Merge commit '5df904233c11a65bd131ead7268f84cca7804275' into ifu230810-2

Conflicts:
	include/triton/Dialect/Triton/Transforms/Passes.h
	include/triton/Dialect/TritonGPU/IR/Dialect.h
	include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
	lib/Analysis/Allocation.cpp
	lib/Analysis/Utility.cpp
	lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
	lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp
	lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
	lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp
	lib/Target/LLVMIR/LLVMIRTranslation.cpp
	python/src/triton.cc
	python/triton/compiler/compiler.py
	python/triton/ops/flash_attention.py
	python/triton/runtime/autotuner.py
	python/triton/runtime/jit.py
	python/triton/tools/aot.py
	python/tutorials/06-fused-attention.py
	test/Conversion/tritongpu_to_llvm.mlir
	test/Target/tritongpu_to_llvmir.mlir
	test/Target/tritongpu_to_llvmir_noinline.mlir
This commit is contained in:
Jason Furmanek
2023-09-01 03:25:33 +00:00
122 changed files with 7341 additions and 2234 deletions

View File

@@ -183,6 +183,13 @@ private:
ReduceOpHelper helper(reduceOp);
unsigned bytes = helper.getScratchSizeInBytes();
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
<<<<<<< HEAD
=======
} else if (auto scanOp = dyn_cast<triton::ScanOp>(op)) {
ScanLoweringHelper helper(scanOp);
unsigned bytes = helper.getScratchSizeInBytes();
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
>>>>>>> 5df904233c11a65bd131ead7268f84cca7804275
} else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
auto srcTy = cvtLayout.getSrc().getType().cast<RankedTensorType>();
auto dstTy = cvtLayout.getResult().getType().cast<RankedTensorType>();
@@ -393,10 +400,19 @@ private:
DenseMap<BufferT *, size_t> bufferStart;
calculateStarts(buffers, bufferStart);
// NOTE: The original paper doesn't consider interference between
// the bumped ranges. Buffers that previously do not interfere with
// could interfere after offset bumping if their liveness ranges overlap.
// Therefore, we rerun the interference graph algorithm after bumping so
// that we regroup the buffers and color them again. Since we always
// increase the buffer offset and keep reducing conflicts, we will
// eventually reach a fixed point.
GraphT interference;
buildInterferenceGraph(buffers, bufferStart, interference);
allocate(buffers, bufferStart, interference);
do {
allocate(buffers, interference, bufferStart);
buildInterferenceGraph(buffers, bufferStart, interference);
} while (!interference.empty());
}
/// Computes the initial shared memory offsets.
@@ -462,6 +478,8 @@ private:
void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
const DenseMap<BufferT *, size_t> &bufferStart,
GraphT &interference) {
// Reset interference graph
interference.clear();
for (auto x : buffers) {
for (auto y : buffers) {
if (x == y)
@@ -484,8 +502,10 @@ private:
/// Finalizes shared memory offsets considering interference.
void allocate(const SmallVector<BufferT *> &buffers,
const DenseMap<BufferT *, size_t> &bufferStart,
const GraphT &interference) {
const GraphT &interference,
DenseMap<BufferT *, size_t> &bufferStart) {
// Reset shared memory size
allocation->sharedMemorySize = 0;
// First-fit graph coloring
// Neighbors are nodes that interfere with each other.
// We color a node by finding the index of the first available
@@ -519,6 +539,7 @@ private:
adj = std::max(adj, bufferStart.lookup(y) + y->size);
}
x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
bufferStart[x] = x->offset;
allocation->sharedMemorySize =
std::max(allocation->sharedMemorySize, x->offset + x->size);
}