Merge commit '5df904233c11a65bd131ead7268f84cca7804275' into ifu230810-2

Conflicts: include/triton/Dialect/Triton/Transforms/Passes.h include/triton/Dialect/TritonGPU/IR/Dialect.h include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td lib/Analysis/Allocation.cpp lib/Analysis/Utility.cpp lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp lib/Target/LLVMIR/LLVMIRTranslation.cpp python/src/triton.cc python/triton/compiler/compiler.py python/triton/ops/flash_attention.py python/triton/runtime/autotuner.py python/triton/runtime/jit.py python/triton/tools/aot.py python/tutorials/06-fused-attention.py test/Conversion/tritongpu_to_llvm.mlir test/Target/tritongpu_to_llvmir.mlir test/Target/tritongpu_to_llvmir_noinline.mlir
2026-04-05 03:01:17 -04:00 · 2023-09-01 03:25:33 +00:00
parent 9cdf3a58c3 5df904233c
commit 3eaeb89d18
122 changed files with 7341 additions and 2234 deletions
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -183,6 +183,13 @@ private:
      ReduceOpHelper helper(reduceOp);
      unsigned bytes = helper.getScratchSizeInBytes();
      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
+<<<<<<< HEAD
+=======
+    } else if (auto scanOp = dyn_cast<triton::ScanOp>(op)) {
+      ScanLoweringHelper helper(scanOp);
+      unsigned bytes = helper.getScratchSizeInBytes();
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes);
+>>>>>>> 5df904233c11a65bd131ead7268f84cca7804275
    } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
      auto srcTy = cvtLayout.getSrc().getType().cast<RankedTensorType>();
      auto dstTy = cvtLayout.getResult().getType().cast<RankedTensorType>();
@@ -393,10 +400,19 @@ private:
    DenseMap<BufferT *, size_t> bufferStart;
    calculateStarts(buffers, bufferStart);

+    // NOTE: The original paper doesn't consider interference between
+    // the bumped ranges. Buffers that previously do not interfere with
+    // could interfere after offset bumping if their liveness ranges overlap.
+    // Therefore, we rerun the interference graph algorithm after bumping so
+    // that we regroup the buffers and color them again. Since we always
+    // increase the buffer offset and keep reducing conflicts, we will
+    // eventually reach a fixed point.
    GraphT interference;
    buildInterferenceGraph(buffers, bufferStart, interference);
-
-    allocate(buffers, bufferStart, interference);
+    do {
+      allocate(buffers, interference, bufferStart);
+      buildInterferenceGraph(buffers, bufferStart, interference);
+    } while (!interference.empty());
  }

  /// Computes the initial shared memory offsets.
@@ -462,6 +478,8 @@ private:
  void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
                              const DenseMap<BufferT *, size_t> &bufferStart,
                              GraphT &interference) {
+    // Reset interference graph
+    interference.clear();
    for (auto x : buffers) {
      for (auto y : buffers) {
        if (x == y)
@@ -484,8 +502,10 @@ private:

  /// Finalizes shared memory offsets considering interference.
  void allocate(const SmallVector<BufferT *> &buffers,
-                const DenseMap<BufferT *, size_t> &bufferStart,
-                const GraphT &interference) {
+                const GraphT &interference,
+                DenseMap<BufferT *, size_t> &bufferStart) {
+    // Reset shared memory size
+    allocation->sharedMemorySize = 0;
    // First-fit graph coloring
    // Neighbors are nodes that interfere with each other.
    // We color a node by finding the index of the first available
@@ -519,6 +539,7 @@ private:
        adj = std::max(adj, bufferStart.lookup(y) + y->size);
      }
      x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
+      bufferStart[x] = x->offset;
      allocation->sharedMemorySize =
          std::max(allocation->sharedMemorySize, x->offset + x->size);
    }