[ANALYSIS] Fix allocation algorithm (#1929)

2026-04-05 03:01:17 -04:00 · 2023-07-12 15:20:36 -04:00
parent 571c92f2a8
commit fd89aa1d2b
2 changed files with 46 additions and 4 deletions
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -388,10 +388,19 @@ private:
    DenseMap<BufferT *, size_t> bufferStart;
    calculateStarts(buffers, bufferStart);

+    // NOTE: The original paper doesn't consider interference between
+    // the bumped ranges. Buffers that previously do not interfere with
+    // could interfere after offset bumping if their liveness ranges overlap.
+    // Therefore, we rerun the interference graph algorithm after bumping so
+    // that we regroup the buffers and color them again. Since we always
+    // increase the buffer offset and keep reducing conflicts, we will
+    // eventually reach a fixed point.
    GraphT interference;
    buildInterferenceGraph(buffers, bufferStart, interference);
-
-    allocate(buffers, bufferStart, interference);
+    do {
+      allocate(buffers, interference, bufferStart);
+      buildInterferenceGraph(buffers, bufferStart, interference);
+    } while (!interference.empty());
  }

  /// Computes the initial shared memory offsets.
@@ -457,6 +466,8 @@ private:
  void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
                              const DenseMap<BufferT *, size_t> &bufferStart,
                              GraphT &interference) {
+    // Reset interference graph
+    interference.clear();
    for (auto x : buffers) {
      for (auto y : buffers) {
        if (x == y)
@@ -479,8 +490,10 @@ private:

  /// Finalizes shared memory offsets considering interference.
  void allocate(const SmallVector<BufferT *> &buffers,
-                const DenseMap<BufferT *, size_t> &bufferStart,
-                const GraphT &interference) {
+                const GraphT &interference,
+                DenseMap<BufferT *, size_t> &bufferStart) {
+    // Reset shared memory size
+    allocation->sharedMemorySize = 0;
    // First-fit graph coloring
    // Neighbors are nodes that interfere with each other.
    // We color a node by finding the index of the first available
@@ -514,6 +527,7 @@ private:
        adj = std::max(adj, bufferStart.lookup(y) + y->size);
      }
      x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
+      bufferStart[x] = x->offset;
      allocation->sharedMemorySize =
          std::max(allocation->sharedMemorySize, x->offset + x->size);
    }
--- a/test/Analysis/test-allocation.mlir
+++ b/test/Analysis/test-allocation.mlir
@@ -202,6 +202,34 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
  tt.return
 }

+// This example triggers graph coloring with multiple rounds
+// CHECK-LABEL: multi_color_multi_rounds
+tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
+  // CHECK: offset = 0, size = 32
+  %cst = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #A_SHARED>
+  // CHECK-NEXT: offset = 1184, size = 128
+  %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x4xf16, #A_SHARED>
+  // CHECK-NEXT: offset = 1312, size = 8192
+  %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x4xf16, #A_SHARED>
+  %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
+  // CHECK-NEXT: scratch offset = 32, size = 1152
+  %0 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
+  %1 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
+  // CHECK-NEXT: offset = 11968, size = 128
+  %cst_3 = arith.constant dense<0.000000e+00> : tensor<2x32xf16, #A_SHARED>
+  %2 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
+  // CHECK-NEXT: offset = 0, size = 512
+  %cst_4 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
+  %3 = triton_gpu.convert_layout %cst_0 : (tensor<16x4xf16, #A_SHARED>) -> tensor<16x4xf16, #AL>
+  %4 = triton_gpu.convert_layout %cst_1 : (tensor<1024x4xf16, #A_SHARED>) -> tensor<1024x4xf16, #AL>
+  // CHECK-NEXT: scratch offset = 0, size = 1152
+  %5 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
+  %6 = triton_gpu.convert_layout %cst_3 : (tensor<2x32xf16, #A_SHARED>) -> tensor<2x32xf16, #AL>
+  // CHECK-NEXT: size = 12096
+  tt.return
+}
+
+
 // CHECK-LABEL: alloc
 tt.func @alloc(%A : !tt.ptr<f16>) {
  // CHECK: offset = 0, size = 512