[ANALYSIS] Fix allocation algorithm (#1929)

This commit is contained in:
Keren Zhou
2023-07-12 15:20:36 -04:00
committed by GitHub
parent 571c92f2a8
commit fd89aa1d2b
2 changed files with 46 additions and 4 deletions

View File

@@ -388,10 +388,19 @@ private:
DenseMap<BufferT *, size_t> bufferStart;
calculateStarts(buffers, bufferStart);
// NOTE: The original paper doesn't consider interference between
// the bumped ranges. Buffers that previously do not interfere with
// could interfere after offset bumping if their liveness ranges overlap.
// Therefore, we rerun the interference graph algorithm after bumping so
// that we regroup the buffers and color them again. Since we always
// increase the buffer offset and keep reducing conflicts, we will
// eventually reach a fixed point.
GraphT interference;
buildInterferenceGraph(buffers, bufferStart, interference);
allocate(buffers, bufferStart, interference);
do {
allocate(buffers, interference, bufferStart);
buildInterferenceGraph(buffers, bufferStart, interference);
} while (!interference.empty());
}
/// Computes the initial shared memory offsets.
@@ -457,6 +466,8 @@ private:
void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
const DenseMap<BufferT *, size_t> &bufferStart,
GraphT &interference) {
// Reset interference graph
interference.clear();
for (auto x : buffers) {
for (auto y : buffers) {
if (x == y)
@@ -479,8 +490,10 @@ private:
/// Finalizes shared memory offsets considering interference.
void allocate(const SmallVector<BufferT *> &buffers,
const DenseMap<BufferT *, size_t> &bufferStart,
const GraphT &interference) {
const GraphT &interference,
DenseMap<BufferT *, size_t> &bufferStart) {
// Reset shared memory size
allocation->sharedMemorySize = 0;
// First-fit graph coloring
// Neighbors are nodes that interfere with each other.
// We color a node by finding the index of the first available
@@ -514,6 +527,7 @@ private:
adj = std::max(adj, bufferStart.lookup(y) + y->size);
}
x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
bufferStart[x] = x->offset;
allocation->sharedMemorySize =
std::max(allocation->sharedMemorySize, x->offset + x->size);
}

View File

@@ -202,6 +202,34 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
tt.return
}
// This example triggers graph coloring with multiple rounds
// CHECK-LABEL: multi_color_multi_rounds
tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
// CHECK: offset = 0, size = 32
%cst = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #A_SHARED>
// CHECK-NEXT: offset = 1184, size = 128
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x4xf16, #A_SHARED>
// CHECK-NEXT: offset = 1312, size = 8192
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x4xf16, #A_SHARED>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
// CHECK-NEXT: scratch offset = 32, size = 1152
%0 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
%1 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
// CHECK-NEXT: offset = 11968, size = 128
%cst_3 = arith.constant dense<0.000000e+00> : tensor<2x32xf16, #A_SHARED>
%2 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
// CHECK-NEXT: offset = 0, size = 512
%cst_4 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
%3 = triton_gpu.convert_layout %cst_0 : (tensor<16x4xf16, #A_SHARED>) -> tensor<16x4xf16, #AL>
%4 = triton_gpu.convert_layout %cst_1 : (tensor<1024x4xf16, #A_SHARED>) -> tensor<1024x4xf16, #AL>
// CHECK-NEXT: scratch offset = 0, size = 1152
%5 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
%6 = triton_gpu.convert_layout %cst_3 : (tensor<2x32xf16, #A_SHARED>) -> tensor<2x32xf16, #AL>
// CHECK-NEXT: size = 12096
tt.return
}
// CHECK-LABEL: alloc
tt.func @alloc(%A : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 512