mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[ANALYSIS] Fix allocation algorithm (#1929)
This commit is contained in:
@@ -388,10 +388,19 @@ private:
|
||||
DenseMap<BufferT *, size_t> bufferStart;
|
||||
calculateStarts(buffers, bufferStart);
|
||||
|
||||
// NOTE: The original paper doesn't consider interference between
|
||||
// the bumped ranges. Buffers that previously do not interfere with
|
||||
// could interfere after offset bumping if their liveness ranges overlap.
|
||||
// Therefore, we rerun the interference graph algorithm after bumping so
|
||||
// that we regroup the buffers and color them again. Since we always
|
||||
// increase the buffer offset and keep reducing conflicts, we will
|
||||
// eventually reach a fixed point.
|
||||
GraphT interference;
|
||||
buildInterferenceGraph(buffers, bufferStart, interference);
|
||||
|
||||
allocate(buffers, bufferStart, interference);
|
||||
do {
|
||||
allocate(buffers, interference, bufferStart);
|
||||
buildInterferenceGraph(buffers, bufferStart, interference);
|
||||
} while (!interference.empty());
|
||||
}
|
||||
|
||||
/// Computes the initial shared memory offsets.
|
||||
@@ -457,6 +466,8 @@ private:
|
||||
void buildInterferenceGraph(const SmallVector<BufferT *> &buffers,
|
||||
const DenseMap<BufferT *, size_t> &bufferStart,
|
||||
GraphT &interference) {
|
||||
// Reset interference graph
|
||||
interference.clear();
|
||||
for (auto x : buffers) {
|
||||
for (auto y : buffers) {
|
||||
if (x == y)
|
||||
@@ -479,8 +490,10 @@ private:
|
||||
|
||||
/// Finalizes shared memory offsets considering interference.
|
||||
void allocate(const SmallVector<BufferT *> &buffers,
|
||||
const DenseMap<BufferT *, size_t> &bufferStart,
|
||||
const GraphT &interference) {
|
||||
const GraphT &interference,
|
||||
DenseMap<BufferT *, size_t> &bufferStart) {
|
||||
// Reset shared memory size
|
||||
allocation->sharedMemorySize = 0;
|
||||
// First-fit graph coloring
|
||||
// Neighbors are nodes that interfere with each other.
|
||||
// We color a node by finding the index of the first available
|
||||
@@ -514,6 +527,7 @@ private:
|
||||
adj = std::max(adj, bufferStart.lookup(y) + y->size);
|
||||
}
|
||||
x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj;
|
||||
bufferStart[x] = x->offset;
|
||||
allocation->sharedMemorySize =
|
||||
std::max(allocation->sharedMemorySize, x->offset + x->size);
|
||||
}
|
||||
|
||||
@@ -202,6 +202,34 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
|
||||
tt.return
|
||||
}
|
||||
|
||||
// This example triggers graph coloring with multiple rounds
|
||||
// CHECK-LABEL: multi_color_multi_rounds
|
||||
tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
|
||||
// CHECK: offset = 0, size = 32
|
||||
%cst = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #A_SHARED>
|
||||
// CHECK-NEXT: offset = 1184, size = 128
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x4xf16, #A_SHARED>
|
||||
// CHECK-NEXT: offset = 1312, size = 8192
|
||||
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x4xf16, #A_SHARED>
|
||||
%cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
|
||||
// CHECK-NEXT: scratch offset = 32, size = 1152
|
||||
%0 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
|
||||
%1 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
|
||||
// CHECK-NEXT: offset = 11968, size = 128
|
||||
%cst_3 = arith.constant dense<0.000000e+00> : tensor<2x32xf16, #A_SHARED>
|
||||
%2 = triton_gpu.convert_layout %cst : (tensor<4x4xf16, #A_SHARED>) -> tensor<4x4xf16, #AL>
|
||||
// CHECK-NEXT: offset = 0, size = 512
|
||||
%cst_4 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
|
||||
%3 = triton_gpu.convert_layout %cst_0 : (tensor<16x4xf16, #A_SHARED>) -> tensor<16x4xf16, #AL>
|
||||
%4 = triton_gpu.convert_layout %cst_1 : (tensor<1024x4xf16, #A_SHARED>) -> tensor<1024x4xf16, #AL>
|
||||
// CHECK-NEXT: scratch offset = 0, size = 1152
|
||||
%5 = triton_gpu.convert_layout %cst_2 : (tensor<16x32xf16, #AL>) -> tensor<16x32xf16, #AL>
|
||||
%6 = triton_gpu.convert_layout %cst_3 : (tensor<2x32xf16, #A_SHARED>) -> tensor<2x32xf16, #AL>
|
||||
// CHECK-NEXT: size = 12096
|
||||
tt.return
|
||||
}
|
||||
|
||||
|
||||
// CHECK-LABEL: alloc
|
||||
tt.func @alloc(%A : !tt.ptr<f16>) {
|
||||
// CHECK: offset = 0, size = 512
|
||||
|
||||
Reference in New Issue
Block a user