[BACKEND] Fix crash in reductions on i1 (#1996)

`getScratchSizeInBytes` was assuming that the size of all types in bits
is
a multiple of 8. If it is not, it would return 0. This caused a bug for
boolean
(i1) type, where the reduction lowering would attempt to use shared
memory,
which was not assigned to the op.

Fix this issue by setting the number of bytes per element to `ceil(bits
/ 8)`.
This commit is contained in:
Goran Flegar
2023-08-09 19:28:05 +02:00
committed by GitHub
parent 3be74fa92d
commit 29bfdb6eef
2 changed files with 17 additions and 1 deletions

View File

@@ -174,7 +174,7 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() {
unsigned bytesPerElem = 0;
for (const auto &ty : srcElementTypes) {
bytesPerElem += ty.getIntOrFloatBitWidth() / 8;
bytesPerElem += ceil<unsigned>(ty.getIntOrFloatBitWidth(), 8);
}
return bytesPerElem * elems;
}

View File

@@ -1380,3 +1380,19 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
tt.return
}
}
// -----
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#slice = #triton_gpu.slice<{dim = 1, parent = #blocked}>
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} {
// CHECK-LABEL: reduce_bools
tt.func public @reduce_bools(%arg: tensor<256x2xi1, #blocked>) {
// CHECK: llvm.mlir.addressof @global_smem
%24 = "tt.reduce"(%arg) <{axis = 1 : i32}> ({
^bb0(%arg4: i1, %arg5: i1):
%48 = arith.ori %arg4, %arg5 : i1
tt.reduce.return %48 : i1
}) : (tensor<256x2xi1, #blocked>) -> tensor<256xi1, #slice>
tt.return
}
}