mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[BACKEND] Fix crash in reductions on i1 (#1996)
`getScratchSizeInBytes` was assuming that the size of all types in bits is a multiple of 8. If it is not, it would return 0. This caused a bug for boolean (i1) type, where the reduction lowering would attempt to use shared memory, which was not assigned to the op. Fix this issue by setting the number of bytes per element to `ceil(bits / 8)`.
This commit is contained in:
@@ -174,7 +174,7 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() {
|
||||
|
||||
unsigned bytesPerElem = 0;
|
||||
for (const auto &ty : srcElementTypes) {
|
||||
bytesPerElem += ty.getIntOrFloatBitWidth() / 8;
|
||||
bytesPerElem += ceil<unsigned>(ty.getIntOrFloatBitWidth(), 8);
|
||||
}
|
||||
return bytesPerElem * elems;
|
||||
}
|
||||
|
||||
@@ -1380,3 +1380,19 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
|
||||
tt.return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
||||
#slice = #triton_gpu.slice<{dim = 1, parent = #blocked}>
|
||||
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} {
|
||||
// CHECK-LABEL: reduce_bools
|
||||
tt.func public @reduce_bools(%arg: tensor<256x2xi1, #blocked>) {
|
||||
// CHECK: llvm.mlir.addressof @global_smem
|
||||
%24 = "tt.reduce"(%arg) <{axis = 1 : i32}> ({
|
||||
^bb0(%arg4: i1, %arg5: i1):
|
||||
%48 = arith.ori %arg4, %arg5 : i1
|
||||
tt.reduce.return %48 : i1
|
||||
}) : (tensor<256x2xi1, #blocked>) -> tensor<256xi1, #slice>
|
||||
tt.return
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user