[BACKEND] Fix crash in reductions on i1 (#1996)

`getScratchSizeInBytes` was assuming that the size of all types in bits is a multiple of 8. If it is not, it would return 0. This caused a bug for boolean (i1) type, where the reduction lowering would attempt to use shared memory, which was not assigned to the op. Fix this issue by setting the number of bytes per element to `ceil(bits / 8)`.
2026-04-05 03:01:17 -04:00 · 2023-08-09 19:28:05 +02:00
parent 3be74fa92d
commit 29bfdb6eef
2 changed files with 17 additions and 1 deletions
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -174,7 +174,7 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() {

  unsigned bytesPerElem = 0;
  for (const auto &ty : srcElementTypes) {
-    bytesPerElem += ty.getIntOrFloatBitWidth() / 8;
+    bytesPerElem += ceil<unsigned>(ty.getIntOrFloatBitWidth(), 8);
  }
  return bytesPerElem * elems;
 }
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1380,3 +1380,19 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
    tt.return
  }
 }
+
+// -----
+#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#slice = #triton_gpu.slice<{dim = 1, parent = #blocked}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} {
+  // CHECK-LABEL: reduce_bools
+  tt.func public @reduce_bools(%arg: tensor<256x2xi1, #blocked>) {
+    // CHECK: llvm.mlir.addressof @global_smem
+    %24 = "tt.reduce"(%arg) <{axis = 1 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.ori %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x2xi1, #blocked>) -> tensor<256xi1, #slice>
+    tt.return
+  }
+}