Set vecSize = 4 and maxPhase = BLOCK_K/4

2026-04-05 03:01:17 -04:00 · 2023-08-09 21:31:08 -05:00
parent 398d2c7dd0
commit 7156fcb0ef
2 changed files with 8 additions and 4 deletions
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -97,9 +97,13 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
            int elemsPerOneBanksRow = (numBanks * bankBitWidth) / typeBitWidth;

            int perPhase = std::max(1, elemsPerOneBanksRow / innerDimLength);
-            int maxPhase = outerDimGranularity / perPhase;
-            int vecSize = innerDimLength / maxPhase;
-            assert(vecSize > 0);
+            // Note: the following settings is customized for mfma_32x32x8f16
+            // to avoid **load** bank conflicts
+            // vecSize is set to k_base, which is 4
+            // maxPhase is set to BLOCK_K/4 so that every 16 workitems will access
+            // difference banks
+            int vecSize = 4;
+            int maxPhase = innerDimLength / 4;

            return $_get(context, vecSize, perPhase, maxPhase, order);
          } else {
--- a/python/triton/compiler/compiler.py
+++ b/python/triton/compiler/compiler.py
@@ -77,7 +77,7 @@ def optimize_ttgir(mod, num_stages, arch):
    pm = ir.pass_manager(mod.context)
    pm.enable_debug()
    pm.add_tritongpu_coalesce_pass()
-    pm.add_tritongpu_remove_layout_conversions_pass()
+    #pm.add_tritongpu_remove_layout_conversions_pass()
    if _is_cuda(arch):
        pm.add_tritongpu_accelerate_matmul_pass(arch)
    # TODO change interface of accelerate_matmul_pass