Merge pull request #268 from ROCmSoftwarePlatform/improve_reduce_for_fa

[CHERRY-PICKED FROM UPSTREAM][BACKEND] no longer uses shared mem or barriers for single-warp reductions (openai#1915)
2026-04-05 03:01:17 -04:00 · 2023-08-21 13:29:11 -05:00
parent d86b19f7a3 d0b7793935
commit fa429316d4
6 changed files with 83 additions and 33 deletions
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -60,9 +60,10 @@ SmallVector<SmallVector<unsigned>> ReduceOpHelper::getScratchConfigsFast() {

  auto argLayout = getSrcLayout();
  auto argLayoutMma = argLayout.dyn_cast<triton::gpu::MmaEncodingAttr>();
-  // if (argLayoutMma && argLayoutMma.getVersionMajor() == 2 &&
-  //     triton::gpu::getWarpsPerCTA(argLayout)[axis] == 1)
-  //   return {{1, 1}, {1, 1}};
+
+  // that case doesn't need inter-warp communication
+  if (isFastReduction() && triton::gpu::getWarpsPerCTA(argLayout)[axis] == 1)
+    return {{0, 0}, {0, 0}};

  /// shared memory block0
  smemShapes[0] = convertType<unsigned>(getSrcShape());