mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
Merge pull request #268 from ROCmSoftwarePlatform/improve_reduce_for_fa
[CHERRY-PICKED FROM UPSTREAM][BACKEND] no longer uses shared mem or barriers for single-warp reductions (openai#1915)
This commit is contained in:
@@ -60,9 +60,10 @@ SmallVector<SmallVector<unsigned>> ReduceOpHelper::getScratchConfigsFast() {
|
||||
|
||||
auto argLayout = getSrcLayout();
|
||||
auto argLayoutMma = argLayout.dyn_cast<triton::gpu::MmaEncodingAttr>();
|
||||
// if (argLayoutMma && argLayoutMma.getVersionMajor() == 2 &&
|
||||
// triton::gpu::getWarpsPerCTA(argLayout)[axis] == 1)
|
||||
// return {{1, 1}, {1, 1}};
|
||||
|
||||
// that case doesn't need inter-warp communication
|
||||
if (isFastReduction() && triton::gpu::getWarpsPerCTA(argLayout)[axis] == 1)
|
||||
return {{0, 0}, {0, 0}};
|
||||
|
||||
/// shared memory block0
|
||||
smemShapes[0] = convertType<unsigned>(getSrcShape());
|
||||
|
||||
Reference in New Issue
Block a user