Set vecSize = 4 and maxPhase = BLOCK_K/4

This commit is contained in:
Lixun Zhang
2023-08-09 21:31:08 -05:00
committed by Lixun Zhang
parent 398d2c7dd0
commit 7156fcb0ef
2 changed files with 8 additions and 4 deletions

View File

@@ -97,9 +97,13 @@ A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] /
int elemsPerOneBanksRow = (numBanks * bankBitWidth) / typeBitWidth;
int perPhase = std::max(1, elemsPerOneBanksRow / innerDimLength);
int maxPhase = outerDimGranularity / perPhase;
int vecSize = innerDimLength / maxPhase;
assert(vecSize > 0);
// Note: the following settings is customized for mfma_32x32x8f16
// to avoid **load** bank conflicts
// vecSize is set to k_base, which is 4
// maxPhase is set to BLOCK_K/4 so that every 16 workitems will access
// difference banks
int vecSize = 4;
int maxPhase = innerDimLength / 4;
return $_get(context, vecSize, perPhase, maxPhase, order);
} else {

View File

@@ -77,7 +77,7 @@ def optimize_ttgir(mod, num_stages, arch):
pm = ir.pass_manager(mod.context)
pm.enable_debug()
pm.add_tritongpu_coalesce_pass()
pm.add_tritongpu_remove_layout_conversions_pass()
#pm.add_tritongpu_remove_layout_conversions_pass()
if _is_cuda(arch):
pm.add_tritongpu_accelerate_matmul_pass(arch)
# TODO change interface of accelerate_matmul_pass