mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
Set vecSize = 4 and maxPhase = BLOCK_K/4
This commit is contained in:
@@ -97,9 +97,13 @@ A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] /
|
||||
int elemsPerOneBanksRow = (numBanks * bankBitWidth) / typeBitWidth;
|
||||
|
||||
int perPhase = std::max(1, elemsPerOneBanksRow / innerDimLength);
|
||||
int maxPhase = outerDimGranularity / perPhase;
|
||||
int vecSize = innerDimLength / maxPhase;
|
||||
assert(vecSize > 0);
|
||||
// Note: the following settings is customized for mfma_32x32x8f16
|
||||
// to avoid **load** bank conflicts
|
||||
// vecSize is set to k_base, which is 4
|
||||
// maxPhase is set to BLOCK_K/4 so that every 16 workitems will access
|
||||
// difference banks
|
||||
int vecSize = 4;
|
||||
int maxPhase = innerDimLength / 4;
|
||||
|
||||
return $_get(context, vecSize, perPhase, maxPhase, order);
|
||||
} else {
|
||||
|
||||
@@ -77,7 +77,7 @@ def optimize_ttgir(mod, num_stages, arch):
|
||||
pm = ir.pass_manager(mod.context)
|
||||
pm.enable_debug()
|
||||
pm.add_tritongpu_coalesce_pass()
|
||||
pm.add_tritongpu_remove_layout_conversions_pass()
|
||||
#pm.add_tritongpu_remove_layout_conversions_pass()
|
||||
if _is_cuda(arch):
|
||||
pm.add_tritongpu_accelerate_matmul_pass(arch)
|
||||
# TODO change interface of accelerate_matmul_pass
|
||||
|
||||
Reference in New Issue
Block a user