mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
[MFMA] Implementation of MFMA DotOp pipeline (#180)
* [MFMA] Implementation of MFMA DotOp pipeline * Added MFMA test_dot unit tests * Added missing ifdefs * Update offline tests * Removing duplicate parts * fix build after rebase * remove redundant stuff * simplify MMAv3.cpp * move reps function into operand attr description, remove coreMatrixType type from layout conversion, refactored type conversion * remove duplication of mfma intruction shape computation * move all MFMA instruction shape details into layout attribute * fix formatting * reenable matmul acceleration * fix dot operator type conversion * add offline test for dotop * add missing ifdef wrappers * run clang format on changes * review and rebase fix * add switch for MFMA instructions * change check precision for float16 test * disable redundant check for allowTF32 * - skip unsupported block size in matmul autotuner - support transposed inputs of dot * reenable matmul acceleration * Add first part to FMA for dot operation on HW without MFMA support. * Fix offline tests. * Fix lit tests * refactor mmav3 to mfma * fix rebase issues * fix detection of mfma support and wrong assert * remove unnecessary macros * Add documentation for MFMA layout. * fix line size computation for B argument * Fix getElemsPerThread() and getSizePerThread() functions for MFMA layout. --------- Co-authored-by: Alexander Efimov <efimov.alexander@gmail.com> Co-authored-by: dfukalov <1671137+dfukalov@users.noreply.github.com> Co-authored-by: weihan13 <weihan13@amd.com> Co-authored-by: Ognjen Plavsic <ognjen.plavsic@dxc.com>
This commit is contained in:
@@ -1176,30 +1176,68 @@ def test_permute(dtype_str, shape, perm, device='cuda'):
|
||||
# test dot
|
||||
# ---------------
|
||||
|
||||
# @pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
|
||||
# [(*shape, 4, False, False, epilogue, allow_tf32, dtype)
|
||||
# for shape in [(64, 64, 64), (16, 16, 16)]
|
||||
# for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
|
||||
# for allow_tf32 in [True, False]
|
||||
# for dtype in ['float16', 'float32']
|
||||
# if not (allow_tf32 and (dtype in ['float16']))] +
|
||||
# [(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
|
||||
# for shape_nw in [[128, 256, 32, 8],
|
||||
# [128, 16, 32, 4],
|
||||
# [32, 128, 64, 4],
|
||||
# [128, 128, 64, 4],
|
||||
# [64, 128, 128, 4],
|
||||
# [32, 128, 64, 2],
|
||||
# [128, 128, 64, 2],
|
||||
# [64, 128, 128, 2]]
|
||||
# for allow_tf32 in [True]
|
||||
# for col_a in [True, False]
|
||||
# for col_b in [True, False]
|
||||
# for dtype in ['int8', 'float16', 'float32']])
|
||||
|
||||
|
||||
# MFMA Test Dot tests
|
||||
@pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
|
||||
[(*shape, 4, False, False, epilogue, allow_tf32, dtype)
|
||||
for shape in [(64, 64, 64), (16, 16, 16)]
|
||||
for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
|
||||
[(*shape, 2, False, False, epilogue, allow_tf32, dtype)
|
||||
for shape in [(64, 64, 64), (32, 32, 32)]
|
||||
for epilogue in ['none', 'trans', 'add-matrix']
|
||||
for allow_tf32 in [True, False]
|
||||
for dtype in ['float16', 'float32']
|
||||
if not (allow_tf32 and (dtype in ['float16']))] +
|
||||
|
||||
[(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
|
||||
for shape_nw in [[128, 256, 32, 8],
|
||||
[128, 16, 32, 4],
|
||||
[32, 128, 64, 4],
|
||||
[128, 128, 64, 4],
|
||||
[64, 128, 128, 4],
|
||||
[32, 128, 64, 2],
|
||||
for shape_nw in [[128, 128, 32, 2],
|
||||
[128, 128, 64, 2],
|
||||
[64, 128, 128, 2]]
|
||||
for allow_tf32 in [True]
|
||||
for col_a in [True, False]
|
||||
for col_b in [True, False]
|
||||
[128, 32, 32, 2],
|
||||
[128, 32, 64, 2],
|
||||
[32, 32, 32, 4],
|
||||
[32, 32, 64, 4],
|
||||
[32, 32, 128, 4],
|
||||
[32, 32, 256, 4],
|
||||
[64, 64, 32, 4],
|
||||
[64, 64, 64, 4],
|
||||
[64, 64, 128, 4],
|
||||
[32, 128, 64, 2],
|
||||
[64, 128, 128, 2],
|
||||
[32, 32, 64, 1],
|
||||
[32, 128, 64, 2],
|
||||
[64, 128, 128, 2],
|
||||
[32, 256, 32, 8],
|
||||
]
|
||||
for allow_tf32 in [False, True]
|
||||
for col_a in [False]
|
||||
for col_b in [False]
|
||||
for dtype in ['int8', 'float16', 'float32']])
|
||||
def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, device='cuda'):
|
||||
capability = torch.cuda.get_device_capability()
|
||||
|
||||
if torch.version.hip is not None:
|
||||
# set capability to large number to jump over check below
|
||||
# check are not relevant to amd gpu, left them for smaller diff between test_core.py and test_core_amd.py tests
|
||||
capability = (100, 100)
|
||||
|
||||
if capability[0] < 7:
|
||||
pytest.skip("Only test tl.dot() on devices with sm >= 70")
|
||||
if capability[0] < 8:
|
||||
@@ -1320,7 +1358,7 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, devi
|
||||
z_ref = np.matmul(z_ref, w)
|
||||
# compare
|
||||
# print(z_ref[:,0], z_tri[:,0])
|
||||
if dtype == 'float32':
|
||||
if dtype == 'float32' or dtype == 'float16':
|
||||
# XXX: Somehow there's a larger difference when we use float32
|
||||
np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user