[MFMA] Implementation of MFMA DotOp pipeline (#180)

* [MFMA] Implementation of MFMA DotOp pipeline

* Added MFMA test_dot unit tests

* Added missing ifdefs

* Update offline tests

* Removing duplicate parts

* fix build after rebase

* remove redundant stuff

* simplify MMAv3.cpp

* move reps function into operand attr description,
remove coreMatrixType type from layout conversion,
refactored type conversion

* remove duplication of mfma intruction shape computation

* move all MFMA instruction shape details into layout attribute

* fix formatting

* reenable matmul acceleration

* fix dot operator type conversion

* add offline test for dotop

* add missing ifdef wrappers

* run clang format on changes

* review and rebase fix

* add switch for MFMA instructions

* change check precision for float16 test

* disable redundant check for allowTF32

* - skip unsupported block size in matmul autotuner
- support transposed inputs of dot

* reenable matmul acceleration

* Add first part to FMA for dot operation on HW without MFMA support.

* Fix offline tests.

* Fix lit tests

* refactor mmav3 to mfma

* fix rebase issues

* fix detection of mfma support and wrong assert

* remove unnecessary macros

* Add documentation for MFMA layout.

* fix line size computation for B argument

* Fix getElemsPerThread() and getSizePerThread() functions for MFMA layout.

---------

Co-authored-by: Alexander Efimov <efimov.alexander@gmail.com>
Co-authored-by: dfukalov <1671137+dfukalov@users.noreply.github.com>
Co-authored-by: weihan13 <weihan13@amd.com>
Co-authored-by: Ognjen Plavsic <ognjen.plavsic@dxc.com>
This commit is contained in:
Andrey Shukshov
2023-05-30 23:10:28 +02:00
committed by GitHub
parent 1ee82e2a8e
commit fee5950893
17 changed files with 1204 additions and 81 deletions

View File

@@ -1176,30 +1176,68 @@ def test_permute(dtype_str, shape, perm, device='cuda'):
# test dot
# ---------------
# @pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
# [(*shape, 4, False, False, epilogue, allow_tf32, dtype)
# for shape in [(64, 64, 64), (16, 16, 16)]
# for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
# for allow_tf32 in [True, False]
# for dtype in ['float16', 'float32']
# if not (allow_tf32 and (dtype in ['float16']))] +
# [(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
# for shape_nw in [[128, 256, 32, 8],
# [128, 16, 32, 4],
# [32, 128, 64, 4],
# [128, 128, 64, 4],
# [64, 128, 128, 4],
# [32, 128, 64, 2],
# [128, 128, 64, 2],
# [64, 128, 128, 2]]
# for allow_tf32 in [True]
# for col_a in [True, False]
# for col_b in [True, False]
# for dtype in ['int8', 'float16', 'float32']])
# MFMA Test Dot tests
@pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
[(*shape, 4, False, False, epilogue, allow_tf32, dtype)
for shape in [(64, 64, 64), (16, 16, 16)]
for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
[(*shape, 2, False, False, epilogue, allow_tf32, dtype)
for shape in [(64, 64, 64), (32, 32, 32)]
for epilogue in ['none', 'trans', 'add-matrix']
for allow_tf32 in [True, False]
for dtype in ['float16', 'float32']
if not (allow_tf32 and (dtype in ['float16']))] +
[(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
for shape_nw in [[128, 256, 32, 8],
[128, 16, 32, 4],
[32, 128, 64, 4],
[128, 128, 64, 4],
[64, 128, 128, 4],
[32, 128, 64, 2],
for shape_nw in [[128, 128, 32, 2],
[128, 128, 64, 2],
[64, 128, 128, 2]]
for allow_tf32 in [True]
for col_a in [True, False]
for col_b in [True, False]
[128, 32, 32, 2],
[128, 32, 64, 2],
[32, 32, 32, 4],
[32, 32, 64, 4],
[32, 32, 128, 4],
[32, 32, 256, 4],
[64, 64, 32, 4],
[64, 64, 64, 4],
[64, 64, 128, 4],
[32, 128, 64, 2],
[64, 128, 128, 2],
[32, 32, 64, 1],
[32, 128, 64, 2],
[64, 128, 128, 2],
[32, 256, 32, 8],
]
for allow_tf32 in [False, True]
for col_a in [False]
for col_b in [False]
for dtype in ['int8', 'float16', 'float32']])
def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, device='cuda'):
capability = torch.cuda.get_device_capability()
if torch.version.hip is not None:
# set capability to large number to jump over check below
# check are not relevant to amd gpu, left them for smaller diff between test_core.py and test_core_amd.py tests
capability = (100, 100)
if capability[0] < 7:
pytest.skip("Only test tl.dot() on devices with sm >= 70")
if capability[0] < 8:
@@ -1320,7 +1358,7 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, devi
z_ref = np.matmul(z_ref, w)
# compare
# print(z_ref[:,0], z_tri[:,0])
if dtype == 'float32':
if dtype == 'float32' or dtype == 'float16':
# XXX: Somehow there's a larger difference when we use float32
np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
else: