[MFMA] Implementation of MFMA DotOp pipeline (#180)

* [MFMA] Implementation of MFMA DotOp pipeline * Added MFMA test_dot unit tests * Added missing ifdefs * Update offline tests * Removing duplicate parts * fix build after rebase * remove redundant stuff * simplify MMAv3.cpp * move reps function into operand attr description, remove coreMatrixType type from layout conversion, refactored type conversion * remove duplication of mfma intruction shape computation * move all MFMA instruction shape details into layout attribute * fix formatting * reenable matmul acceleration * fix dot operator type conversion * add offline test for dotop * add missing ifdef wrappers * run clang format on changes * review and rebase fix * add switch for MFMA instructions * change check precision for float16 test * disable redundant check for allowTF32 * - skip unsupported block size in matmul autotuner - support transposed inputs of dot * reenable matmul acceleration * Add first part to FMA for dot operation on HW without MFMA support. * Fix offline tests. * Fix lit tests * refactor mmav3 to mfma * fix rebase issues * fix detection of mfma support and wrong assert * remove unnecessary macros * Add documentation for MFMA layout. * fix line size computation for B argument * Fix getElemsPerThread() and getSizePerThread() functions for MFMA layout. --------- Co-authored-by: Alexander Efimov <efimov.alexander@gmail.com> Co-authored-by: dfukalov <1671137+dfukalov@users.noreply.github.com> Co-authored-by: weihan13 <weihan13@amd.com> Co-authored-by: Ognjen Plavsic <ognjen.plavsic@dxc.com>
2026-04-05 03:01:17 -04:00 · 2023-05-30 23:10:28 +02:00
parent 1ee82e2a8e
commit fee5950893
17 changed files with 1204 additions and 81 deletions
--- a/python/test/unit/language/test_core_amd.py
+++ b/python/test/unit/language/test_core_amd.py
@@ -1176,30 +1176,68 @@ def test_permute(dtype_str, shape, perm, device='cuda'):
 # test dot
 # ---------------

+# @pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
+#                          [(*shape, 4, False, False, epilogue, allow_tf32, dtype)
+#                           for shape in [(64, 64, 64), (16, 16, 16)]
+#                           for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
+#                           for allow_tf32 in [True, False]
+#                           for dtype in ['float16', 'float32']
+#                           if not (allow_tf32 and (dtype in ['float16']))] +
+#                          [(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
+#                           for shape_nw in [[128, 256, 32, 8],
+#                                            [128, 16, 32, 4],
+#                                            [32, 128, 64, 4],
+#                                            [128, 128, 64, 4],
+#                                            [64, 128, 128, 4],
+#                                            [32, 128, 64, 2],
+#                                            [128, 128, 64, 2],
+#                                            [64, 128, 128, 2]]
+#                           for allow_tf32 in [True]
+#                           for col_a in [True, False]
+#                           for col_b in [True, False]
+#                           for dtype in ['int8', 'float16', 'float32']])

+
+# MFMA Test Dot tests
@pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype",
-                         [(*shape, 4, False, False, epilogue, allow_tf32, dtype)
-                          for shape in [(64, 64, 64), (16, 16, 16)]
-                          for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
+                         [(*shape, 2, False, False, epilogue, allow_tf32, dtype)
+                          for shape in [(64, 64, 64), (32, 32, 32)]
+                          for epilogue in ['none', 'trans', 'add-matrix']
                          for allow_tf32 in [True, False]
                          for dtype in ['float16', 'float32']
                          if not (allow_tf32 and (dtype in ['float16']))] +

                         [(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype)
-                          for shape_nw in [[128, 256, 32, 8],
-                                           [128, 16, 32, 4],
-                                           [32, 128, 64, 4],
-                                           [128, 128, 64, 4],
-                                           [64, 128, 128, 4],
-                                           [32, 128, 64, 2],
+                          for shape_nw in [[128, 128, 32, 2],
                                           [128, 128, 64, 2],
-                                           [64, 128, 128, 2]]
-                          for allow_tf32 in [True]
-                          for col_a in [True, False]
-                          for col_b in [True, False]
+                                           [128, 32, 32, 2],
+                                           [128, 32, 64, 2],
+                                           [32, 32, 32, 4],
+                                           [32, 32, 64, 4],
+                                           [32, 32, 128, 4],
+                                           [32, 32, 256, 4],
+                                           [64, 64, 32, 4],
+                                           [64, 64, 64, 4],
+                                           [64, 64, 128, 4],
+                                           [32, 128, 64, 2],
+                                           [64, 128, 128, 2],
+                                           [32, 32, 64, 1],
+                                           [32, 128, 64, 2],
+                                           [64, 128, 128, 2],
+                                           [32, 256, 32, 8],
+                                           ]
+                          for allow_tf32 in [False, True]
+                          for col_a in [False]
+                          for col_b in [False]
                          for dtype in ['int8', 'float16', 'float32']])
 def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, device='cuda'):
    capability = torch.cuda.get_device_capability()
+
+    if torch.version.hip is not None:
+        # set capability to large number to jump over check below
+        # check are not relevant to amd gpu, left them for smaller diff between test_core.py and test_core_amd.py tests
+        capability = (100, 100)
+
    if capability[0] < 7:
        pytest.skip("Only test tl.dot() on devices with sm >= 70")
    if capability[0] < 8:
@@ -1320,7 +1358,7 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, devi
        z_ref = np.matmul(z_ref, w)
    # compare
    # print(z_ref[:,0], z_tri[:,0])
-    if dtype == 'float32':
+    if dtype == 'float32' or dtype == 'float16':
        # XXX: Somehow there's a larger difference when we use float32
        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
    else: