From 5c56cac0a027fbc3e1c0260572d7a2989282a1b9 Mon Sep 17 00:00:00 2001 From: Ignacio Sica Date: Tue, 18 Mar 2025 14:33:30 -0300 Subject: [PATCH] MI300 mfma support (#9417) * add f16/f32 mfma support for MI300 - add 16x16 mfma shape support for f16 with f32 acc - add ops_python mfma emulation - add arch to AMDRenderer * minor cleanup * minor cleanup * add mfma emulation task to ci * add back todo * hotfix: comment * add tc=3 job to ci --- .github/workflows/test.yml | 5 +++++ tinygrad/renderer/cstyle.py | 16 +++++++++++++--- tinygrad/runtime/ops_python.py | 6 ++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a6cb1ebb34..4873e153a9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -234,6 +234,10 @@ jobs: PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 python3 ./extra/gemm/simple_matmul.py PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 python3 ./extra/gemm/simple_matmul.py PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded + - name: Test emulated AMD MFMA tensor cores + run: | + PYTHONPATH=. DEBUG=2 EMULATE_AMD_MFMA=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py + PYTHONPATH=. DEBUG=2 EMULATE_AMD_MFMA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Test emulated CUDA tensor cores run: | DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 @@ -253,6 +257,7 @@ jobs: run: | TC=3 DEBUG=3 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm TC=3 PYTHONPATH=. DEBUG=3 EMULATE_AMD=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py + TC=3 PYTHONPATH=. DEBUG=3 EMULATE_AMD_MFMA=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py TC=3 DEBUG=3 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 TC=3 PYTHONPATH=. DEBUG=3 EMULATE_INTEL=1 PYTHON=1 N=16 HALF=1 python3 ./extra/gemm/simple_matmul.py TC=3 PYTHONPATH=. DEBUG=3 AMX=1 EMULATE_AMX=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 01b8a85d56..fde65b53cb 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -403,10 +403,19 @@ class AMDRenderer(CStyleLanguage): tensor_cores = [TensorCore(dims=(16,16,16), threads=32, elements_per_thread=(16,16,8), dtype_in=di, dtype_out=do, opts=("l0","l0","l0","l0","l1","u1","u1","u1"), swizzle=(((4,9,10,11,0),(1,2,3,5,6,7,8)), ((0,1,2,3,4),(9,10,11,5,6,7,8)))) for di,do in [(dtypes.half,dtypes.float),(dtypes.half,dtypes.half)]] + # https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme + tensor_cores_mfma = [TensorCore(dims=(16,16,16), threads=64, elements_per_thread=(4,4,4), dtype_in=di, dtype_out=do, + opts=("l0","l0","l0","l0","u1","u1","l1","l1"), swizzle=(((10,11,4,5,8,9),(0,1,2,3,6,7)),((0,1,2,3,8,9),(4,5,10,11,6,7)))) + for di,do in [(dtypes.half,dtypes.float)]] - def __init__(self, arch:str): # gfx942 => MI300, gfx1100 => RX 7900 + def __init__(self, arch:str): # gfx942 => MI300, gfx1100 => RX 7900, gfx1201 => RX 9700 + self.arch = arch # TODO: fix tensor cores for gfx1201 - self.tensor_cores, self.arch = AMDRenderer.tensor_cores if arch != "gfx1201" else [], arch + self.tensor_cores = \ + AMDRenderer.tensor_cores_mfma if arch.split(":")[0] == "gfx942" else AMDRenderer.tensor_cores if arch.split(":")[0] != "gfx1201" else [] + if self.arch.split(":")[0] == "gfx942": + self.string_rewrite = PatternMatcher([ + (UPat(Ops.WMMA, name="x"), lambda ctx,x: f"__{x.arg[0]}({ctx[x.src[0]]}, {ctx[x.src[1]]}, {ctx[x.src[2]]}, 0, 0, 0)")]) + base_rewrite def __reduce__(self): return self.__class__, (self.arch,) # language options @@ -458,7 +467,8 @@ class AMDRenderer(CStyleLanguage): prefix += [self.render_vector_prefix(dt) for dt in used_dtypes if dt.count > 1] for arg in dedup([uop.arg for uop in uops if uop.op is Ops.WMMA]): # TODO: handle TCs f32_bf16 and bf16_bf16 w/ wrapper - if arg[3] == dtypes.float: prefix.append(f"#define __{arg[0]} __builtin_amdgcn_wmma_f32_16x16x16_f16_w32") + if self.arch.split(":")[0] == "gfx942": prefix.append(f"#define __{arg[0]} __builtin_amdgcn_mfma_f32_16x16x16f16") + elif arg[3] == dtypes.float: prefix.append(f"#define __{arg[0]} __builtin_amdgcn_wmma_f32_16x16x16_f16_w32") else: prefix.append(f"static inline __attribute__((device)) half8 __{arg[0]}"+"""(half16 a, half16 b, half8 c) { half16 c_frag = {}; half8 d; for (int n = 0; n < 8; n++) { c_frag[n*2] = c[n]; } c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c_frag, false); diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 427340cb61..32d9742837 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -136,6 +136,11 @@ class PythonProgram: # (i, j), C, D (2 elements on 32 threads): row major same as A/B def c_map(lane, elem): return (elem + ((lane%2)*2) + ((lane//8)%2)*4, ((lane//2)%4) + (lane//16)*4) ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map) + elif arg[4] == "AMD" and arg[5] == 64: + def a_elem(x, k, row, goff): return x[k%4][goff + (k//4)*16 + row] + def b_elem(x, col, k, goff): return a_elem(x, k, col, goff) # pylint: disable=arguments-out-of-order + def c_map(lane, elem): return (lane%16, (lane//16)*4 + elem) + ul[i] = wmma_helper(64, 16, 4, 4, 4, a_elem, b_elem, c_map) elif arg[4] == "AMD": # A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15 def a_elem(x, k, row, goff): @@ -191,6 +196,7 @@ class PythonRenderer(Renderer): def __init__(self): if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores + if getenv("EMULATE_AMD_MFMA"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores_mfma if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80 if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75 if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores