diff --git a/extra/assembly/amd/test/test_custom_kernel.py b/extra/assembly/amd/test/test_custom_kernel.py index 6102b217c7..2884246f2c 100644 --- a/extra/assembly/amd/test/test_custom_kernel.py +++ b/extra/assembly/amd/test/test_custom_kernel.py @@ -57,7 +57,7 @@ def custom_add_var(A:UOp, B:UOp, arch:str) -> UOp: class TestCustomKernel(unittest.TestCase): def test_simple(self): a = Tensor.full((16, 16), 1.).contiguous().realize() - a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].arch))[0] + a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].renderer.arch))[0] ei = a.schedule()[-1].lower() self.assertEqual(ei.prg.estimates.ops, a.numel()) self.assertEqual(ei.prg.estimates.mem, a.nbytes()*2) @@ -67,7 +67,7 @@ class TestCustomKernel(unittest.TestCase): def test_variable(self): b = Tensor.full((16, 16), 1, dtype=dtypes.uint32).contiguous().realize() a = Tensor.zeros_like(b).contiguous().realize() - a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].arch))[0] + a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].renderer.arch))[0] ei = a.schedule()[-1].lower() for i in range(4): ei.run({"var":i}) diff --git a/extra/mmapeak/mmapeak.py b/extra/mmapeak/mmapeak.py index 286d4e0f2e..7046d58fe3 100644 --- a/extra/mmapeak/mmapeak.py +++ b/extra/mmapeak/mmapeak.py @@ -5,7 +5,6 @@ os.environ["AMD_AQL"] = "1" from tinygrad.device import Device from tinygrad.runtime.support.compiler_amd import HIPCompiler -from tinygrad.runtime.ops_amd import AMDProgram NUM_WORKGROUPS = 96 WAVE_SIZE = 32 @@ -34,29 +33,26 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="") src = assemblyTemplate.replace("INTERNAL_LOOP", str(INTERNAL_LOOP)).replace("INSTRUCTION", instructions*INSTRUCTIONS_PER_LOOP) src = src.replace("DIRECTIVE", DIRECTIVE) lib = COMPILER.compile(src) - fxn = AMDProgram(DEV, "matmul", lib) + fxn = DEV.runtime("matmul", lib) elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)]) FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS") if __name__=="__main__": - DEVICENUM = os.getenv("DEVICENUM", "0") - try: - DEV = Device['AMD:' + DEVICENUM] - except: - raise RuntimeError("Error while initiating AMD device") + DEV = Device[Device.DEFAULT] + arch = DEV.renderer.arch - COMPILER = HIPCompiler(DEV.arch) - if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}: - if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8 - if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32 + COMPILER = HIPCompiler(arch) + if arch in {'gfx1100', 'gfx1103', 'gfx1151'}: + if arch == 'gfx1103': NUM_WORKGROUPS = 8 + if arch == 'gfx1151': NUM_WORKGROUPS = 32 launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15)) launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15)) launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15)) launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15)) launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9)) launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11)) - elif DEV.arch == 'gfx1201': + elif arch == 'gfx1201': NUM_WORKGROUPS = 64 launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7)) launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7)) @@ -82,7 +78,7 @@ if __name__=="__main__": launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False) FLOPS_PER_MATMUL = 16*16*64*2 launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False) - elif DEV.arch == 'gfx950': + elif arch == 'gfx950': DIRECTIVE = ".amdhsa_accum_offset 4" NUM_WORKGROUPS = 256 WAVE_SIZE = 64 @@ -97,4 +93,4 @@ if __name__=="__main__": launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,5), accum=True, extra=", cbsz:2 blgp:2") # fp6 launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,3), accum=True, extra=", cbsz:4 blgp:4") # fp4 else: - raise RuntimeError(f"arch {DEV.arch} not supported.") + raise RuntimeError(f"arch {arch} not supported.") diff --git a/tinygrad/runtime/ops_null.py b/tinygrad/runtime/ops_null.py index 0fb11ad572..64c3b41407 100644 --- a/tinygrad/runtime/ops_null.py +++ b/tinygrad/runtime/ops_null.py @@ -2,10 +2,10 @@ import functools from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair from tinygrad.engine.jit import MultiGraphRunner from tinygrad.renderer.cstyle import Renderer, CStyleLanguage -from tinygrad.renderer.llvmir import AMDLLVMRenderer from tinygrad.uop.ops import Ops from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK from tinygrad.renderer.nir import IR3Renderer, NAKRenderer +from tinygrad.renderer.llvmir import AMDLLVMRenderer class NullRenderer(CStyleLanguage): device = "NULL" @@ -36,6 +36,7 @@ class NullDevice(Compiled): match str(EMULATE.value): case "AMD": renderer = functools.partial(AMDLLVMRenderer, "gfx1100") case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201") + case "AMD_CDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx950") case "": renderer = NullRenderer case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}") compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630