support rendering assembly kernels on the NULL backend (#14283)

* assembly custom kernels in DEV=NULL, use renderer arch * update mmapeak * llvm
2026-04-29 03:00:14 -04:00 · 2026-01-22 01:49:07 -05:00
parent dfefeddeed
commit dff5f361b0
3 changed files with 14 additions and 17 deletions
--- a/extra/assembly/amd/test/test_custom_kernel.py
+++ b/extra/assembly/amd/test/test_custom_kernel.py
@@ -57,7 +57,7 @@ def custom_add_var(A:UOp, B:UOp, arch:str) -> UOp:
 class TestCustomKernel(unittest.TestCase):
  def test_simple(self):
    a = Tensor.full((16, 16), 1.).contiguous().realize()
-    a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].arch))[0]
+    a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].renderer.arch))[0]
    ei = a.schedule()[-1].lower()
    self.assertEqual(ei.prg.estimates.ops, a.numel())
    self.assertEqual(ei.prg.estimates.mem, a.nbytes()*2)
@@ -67,7 +67,7 @@ class TestCustomKernel(unittest.TestCase):
  def test_variable(self):
    b = Tensor.full((16, 16), 1, dtype=dtypes.uint32).contiguous().realize()
    a = Tensor.zeros_like(b).contiguous().realize()
-    a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].arch))[0]
+    a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].renderer.arch))[0]
    ei = a.schedule()[-1].lower()
    for i in range(4):
      ei.run({"var":i})
--- a/extra/mmapeak/mmapeak.py
+++ b/extra/mmapeak/mmapeak.py
@@ -5,7 +5,6 @@ os.environ["AMD_AQL"] = "1"

 from tinygrad.device import Device
 from tinygrad.runtime.support.compiler_amd import HIPCompiler
-from tinygrad.runtime.ops_amd import AMDProgram

 NUM_WORKGROUPS = 96
 WAVE_SIZE = 32
@@ -34,29 +33,26 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
  src = assemblyTemplate.replace("INTERNAL_LOOP", str(INTERNAL_LOOP)).replace("INSTRUCTION", instructions*INSTRUCTIONS_PER_LOOP)
  src = src.replace("DIRECTIVE", DIRECTIVE)
  lib = COMPILER.compile(src)
-  fxn = AMDProgram(DEV, "matmul", lib)
+  fxn = DEV.runtime("matmul", lib)
  elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
  FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
  print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")

 if __name__=="__main__":
-  DEVICENUM = os.getenv("DEVICENUM", "0")
-  try:
-    DEV = Device['AMD:' + DEVICENUM]
-  except:
-    raise RuntimeError("Error while initiating AMD device")
+  DEV = Device[Device.DEFAULT]
+  arch = DEV.renderer.arch

-  COMPILER = HIPCompiler(DEV.arch)
-  if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
-    if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
-    if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
+  COMPILER = HIPCompiler(arch)
+  if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
+    if arch == 'gfx1103': NUM_WORKGROUPS = 8
+    if arch == 'gfx1151': NUM_WORKGROUPS = 32
    launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
    launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
    launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
    launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15))
    launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9))
    launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11))
-  elif DEV.arch == 'gfx1201':
+  elif arch == 'gfx1201':
    NUM_WORKGROUPS = 64
    launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7))
    launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7))
@@ -82,7 +78,7 @@ if __name__=="__main__":
    launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False)
    FLOPS_PER_MATMUL = 16*16*64*2
    launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False)
-  elif DEV.arch == 'gfx950':
+  elif arch == 'gfx950':
    DIRECTIVE = ".amdhsa_accum_offset 4"
    NUM_WORKGROUPS = 256
    WAVE_SIZE = 64
@@ -97,4 +93,4 @@ if __name__=="__main__":
    launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,5), accum=True, extra=", cbsz:2 blgp:2") # fp6
    launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,3), accum=True, extra=", cbsz:4 blgp:4") # fp4
  else:
-    raise RuntimeError(f"arch {DEV.arch} not supported.")
+    raise RuntimeError(f"arch {arch} not supported.")
--- a/tinygrad/runtime/ops_null.py
+++ b/tinygrad/runtime/ops_null.py
@@ -2,10 +2,10 @@ import functools
 from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
 from tinygrad.engine.jit import MultiGraphRunner
 from tinygrad.renderer.cstyle import Renderer, CStyleLanguage
-from tinygrad.renderer.llvmir import AMDLLVMRenderer
 from tinygrad.uop.ops import Ops
 from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK
 from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
+from tinygrad.renderer.llvmir import AMDLLVMRenderer

 class NullRenderer(CStyleLanguage):
  device = "NULL"
@@ -36,6 +36,7 @@ class NullDevice(Compiled):
    match str(EMULATE.value):
      case "AMD": renderer = functools.partial(AMDLLVMRenderer, "gfx1100")
      case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201")
+      case "AMD_CDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx950")
      case "": renderer = NullRenderer
      case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
    compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630