diff --git a/extra/assembly/amd/test/test_custom_kernel.py b/extra/assembly/amd/test/test_custom_kernel.py
index 6102b217c7..2884246f2c 100644
--- a/extra/assembly/amd/test/test_custom_kernel.py
+++ b/extra/assembly/amd/test/test_custom_kernel.py
@@ -57,7 +57,7 @@ def custom_add_var(A:UOp, B:UOp, arch:str) -> UOp:
 class TestCustomKernel(unittest.TestCase):
   def test_simple(self):
     a = Tensor.full((16, 16), 1.).contiguous().realize()
-    a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].arch))[0]
+    a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].renderer.arch))[0]
     ei = a.schedule()[-1].lower()
     self.assertEqual(ei.prg.estimates.ops, a.numel())
     self.assertEqual(ei.prg.estimates.mem, a.nbytes()*2)
@@ -67,7 +67,7 @@ class TestCustomKernel(unittest.TestCase):
   def test_variable(self):
     b = Tensor.full((16, 16), 1, dtype=dtypes.uint32).contiguous().realize()
     a = Tensor.zeros_like(b).contiguous().realize()
-    a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].arch))[0]
+    a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].renderer.arch))[0]
     ei = a.schedule()[-1].lower()
     for i in range(4):
       ei.run({"var":i})
diff --git a/extra/mmapeak/mmapeak.py b/extra/mmapeak/mmapeak.py
index 286d4e0f2e..7046d58fe3 100644
--- a/extra/mmapeak/mmapeak.py
+++ b/extra/mmapeak/mmapeak.py
@@ -5,7 +5,6 @@ os.environ["AMD_AQL"] = "1"
 
 from tinygrad.device import Device
 from tinygrad.runtime.support.compiler_amd import HIPCompiler
-from tinygrad.runtime.ops_amd import AMDProgram
 
 NUM_WORKGROUPS = 96
 WAVE_SIZE = 32
@@ -34,29 +33,26 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
   src = assemblyTemplate.replace("INTERNAL_LOOP", str(INTERNAL_LOOP)).replace("INSTRUCTION", instructions*INSTRUCTIONS_PER_LOOP)
   src = src.replace("DIRECTIVE", DIRECTIVE)
   lib = COMPILER.compile(src)
-  fxn = AMDProgram(DEV, "matmul", lib)
+  fxn = DEV.runtime("matmul", lib)
   elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
   FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
   print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
 
 if __name__=="__main__":
-  DEVICENUM = os.getenv("DEVICENUM", "0")
-  try:
-    DEV = Device['AMD:' + DEVICENUM]
-  except:
-    raise RuntimeError("Error while initiating AMD device")
+  DEV = Device[Device.DEFAULT]
+  arch = DEV.renderer.arch
 
-  COMPILER = HIPCompiler(DEV.arch)
-  if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
-    if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
-    if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
+  COMPILER = HIPCompiler(arch)
+  if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
+    if arch == 'gfx1103': NUM_WORKGROUPS = 8
+    if arch == 'gfx1151': NUM_WORKGROUPS = 32
     launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
     launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
     launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
     launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15))
     launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9))
     launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11))
-  elif DEV.arch == 'gfx1201':
+  elif arch == 'gfx1201':
     NUM_WORKGROUPS = 64
     launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7))
     launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7))
@@ -82,7 +78,7 @@ if __name__=="__main__":
     launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False)
     FLOPS_PER_MATMUL = 16*16*64*2
     launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False)
-  elif DEV.arch == 'gfx950':
+  elif arch == 'gfx950':
     DIRECTIVE = ".amdhsa_accum_offset 4"
     NUM_WORKGROUPS = 256
     WAVE_SIZE = 64
@@ -97,4 +93,4 @@ if __name__=="__main__":
     launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,5), accum=True, extra=", cbsz:2 blgp:2") # fp6
     launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,3), accum=True, extra=", cbsz:4 blgp:4") # fp4
   else:
-    raise RuntimeError(f"arch {DEV.arch} not supported.")
+    raise RuntimeError(f"arch {arch} not supported.")
diff --git a/tinygrad/runtime/ops_null.py b/tinygrad/runtime/ops_null.py
index 0fb11ad572..64c3b41407 100644
--- a/tinygrad/runtime/ops_null.py
+++ b/tinygrad/runtime/ops_null.py
@@ -2,10 +2,10 @@ import functools
 from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
 from tinygrad.engine.jit import MultiGraphRunner
 from tinygrad.renderer.cstyle import Renderer, CStyleLanguage
-from tinygrad.renderer.llvmir import AMDLLVMRenderer
 from tinygrad.uop.ops import Ops
 from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK
 from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
+from tinygrad.renderer.llvmir import AMDLLVMRenderer
 
 class NullRenderer(CStyleLanguage):
   device = "NULL"
@@ -36,6 +36,7 @@ class NullDevice(Compiled):
     match str(EMULATE.value):
       case "AMD": renderer = functools.partial(AMDLLVMRenderer, "gfx1100")
       case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201")
+      case "AMD_CDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx950")
       case "": renderer = NullRenderer
       case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
     compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630