support rendering assembly kernels on the NULL backend (#14283)

* assembly custom kernels in DEV=NULL, use renderer arch

* update mmapeak

* llvm
This commit is contained in:
qazal
2026-01-22 01:49:07 -05:00
committed by GitHub
parent dfefeddeed
commit dff5f361b0
3 changed files with 14 additions and 17 deletions

View File

@@ -57,7 +57,7 @@ def custom_add_var(A:UOp, B:UOp, arch:str) -> UOp:
class TestCustomKernel(unittest.TestCase):
def test_simple(self):
a = Tensor.full((16, 16), 1.).contiguous().realize()
a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].arch))[0]
a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].renderer.arch))[0]
ei = a.schedule()[-1].lower()
self.assertEqual(ei.prg.estimates.ops, a.numel())
self.assertEqual(ei.prg.estimates.mem, a.nbytes()*2)
@@ -67,7 +67,7 @@ class TestCustomKernel(unittest.TestCase):
def test_variable(self):
b = Tensor.full((16, 16), 1, dtype=dtypes.uint32).contiguous().realize()
a = Tensor.zeros_like(b).contiguous().realize()
a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].arch))[0]
a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].renderer.arch))[0]
ei = a.schedule()[-1].lower()
for i in range(4):
ei.run({"var":i})

View File

@@ -5,7 +5,6 @@ os.environ["AMD_AQL"] = "1"
from tinygrad.device import Device
from tinygrad.runtime.support.compiler_amd import HIPCompiler
from tinygrad.runtime.ops_amd import AMDProgram
NUM_WORKGROUPS = 96
WAVE_SIZE = 32
@@ -34,29 +33,26 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
src = assemblyTemplate.replace("INTERNAL_LOOP", str(INTERNAL_LOOP)).replace("INSTRUCTION", instructions*INSTRUCTIONS_PER_LOOP)
src = src.replace("DIRECTIVE", DIRECTIVE)
lib = COMPILER.compile(src)
fxn = AMDProgram(DEV, "matmul", lib)
fxn = DEV.runtime("matmul", lib)
elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
if __name__=="__main__":
DEVICENUM = os.getenv("DEVICENUM", "0")
try:
DEV = Device['AMD:' + DEVICENUM]
except:
raise RuntimeError("Error while initiating AMD device")
DEV = Device[Device.DEFAULT]
arch = DEV.renderer.arch
COMPILER = HIPCompiler(DEV.arch)
if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
COMPILER = HIPCompiler(arch)
if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
if arch == 'gfx1103': NUM_WORKGROUPS = 8
if arch == 'gfx1151': NUM_WORKGROUPS = 32
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15))
launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9))
launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11))
elif DEV.arch == 'gfx1201':
elif arch == 'gfx1201':
NUM_WORKGROUPS = 64
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7))
launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7))
@@ -82,7 +78,7 @@ if __name__=="__main__":
launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False)
FLOPS_PER_MATMUL = 16*16*64*2
launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False)
elif DEV.arch == 'gfx950':
elif arch == 'gfx950':
DIRECTIVE = ".amdhsa_accum_offset 4"
NUM_WORKGROUPS = 256
WAVE_SIZE = 64
@@ -97,4 +93,4 @@ if __name__=="__main__":
launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,5), accum=True, extra=", cbsz:2 blgp:2") # fp6
launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,3), accum=True, extra=", cbsz:4 blgp:4") # fp4
else:
raise RuntimeError(f"arch {DEV.arch} not supported.")
raise RuntimeError(f"arch {arch} not supported.")

View File

@@ -2,10 +2,10 @@ import functools
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.uop.ops import Ops
from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK
from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
class NullRenderer(CStyleLanguage):
device = "NULL"
@@ -36,6 +36,7 @@ class NullDevice(Compiled):
match str(EMULATE.value):
case "AMD": renderer = functools.partial(AMDLLVMRenderer, "gfx1100")
case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201")
case "AMD_CDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx950")
case "": renderer = NullRenderer
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630