mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
support rendering assembly kernels on the NULL backend (#14283)
* assembly custom kernels in DEV=NULL, use renderer arch * update mmapeak * llvm
This commit is contained in:
@@ -57,7 +57,7 @@ def custom_add_var(A:UOp, B:UOp, arch:str) -> UOp:
|
||||
class TestCustomKernel(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
a = Tensor.full((16, 16), 1.).contiguous().realize()
|
||||
a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].arch))[0]
|
||||
a = Tensor.custom_kernel(a, fxn=functools.partial(custom_add_one, arch=Device[Device.DEFAULT].renderer.arch))[0]
|
||||
ei = a.schedule()[-1].lower()
|
||||
self.assertEqual(ei.prg.estimates.ops, a.numel())
|
||||
self.assertEqual(ei.prg.estimates.mem, a.nbytes()*2)
|
||||
@@ -67,7 +67,7 @@ class TestCustomKernel(unittest.TestCase):
|
||||
def test_variable(self):
|
||||
b = Tensor.full((16, 16), 1, dtype=dtypes.uint32).contiguous().realize()
|
||||
a = Tensor.zeros_like(b).contiguous().realize()
|
||||
a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].arch))[0]
|
||||
a = Tensor.custom_kernel(a, b, fxn=functools.partial(custom_add_var, arch=Device[Device.DEFAULT].renderer.arch))[0]
|
||||
ei = a.schedule()[-1].lower()
|
||||
for i in range(4):
|
||||
ei.run({"var":i})
|
||||
|
||||
@@ -5,7 +5,6 @@ os.environ["AMD_AQL"] = "1"
|
||||
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
||||
from tinygrad.runtime.ops_amd import AMDProgram
|
||||
|
||||
NUM_WORKGROUPS = 96
|
||||
WAVE_SIZE = 32
|
||||
@@ -34,29 +33,26 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
|
||||
src = assemblyTemplate.replace("INTERNAL_LOOP", str(INTERNAL_LOOP)).replace("INSTRUCTION", instructions*INSTRUCTIONS_PER_LOOP)
|
||||
src = src.replace("DIRECTIVE", DIRECTIVE)
|
||||
lib = COMPILER.compile(src)
|
||||
fxn = AMDProgram(DEV, "matmul", lib)
|
||||
fxn = DEV.runtime("matmul", lib)
|
||||
elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
|
||||
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
|
||||
print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
|
||||
|
||||
if __name__=="__main__":
|
||||
DEVICENUM = os.getenv("DEVICENUM", "0")
|
||||
try:
|
||||
DEV = Device['AMD:' + DEVICENUM]
|
||||
except:
|
||||
raise RuntimeError("Error while initiating AMD device")
|
||||
DEV = Device[Device.DEFAULT]
|
||||
arch = DEV.renderer.arch
|
||||
|
||||
COMPILER = HIPCompiler(DEV.arch)
|
||||
if DEV.arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
|
||||
if DEV.arch == 'gfx1103': NUM_WORKGROUPS = 8
|
||||
if DEV.arch == 'gfx1151': NUM_WORKGROUPS = 32
|
||||
COMPILER = HIPCompiler(arch)
|
||||
if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
|
||||
if arch == 'gfx1103': NUM_WORKGROUPS = 8
|
||||
if arch == 'gfx1151': NUM_WORKGROUPS = 32
|
||||
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (7,8,15))
|
||||
launchBenchmark("v_wmma_f16_16x16x16_f16", (7,8,15))
|
||||
launchBenchmark("v_wmma_f32_16x16x16_bf16", (7,8,15))
|
||||
launchBenchmark("v_wmma_f32_16x16x16_f16", (7,8,15))
|
||||
launchBenchmark("v_wmma_i32_16x16x16_iu4", (7,8,9))
|
||||
launchBenchmark("v_wmma_i32_16x16x16_iu8", (7,8,11))
|
||||
elif DEV.arch == 'gfx1201':
|
||||
elif arch == 'gfx1201':
|
||||
NUM_WORKGROUPS = 64
|
||||
launchBenchmark("v_wmma_bf16_16x16x16_bf16", (3,4,7))
|
||||
launchBenchmark("v_wmma_f16_16x16x16_f16", (3,4,7))
|
||||
@@ -82,7 +78,7 @@ if __name__=="__main__":
|
||||
launchBenchmark("v_swmmac_f32_16x16x32_bf8_bf8", (7,8,9,10,13,14), False)
|
||||
FLOPS_PER_MATMUL = 16*16*64*2
|
||||
launchBenchmark("v_swmmac_i32_16x16x64_iu4", (7,8,9,10,13,14), False)
|
||||
elif DEV.arch == 'gfx950':
|
||||
elif arch == 'gfx950':
|
||||
DIRECTIVE = ".amdhsa_accum_offset 4"
|
||||
NUM_WORKGROUPS = 256
|
||||
WAVE_SIZE = 64
|
||||
@@ -97,4 +93,4 @@ if __name__=="__main__":
|
||||
launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,5), accum=True, extra=", cbsz:2 blgp:2") # fp6
|
||||
launchBenchmark("v_mfma_f32_16x16x128_f8f6f4", (3,0,3), accum=True, extra=", cbsz:4 blgp:4") # fp4
|
||||
else:
|
||||
raise RuntimeError(f"arch {DEV.arch} not supported.")
|
||||
raise RuntimeError(f"arch {arch} not supported.")
|
||||
|
||||
@@ -2,10 +2,10 @@ import functools
|
||||
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK
|
||||
from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
|
||||
class NullRenderer(CStyleLanguage):
|
||||
device = "NULL"
|
||||
@@ -36,6 +36,7 @@ class NullDevice(Compiled):
|
||||
match str(EMULATE.value):
|
||||
case "AMD": renderer = functools.partial(AMDLLVMRenderer, "gfx1100")
|
||||
case "AMD_RDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx1201")
|
||||
case "AMD_CDNA4": renderer = functools.partial(AMDLLVMRenderer, "gfx950")
|
||||
case "": renderer = NullRenderer
|
||||
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
|
||||
compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630
|
||||
|
||||
Reference in New Issue
Block a user