# ruff: noqa: F405, F403 # allow define from star imports import numpy as np import unittest import subprocess, struct, math, textwrap from tinygrad import Tensor, dtypes, Device, UOp from tinygrad.uop.ops import Ops from tinygrad.helpers import getenv from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble from tinygrad.renderer import ProgramSpec from tinygrad.engine.realize import CompiledRunner from extra.assembly.rdna3.autogen import * from extra.assembly.rdna3.asm import waitcnt from test.testextra.test_cfg_viz import template def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]): out = Tensor([0]*n_threads, dtype=dtypes.uint32).realize() src = "\n".join(inst.disasm() for inst in [ s_load_b64(s[0:1], s[0:1], NULL), *asm, v_lshlrev_b32_e32(v[0], 2, v[0]), s_waitcnt(simm16=waitcnt(lgkmcnt=0)), #global_store_b32(v[0], v[1], s[0:1]), global_store_b32(addr=v[0], data=vdst, saddr=s[0:1]), s_endpgm() ]) prg = ProgramSpec("test", template.replace("fn_name", "test").replace("INSTRUCTION", textwrap.dedent(src)), Device.DEFAULT, UOp(Ops.SINK), global_size=[1, 1, 1], local_size=[n_threads, 1, 1], globals=[0]) car = CompiledRunner(prg) if getenv("PRINT_ASM"): amdgpu_disassemble(car.lib) car([out.uop.buffer], {}, wait=True) return out.tolist() def f16_to_bits(x:float) -> int: return struct.unpack(' float: return struct.unpack(' int: return struct.unpack('