# ruff: noqa: F405, F403 # allow define from star imports import numpy as np import unittest import subprocess, struct, math, functools from tinygrad import Tensor, dtypes, Device from tinygrad.helpers import getenv from extra.assembly.amd.autogen.rdna3.ins import * from extra.assembly.amd.asm import waitcnt from test.testextra.test_cfg_viz import asm_kernel def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]): out = Tensor([0]*n_threads, dtype=dtypes.uint32).realize() insts = [ s_load_b64(s[0:1], s[0:1], NULL), *asm, v_lshlrev_b32_e32(v[0], 2, v[0]), s_waitcnt(simm16=waitcnt(lgkmcnt=0)), #global_store_b32(v[0], v[1], s[0:1]), global_store_b32(addr=v[0], data=vdst, saddr=s[0:1]), s_endpgm() ] out = Tensor.custom_kernel(out, fxn=functools.partial(asm_kernel, name="test", insts=insts, device=out.device, n_threads=n_threads))[0] out.realize() return out.tolist() def f16_to_bits(x:float) -> int: return struct.unpack(' float: return struct.unpack(' int: return struct.unpack('