From d7afa0208595b1211eea1914c827bcd37548069b Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Thu, 22 Jan 2026 05:10:59 -0500 Subject: [PATCH] clean up the extra/sqtt directory (#14284) * remove legacy test_timing stuff * remove legacy test_pmc, update active_sqtt_parse --- extra/sqtt/active_sqtt_parse.py | 5 +- extra/sqtt/test_pmc.py | 87 ------------------- extra/sqtt/test_timing.py | 143 -------------------------------- 3 files changed, 3 insertions(+), 232 deletions(-) delete mode 100644 extra/sqtt/test_pmc.py delete mode 100644 extra/sqtt/test_timing.py diff --git a/extra/sqtt/active_sqtt_parse.py b/extra/sqtt/active_sqtt_parse.py index 80f2984849..80652750f9 100644 --- a/extra/sqtt/active_sqtt_parse.py +++ b/extra/sqtt/active_sqtt_parse.py @@ -11,7 +11,7 @@ from tinygrad import Tensor from tinygrad.helpers import system, OSX from tinygrad.runtime.ops_amd import AMDProgram from extra.sqtt.roc import decode, WaveExec, ProfileSQTTEvent -from tinygrad.device import Device, ProfileDeviceEvent +from tinygrad.device import Device from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets @@ -23,7 +23,7 @@ def save_sqtt(): dev.profile_events.clear() sqtt:dict[str, list[WaveExec]] = {} yield sqtt - events = dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())] + events = dev.profile_events #rctx = decode(events) #assert len(rctx.inst_execs) > 0, "empty sqtt output" @@ -45,6 +45,7 @@ matmul: .rodata .p2align 6 .amdhsa_kernel matmul + .amdhsa_kernarg_size 8 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_next_free_vgpr .amdgcn.next_free_vgpr .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr diff --git a/extra/sqtt/test_pmc.py b/extra/sqtt/test_pmc.py deleted file mode 100644 index 0143ff02dc..0000000000 --- a/extra/sqtt/test_pmc.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -os.environ["PROFILE"] = "1" -os.environ["PMC"] = "1" - -import unittest -import functools, contextlib -import numpy as np -from tinygrad import Tensor, Context, Device -from tinygrad.dtype import dtypes, AddrSpace -from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType -from tinygrad.runtime.ops_amd import ProfilePMCEvent -from tinygrad.engine.realize import get_runner -from tinygrad.viz.serve import unpack_pmc -from extra.sqtt.roc import print_pmc - -def copy_kernel(B, A, stride=1): - n_threads = 32 - assert A.size >= n_threads, f"{A.size} is too small, min size {n_threads}" - g = UOp.range(A.size//n_threads, 0, AxisType.GLOBAL) - l = UOp.range(n_threads, 1, AxisType.LOCAL) - i = g * n_threads + l - index = (i * stride) % A.size - return B[index].store(A[index]).sink(arg=KernelInfo(name=f"copy_{A.size}_stride_{stride}", opts_to_apply=())) - -def lds_kernel(offset:UOp, size:int, inst:str) -> UOp: - tid = UOp.range(offset.size, 0, AxisType.LOCAL) - dst = UOp.placeholder((size,), dtypes.float32, 1, AddrSpace.REG) - #lds = UOp.placeholder((1024,), dtypes.float32, 2, AddrSpace.LOCAL) - u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_waitcnt(0);') - u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_barrier();', src=(u,)) - u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_sched_barrier(0);', src=(u,)) - u = UOp(Ops.CUSTOM, arg=f'asm volatile("{inst} '+'%0, %1" : "=v"({0}) : "v"({1}));', src=(dst, offset[tid], u)) - return UOp.sink(u, arg=KernelInfo(name="test_lds", opts_to_apply=())) - -dev = Device[Device.DEFAULT] - -@contextlib.contextmanager -def save_pmc(): - # clear the old traces - dev.profile_events.clear() - pmc:list[ProfilePMCEvent] = [] - yield pmc - for e in dev.profile_events: - if isinstance(e, ProfilePMCEvent): pmc.append(e) - -@unittest.skipIf(dev.device != "AMD", "tests PMC counters on AMD") -class TestPMC(unittest.TestCase): - @Context(IGNORE_OOB=0) - def test_copy(self, stride:int=1): - N = 1 << 25 # ~134MB - a = Tensor(np.arange(N, dtype=np.uint32)+1).realize() - b = Tensor(np.zeros(N, dtype=np.uint32)).realize() - b = Tensor.custom_kernel(b, a, fxn=functools.partial(copy_kernel, stride=stride))[0] - with save_pmc() as pmc: - b.realize() - print_pmc(pmc) - np.testing.assert_equal(a.numpy(), b.numpy()) - - def test_copy_uncoalesced(self): return self.test_copy(stride=17) - - # test with two threads issuing ds_reads at different offsets - def test_ds_read(self, size=1, inst='ds_read_b32'): - test_banks = 256 - offsets = [Tensor([0, b*4]) for b in range(1, test_banks)] - with Context(DEBUG=0): Tensor.realize(*offsets) - k = Tensor.custom_kernel(offsets[0], fxn=functools.partial(lds_kernel, size=size, inst=inst))[0] - # sample all kernels - with save_pmc() as pmc_events: - runner = get_runner(Device.DEFAULT, k.schedule()[0].ast) - # TODO: llvm eliminates lds definition from the ELF, is there another way to pin lds size? - runner._prg.group_segment_size = 1024 - for offset in offsets: runner([offset.uop.buffer]) - # find read offsets that created bank conflicts from the pmc counters - found:list[Tensor] = [] - for i,e in enumerate(pmc_events): - pmc = unpack_pmc(e)["rows"] - # SQ on gfx9, renamed to SQC after gfx10 - val = next(total for name,total,_all_instances in pmc if name in {"SQ_LDS_BANK_CONFLICT", "SQC_LDS_BANK_CONFLICT"}) - if val > 0: found.append(offsets[i]) - print("Found bank conflicts at offsets:", [s.numpy() for s in found]) - - def test_ds_read_b64(self): self.test_ds_read(2, 'ds_read_b64') - - def test_ds_read_b128(self): self.test_ds_read(4, 'ds_read_b128') - -if __name__ == "__main__": - unittest.main() diff --git a/extra/sqtt/test_timing.py b/extra/sqtt/test_timing.py deleted file mode 100644 index 80ee504233..0000000000 --- a/extra/sqtt/test_timing.py +++ /dev/null @@ -1,143 +0,0 @@ -import os -os.environ["PYTHONPATH"] = "." -os.environ["SQTT"] = "1" -if "DEV" not in os.environ: os.environ["DEV"] = "AMD" -os.environ["PROFILE"] = "1" -# VIZ=1 to launch server -# os.environ["VIZ"] = "1" -os.environ["AMD_LLVM"] = "0" - -import unittest -import sys, contextlib -from tinygrad import Tensor, dtypes -from tinygrad.helpers import getenv -from tinygrad.uop.ops import UOp, Ops, KernelInfo -from tinygrad.device import Device, ProfileDeviceEvent - -from extra.sqtt.roc import decode, WaveExec - -dev = Device[os.environ["DEV"]] - -def custom(arg:str, s:UOp|None=None) -> UOp: return UOp(Ops.CUSTOM, src=(s,) if s is not None else (), arg=arg) - -def asm_kernel(instrs:list[str], l:int=1, g:int=1) -> Tensor: - name = sys._getframe(1).f_code.co_name - def fxn(_): - L = UOp.special(l, "lidx0") - G = UOp.special(g, "gidx0") - op = custom("asm volatile (") - for inst in instrs: op = custom(f' "{inst}\\n\\t"', op) - op = custom(");", op) - return UOp.sink(op, L, G, arg=KernelInfo(name=name)) - k = Tensor.custom_kernel(Tensor.empty(1), fxn=fxn)[0] - return k - -@contextlib.contextmanager -def save_sqtt(): - # clear the old traces - dev.profile_events.clear() - sqtt:dict[str, list[WaveExec]] = {} - yield sqtt - # decode sqtt - if os.environ["DEV"] != "AMD": return - rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())]) - assert len(rctx.inst_execs) > 0, "empty sqtt output" - sqtt.update(rctx.inst_execs) - -class TestTiming(unittest.TestCase): - def test_v_add(self): - with save_sqtt() as sqtt: - asm_kernel([f"v_add_f32 v{10+i} v{10+i+1} {10+i}" for i in range(3)]).realize() - wave = list(sqtt.values())[0][:-1] - assert all(s.dur == 1 for s in wave) - assert all(s.stall == 0 for s in wave) - - def test_chain_v_add_1l(self): - with save_sqtt() as sqtt: - asm_kernel([ - "v_add_f32_e32 v1 v0 v0", - "v_add_f32_e32 v2 v1 v1", - ]).realize() - wave = list(sqtt.values())[0][:-1] - assert all(s.dur == 1 for s in wave) - assert all(s.stall == 0 for s in wave) - - def test_multi_cycle_inst(self): - def custom_vrcp(A, B): - op = custom("float a = 0.0;") - op = custom("float b = (*(data1_1+0));", op) - #op = custom('asm volatile("v_mul_f32_e32 %2 %2 %1" : "+v"(a) : "v"(b));', op) - op = custom('asm volatile("v_rcp_f32_e32 %2 %1" : "+v"(a) : "v"(b));', op) - op = custom('asm volatile("v_add_f32_e64 %1 %1 1.0" : "+v"(a));', op) - op = custom("*(data0_1+0) = a;", op) - return UOp.sink(op, A, B, arg=KernelInfo(name="custom_vrcp")) - out = Tensor([0.]).realize() - inp = Tensor([-2.0]).realize() - with save_sqtt() as sqtt: - Tensor.custom_kernel(out, inp, fxn=custom_vrcp)[0].realize() - wave = list(sqtt.values())[0][0] - for i in range(len(wave.insts)): - if wave.insts[i].inst.startswith("global_store"): - print(f"store diff {wave.insts[i].time-(wave.insts[i-1].time)}") - self.assertEqual(out.item(), 0.5) - - def test_wmma(self): - with save_sqtt() as sqtt: - for tc in dev.renderer.get_tensor_cores(dev.arch): - M, K, N = tc.dims - s = 32 - a = Tensor.empty(M*s, K*s, dtype=tc.dtype_in)@Tensor.empty(K*s, N*s, dtype=tc.dtype_in) - a.realize() - print(a) - for p,waves in sqtt.items(): - for e in waves[0].insts: - if (e.inst.startswith("v_wmma")): - instruction = e.inst.split(" ")[0] - print(f"{instruction:<29} : {e.dur} cycles") - - def test_sleep(self): - n = 1 - def sleep_kernel(data0): - assert data0.dtype.base == dtypes.ulong - op = custom("unsigned long long t0 = __builtin_readcyclecounter();") - op = custom(f"__builtin_amdgcn_s_sleep({n});", op) - op = custom("unsigned long long t1 = __builtin_readcyclecounter();", op) - op = custom(f"data0_{data0.size}[0] = t1 - t0;", op) - return UOp.sink(data0, op, arg=KernelInfo(name=f"sleep_{n}")) - diff_hw_reg = Tensor.empty(1, dtype=dtypes.ulong) - diff_hw_reg = Tensor.custom_kernel(diff_hw_reg, fxn=sleep_kernel)[0] - with save_sqtt() as sqtt: - diff_hw_reg.realize() - sleep = next((e for e in sqtt[f"sleep_{n}"][0].insts if e.inst.startswith("s_sleep"))) - # cycles = sleep dur + overhead of storing hi/lo REG_SHADER_CYCLES - self.assertGreaterEqual(diff_hw_reg.item(), sleep.dur) - - def test_nop(self): - with save_sqtt() as sqtt: - asm_kernel(["s_nop 1"]*10).realize() - wave = list(sqtt.values())[0][0] - for e in wave.insts: - print(f"{e.inst} {e.dur=} {e.stall=}") - - def test_wave_sched(self): - num_waves = getenv("NUM_WAVES", 16) - num_wgps = getenv("NUM_WGPS", 2) - num_vgpr = getenv("NUM_VGPR", 256) - with save_sqtt() as sqtt: - # 1 cycle decode, no stall - asm_kernel([f"v_mov_b32_e32 v{i} {i}" for i in range(num_vgpr)], l=32*num_waves, g=num_wgps).realize() - waves = list(sqtt.values())[0] - print(len(waves), "waves decoded") - for w in waves: - print(f"{w.wave_id:<2} {w.simd=} {w.cu=} {w.se=} @ clk {w.begin_time}") - - def test_ones(self): - N = getenv("N", 4096) - CNT = getenv("CNT", 2) - with save_sqtt() as sqtt: - for _ in range(CNT): - Tensor.ones(N, N).contiguous().realize() - self.assertEqual(len(sqtt), CNT) - -if __name__ == "__main__": - unittest.main()