From d7afa0208595b1211eea1914c827bcd37548069b Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Thu, 22 Jan 2026 05:10:59 -0500
Subject: [PATCH] clean up the extra/sqtt directory (#14284)

* remove legacy test_timing stuff

* remove legacy test_pmc, update active_sqtt_parse
---
 extra/sqtt/active_sqtt_parse.py |   5 +-
 extra/sqtt/test_pmc.py          |  87 -------------------
 extra/sqtt/test_timing.py       | 143 --------------------------------
 3 files changed, 3 insertions(+), 232 deletions(-)
 delete mode 100644 extra/sqtt/test_pmc.py
 delete mode 100644 extra/sqtt/test_timing.py

diff --git a/extra/sqtt/active_sqtt_parse.py b/extra/sqtt/active_sqtt_parse.py
index 80f2984849..80652750f9 100644
--- a/extra/sqtt/active_sqtt_parse.py
+++ b/extra/sqtt/active_sqtt_parse.py
@@ -11,7 +11,7 @@ from tinygrad import Tensor
 from tinygrad.helpers import system, OSX
 from tinygrad.runtime.ops_amd import AMDProgram
 from extra.sqtt.roc import decode, WaveExec, ProfileSQTTEvent
-from tinygrad.device import Device, ProfileDeviceEvent
+from tinygrad.device import Device
 
 from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets
 
@@ -23,7 +23,7 @@ def save_sqtt():
   dev.profile_events.clear()
   sqtt:dict[str, list[WaveExec]] = {}
   yield sqtt
-  events = dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())]
+  events = dev.profile_events
 
   #rctx = decode(events)
   #assert len(rctx.inst_execs) > 0, "empty sqtt output"
@@ -45,6 +45,7 @@ matmul:
 .rodata
 .p2align 6
 .amdhsa_kernel matmul
+  .amdhsa_kernarg_size 8
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_next_free_vgpr .amdgcn.next_free_vgpr
   .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr
diff --git a/extra/sqtt/test_pmc.py b/extra/sqtt/test_pmc.py
deleted file mode 100644
index 0143ff02dc..0000000000
--- a/extra/sqtt/test_pmc.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os
-os.environ["PROFILE"] = "1"
-os.environ["PMC"] = "1"
-
-import unittest
-import functools, contextlib
-import numpy as np
-from tinygrad import Tensor, Context, Device
-from tinygrad.dtype import dtypes, AddrSpace
-from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
-from tinygrad.runtime.ops_amd import ProfilePMCEvent
-from tinygrad.engine.realize import get_runner
-from tinygrad.viz.serve import unpack_pmc
-from extra.sqtt.roc import print_pmc
-
-def copy_kernel(B, A, stride=1):
-  n_threads = 32
-  assert A.size >= n_threads, f"{A.size} is too small, min size {n_threads}"
-  g = UOp.range(A.size//n_threads, 0, AxisType.GLOBAL)
-  l = UOp.range(n_threads, 1, AxisType.LOCAL)
-  i = g * n_threads + l
-  index = (i * stride) % A.size
-  return B[index].store(A[index]).sink(arg=KernelInfo(name=f"copy_{A.size}_stride_{stride}", opts_to_apply=()))
-
-def lds_kernel(offset:UOp, size:int, inst:str) -> UOp:
-  tid = UOp.range(offset.size, 0, AxisType.LOCAL)
-  dst = UOp.placeholder((size,), dtypes.float32, 1, AddrSpace.REG)
-  #lds = UOp.placeholder((1024,), dtypes.float32, 2, AddrSpace.LOCAL)
-  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_waitcnt(0);')
-  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_barrier();', src=(u,))
-  u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_sched_barrier(0);', src=(u,))
-  u = UOp(Ops.CUSTOM, arg=f'asm volatile("{inst} '+'%0, %1" : "=v"({0}) : "v"({1}));', src=(dst, offset[tid], u))
-  return UOp.sink(u, arg=KernelInfo(name="test_lds", opts_to_apply=()))
-
-dev = Device[Device.DEFAULT]
-
-@contextlib.contextmanager
-def save_pmc():
-  # clear the old traces
-  dev.profile_events.clear()
-  pmc:list[ProfilePMCEvent] = []
-  yield pmc
-  for e in dev.profile_events:
-    if isinstance(e, ProfilePMCEvent): pmc.append(e)
-
-@unittest.skipIf(dev.device != "AMD", "tests PMC counters on AMD")
-class TestPMC(unittest.TestCase):
-  @Context(IGNORE_OOB=0)
-  def test_copy(self, stride:int=1):
-    N = 1 << 25 # ~134MB
-    a = Tensor(np.arange(N, dtype=np.uint32)+1).realize()
-    b = Tensor(np.zeros(N, dtype=np.uint32)).realize()
-    b = Tensor.custom_kernel(b, a, fxn=functools.partial(copy_kernel, stride=stride))[0]
-    with save_pmc() as pmc:
-      b.realize()
-    print_pmc(pmc)
-    np.testing.assert_equal(a.numpy(), b.numpy())
-
-  def test_copy_uncoalesced(self): return self.test_copy(stride=17)
-
-  # test with two threads issuing ds_reads at different offsets
-  def test_ds_read(self, size=1, inst='ds_read_b32'):
-    test_banks = 256
-    offsets = [Tensor([0, b*4]) for b in range(1, test_banks)]
-    with Context(DEBUG=0): Tensor.realize(*offsets)
-    k = Tensor.custom_kernel(offsets[0], fxn=functools.partial(lds_kernel, size=size, inst=inst))[0]
-    # sample all kernels
-    with save_pmc() as pmc_events:
-      runner = get_runner(Device.DEFAULT, k.schedule()[0].ast)
-      # TODO: llvm eliminates lds definition from the ELF, is there another way to pin lds size?
-      runner._prg.group_segment_size = 1024
-      for offset in offsets: runner([offset.uop.buffer])
-    # find read offsets that created bank conflicts from the pmc counters
-    found:list[Tensor] = []
-    for i,e in enumerate(pmc_events):
-      pmc = unpack_pmc(e)["rows"]
-      # SQ on gfx9, renamed to SQC after gfx10
-      val = next(total for name,total,_all_instances in pmc if name in {"SQ_LDS_BANK_CONFLICT", "SQC_LDS_BANK_CONFLICT"})
-      if val > 0: found.append(offsets[i])
-    print("Found bank conflicts at offsets:", [s.numpy() for s in found])
-
-  def test_ds_read_b64(self): self.test_ds_read(2, 'ds_read_b64')
-
-  def test_ds_read_b128(self): self.test_ds_read(4, 'ds_read_b128')
-
-if __name__ == "__main__":
-  unittest.main()
diff --git a/extra/sqtt/test_timing.py b/extra/sqtt/test_timing.py
deleted file mode 100644
index 80ee504233..0000000000
--- a/extra/sqtt/test_timing.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import os
-os.environ["PYTHONPATH"] = "."
-os.environ["SQTT"] = "1"
-if "DEV" not in os.environ: os.environ["DEV"] = "AMD"
-os.environ["PROFILE"] = "1"
-# VIZ=1 to launch server
-# os.environ["VIZ"] = "1"
-os.environ["AMD_LLVM"] = "0"
-
-import unittest
-import sys, contextlib
-from tinygrad import Tensor, dtypes
-from tinygrad.helpers import getenv
-from tinygrad.uop.ops import UOp, Ops, KernelInfo
-from tinygrad.device import Device, ProfileDeviceEvent
-
-from extra.sqtt.roc import decode, WaveExec
-
-dev = Device[os.environ["DEV"]]
-
-def custom(arg:str, s:UOp|None=None) -> UOp: return UOp(Ops.CUSTOM, src=(s,) if s is not None else (), arg=arg)
-
-def asm_kernel(instrs:list[str], l:int=1, g:int=1) -> Tensor:
-  name = sys._getframe(1).f_code.co_name
-  def fxn(_):
-    L = UOp.special(l, "lidx0")
-    G = UOp.special(g, "gidx0")
-    op = custom("asm volatile (")
-    for inst in instrs: op = custom(f'  "{inst}\\n\\t"', op)
-    op = custom(");", op)
-    return UOp.sink(op, L, G, arg=KernelInfo(name=name))
-  k = Tensor.custom_kernel(Tensor.empty(1), fxn=fxn)[0]
-  return k
-
-@contextlib.contextmanager
-def save_sqtt():
-  # clear the old traces
-  dev.profile_events.clear()
-  sqtt:dict[str, list[WaveExec]] = {}
-  yield sqtt
-  # decode sqtt
-  if os.environ["DEV"] != "AMD": return
-  rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
-  assert len(rctx.inst_execs) > 0, "empty sqtt output"
-  sqtt.update(rctx.inst_execs)
-
-class TestTiming(unittest.TestCase):
-  def test_v_add(self):
-    with save_sqtt() as sqtt:
-      asm_kernel([f"v_add_f32 v{10+i} v{10+i+1} {10+i}" for i in range(3)]).realize()
-    wave = list(sqtt.values())[0][:-1]
-    assert all(s.dur == 1 for s in wave)
-    assert all(s.stall == 0 for s in wave)
-
-  def test_chain_v_add_1l(self):
-    with save_sqtt() as sqtt:
-      asm_kernel([
-        "v_add_f32_e32 v1 v0 v0",
-        "v_add_f32_e32 v2 v1 v1",
-      ]).realize()
-    wave = list(sqtt.values())[0][:-1]
-    assert all(s.dur == 1 for s in wave)
-    assert all(s.stall == 0 for s in wave)
-
-  def test_multi_cycle_inst(self):
-    def custom_vrcp(A, B):
-      op = custom("float a = 0.0;")
-      op = custom("float b = (*(data1_1+0));", op)
-      #op = custom('asm volatile("v_mul_f32_e32 %2 %2 %1" : "+v"(a) : "v"(b));', op)
-      op = custom('asm volatile("v_rcp_f32_e32 %2 %1" : "+v"(a) : "v"(b));', op)
-      op = custom('asm volatile("v_add_f32_e64 %1 %1 1.0" : "+v"(a));', op)
-      op = custom("*(data0_1+0) = a;", op)
-      return UOp.sink(op, A, B, arg=KernelInfo(name="custom_vrcp"))
-    out = Tensor([0.]).realize()
-    inp = Tensor([-2.0]).realize()
-    with save_sqtt() as sqtt:
-      Tensor.custom_kernel(out, inp, fxn=custom_vrcp)[0].realize()
-    wave = list(sqtt.values())[0][0]
-    for i in range(len(wave.insts)):
-      if wave.insts[i].inst.startswith("global_store"):
-        print(f"store diff {wave.insts[i].time-(wave.insts[i-1].time)}")
-    self.assertEqual(out.item(), 0.5)
-
-  def test_wmma(self):
-    with save_sqtt() as sqtt:
-      for tc in dev.renderer.get_tensor_cores(dev.arch):
-        M, K, N = tc.dims
-        s = 32
-        a = Tensor.empty(M*s, K*s, dtype=tc.dtype_in)@Tensor.empty(K*s, N*s, dtype=tc.dtype_in)
-        a.realize()
-        print(a)
-    for p,waves in sqtt.items():
-      for e in waves[0].insts:
-        if (e.inst.startswith("v_wmma")):
-          instruction = e.inst.split(" ")[0]
-          print(f"{instruction:<29} : {e.dur} cycles")
-
-  def test_sleep(self):
-    n = 1
-    def sleep_kernel(data0):
-      assert data0.dtype.base == dtypes.ulong
-      op = custom("unsigned long long t0 = __builtin_readcyclecounter();")
-      op = custom(f"__builtin_amdgcn_s_sleep({n});", op)
-      op = custom("unsigned long long t1 = __builtin_readcyclecounter();", op)
-      op = custom(f"data0_{data0.size}[0] = t1 - t0;", op)
-      return UOp.sink(data0, op, arg=KernelInfo(name=f"sleep_{n}"))
-    diff_hw_reg = Tensor.empty(1, dtype=dtypes.ulong)
-    diff_hw_reg = Tensor.custom_kernel(diff_hw_reg, fxn=sleep_kernel)[0]
-    with save_sqtt() as sqtt:
-      diff_hw_reg.realize()
-    sleep = next((e for e in sqtt[f"sleep_{n}"][0].insts if e.inst.startswith("s_sleep")))
-    # cycles = sleep dur + overhead of storing hi/lo REG_SHADER_CYCLES
-    self.assertGreaterEqual(diff_hw_reg.item(), sleep.dur)
-
-  def test_nop(self):
-    with save_sqtt() as sqtt:
-      asm_kernel(["s_nop 1"]*10).realize()
-    wave = list(sqtt.values())[0][0]
-    for e in wave.insts:
-      print(f"{e.inst} {e.dur=} {e.stall=}")
-
-  def test_wave_sched(self):
-    num_waves = getenv("NUM_WAVES", 16)
-    num_wgps = getenv("NUM_WGPS", 2)
-    num_vgpr = getenv("NUM_VGPR", 256)
-    with save_sqtt() as sqtt:
-      # 1 cycle decode, no stall
-      asm_kernel([f"v_mov_b32_e32 v{i} {i}" for i in range(num_vgpr)], l=32*num_waves, g=num_wgps).realize()
-    waves = list(sqtt.values())[0]
-    print(len(waves), "waves decoded")
-    for w in waves:
-      print(f"{w.wave_id:<2} {w.simd=} {w.cu=} {w.se=} @ clk {w.begin_time}")
-
-  def test_ones(self):
-    N = getenv("N", 4096)
-    CNT = getenv("CNT", 2)
-    with save_sqtt() as sqtt:
-      for _ in range(CNT):
-        Tensor.ones(N, N).contiguous().realize()
-    self.assertEqual(len(sqtt), CNT)
-
-if __name__ == "__main__":
-  unittest.main()