From 4573e91e61a1370d8802a767e227430bc9c24994 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Thu, 1 Jan 2026 18:51:31 -0500
Subject: [PATCH] more

---
 extra/assembly/amd/test/discover_instops.py | 90 ++++++++++++++++++---
 extra/assembly/amd/test/test_sqtt_hw.py     |  4 +-
 tinygrad/runtime/ops_amd.py                 |  7 +-
 3 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/extra/assembly/amd/test/discover_instops.py b/extra/assembly/amd/test/discover_instops.py
index b333ed82fd..ef470d1da5 100644
--- a/extra/assembly/amd/test/discover_instops.py
+++ b/extra/assembly/amd/test/discover_instops.py
@@ -7,11 +7,10 @@ For full traces: DEBUG=2 python extra/assembly/amd/test/discover_instops.py
 import os
 os.environ["SQTT"] = "1"
 os.environ["PROFILE"] = "1"
-os.environ["SQTT_ITRACE_SE_MASK"] = "2"  # Enable instruction tracing on SE1
+os.environ["SQTT_ITRACE_SE_MASK"] = "1"  # Enable instruction tracing on SE0
 os.environ["SQTT_LIMIT_SE"] = "2"        # Force work to traced SE only
 
 from tinygrad.helpers import DEBUG, colored
-from tinygrad.runtime.ops_amd import SQTT_SIMD_SEL
 
 from extra.assembly.amd.autogen.rdna3.ins import (
   # VALU - basic (these are safe, just register ops)
@@ -28,11 +27,19 @@ from extra.assembly.amd.autogen.rdna3.ins import (
   # SALU - basic (safe, just register ops)
   s_mov_b32, s_add_u32, s_and_b32, s_or_b32,
   s_lshl_b32, s_lshr_b32,
-  s_nop, s_endpgm,
+  s_nop, s_endpgm, s_waitcnt,
   # SALU - branch (safe if offset is 0 = next instruction)
   s_branch, s_cbranch_scc0, s_cbranch_execz,
   # SALU - message
   s_sendmsg,
+  # SMEM - scalar memory (load from kernarg pointer in s[0:1])
+  s_load_b32, s_load_b64,
+  # VMEM - vector memory (global load/store)
+  global_load_b32, global_store_b32,
+  # LDS - local data share
+  ds_load_b32, ds_store_b32,
+  # SrcEnum for NULL soffset
+  SrcEnum,
 )
 from extra.assembly.amd.dsl import v, s
 from extra.assembly.amd.sqtt import InstOp, INST
@@ -45,12 +52,20 @@ from extra.assembly.amd.test.test_sqtt_hw import (
 # INSTRUCTION TEST CASES - only safe instructions that don't access memory
 # ═══════════════════════════════════════════════════════════════════════════════
 
+# Helper: load buffer address from kernarg (s[0:1] -> s[2:3])
+# The runtime passes kernarg pointer in s[0:1], kernarg contains buffer address
+def _load_buf_addr():
+  return [
+    s_load_b64(s[2:3], s[0], 0, soffset=SrcEnum.NULL),  # load buf addr from kernarg
+    s_waitcnt(lgkmcnt=0),  # wait for SMEM load
+  ]
+
 INSTRUCTION_TESTS: dict[str, tuple[str, list]] = {
   # SALU (0x0) - scalar ALU, just register operations
-  "SALU_mov": ("s_mov_b32", [s_mov_b32(s[0], 0), s_mov_b32(s[1], 1)]),
-  "SALU_add": ("s_add_u32", [s_mov_b32(s[0], 1), s_mov_b32(s[1], 2), s_add_u32(s[2], s[0], s[1])]),
-  "SALU_logic": ("s_and/or", [s_and_b32(s[2], s[0], s[1]), s_or_b32(s[3], s[0], s[1])]),
-  "SALU_shift": ("s_lshl/lshr", [s_lshl_b32(s[2], s[0], 1), s_lshr_b32(s[3], s[0], 1)]),
+  "SALU_mov": ("s_mov_b32", [s_mov_b32(s[4], 0), s_mov_b32(s[5], 1)]),
+  "SALU_add": ("s_add_u32", [s_mov_b32(s[4], 1), s_mov_b32(s[5], 2), s_add_u32(s[6], s[4], s[5])]),
+  "SALU_logic": ("s_and/or", [s_and_b32(s[6], s[4], s[5]), s_or_b32(s[7], s[4], s[5])]),
+  "SALU_shift": ("s_lshl/lshr", [s_lshl_b32(s[6], s[4], 1), s_lshr_b32(s[7], s[4], 1)]),
   "SALU_nop": ("s_nop", [s_nop(0)]),
 
   # JUMP (0x3) - branch to next instruction (offset 0)
@@ -80,13 +95,62 @@ INSTRUCTION_TESTS: dict[str, tuple[str, list]] = {
 
   # VALU CMPX (0x73) - modifies EXEC
   "VALU_cmpx": ("v_cmpx_eq_u32", [v_cmpx_eq_u32_e32(v[0], v[1])]),
+
+  # ═══════════════════════════════════════════════════════════════════════════════
+  # MEMORY INSTRUCTIONS - access real buffer passed via kernarg
+  # ═══════════════════════════════════════════════════════════════════════════════
+
+  # SMEM (0x1) - scalar memory load from buffer
+  "SMEM_load": ("s_load_b32", [
+    s_load_b64(s[2:3], s[0], 0, soffset=SrcEnum.NULL),  # load buf addr from kernarg
+    s_waitcnt(lgkmcnt=0),
+    s_load_b32(s[4], s[2], 0, soffset=SrcEnum.NULL),  # load from buffer
+    s_waitcnt(lgkmcnt=0),
+  ]),
+
+  # VMEM load (0x21 VMEM_LOAD) - global load
+  "VMEM_load": ("global_load_b32", [
+    s_load_b64(s[2:3], s[0], 0, soffset=SrcEnum.NULL),  # load buf addr from kernarg
+    s_waitcnt(lgkmcnt=0),
+    v_mov_b32_e32(v[0], 0),  # offset = 0
+    global_load_b32(v[1], addr=v[0], saddr=s[2], offset=0),  # load from buffer
+    s_waitcnt(vmcnt=0),
+  ]),
+
+  # VMEM store (0x24 VMEM_STORE) - global store
+  "VMEM_store": ("global_store_b32", [
+    s_load_b64(s[2:3], s[0], 0, soffset=SrcEnum.NULL),  # load buf addr from kernarg
+    s_waitcnt(lgkmcnt=0),
+    v_mov_b32_e32(v[0], 0),  # offset = 0
+    v_mov_b32_e32(v[1], 42),  # data to store
+    global_store_b32(addr=v[0], data=v[1], saddr=s[2], offset=0),  # store to buffer
+    s_waitcnt(vmcnt=0),
+  ]),
+
+  # LDS load (0x29 LDS_LOAD) - local data share read
+  "LDS_load": ("ds_load_b32", [
+    v_mov_b32_e32(v[0], 0),  # LDS address = 0
+    ds_load_b32(v[1], v[0], offset=0),  # read from LDS
+    s_waitcnt(lgkmcnt=0),
+  ]),
+
+  # LDS store (0x2b LDS_STORE) - local data share write
+  "LDS_store": ("ds_store_b32", [
+    v_mov_b32_e32(v[0], 0),  # LDS address = 0
+    v_mov_b32_e32(v[1], 42),  # data to store
+    ds_store_b32(v[0], v[1], offset=0),  # write to LDS
+    s_waitcnt(lgkmcnt=0),
+  ]),
 }
 
 
-def run_with_simd_retry(instructions: list, max_retries: int = 4) -> tuple[list[bytes], list, set]:
-  """Run instructions and retry with different SIMD selections until we get INST packets."""
-  for simd in range(max_retries):
-    SQTT_SIMD_SEL.value = simd
+def run_with_retry(instructions: list, max_attempts: int = 10) -> tuple[list[bytes], list, set]:
+  """Run instructions and retry until we get INST packets.
+
+  The hardware scheduler picks which SIMD to run on (~50/50), and SQTT traces simd_sel=0.
+  We retry until the wave lands on the traced SIMD.
+  """
+  for _ in range(max_attempts):
     blobs = run_asm_sqtt(instructions)
     packets = decode_all_blobs(blobs)
     ops = get_inst_ops(packets)
@@ -102,7 +166,7 @@ def discover_all_instops() -> tuple[dict[int, set[str]], dict[str, Exception]]:
 
   for test_name, (instr_name, instructions) in INSTRUCTION_TESTS.items():
     try:
-      blobs, packets, ops = run_with_simd_retry(instructions)
+      blobs, packets, ops = run_with_retry(instructions)
 
       for op in ops:
         if op not in discovered:
@@ -111,7 +175,7 @@ def discover_all_instops() -> tuple[dict[int, set[str]], dict[str, Exception]]:
 
       if DEBUG >= 2:
         print(f"\n{'─'*60}")
-        print(f"{test_name} ({instr_name}): ops={[hex(op) for op in sorted(ops)]} simd_sel={SQTT_SIMD_SEL.value}")
+        print(f"{test_name} ({instr_name}): ops={[hex(op) for op in sorted(ops)]}")
         print_blobs(blobs, filter_timing=True)
       if DEBUG >= 1:
         status = colored("✓", "green") if ops else colored("∅", "yellow")
diff --git a/extra/assembly/amd/test/test_sqtt_hw.py b/extra/assembly/amd/test/test_sqtt_hw.py
index 5712f3f2e6..8e887678c9 100644
--- a/extra/assembly/amd/test/test_sqtt_hw.py
+++ b/extra/assembly/amd/test/test_sqtt_hw.py
@@ -9,8 +9,8 @@ For pretty trace output: DEBUG=2 python -m pytest extra/assembly/amd/test/test_s
 import os
 os.environ["SQTT"] = "1"
 os.environ["PROFILE"] = "1"
-os.environ["SQTT_ITRACE_SE_MASK"] = "2"  # Enable instruction tracing on SE1
-os.environ["SQTT_LIMIT_SE"] = "1"        # Limit execution
+os.environ["SQTT_ITRACE_SE_MASK"] = "1"  # Enable instruction tracing on SE0
+os.environ["SQTT_LIMIT_SE"] = "2"        # Force work to traced SE only
 
 import unittest
 from tinygrad.helpers import DEBUG, colored
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index f5145fce0a..b3a45dec96 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -21,7 +21,7 @@ from tinygrad.runtime.support.memory import AddrSpace
 if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl  # noqa: F401 # pylint: disable=unused-import
 
 SQTT = ContextVar("SQTT", abs(VIZ.value)>=2)
-SQTT_ITRACE_SE_MASK, SQTT_LIMIT_SE, SQTT_SIMD_SEL = ContextVar("SQTT_ITRACE_SE_MASK", 0b11), ContextVar("SQTT_LIMIT_SE", 0), ContextVar("SQTT_SIMD_SEL", 0)
+SQTT_ITRACE_SE_MASK, SQTT_LIMIT_SE = ContextVar("SQTT_ITRACE_SE_MASK", 0b11), ContextVar("SQTT_LIMIT_SE", 0)
 PMC = ContextVar("PMC", abs(VIZ.value)>=2)
 EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
 WAIT_REG_MEM_FUNCTION_EQ  = 3 # ==
@@ -251,15 +251,14 @@ class AMDComputeQueue(HWQueue):
         else:
           self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12)
           self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
-        # NOTE: SQTT can only trace instructions on one simd per se, this selects the simd in first wgp in first sa.
+        # NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
         # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
         # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
         # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
         # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
         # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
-        # Use SQTT_SIMD_SEL (0-3) to select which SIMD to trace within the WGP.
         cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT
-        self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=SQTT_SIMD_SEL.value, wgp_sel=0, sa_sel=0)
+        self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=0, wgp_sel=0, sa_sel=0)
         reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
                       self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
         token_exclude = (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0