From 60bd546593059f5763937b6e85069ccbbbddbcc6 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:19:54 +0200 Subject: [PATCH] sqtt: add cycle count to rdna3 enums (#15473) * update rdna3 sqtt enums to include cycle_count * dispatch_to_exec --- test/mockgpu/amd/emu.py | 28 ++++++------ tinygrad/renderer/amd/sqtt.py | 80 +++++++++++++++++------------------ tinygrad/viz/serve.py | 4 +- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/test/mockgpu/amd/emu.py b/test/mockgpu/amd/emu.py index 42948e87f0..527763d3af 100644 --- a/test/mockgpu/amd/emu.py +++ b/test/mockgpu/amd/emu.py @@ -131,25 +131,25 @@ def _init_sqtt_encoder(): SOPPOp3.S_CBRANCH_EXECZ.value, SOPPOp3.S_CBRANCH_EXECNZ.value} # VALU sub-classification patterns - _VALU_TRANS_RE = re.compile(r'V_(EXP|LOG|RCP|RSQ|SQRT|SIN|COS|CEIL|FLOOR|TRUNC|RNDNE|FRACT|FREXP)_') - _VALU_64_SHIFT_RE = re.compile(r'V_(LSHLREV|LSHRREV|ASHRREV)_(B|I)64') - _VALU_MAD64_RE = re.compile(r'V_MAD_(U|I)64') - _VALU_64_RE = re.compile(r'V_\w+_F64') + _VALUT_4_RE = re.compile(r'V_(EXP|LOG|RCP|RSQ|SQRT|SIN|COS|CEIL|FLOOR|TRUNC|RNDNE|FRACT|FREXP)_') + _VALUB_2_RE = re.compile(r'V_(LSHLREV|LSHRREV|ASHRREV)_(B|I)64') + _VALUB_4_RE = re.compile(r'V_MAD_(U|I)64') + _VALUB_16_RE = re.compile(r'V_\w+_F64') def _valu_op(op_name: str) -> InstOp|None: - if 'CMPX' in op_name: return InstOp.VALU_CMPX - if _VALU_64_SHIFT_RE.search(op_name): return InstOp.VALU_64_SHIFT - if _VALU_MAD64_RE.search(op_name): return InstOp.VALU_MAD64 - if _VALU_64_RE.search(op_name): return InstOp.VALU_64 - if _VALU_TRANS_RE.search(op_name): return InstOp.VALU_TRANS + if 'CMPX' in op_name: return InstOp.VALU1_WR_EXEC + if _VALUB_2_RE.search(op_name): return InstOp.VALUB_2 + if _VALUB_4_RE.search(op_name): return InstOp.VALUB_4 + if _VALUB_16_RE.search(op_name): return InstOp.VALUB_16 + if _VALUT_4_RE.search(op_name): return InstOp.VALUT_4 return None def _mem_op(t, op_name: str) -> InstOp: is_store = "STORE" in op_name - if issubclass(t, _DS): return InstOp.LDS_STORE if is_store else InstOp.LDS_LOAD - if issubclass(t, _GLOBAL): return InstOp.GLOBAL_STORE if is_store else InstOp.GLOBAL_LOAD - if issubclass(t, _FLAT): return InstOp.FLAT_STORE if is_store else InstOp.FLAT_LOAD - if issubclass(t, _SCRATCH): return InstOp.FLAT_STORE if is_store else InstOp.FLAT_LOAD + if issubclass(t, _DS): return InstOp.LDS_WR_2 if is_store else InstOp.LDS_RD + if issubclass(t, _GLOBAL): return InstOp.SGMEM_WR_2 if is_store else InstOp.SGMEM_RD_1 + if issubclass(t, _FLAT): return InstOp.FLAT_WR_3 if is_store else InstOp.FLAT_RD_2 + if issubclass(t, _SCRATCH): return InstOp.FLAT_WR_3 if is_store else InstOp.FLAT_RD_2 return InstOp.SALU nibbles: list[int] = [] @@ -174,7 +174,7 @@ def _init_sqtt_encoder(): op = _valu_op(op_name) if op is None: _emit_nibbles(nibbles, VALUINST, delta=1, wave=w) else: _emit_nibbles(nibbles, INST, delta=1, wave=w, op=op) - elif issubclass(inst_type, _SMEM): _emit_nibbles(nibbles, INST, delta=1, wave=w, op=InstOp.SMEM) + elif issubclass(inst_type, _SMEM): _emit_nibbles(nibbles, INST, delta=1, wave=w, op=InstOp.SMEM_RD) else: _emit_nibbles(nibbles, INST, delta=1, wave=w, op=_mem_op(inst_type, op_name)) def finish(wave_id: int): diff --git a/tinygrad/renderer/amd/sqtt.py b/tinygrad/renderer/amd/sqtt.py index fef92d8df2..f1911fc37e 100644 --- a/tinygrad/renderer/amd/sqtt.py +++ b/tinygrad/renderer/amd/sqtt.py @@ -44,62 +44,62 @@ class InstOp(Enum): OTHER_ range follows same pattern but values overlap differently. """ SALU = 0x0 - SMEM = 0x1 + SMEM_RD = 0x1 JUMP = 0x3 # branch taken JUMP_NO = 0x4 # branch not taken CALL = 0x5 # s_call_b64 MESSAGE = 0x9 - VALU_TRANS = 0xb # transcendental: exp, log, rcp, sqrt, sin, cos - VALU_64_SHIFT = 0xd # 64-bit shifts: lshl, lshr, ashr - VALU_MAD64 = 0xe # 64-bit multiply-add - VALU_64 = 0xf # 64-bit: add, mul, fma, rcp, sqrt, rounding, frexp, div helpers + VALUT_4 = 0xb # transcendental: exp, log, rcp, sqrt, sin, cos + VALUB_2 = 0xd # 64-bit shifts: lshl, lshr, ashr + VALUB_4 = 0xe # 64-bit multiply-add + VALUB_16 = 0xf # 64-bit: add, mul, fma, rcp, sqrt, rounding, frexp, div helpers VINTERP = 0x12 # interpolation: v_interp_p10_f32, v_interp_p2_f32 BARRIER = 0x13 # FLAT memory ops on traced SIMD (0x1x range) - FLAT_LOAD = 0x1c - FLAT_STORE = 0x1d - FLAT_STORE_64 = 0x1e - FLAT_STORE_96 = 0x1f - FLAT_STORE_128 = 0x20 + FLAT_RD_2 = 0x1c + FLAT_WR_3 = 0x1d + FLAT_WR_4 = 0x1e + FLAT_WR_5 = 0x1f + FLAT_WR_6 = 0x20 # GLOBAL memory ops on traced SIMD (0x2x range) - GLOBAL_LOAD = 0x21 # saddr=SGPR, all sizes - GLOBAL_LOAD_VADDR = 0x22 # saddr=NULL, all sizes - GLOBAL_STORE = 0x24 # saddr=SGPR, 32-bit - GLOBAL_STORE_64 = 0x25 # saddr=SGPR 64 or saddr=NULL 32 - GLOBAL_STORE_96 = 0x26 # saddr=SGPR 96 or saddr=NULL 64 - GLOBAL_STORE_128 = 0x27 # saddr=SGPR 128 or saddr=NULL 96 - GLOBAL_STORE_VADDR_128 = 0x28 # saddr=NULL, 128-bit + SGMEM_RD_1 = 0x21 # saddr=SGPR, all sizes + SGMEM_RD_2 = 0x22 # saddr=NULL, all sizes + SGMEM_WR_2 = 0x24 # saddr=SGPR, 32-bit + SGMEM_WR_3 = 0x25 # saddr=SGPR 64 or saddr=NULL 32 + SGMEM_WR_4 = 0x26 # saddr=SGPR 96 or saddr=NULL 64 + SGMEM_WR_5 = 0x27 # saddr=SGPR 128 or saddr=NULL 96 + SGMEM_WR_6 = 0x28 # saddr=NULL, 128-bit # LDS ops on traced SIMD - LDS_LOAD = 0x29 - LDS_ATOMIC = 0x2a # ds_append, ds_consume, ds_store_addtid_b32 - LDS_STORE = 0x2b - LDS_STORE_64 = 0x2c - LDS_STORE_96 = 0x2d - LDS_STORE_128 = 0x2e + LDS_RD = 0x29 + LDS_WR_1 = 0x2a # ds_append, ds_consume, ds_store_addtid_b32 + LDS_WR_2 = 0x2b + LDS_WR_3 = 0x2c + LDS_WR_4 = 0x2d + LDS_WR_5 = 0x2e # Memory ops on other SIMD (0x5x range) - OTHER_LDS_LOAD = 0x50 - OTHER_LDS_STORE = 0x51 - OTHER_LDS_STORE_64 = 0x52 - OTHER_LDS_STORE_128 = 0x54 - OTHER_FLAT_LOAD = 0x55 - OTHER_FLAT_STORE = 0x56 - OTHER_FLAT_STORE_64 = 0x57 - OTHER_FLAT_STORE_96 = 0x58 - OTHER_FLAT_STORE_128 = 0x59 - OTHER_GLOBAL_LOAD = 0x5a # saddr=SGPR, all sizes - OTHER_GLOBAL_LOAD_VADDR = 0x5b # saddr=NULL or saddr=SGPR store 32 - OTHER_GLOBAL_STORE_64 = 0x5c # saddr=SGPR 64 or saddr=NULL 32 - OTHER_GLOBAL_STORE_96 = 0x5d # saddr=SGPR 96 or saddr=NULL 64 - OTHER_GLOBAL_STORE_128 = 0x5e # saddr=SGPR 128 or saddr=NULL 96 - OTHER_GLOBAL_STORE_VADDR_128 = 0x5f # saddr=NULL, 128-bit + OTHER_LDS_1 = 0x50 + OTHER_LDS_2 = 0x51 + OTHER_LDS_3 = 0x52 + OTHER_LDS_5 = 0x54 + OTHER_FLAT_2 = 0x55 + OTHER_FLAT_3 = 0x56 + OTHER_FLAT_4 = 0x57 + OTHER_FLAT_5 = 0x58 + OTHER_FLAT_6 = 0x59 + OTHER_VMEM_1 = 0x5a # saddr=SGPR, all sizes + OTHER_VMEM_2 = 0x5b # saddr=NULL or saddr=SGPR store 32 + OTHER_VMEM_3 = 0x5c # saddr=SGPR 64 or saddr=NULL 32 + OTHER_VMEM_4 = 0x5d # saddr=SGPR 96 or saddr=NULL 64 + OTHER_VMEM_5 = 0x5e # saddr=SGPR 128 or saddr=NULL 96 + OTHER_VMEM_6 = 0x5f # saddr=NULL, 128-bit # EXEC-modifying ops (0x7x range) - SALU_SAVEEXEC = 0x72 # s_*_saveexec_b32/b64 - VALU_CMPX = 0x73 # v_cmpx_* + SALU_WR_EXEC = 0x72 # s_*_saveexec_b32/b64 + VALU1_WR_EXEC = 0x73 # v_cmpx_* class InstOpRDNA4(Enum): """SQTT instruction operation types for RDNA4 (gfx1200). Different encoding from RDNA3.""" diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index 4ba38cf73b..8af1cfc3e6 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -349,8 +349,8 @@ def sqtt_timeline(data:bytes, lib:bytes, target:str) -> Generator[ProfileEvent, NS_PER_TICK = 10 # 100MHz prev_pair:tuple[int, int]|None = None # (shader, realtime) is_cdna = target.startswith("gfx9") - dispatch_to_exec = {"WMMA":"VALU", "VALU":"VALU", "VALUINST":"VALU", "VINTERP":"VALU", "GLOBAL":"VMEM", "FLAT":"VMEM", "LDS":"LDS", "SALU":"SALU", - "SMEM":"SALU", "VMEM":"VMEM"} + dispatch_to_exec = {"WMMA":"VALU", "VALU":"VALU", "VALU1":"VALU", "VALUT":"VALU", "VALUB":"VALU", "VALUINST":"VALU", "VINTERP":"VALU", + "SGMEM":"VMEM", "FLAT":"VMEM", "LDS":"LDS", "SALU":"SALU", "SMEM":"SALU", "VMEM":"VMEM"} def add(name:str, p:PacketType, op:str|None=None, wave:int|None=None, info:InstructionInfo|None=None) -> Generator[ProfileEvent, None, None]: row = f"WAVE:{wave}" if (wave:=getattr(p, "wave", wave)) is not None else f"{p.__class__.__name__}:0 {name}" if row not in row_ends: yield ProfilePointEvent(row, "JSON", "pcMap", pc_map, ts=Decimal(0))