From 89be3590aa61f870730cc0279d38184a5952db30 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 10 Oct 2025 17:54:14 +0800 Subject: [PATCH] amd: sqtt on gfx12 (#12564) * amd: sqtt on gfx12 * cleaner * thi * and this * ops * ugh * back * rm this * rm --- autogen_stubs.sh | 4 ++-- extra/sqtt/rgptool.py | 9 +++++--- extra/sqtt/roc.py | 19 ++++++++-------- extra/sqtt/rocprof/rocprof.py | 5 ++-- extra/sqtt/sqtt.h | 5 ++++ tinygrad/runtime/autogen/sqtt.py | 22 ++++++++++++++---- tinygrad/runtime/ops_amd.py | 39 ++++++++++++++++++++------------ 7 files changed, 67 insertions(+), 36 deletions(-) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 1ff229a5d3..1ea3b583db 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -435,8 +435,8 @@ generate_sqtt() { -o extra/sqtt/rocprof/rocprof.py fixup extra/sqtt/rocprof/rocprof.py sed -i '1s/^/# pylint: skip-file\n/' extra/sqtt/rocprof/rocprof.py - sed -i "s/import ctypes/import ctypes, tinygrad.helpers.fetch as tgfetch/g" extra/sqtt/rocprof/rocprof.py - sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py + sed -i "s/import ctypes/import ctypes\nfrom tinygrad.helpers import fetch/g" extra/sqtt/rocprof/rocprof.py + sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py } generate_webgpu() { diff --git a/extra/sqtt/rgptool.py b/extra/sqtt/rgptool.py index b246f5e731..21b2959a37 100755 --- a/extra/sqtt/rgptool.py +++ b/extra/sqtt/rgptool.py @@ -156,6 +156,9 @@ class RGP: sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device] if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data") device_props = sqtt_events[0].props + gfx_ver = device_props['gfx_target_version'] // 10000 + gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}", + getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None)) sqtt_itrace_enabled = any([event.itrace for event in sqtt_events]) sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events]) sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0 @@ -193,7 +196,7 @@ class RGP: flags=0, trace_shader_core_clock=0x93f05080, trace_memory_clock=0x4a723a40, - device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']], + device_id={110000: 0x744c, 110003: 0x7480, 120001: 0x7550}[device_props['gfx_target_version']], device_revision_id=0xc8, vgprs_per_simd=1536, sgprs_per_simd=128*16, @@ -207,7 +210,7 @@ class RGP: sgpr_alloc_granularity=128, hardware_contexts=8, gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE, - gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0, + gfxip_level=gfx_iplvl, gpu_index=0, gds_size=0, gds_per_shader_engine=0, @@ -258,7 +261,7 @@ class RGP: major_version=0, minor_version=2, ), shader_engine_index=sqtt_event.se, - sqtt_version=sqtt.SQTT_VERSION_3_2, + sqtt_version={11: sqtt.SQTT_VERSION_3_2, 12: sqtt.SQTT_VERSION_3_3}.get(gfx_ver), _0=sqtt.union_sqtt_file_chunk_sqtt_desc_0( v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1( instrumentation_spec_version=1, diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py index c4eaa95f76..9044904e4c 100644 --- a/extra/sqtt/roc.py +++ b/extra/sqtt/roc.py @@ -20,14 +20,15 @@ class InstInfo: class _ROCParseCtx: def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]): self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs - self.wave_events = {} + self.wave_events, self.disasms, self.addr2prg = {}, {}, {} + + for prog in prog_evs: + for addr, info in comgr_get_address_table(prog.lib).items(): + self.disasms[prog.base + addr] = info + self.addr2prg[prog.base + addr] = prog def next_sqtt(self): return next(self.sqtt_evs, None) - def find_program(self, idx): return self.prog_evs[idx] - def get_instr_info(self, idx, exec_addr): return self.disasm_program(idx)[exec_addr - self.find_program(idx).base] - - @functools.lru_cache(None) - def disasm_program(self, idx): return comgr_get_address_table(self.find_program(idx).lib) + def find_program(self, addr): return self.addr2prg[addr] def on_occupancy_ev(self, ev): if DEBUG >= 4: print("OCC", ev.time, ev.cu, ev.simd, ev.wave_id, ev.start) @@ -39,10 +40,10 @@ class _ROCParseCtx: for j in range(ev.instructions_size): inst_ev = ev.instructions_array[j] inst_typ = rocprof.rocprofiler_thread_trace_decoder_inst_category_t__enumvalues[inst_ev.category] - asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.get_instr_info(inst_ev.pc.code_object_id, inst_ev.pc.address)[0])) + asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.disasms[inst_ev.pc.address][0])) asm[inst_ev.pc.address].on_ev(inst_ev) - self.wave_events[(self.find_program(ev.instructions_array[0].pc.code_object_id).name, ev.wave_id, ev.cu, ev.simd)] = asm + self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -78,7 +79,7 @@ if __name__ == "__main__": @rocprof.rocprof_trace_decoder_isa_callback_t def isa_cb(instr_ptr, mem_size_ptr, size_ptr, pc, data_ptr): - instr, mem_size_ptr[0] = ROCParseCtx.get_instr_info(pc.code_object_id, pc.address) + instr, mem_size_ptr[0] = ROCParseCtx.disasms[pc.address] # this is the number of bytes to next instruction, set to 0 for end_pgm if instr == "s_endpgm": mem_size_ptr[0] = 0 diff --git a/extra/sqtt/rocprof/rocprof.py b/extra/sqtt/rocprof/rocprof.py index c864d31da7..1d0b151bb1 100644 --- a/extra/sqtt/rocprof/rocprof.py +++ b/extra/sqtt/rocprof/rocprof.py @@ -7,7 +7,8 @@ # POINTER_SIZE is: 8 # LONGDOUBLE_SIZE is: 16 # -import ctypes, tinygrad.helpers.fetch as tgfetch +import ctypes +from tinygrad.helpers import fetch class AsDictMixin: @@ -155,7 +156,7 @@ class FunctionFactoryStub: # You can either re-run clan2py with -l /path/to/library.so # Or manually fix this by comment the ctypes.CDLL loading _libraries = {} -_libraries['FIXME_STUB'] = ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB') +_libraries['FIXME_STUB'] = ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB') diff --git a/extra/sqtt/sqtt.h b/extra/sqtt/sqtt.h index 775655840c..ac641acabd 100644 --- a/extra/sqtt/sqtt.h +++ b/extra/sqtt/sqtt.h @@ -43,6 +43,7 @@ enum sqtt_version SQTT_VERSION_2_3 = 0x6, /* GFX9 */ SQTT_VERSION_2_4 = 0x7, /* GFX10+ */ SQTT_VERSION_3_2 = 0xb, /* GFX11+ */ + SQTT_VERSION_3_3 = 0xc, /* GFX12+ */ }; enum sqtt_file_chunk_type @@ -144,6 +145,8 @@ enum sqtt_gfxip_level SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7, SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9, SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc, + SQTT_GFXIP_LEVEL_GFXIP_11_5 = 0xd, + SQTT_GFXIP_LEVEL_GFXIP_12 = 0x10, }; enum sqtt_memory_type @@ -427,6 +430,8 @@ enum elf_gfxip_level EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x04e, }; struct sqtt_file_chunk_spm_db { diff --git a/tinygrad/runtime/autogen/sqtt.py b/tinygrad/runtime/autogen/sqtt.py index 5d246bff15..3234c6edca 100644 --- a/tinygrad/runtime/autogen/sqtt.py +++ b/tinygrad/runtime/autogen/sqtt.py @@ -174,12 +174,14 @@ sqtt_version__enumvalues = { 6: 'SQTT_VERSION_2_3', 7: 'SQTT_VERSION_2_4', 11: 'SQTT_VERSION_3_2', + 12: 'SQTT_VERSION_3_3', } SQTT_VERSION_NONE = 0 SQTT_VERSION_2_2 = 5 SQTT_VERSION_2_3 = 6 SQTT_VERSION_2_4 = 7 SQTT_VERSION_3_2 = 11 +SQTT_VERSION_3_3 = 12 sqtt_version = ctypes.c_uint32 # enum # values for enumeration 'sqtt_file_chunk_type' @@ -336,6 +338,8 @@ sqtt_gfxip_level__enumvalues = { 7: 'SQTT_GFXIP_LEVEL_GFXIP_10_1', 9: 'SQTT_GFXIP_LEVEL_GFXIP_10_3', 12: 'SQTT_GFXIP_LEVEL_GFXIP_11_0', + 13: 'SQTT_GFXIP_LEVEL_GFXIP_11_5', + 16: 'SQTT_GFXIP_LEVEL_GFXIP_12', } SQTT_GFXIP_LEVEL_NONE = 0 SQTT_GFXIP_LEVEL_GFXIP_6 = 1 @@ -346,6 +350,8 @@ SQTT_GFXIP_LEVEL_GFXIP_9 = 5 SQTT_GFXIP_LEVEL_GFXIP_10_1 = 7 SQTT_GFXIP_LEVEL_GFXIP_10_3 = 9 SQTT_GFXIP_LEVEL_GFXIP_11_0 = 12 +SQTT_GFXIP_LEVEL_GFXIP_11_5 = 13 +SQTT_GFXIP_LEVEL_GFXIP_12 = 16 sqtt_gfxip_level = ctypes.c_uint32 # enum # values for enumeration 'sqtt_memory_type' @@ -806,12 +812,16 @@ elf_gfxip_level__enumvalues = { 51: 'EF_AMDGPU_MACH_AMDGCN_GFX1010', 54: 'EF_AMDGPU_MACH_AMDGCN_GFX1030', 65: 'EF_AMDGPU_MACH_AMDGCN_GFX1100', + 67: 'EF_AMDGPU_MACH_AMDGCN_GFX1150', + 78: 'EF_AMDGPU_MACH_AMDGCN_GFX1200', } EF_AMDGPU_MACH_AMDGCN_GFX801 = 40 EF_AMDGPU_MACH_AMDGCN_GFX900 = 44 EF_AMDGPU_MACH_AMDGCN_GFX1010 = 51 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 54 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 65 +EF_AMDGPU_MACH_AMDGCN_GFX1150 = 67 +EF_AMDGPU_MACH_AMDGCN_GFX1200 = 78 elf_gfxip_level = ctypes.c_uint32 # enum class struct_sqtt_file_chunk_spm_db(Structure): pass @@ -1607,7 +1617,8 @@ __all__ = \ 'ApiCmdUpdateBuffer', 'ApiCmdWaitEvents', 'ApiCmdWriteTimestamp', 'ApiInvalid', 'ApiRayTracingSeparateCompiled', 'EF_AMDGPU_MACH_AMDGCN_GFX1010', 'EF_AMDGPU_MACH_AMDGCN_GFX1030', - 'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX801', + 'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX1150', + 'EF_AMDGPU_MACH_AMDGCN_GFX1200', 'EF_AMDGPU_MACH_AMDGCN_GFX801', 'EF_AMDGPU_MACH_AMDGCN_GFX900', 'EventCmdBlitImage', 'EventCmdBuildAccelerationStructuresIndirectKHR', 'EventCmdBuildAccelerationStructuresKHR', @@ -1671,7 +1682,8 @@ __all__ = \ 'SQTT_FILE_CHUNK_TYPE_SQTT_DESC', 'SQTT_FILE_MAGIC_NUMBER', 'SQTT_FILE_VERSION_MAJOR', 'SQTT_FILE_VERSION_MINOR', 'SQTT_GFXIP_LEVEL_GFXIP_10_1', 'SQTT_GFXIP_LEVEL_GFXIP_10_3', - 'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_6', + 'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_11_5', + 'SQTT_GFXIP_LEVEL_GFXIP_12', 'SQTT_GFXIP_LEVEL_GFXIP_6', 'SQTT_GFXIP_LEVEL_GFXIP_7', 'SQTT_GFXIP_LEVEL_GFXIP_8', 'SQTT_GFXIP_LEVEL_GFXIP_8_1', 'SQTT_GFXIP_LEVEL_GFXIP_9', 'SQTT_GFXIP_LEVEL_NONE', 'SQTT_GPU_NAME_MAX_SIZE', @@ -1697,9 +1709,9 @@ __all__ = \ 'SQTT_QUEUE_TYPE_COMPUTE', 'SQTT_QUEUE_TYPE_DMA', 'SQTT_QUEUE_TYPE_UNIVERSAL', 'SQTT_QUEUE_TYPE_UNKNOWN', 'SQTT_SA_PER_SE', 'SQTT_VERSION_2_2', 'SQTT_VERSION_2_3', - 'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_NONE', - 'UserEventObjectName', 'UserEventPop', 'UserEventPush', - 'UserEventTrigger', 'elf_gfxip_level', + 'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_3_3', + 'SQTT_VERSION_NONE', 'UserEventObjectName', 'UserEventPop', + 'UserEventPush', 'UserEventTrigger', 'elf_gfxip_level', 'rgp_sqtt_marker_event_type', 'rgp_sqtt_marker_general_api_type', 'rgp_sqtt_marker_identifier', 'rgp_sqtt_marker_user_event_type', 'sqtt_api_type', 'sqtt_engine_type', diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index b47ee3259f..d89eaff8b6 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -130,8 +130,9 @@ class AMDComputeQueue(HWQueue): self.wreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2, *data_ints[i:i+2]) def sqtt_config(self, tracing:bool): - self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1, - rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK, util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing)) + trace_ctrl = {'rt_freq': self.soc.SQ_TT_RT_FREQ_4096_CLK} if self.dev.target < (12,0,0) else {} + self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1, util_timer=1, + mode=int(tracing), **trace_ctrl) # Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start def sqtt_start(self, buf0s:list[HCQBuffer], se_mask:int): @@ -140,24 +141,35 @@ class AMDComputeQueue(HWQueue): # One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well for se in range(len(buf0s)): self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1) - buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12) - self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size>>12) - self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo) + buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr >> 12) + if self.dev.target >= (12,0,0): + self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, size=buf0s[se].size >> 12) + self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_LO, base_lo=buf0_lo) + self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_HI, base_hi=buf0_hi) + else: + self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12) + self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo) # NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa. # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se, # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE and trace even kernels that only have one wavefront. - self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0) + cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT + self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=0, wgp_sel=0, sa_sel=0) reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \ self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT - token_exclude = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT + token_exclude = (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0 + + # disable tracing if not (se_mask >> se) & 0b1: - token_exclude |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \ + # gfx12 doesn't have enums with all fields, so it's hardcoded, but it's the same as gfx11. + token_exclude |= (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \ 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \ - 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT - self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1) + 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT) if self.dev.target < (12,0,0) else 0x927 + + token_mask = {} if self.dev.target < (12,0,0) else {'exclude_barrier_wait': 1} + self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1, **token_mask) # Enable SQTT self.sqtt_config(tracing=True) # Restore global broadcasting @@ -178,9 +190,6 @@ class AMDComputeQueue(HWQueue): # Wait for FINISH_PENDING==0 self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ), self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), 4) - # Wait for FINISH_DONE!=0 - self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ), - self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_done'), 4) # Disable SQTT self.sqtt_config(tracing=False) # Wait for BUSY==0 @@ -804,7 +813,7 @@ class AMDDevice(HCQCompiled): # SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them) self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0)) if self.sqtt_enabled: - if self.target[0] != 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}') + if self.target[0] < 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}') if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000: raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add " f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n" @@ -872,7 +881,7 @@ class AMDDevice(HCQCompiled): self.synchronize() if DEBUG >= 2: print(f'{self.device}: Saving SQTT in profile...') for i,buf0 in enumerate(self.sqtt_buffers): - wptr = ((struct.unpack('= 2: print(f'\t{self.device}: SE {i} blob size {wptr:#x}') assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen" # When sqtt buffer overflows, wptr stops at the last dword