mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
amd: sqtt on gfx12 (#12564)
* amd: sqtt on gfx12 * cleaner * thi * and this * ops * ugh * back * rm this * rm
This commit is contained in:
@@ -435,8 +435,8 @@ generate_sqtt() {
|
||||
-o extra/sqtt/rocprof/rocprof.py
|
||||
fixup extra/sqtt/rocprof/rocprof.py
|
||||
sed -i '1s/^/# pylint: skip-file\n/' extra/sqtt/rocprof/rocprof.py
|
||||
sed -i "s/import ctypes/import ctypes, tinygrad.helpers.fetch as tgfetch/g" extra/sqtt/rocprof/rocprof.py
|
||||
sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py
|
||||
sed -i "s/import ctypes/import ctypes\nfrom tinygrad.helpers import fetch/g" extra/sqtt/rocprof/rocprof.py
|
||||
sed -i "s|FunctionFactoryStub()|ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so')))|g" extra/sqtt/rocprof/rocprof.py
|
||||
}
|
||||
|
||||
generate_webgpu() {
|
||||
|
||||
@@ -156,6 +156,9 @@ class RGP:
|
||||
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
|
||||
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
|
||||
device_props = sqtt_events[0].props
|
||||
gfx_ver = device_props['gfx_target_version'] // 10000
|
||||
gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}",
|
||||
getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None))
|
||||
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
|
||||
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
|
||||
@@ -193,7 +196,7 @@ class RGP:
|
||||
flags=0,
|
||||
trace_shader_core_clock=0x93f05080,
|
||||
trace_memory_clock=0x4a723a40,
|
||||
device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']],
|
||||
device_id={110000: 0x744c, 110003: 0x7480, 120001: 0x7550}[device_props['gfx_target_version']],
|
||||
device_revision_id=0xc8,
|
||||
vgprs_per_simd=1536,
|
||||
sgprs_per_simd=128*16,
|
||||
@@ -207,7 +210,7 @@ class RGP:
|
||||
sgpr_alloc_granularity=128,
|
||||
hardware_contexts=8,
|
||||
gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE,
|
||||
gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0,
|
||||
gfxip_level=gfx_iplvl,
|
||||
gpu_index=0,
|
||||
gds_size=0,
|
||||
gds_per_shader_engine=0,
|
||||
@@ -258,7 +261,7 @@ class RGP:
|
||||
major_version=0, minor_version=2,
|
||||
),
|
||||
shader_engine_index=sqtt_event.se,
|
||||
sqtt_version=sqtt.SQTT_VERSION_3_2,
|
||||
sqtt_version={11: sqtt.SQTT_VERSION_3_2, 12: sqtt.SQTT_VERSION_3_3}.get(gfx_ver),
|
||||
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
|
||||
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
|
||||
instrumentation_spec_version=1,
|
||||
|
||||
@@ -20,14 +20,15 @@ class InstInfo:
|
||||
class _ROCParseCtx:
|
||||
def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
|
||||
self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs
|
||||
self.wave_events = {}
|
||||
self.wave_events, self.disasms, self.addr2prg = {}, {}, {}
|
||||
|
||||
for prog in prog_evs:
|
||||
for addr, info in comgr_get_address_table(prog.lib).items():
|
||||
self.disasms[prog.base + addr] = info
|
||||
self.addr2prg[prog.base + addr] = prog
|
||||
|
||||
def next_sqtt(self): return next(self.sqtt_evs, None)
|
||||
def find_program(self, idx): return self.prog_evs[idx]
|
||||
def get_instr_info(self, idx, exec_addr): return self.disasm_program(idx)[exec_addr - self.find_program(idx).base]
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def disasm_program(self, idx): return comgr_get_address_table(self.find_program(idx).lib)
|
||||
def find_program(self, addr): return self.addr2prg[addr]
|
||||
|
||||
def on_occupancy_ev(self, ev):
|
||||
if DEBUG >= 4: print("OCC", ev.time, ev.cu, ev.simd, ev.wave_id, ev.start)
|
||||
@@ -39,10 +40,10 @@ class _ROCParseCtx:
|
||||
for j in range(ev.instructions_size):
|
||||
inst_ev = ev.instructions_array[j]
|
||||
inst_typ = rocprof.rocprofiler_thread_trace_decoder_inst_category_t__enumvalues[inst_ev.category]
|
||||
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.get_instr_info(inst_ev.pc.code_object_id, inst_ev.pc.address)[0]))
|
||||
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.disasms[inst_ev.pc.address][0]))
|
||||
asm[inst_ev.pc.address].on_ev(inst_ev)
|
||||
|
||||
self.wave_events[(self.find_program(ev.instructions_array[0].pc.code_object_id).name, ev.wave_id, ev.cu, ev.simd)] = asm
|
||||
self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
@@ -78,7 +79,7 @@ if __name__ == "__main__":
|
||||
|
||||
@rocprof.rocprof_trace_decoder_isa_callback_t
|
||||
def isa_cb(instr_ptr, mem_size_ptr, size_ptr, pc, data_ptr):
|
||||
instr, mem_size_ptr[0] = ROCParseCtx.get_instr_info(pc.code_object_id, pc.address)
|
||||
instr, mem_size_ptr[0] = ROCParseCtx.disasms[pc.address]
|
||||
|
||||
# this is the number of bytes to next instruction, set to 0 for end_pgm
|
||||
if instr == "s_endpgm": mem_size_ptr[0] = 0
|
||||
|
||||
@@ -7,7 +7,8 @@
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes, tinygrad.helpers.fetch as tgfetch
|
||||
import ctypes
|
||||
from tinygrad.helpers import fetch
|
||||
|
||||
|
||||
class AsDictMixin:
|
||||
@@ -155,7 +156,7 @@ class FunctionFactoryStub:
|
||||
# You can either re-run clan2py with -l /path/to/library.so
|
||||
# Or manually fix this by comment the ctypes.CDLL loading
|
||||
_libraries = {}
|
||||
_libraries['FIXME_STUB'] = ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')
|
||||
_libraries['FIXME_STUB'] = ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@ enum sqtt_version
|
||||
SQTT_VERSION_2_3 = 0x6, /* GFX9 */
|
||||
SQTT_VERSION_2_4 = 0x7, /* GFX10+ */
|
||||
SQTT_VERSION_3_2 = 0xb, /* GFX11+ */
|
||||
SQTT_VERSION_3_3 = 0xc, /* GFX12+ */
|
||||
};
|
||||
|
||||
enum sqtt_file_chunk_type
|
||||
@@ -144,6 +145,8 @@ enum sqtt_gfxip_level
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_11_5 = 0xd,
|
||||
SQTT_GFXIP_LEVEL_GFXIP_12 = 0x10,
|
||||
};
|
||||
|
||||
enum sqtt_memory_type
|
||||
@@ -427,6 +430,8 @@ enum elf_gfxip_level
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x04e,
|
||||
};
|
||||
|
||||
struct sqtt_file_chunk_spm_db {
|
||||
|
||||
@@ -174,12 +174,14 @@ sqtt_version__enumvalues = {
|
||||
6: 'SQTT_VERSION_2_3',
|
||||
7: 'SQTT_VERSION_2_4',
|
||||
11: 'SQTT_VERSION_3_2',
|
||||
12: 'SQTT_VERSION_3_3',
|
||||
}
|
||||
SQTT_VERSION_NONE = 0
|
||||
SQTT_VERSION_2_2 = 5
|
||||
SQTT_VERSION_2_3 = 6
|
||||
SQTT_VERSION_2_4 = 7
|
||||
SQTT_VERSION_3_2 = 11
|
||||
SQTT_VERSION_3_3 = 12
|
||||
sqtt_version = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'sqtt_file_chunk_type'
|
||||
@@ -336,6 +338,8 @@ sqtt_gfxip_level__enumvalues = {
|
||||
7: 'SQTT_GFXIP_LEVEL_GFXIP_10_1',
|
||||
9: 'SQTT_GFXIP_LEVEL_GFXIP_10_3',
|
||||
12: 'SQTT_GFXIP_LEVEL_GFXIP_11_0',
|
||||
13: 'SQTT_GFXIP_LEVEL_GFXIP_11_5',
|
||||
16: 'SQTT_GFXIP_LEVEL_GFXIP_12',
|
||||
}
|
||||
SQTT_GFXIP_LEVEL_NONE = 0
|
||||
SQTT_GFXIP_LEVEL_GFXIP_6 = 1
|
||||
@@ -346,6 +350,8 @@ SQTT_GFXIP_LEVEL_GFXIP_9 = 5
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 7
|
||||
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 9
|
||||
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 12
|
||||
SQTT_GFXIP_LEVEL_GFXIP_11_5 = 13
|
||||
SQTT_GFXIP_LEVEL_GFXIP_12 = 16
|
||||
sqtt_gfxip_level = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'sqtt_memory_type'
|
||||
@@ -806,12 +812,16 @@ elf_gfxip_level__enumvalues = {
|
||||
51: 'EF_AMDGPU_MACH_AMDGCN_GFX1010',
|
||||
54: 'EF_AMDGPU_MACH_AMDGCN_GFX1030',
|
||||
65: 'EF_AMDGPU_MACH_AMDGCN_GFX1100',
|
||||
67: 'EF_AMDGPU_MACH_AMDGCN_GFX1150',
|
||||
78: 'EF_AMDGPU_MACH_AMDGCN_GFX1200',
|
||||
}
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX801 = 40
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX900 = 44
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 51
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 54
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 65
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 67
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 78
|
||||
elf_gfxip_level = ctypes.c_uint32 # enum
|
||||
class struct_sqtt_file_chunk_spm_db(Structure):
|
||||
pass
|
||||
@@ -1607,7 +1617,8 @@ __all__ = \
|
||||
'ApiCmdUpdateBuffer', 'ApiCmdWaitEvents', 'ApiCmdWriteTimestamp',
|
||||
'ApiInvalid', 'ApiRayTracingSeparateCompiled',
|
||||
'EF_AMDGPU_MACH_AMDGCN_GFX1010', 'EF_AMDGPU_MACH_AMDGCN_GFX1030',
|
||||
'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX801',
|
||||
'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX1150',
|
||||
'EF_AMDGPU_MACH_AMDGCN_GFX1200', 'EF_AMDGPU_MACH_AMDGCN_GFX801',
|
||||
'EF_AMDGPU_MACH_AMDGCN_GFX900', 'EventCmdBlitImage',
|
||||
'EventCmdBuildAccelerationStructuresIndirectKHR',
|
||||
'EventCmdBuildAccelerationStructuresKHR',
|
||||
@@ -1671,7 +1682,8 @@ __all__ = \
|
||||
'SQTT_FILE_CHUNK_TYPE_SQTT_DESC', 'SQTT_FILE_MAGIC_NUMBER',
|
||||
'SQTT_FILE_VERSION_MAJOR', 'SQTT_FILE_VERSION_MINOR',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_10_1', 'SQTT_GFXIP_LEVEL_GFXIP_10_3',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_6',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_11_5',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_12', 'SQTT_GFXIP_LEVEL_GFXIP_6',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_7', 'SQTT_GFXIP_LEVEL_GFXIP_8',
|
||||
'SQTT_GFXIP_LEVEL_GFXIP_8_1', 'SQTT_GFXIP_LEVEL_GFXIP_9',
|
||||
'SQTT_GFXIP_LEVEL_NONE', 'SQTT_GPU_NAME_MAX_SIZE',
|
||||
@@ -1697,9 +1709,9 @@ __all__ = \
|
||||
'SQTT_QUEUE_TYPE_COMPUTE', 'SQTT_QUEUE_TYPE_DMA',
|
||||
'SQTT_QUEUE_TYPE_UNIVERSAL', 'SQTT_QUEUE_TYPE_UNKNOWN',
|
||||
'SQTT_SA_PER_SE', 'SQTT_VERSION_2_2', 'SQTT_VERSION_2_3',
|
||||
'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_NONE',
|
||||
'UserEventObjectName', 'UserEventPop', 'UserEventPush',
|
||||
'UserEventTrigger', 'elf_gfxip_level',
|
||||
'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_3_3',
|
||||
'SQTT_VERSION_NONE', 'UserEventObjectName', 'UserEventPop',
|
||||
'UserEventPush', 'UserEventTrigger', 'elf_gfxip_level',
|
||||
'rgp_sqtt_marker_event_type', 'rgp_sqtt_marker_general_api_type',
|
||||
'rgp_sqtt_marker_identifier', 'rgp_sqtt_marker_user_event_type',
|
||||
'sqtt_api_type', 'sqtt_engine_type',
|
||||
|
||||
@@ -130,8 +130,9 @@ class AMDComputeQueue(HWQueue):
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2, *data_ints[i:i+2])
|
||||
|
||||
def sqtt_config(self, tracing:bool):
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
|
||||
rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK, util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
|
||||
trace_ctrl = {'rt_freq': self.soc.SQ_TT_RT_FREQ_4096_CLK} if self.dev.target < (12,0,0) else {}
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_CTRL, draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1, util_timer=1,
|
||||
mode=int(tracing), **trace_ctrl)
|
||||
|
||||
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
|
||||
def sqtt_start(self, buf0s:list[HCQBuffer], se_mask:int):
|
||||
@@ -140,24 +141,35 @@ class AMDComputeQueue(HWQueue):
|
||||
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
|
||||
for se in range(len(buf0s)):
|
||||
self.wreg(self.gc.regGRBM_GFX_INDEX, se_index=se, instance_broadcast_writes=1)
|
||||
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size>>12)
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
|
||||
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr >> 12)
|
||||
if self.dev.target >= (12,0,0):
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, size=buf0s[se].size >> 12)
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_LO, base_lo=buf0_lo)
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE_HI, base_hi=buf0_hi)
|
||||
else:
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE, base_hi=buf0_hi, size=buf0s[se].size >> 12)
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE, base_lo=buf0_lo)
|
||||
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
|
||||
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
|
||||
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
|
||||
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
|
||||
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
|
||||
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0)
|
||||
cs_wtype = (1 << 6) if self.dev.target >= (12,0,0) else self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_MASK, wtype_include=cs_wtype, simd_sel=0, wgp_sel=0, sa_sel=0)
|
||||
reg_include = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
|
||||
self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
|
||||
token_exclude = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
|
||||
token_exclude = (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT) if self.dev.target < (12,0,0) else 0
|
||||
|
||||
# disable tracing
|
||||
if not (se_mask >> se) & 0b1:
|
||||
token_exclude |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
||||
# gfx12 doesn't have enums with all fields, so it's hardcoded, but it's the same as gfx11.
|
||||
token_exclude |= (1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
||||
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
|
||||
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1)
|
||||
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT) if self.dev.target < (12,0,0) else 0x927
|
||||
|
||||
token_mask = {} if self.dev.target < (12,0,0) else {'exclude_barrier_wait': 1}
|
||||
self.wreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK, reg_include=reg_include, token_exclude=token_exclude, bop_events_token_include=1, **token_mask)
|
||||
# Enable SQTT
|
||||
self.sqtt_config(tracing=True)
|
||||
# Restore global broadcasting
|
||||
@@ -178,9 +190,6 @@ class AMDComputeQueue(HWQueue):
|
||||
# Wait for FINISH_PENDING==0
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_pending'), 4)
|
||||
# Wait for FINISH_DONE!=0
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
|
||||
self.gc.regSQ_THREAD_TRACE_STATUS.addr[0], 0, 0, self.gc.regSQ_THREAD_TRACE_STATUS.fields_mask('finish_done'), 4)
|
||||
# Disable SQTT
|
||||
self.sqtt_config(tracing=False)
|
||||
# Wait for BUSY==0
|
||||
@@ -804,7 +813,7 @@ class AMDDevice(HCQCompiled):
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
if self.target[0] != 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
|
||||
if self.target[0] < 11: raise RuntimeError(f'SQ Thread Tracing is not supported on gc:{self.target}')
|
||||
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
|
||||
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
|
||||
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
|
||||
@@ -872,7 +881,7 @@ class AMDDevice(HCQCompiled):
|
||||
self.synchronize()
|
||||
if DEBUG >= 2: print(f'{self.device}: Saving SQTT in profile...')
|
||||
for i,buf0 in enumerate(self.sqtt_buffers):
|
||||
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - ((buf0.va_addr//32) & 0x1FFFFFFF)) * 32
|
||||
wptr = ((struct.unpack('<I', wptrs[i*4:i*4+4])[0] & 0x1FFFFFFF) - (((buf0.va_addr//32) & 0x1FFFFFFF) if self.target < (12,0,0) else 0)) * 32
|
||||
if DEBUG >= 2: print(f'\t{self.device}: SE {i} blob size {wptr:#x}')
|
||||
assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen"
|
||||
# When sqtt buffer overflows, wptr stops at the last dword
|
||||
|
||||
Reference in New Issue
Block a user