amd: sqtt on gfx12 (#12564)

* amd: sqtt on gfx12

* cleaner

* thi

* and this

* ops

* ugh

* back

* rm this

* rm
This commit is contained in:
nimlgen
2025-10-10 17:54:14 +08:00
committed by GitHub
parent 95ad047445
commit 89be3590aa
7 changed files with 67 additions and 36 deletions

View File

@@ -156,6 +156,9 @@ class RGP:
sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device]
if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data")
device_props = sqtt_events[0].props
gfx_ver = device_props['gfx_target_version'] // 10000
gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}",
getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None))
sqtt_itrace_enabled = any([event.itrace for event in sqtt_events])
sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events])
sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0
@@ -193,7 +196,7 @@ class RGP:
flags=0,
trace_shader_core_clock=0x93f05080,
trace_memory_clock=0x4a723a40,
device_id={110000: 0x744c, 110003: 0x7480}[device_props['gfx_target_version']],
device_id={110000: 0x744c, 110003: 0x7480, 120001: 0x7550}[device_props['gfx_target_version']],
device_revision_id=0xc8,
vgprs_per_simd=1536,
sgprs_per_simd=128*16,
@@ -207,7 +210,7 @@ class RGP:
sgpr_alloc_granularity=128,
hardware_contexts=8,
gpu_type=sqtt.SQTT_GPU_TYPE_DISCRETE,
gfxip_level=sqtt.SQTT_GFXIP_LEVEL_GFXIP_11_0,
gfxip_level=gfx_iplvl,
gpu_index=0,
gds_size=0,
gds_per_shader_engine=0,
@@ -258,7 +261,7 @@ class RGP:
major_version=0, minor_version=2,
),
shader_engine_index=sqtt_event.se,
sqtt_version=sqtt.SQTT_VERSION_3_2,
sqtt_version={11: sqtt.SQTT_VERSION_3_2, 12: sqtt.SQTT_VERSION_3_3}.get(gfx_ver),
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
instrumentation_spec_version=1,

View File

@@ -20,14 +20,15 @@ class InstInfo:
class _ROCParseCtx:
def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs
self.wave_events = {}
self.wave_events, self.disasms, self.addr2prg = {}, {}, {}
for prog in prog_evs:
for addr, info in comgr_get_address_table(prog.lib).items():
self.disasms[prog.base + addr] = info
self.addr2prg[prog.base + addr] = prog
def next_sqtt(self): return next(self.sqtt_evs, None)
def find_program(self, idx): return self.prog_evs[idx]
def get_instr_info(self, idx, exec_addr): return self.disasm_program(idx)[exec_addr - self.find_program(idx).base]
@functools.lru_cache(None)
def disasm_program(self, idx): return comgr_get_address_table(self.find_program(idx).lib)
def find_program(self, addr): return self.addr2prg[addr]
def on_occupancy_ev(self, ev):
if DEBUG >= 4: print("OCC", ev.time, ev.cu, ev.simd, ev.wave_id, ev.start)
@@ -39,10 +40,10 @@ class _ROCParseCtx:
for j in range(ev.instructions_size):
inst_ev = ev.instructions_array[j]
inst_typ = rocprof.rocprofiler_thread_trace_decoder_inst_category_t__enumvalues[inst_ev.category]
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.get_instr_info(inst_ev.pc.code_object_id, inst_ev.pc.address)[0]))
asm.setdefault(inst_ev.pc.address, InstInfo(typ=inst_typ, inst=self.disasms[inst_ev.pc.address][0]))
asm[inst_ev.pc.address].on_ev(inst_ev)
self.wave_events[(self.find_program(ev.instructions_array[0].pc.code_object_id).name, ev.wave_id, ev.cu, ev.simd)] = asm
self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@@ -78,7 +79,7 @@ if __name__ == "__main__":
@rocprof.rocprof_trace_decoder_isa_callback_t
def isa_cb(instr_ptr, mem_size_ptr, size_ptr, pc, data_ptr):
instr, mem_size_ptr[0] = ROCParseCtx.get_instr_info(pc.code_object_id, pc.address)
instr, mem_size_ptr[0] = ROCParseCtx.disasms[pc.address]
# this is the number of bytes to next instruction, set to 0 for end_pgm
if instr == "s_endpgm": mem_size_ptr[0] = 0

View File

@@ -7,7 +7,8 @@
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, tinygrad.helpers.fetch as tgfetch
import ctypes
from tinygrad.helpers import fetch
class AsDictMixin:
@@ -155,7 +156,7 @@ class FunctionFactoryStub:
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries = {}
_libraries['FIXME_STUB'] = ctypes.CDLL(str(tgfetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')
_libraries['FIXME_STUB'] = ctypes.CDLL(str(fetch('https://github.com/ROCm/rocprof-trace-decoder/raw/5420409ad0963b2d76450add067b9058493ccbd0/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so'))) # ctypes.CDLL('FIXME_STUB')

View File

@@ -43,6 +43,7 @@ enum sqtt_version
SQTT_VERSION_2_3 = 0x6, /* GFX9 */
SQTT_VERSION_2_4 = 0x7, /* GFX10+ */
SQTT_VERSION_3_2 = 0xb, /* GFX11+ */
SQTT_VERSION_3_3 = 0xc, /* GFX12+ */
};
enum sqtt_file_chunk_type
@@ -144,6 +145,8 @@ enum sqtt_gfxip_level
SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7,
SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9,
SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc,
SQTT_GFXIP_LEVEL_GFXIP_11_5 = 0xd,
SQTT_GFXIP_LEVEL_GFXIP_12 = 0x10,
};
enum sqtt_memory_type
@@ -427,6 +430,8 @@ enum elf_gfxip_level
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x04e,
};
struct sqtt_file_chunk_spm_db {