rocprof: use llvm disasm (#13077)

* rocprof: use llvm disasm

* rm
This commit is contained in:
nimlgen
2025-11-03 23:58:58 +08:00
committed by GitHub
parent 27d42fd575
commit dfde3f54d9
5 changed files with 37 additions and 76 deletions

View File

@@ -1,68 +0,0 @@
import ctypes
from dataclasses import dataclass
import tinygrad.runtime.autogen.comgr as comgr
from tinygrad.runtime.support.compiler_amd import check
@dataclass
class InstrCtx:
pc:int=0
inst:str=""
@comgr.amd_comgr_create_disassembly_info.argtypes[2]
def instr_cb(text, user_data):
c = ctypes.cast(user_data, ctypes.POINTER(ctypes.py_object)).contents.value
c.inst = ctypes.string_at(text).decode("utf-8","replace").strip()
return comgr.AMD_COMGR_STATUS_SUCCESS
# nop callback
@comgr.amd_comgr_create_disassembly_info.argtypes[3]
def addr_cb(*args): return comgr.AMD_COMGR_STATUS_SUCCESS
def comgr_get_address_table(lib:bytes) -> dict[int, tuple[str, int]]:
check(comgr.amd_comgr_create_data(comgr.AMD_COMGR_DATA_KIND_EXECUTABLE, ctypes.byref(data_src:=comgr.amd_comgr_data_t())))
lib_buf = ctypes.create_string_buffer(lib, len(lib))
check(comgr.amd_comgr_set_data(data_src, len(lib), lib_buf))
check(comgr.amd_comgr_get_data_isa_name(data_src, isa_sz:=ctypes.c_size_t(128), isa:=(ctypes.c_char*isa_sz.value)()))
@comgr.amd_comgr_create_disassembly_info.argtypes[1]
def memory_cb(from_addr, to, size, _):
base, buf_len = ctypes.addressof(lib_buf), len(lib_buf)
start = int(from_addr) - base
if start < 0 or start >= buf_len: return 0
ctypes.memmove(to, base + start, n:=min(int(size), buf_len - start))
return n
info_src = comgr.amd_comgr_disassembly_info_t()
check(comgr.amd_comgr_create_disassembly_info(ctypes.cast(isa, ctypes.POINTER(ctypes.c_char)), memory_cb, instr_cb, addr_cb, info_src))
@comgr.amd_comgr_iterate_symbols.argtypes[1]
def sym_callback(sym, udata):
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_TYPE, ctypes.byref(sym_type:=ctypes.c_int())))
if sym_type.value != comgr.AMD_COMGR_SYMBOL_TYPE_FUNC: return comgr.AMD_COMGR_STATUS_SUCCESS
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_VALUE, ctypes.byref(vaddr:=ctypes.c_uint64())))
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_SIZE, ctypes.byref(size:=ctypes.c_uint64())))
check(comgr.amd_comgr_map_elf_virtual_address_to_code_object_offset(data_src, vaddr.value, ctypes.byref(offset:=ctypes.c_uint64()),
ctypes.byref(ctypes.c_uint64()), ctypes.byref(nobits:=ctypes.c_bool())))
check(nobits.value)
base = ctypes.addressof(lib_buf)
pc = base + offset.value
end = pc + size.value
addr_table = ctypes.cast(udata, ctypes.POINTER(ctypes.py_object)).contents.value
instr_ref = ctypes.py_object(ctx:=InstrCtx())
instr_ptr = ctypes.cast(ctypes.pointer(instr_ref), ctypes.c_void_p)
while pc < end:
size_read = ctypes.c_uint64(0)
ctx.pc = pc
st = comgr.amd_comgr_disassemble_instruction(info_src, ctypes.c_uint64(pc), instr_ptr, ctypes.byref(size_read))
if st == comgr.AMD_COMGR_STATUS_SUCCESS and size_read.value:
rel = (pc - base) - offset.value
addr_table[vaddr.value + rel] = (ctx.inst, int(size_read.value))
pc += size_read.value
else: # don't inf loop if comgr fails
b = ctypes.c_ubyte.from_buffer(lib_buf, pc - base).value
addr_table[vaddr.value + (pc - base - offset.value)] = (f"DISASSEMBLER ISSUE 0x{b:02x}", 1)
pc += 1
return comgr.AMD_COMGR_STATUS_SUCCESS
addr_table:dict[int, tuple[str, int]] = {}
check(comgr.amd_comgr_iterate_symbols(data_src, sym_callback, ctypes.cast(ctypes.pointer(ctypes.py_object(addr_table)), ctypes.c_void_p)))
return addr_table

View File

@@ -1,9 +1,33 @@
import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools
from extra.sqtt.rocprof import rocprof
from extra.sqtt.disasm import comgr_get_address_table
from tinygrad.helpers import temp, DEBUG
from tinygrad.device import ProfileEvent, ProfileProgramEvent
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
from tinygrad.runtime.autogen import llvm
from tinygrad.runtime.support.elf import elf_loader
# to pass NULL to callbacks
llvm.LLVMCreateDisasmCPUFeatures.argtypes = llvm.LLVMCreateDisasmCPUFeatures.argtypes[:5] + [ctypes.c_void_p, ctypes.c_void_p]
def llvm_disasm(arch:str, lib:bytes) -> dict[int, tuple[str, int]]:
llvm.LLVMInitializeAMDGPUTargetInfo()
llvm.LLVMInitializeAMDGPUTargetMC()
llvm.LLVMInitializeAMDGPUAsmParser()
llvm.LLVMInitializeAMDGPUDisassembler()
ctx = llvm.LLVMCreateDisasmCPUFeatures("amdgcn-amd-amdhsa".encode(), arch.encode(), "".encode(), None, 0, None, None)
image, sections, relocs = elf_loader(lib)
text = next((sh.header for sh in sections if sh.name == ".text"), -1)
off, sz = text.sh_addr, text.sh_size
addr_table:dict[int, tuple[str, int]] = {}
out = ctypes.create_string_buffer(128)
cur_off = off
while cur_off < sz + off:
view = (ctypes.c_ubyte * ((sz + off) - cur_off)).from_buffer_copy(memoryview(image)[cur_off:])
instr_sz = llvm.LLVMDisasmInstruction(ctx, view, ctypes.c_uint64(len(view)), ctypes.c_uint64(0), out, ctypes.c_size_t(128))
addr_table[cur_off] = (out.value.decode("utf-8", "replace").strip(), instr_sz)
cur_off += instr_sz
return addr_table
@dataclasses.dataclass
class InstInfo:
@@ -18,12 +42,12 @@ class InstInfo:
self.hit, self.lat, self.stall = self.hit + 1, self.lat + ev.duration, self.stall + ev.stall
class _ROCParseCtx:
def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.dev_evs, self.sqtt_evs, self.prog_evs = dev_evs, iter(sqtt_evs), prog_evs
self.wave_events, self.disasms, self.addr2prg = {}, {}, {}
for prog in prog_evs:
for addr, info in comgr_get_address_table(prog.lib).items():
for addr, info in llvm_disasm(dev_evs[prog.device].arch, prog.lib).items():
self.disasms[prog.base + addr] = info
self.addr2prg[prog.base + addr] = prog
@@ -50,13 +74,15 @@ class _ROCParseCtx:
self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
dev_events:dict[str, ProfileDeviceEvent] = {}
sqtt_events:list[ProfileSQTTEvent] = []
prog_events:list[ProfileProgramEvent] = []
for e in profile:
if isinstance(e, ProfileDeviceEvent): dev_events[e.device] = e
if isinstance(e, ProfileSQTTEvent): sqtt_events.append(e)
if isinstance(e, ProfileProgramEvent) and e.device.startswith("AMD"): prog_events.append(e)
ROCParseCtx = _ROCParseCtx(sqtt_events, prog_events)
ROCParseCtx = _ROCParseCtx(dev_events, sqtt_events, prog_events)
@rocprof.rocprof_trace_decoder_se_data_callback_t
def copy_cb(buf, buf_size, data_ptr):

View File

@@ -54,7 +54,7 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
@dataclass(frozen=True)
class ProfileDeviceEvent(ProfileEvent):
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0) # noqa: E702
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); arch:str="" # noqa: E702
@dataclass(frozen=True)
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702

View File

@@ -982,6 +982,7 @@ class AMDDevice(HCQCompiled):
def on_device_hang(self): self.iface.on_device_hang()
def device_info(self): return self.arch
def _at_profile_finalize(self):
if self.sqtt_enabled:
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))

View File

@@ -409,6 +409,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
for dev in HCQCompiled.peer_groups[pg]: cast(HCQAllocator, dev.allocator).map(alc)
return self.signal_t(base_buf=HCQCompiled.signal_pool[pg].pop(), owner=self, **kwargs)
def device_info(self) -> str: return "" # to be overridden if needed
def _at_profile_finalize(self):
self.synchronize() # Expect device to be synchronizes
@@ -422,7 +424,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff)]
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, arch=self.device_info())]
def _wrap_timeline_signal(self):
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1