mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
@@ -1,68 +0,0 @@
|
||||
import ctypes
|
||||
from dataclasses import dataclass
|
||||
import tinygrad.runtime.autogen.comgr as comgr
|
||||
from tinygrad.runtime.support.compiler_amd import check
|
||||
|
||||
@dataclass
|
||||
class InstrCtx:
|
||||
pc:int=0
|
||||
inst:str=""
|
||||
|
||||
@comgr.amd_comgr_create_disassembly_info.argtypes[2]
|
||||
def instr_cb(text, user_data):
|
||||
c = ctypes.cast(user_data, ctypes.POINTER(ctypes.py_object)).contents.value
|
||||
c.inst = ctypes.string_at(text).decode("utf-8","replace").strip()
|
||||
return comgr.AMD_COMGR_STATUS_SUCCESS
|
||||
|
||||
# nop callback
|
||||
@comgr.amd_comgr_create_disassembly_info.argtypes[3]
|
||||
def addr_cb(*args): return comgr.AMD_COMGR_STATUS_SUCCESS
|
||||
|
||||
def comgr_get_address_table(lib:bytes) -> dict[int, tuple[str, int]]:
|
||||
check(comgr.amd_comgr_create_data(comgr.AMD_COMGR_DATA_KIND_EXECUTABLE, ctypes.byref(data_src:=comgr.amd_comgr_data_t())))
|
||||
lib_buf = ctypes.create_string_buffer(lib, len(lib))
|
||||
check(comgr.amd_comgr_set_data(data_src, len(lib), lib_buf))
|
||||
check(comgr.amd_comgr_get_data_isa_name(data_src, isa_sz:=ctypes.c_size_t(128), isa:=(ctypes.c_char*isa_sz.value)()))
|
||||
|
||||
@comgr.amd_comgr_create_disassembly_info.argtypes[1]
|
||||
def memory_cb(from_addr, to, size, _):
|
||||
base, buf_len = ctypes.addressof(lib_buf), len(lib_buf)
|
||||
start = int(from_addr) - base
|
||||
if start < 0 or start >= buf_len: return 0
|
||||
ctypes.memmove(to, base + start, n:=min(int(size), buf_len - start))
|
||||
return n
|
||||
|
||||
info_src = comgr.amd_comgr_disassembly_info_t()
|
||||
check(comgr.amd_comgr_create_disassembly_info(ctypes.cast(isa, ctypes.POINTER(ctypes.c_char)), memory_cb, instr_cb, addr_cb, info_src))
|
||||
|
||||
@comgr.amd_comgr_iterate_symbols.argtypes[1]
|
||||
def sym_callback(sym, udata):
|
||||
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_TYPE, ctypes.byref(sym_type:=ctypes.c_int())))
|
||||
if sym_type.value != comgr.AMD_COMGR_SYMBOL_TYPE_FUNC: return comgr.AMD_COMGR_STATUS_SUCCESS
|
||||
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_VALUE, ctypes.byref(vaddr:=ctypes.c_uint64())))
|
||||
check(comgr.amd_comgr_symbol_get_info(sym, comgr.AMD_COMGR_SYMBOL_INFO_SIZE, ctypes.byref(size:=ctypes.c_uint64())))
|
||||
check(comgr.amd_comgr_map_elf_virtual_address_to_code_object_offset(data_src, vaddr.value, ctypes.byref(offset:=ctypes.c_uint64()),
|
||||
ctypes.byref(ctypes.c_uint64()), ctypes.byref(nobits:=ctypes.c_bool())))
|
||||
check(nobits.value)
|
||||
base = ctypes.addressof(lib_buf)
|
||||
pc = base + offset.value
|
||||
end = pc + size.value
|
||||
addr_table = ctypes.cast(udata, ctypes.POINTER(ctypes.py_object)).contents.value
|
||||
instr_ref = ctypes.py_object(ctx:=InstrCtx())
|
||||
instr_ptr = ctypes.cast(ctypes.pointer(instr_ref), ctypes.c_void_p)
|
||||
while pc < end:
|
||||
size_read = ctypes.c_uint64(0)
|
||||
ctx.pc = pc
|
||||
st = comgr.amd_comgr_disassemble_instruction(info_src, ctypes.c_uint64(pc), instr_ptr, ctypes.byref(size_read))
|
||||
if st == comgr.AMD_COMGR_STATUS_SUCCESS and size_read.value:
|
||||
rel = (pc - base) - offset.value
|
||||
addr_table[vaddr.value + rel] = (ctx.inst, int(size_read.value))
|
||||
pc += size_read.value
|
||||
else: # don't inf loop if comgr fails
|
||||
b = ctypes.c_ubyte.from_buffer(lib_buf, pc - base).value
|
||||
addr_table[vaddr.value + (pc - base - offset.value)] = (f"DISASSEMBLER ISSUE 0x{b:02x}", 1)
|
||||
pc += 1
|
||||
return comgr.AMD_COMGR_STATUS_SUCCESS
|
||||
addr_table:dict[int, tuple[str, int]] = {}
|
||||
check(comgr.amd_comgr_iterate_symbols(data_src, sym_callback, ctypes.cast(ctypes.pointer(ctypes.py_object(addr_table)), ctypes.c_void_p)))
|
||||
return addr_table
|
||||
@@ -1,9 +1,33 @@
|
||||
import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools
|
||||
from extra.sqtt.rocprof import rocprof
|
||||
from extra.sqtt.disasm import comgr_get_address_table
|
||||
from tinygrad.helpers import temp, DEBUG
|
||||
from tinygrad.device import ProfileEvent, ProfileProgramEvent
|
||||
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
|
||||
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
|
||||
from tinygrad.runtime.autogen import llvm
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
|
||||
# to pass NULL to callbacks
|
||||
llvm.LLVMCreateDisasmCPUFeatures.argtypes = llvm.LLVMCreateDisasmCPUFeatures.argtypes[:5] + [ctypes.c_void_p, ctypes.c_void_p]
|
||||
def llvm_disasm(arch:str, lib:bytes) -> dict[int, tuple[str, int]]:
|
||||
llvm.LLVMInitializeAMDGPUTargetInfo()
|
||||
llvm.LLVMInitializeAMDGPUTargetMC()
|
||||
llvm.LLVMInitializeAMDGPUAsmParser()
|
||||
llvm.LLVMInitializeAMDGPUDisassembler()
|
||||
ctx = llvm.LLVMCreateDisasmCPUFeatures("amdgcn-amd-amdhsa".encode(), arch.encode(), "".encode(), None, 0, None, None)
|
||||
|
||||
image, sections, relocs = elf_loader(lib)
|
||||
text = next((sh.header for sh in sections if sh.name == ".text"), -1)
|
||||
off, sz = text.sh_addr, text.sh_size
|
||||
|
||||
addr_table:dict[int, tuple[str, int]] = {}
|
||||
out = ctypes.create_string_buffer(128)
|
||||
cur_off = off
|
||||
while cur_off < sz + off:
|
||||
view = (ctypes.c_ubyte * ((sz + off) - cur_off)).from_buffer_copy(memoryview(image)[cur_off:])
|
||||
instr_sz = llvm.LLVMDisasmInstruction(ctx, view, ctypes.c_uint64(len(view)), ctypes.c_uint64(0), out, ctypes.c_size_t(128))
|
||||
addr_table[cur_off] = (out.value.decode("utf-8", "replace").strip(), instr_sz)
|
||||
cur_off += instr_sz
|
||||
return addr_table
|
||||
|
||||
@dataclasses.dataclass
|
||||
class InstInfo:
|
||||
@@ -18,12 +42,12 @@ class InstInfo:
|
||||
self.hit, self.lat, self.stall = self.hit + 1, self.lat + ev.duration, self.stall + ev.stall
|
||||
|
||||
class _ROCParseCtx:
|
||||
def __init__(self, sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
|
||||
self.sqtt_evs, self.prog_evs = iter(sqtt_evs), prog_evs
|
||||
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
|
||||
self.dev_evs, self.sqtt_evs, self.prog_evs = dev_evs, iter(sqtt_evs), prog_evs
|
||||
self.wave_events, self.disasms, self.addr2prg = {}, {}, {}
|
||||
|
||||
for prog in prog_evs:
|
||||
for addr, info in comgr_get_address_table(prog.lib).items():
|
||||
for addr, info in llvm_disasm(dev_evs[prog.device].arch, prog.lib).items():
|
||||
self.disasms[prog.base + addr] = info
|
||||
self.addr2prg[prog.base + addr] = prog
|
||||
|
||||
@@ -50,13 +74,15 @@ class _ROCParseCtx:
|
||||
self.wave_events[(self.find_program(ev.instructions_array[0].pc.address).name, ev.wave_id, ev.cu, ev.simd)] = asm
|
||||
|
||||
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
|
||||
dev_events:dict[str, ProfileDeviceEvent] = {}
|
||||
sqtt_events:list[ProfileSQTTEvent] = []
|
||||
prog_events:list[ProfileProgramEvent] = []
|
||||
for e in profile:
|
||||
if isinstance(e, ProfileDeviceEvent): dev_events[e.device] = e
|
||||
if isinstance(e, ProfileSQTTEvent): sqtt_events.append(e)
|
||||
if isinstance(e, ProfileProgramEvent) and e.device.startswith("AMD"): prog_events.append(e)
|
||||
|
||||
ROCParseCtx = _ROCParseCtx(sqtt_events, prog_events)
|
||||
ROCParseCtx = _ROCParseCtx(dev_events, sqtt_events, prog_events)
|
||||
|
||||
@rocprof.rocprof_trace_decoder_se_data_callback_t
|
||||
def copy_cb(buf, buf_size, data_ptr):
|
||||
|
||||
@@ -54,7 +54,7 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileDeviceEvent(ProfileEvent):
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0) # noqa: E702
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); arch:str="" # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
|
||||
|
||||
@@ -982,6 +982,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
def on_device_hang(self): self.iface.on_device_hang()
|
||||
|
||||
def device_info(self): return self.arch
|
||||
def _at_profile_finalize(self):
|
||||
if self.sqtt_enabled:
|
||||
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
@@ -409,6 +409,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
for dev in HCQCompiled.peer_groups[pg]: cast(HCQAllocator, dev.allocator).map(alc)
|
||||
return self.signal_t(base_buf=HCQCompiled.signal_pool[pg].pop(), owner=self, **kwargs)
|
||||
|
||||
def device_info(self) -> str: return "" # to be overridden if needed
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
self.synchronize() # Expect device to be synchronizes
|
||||
|
||||
@@ -422,7 +424,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
|
||||
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
|
||||
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff)]
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, arch=self.device_info())]
|
||||
|
||||
def _wrap_timeline_signal(self):
|
||||
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
|
||||
|
||||
Reference in New Issue
Block a user