From eff80beeed30143efc63ce3adb0611f234ca4075 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Wed, 5 Nov 2025 23:43:20 +0800 Subject: [PATCH] amd: props in device not sqtt (#13106) * amd: props in device not sqtt * fix * f * fix * fix --- extra/sqtt/rgptool.py | 3 +-- extra/sqtt/roc.py | 3 ++- extra/sqtt/test_timing.py | 4 ++-- tinygrad/device.py | 2 +- tinygrad/runtime/ops_amd.py | 6 +++--- tinygrad/runtime/support/hcq.py | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/extra/sqtt/rgptool.py b/extra/sqtt/rgptool.py index 6c3e3470b1..453ae9fca2 100755 --- a/extra/sqtt/rgptool.py +++ b/extra/sqtt/rgptool.py @@ -154,6 +154,7 @@ class RGP: if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ") device_event = device_events[device] sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device] + device_props = device_event.props # merge events per SE merged_sqtt_events:dict[int, ProfileSQTTEvent] = {} for ev in sqtt_events: @@ -164,12 +165,10 @@ class RGP: se=ev.se, itrace=merged_sqtt_events[ev.se].itrace or ev.itrace, blob=merged_sqtt_events[ev.se].blob + ev.blob, - props=ev.props, ) sqtt_events = list(merged_sqtt_events.values()) if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data") - device_props = sqtt_events[0].props gfx_ver = device_props['gfx_target_version'] // 10000 gfx_iplvl = getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}_{(device_props['gfx_target_version']//100)%100}", getattr(sqtt, f"SQTT_GFXIP_LEVEL_GFXIP_{device_props['gfx_target_version']//10000}", None)) diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py index 791ca0e54b..aac9a6194c 100644 --- a/extra/sqtt/roc.py +++ b/extra/sqtt/roc.py @@ -57,7 +57,8 @@ class _ROCParseCtx: self.inst_execs:dict[tuple[str, int, int, int], list[InstExec]] = {} for prog in prog_evs: - for addr, info in llvm_disasm(dev_evs[prog.device].arch, unwrap(prog.lib)).items(): + arch = "gfx%d%x%x" % ((trgt:=dev_evs[prog.device].props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100) + for addr, info in llvm_disasm(arch, unwrap(prog.lib)).items(): self.disasms[unwrap(prog.base) + addr] = info self.addr2prg[unwrap(prog.base) + addr] = prog diff --git a/extra/sqtt/test_timing.py b/extra/sqtt/test_timing.py index 1e12051e9d..dd4951e04a 100644 --- a/extra/sqtt/test_timing.py +++ b/extra/sqtt/test_timing.py @@ -32,7 +32,7 @@ def get_sqtt(asm:list[str], l:int=1, g:int=1) -> list[InstExec]: k = Tensor.custom_kernel(Tensor.empty(1), fxn=fxn)[0] # exec and decode sqtt k.realize() - rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())]) + rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())]) assert len(rctx.inst_execs) > 0, "empty sqtt output" return list(rctx.inst_execs.values())[0][:-1] @@ -83,7 +83,7 @@ class TestTiming(unittest.TestCase): diff_hw_reg = Tensor.empty(1, dtype=dtypes.ulong) diff_hw_reg = Tensor.custom_kernel(diff_hw_reg, fxn=sleep_kernel)[0] diff_hw_reg.realize() - rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", arch=dev.device_info())]) + rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())]) diff_sqtt = list(rctx.inst_execs.values())[0][2] self.assertEqual(diff_sqtt.dur, diff_hw_reg.item()-1) # 1 cycle for reading the counter register diff --git a/tinygrad/device.py b/tinygrad/device.py index 374aa289bf..ffc8c1fc6f 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -54,7 +54,7 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices] @dataclass(frozen=True) class ProfileDeviceEvent(ProfileEvent): - device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); arch:str="" # noqa: E702 + device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702 @dataclass(frozen=True) class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702 diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 0f3b7c70d1..0ee414cae7 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -28,7 +28,7 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM << | (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE) @dataclass(frozen=True) -class ProfileSQTTEvent(ProfileEvent): device:str; se:int; props:dict; blob:bytes; itrace:bool # noqa: E702 +class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:bool # noqa: E702 @dataclass(frozen=True) class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702 @@ -605,7 +605,7 @@ class AMDProgram(HCQProgram): self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf) resbuf = (struct.pack('> se) & 1))] + Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))] return res class AMDAllocator(HCQAllocator['AMDDevice']): @@ -999,4 +999,4 @@ class AMDDevice(HCQCompiled): def on_device_hang(self): self.iface.on_device_hang() - def device_info(self): return self.arch + def device_props(self): return self.iface.props diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index a4675fd317..b8aa2f747e 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -409,7 +409,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): for dev in HCQCompiled.peer_groups[pg]: cast(HCQAllocator, dev.allocator).map(alc) return self.signal_t(base_buf=HCQCompiled.signal_pool[pg].pop(), owner=self, **kwargs) - def device_info(self) -> str: return "" # to be overridden if needed + def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent. def _at_profile_finalize(self): self.synchronize() # Expect device to be synchronizes @@ -424,7 +424,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)]) if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0) else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)]) - Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, arch=self.device_info())] + Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())] def _wrap_timeline_signal(self): self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1