From a5f3d464230f52f12cedc3be820846ae16886b67 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 27 Jan 2026 09:03:15 -0500 Subject: [PATCH] hcq: do not assume kernel names are unique (#14371) * hcq: do not assume kernel names are unique * colored kernel name --- tinygrad/device.py | 2 +- tinygrad/runtime/ops_amd.py | 11 ++++++----- tinygrad/runtime/support/hcq.py | 4 +++- tinygrad/viz/serve.py | 8 ++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index 8c96e3ec97..1123f05b18 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -59,7 +59,7 @@ class ProfileDeviceEvent(ProfileEvent): device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702 @dataclass(frozen=True) -class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702 +class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702 @dataclass(frozen=True) class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702 diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 48db08cc4e..8be30035ab 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -32,13 +32,13 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM << | (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE) @dataclass(frozen=True) -class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702 +class ProfileSQTTEvent(ProfileEvent): device:str; kern:int; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702 @dataclass(frozen=True) class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702 @dataclass(frozen=True) -class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702 +class ProfilePMCEvent(ProfileEvent): device:str; kern:int; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702 class AMDSignal(HCQSignal): def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100}) @@ -600,7 +600,8 @@ class AMDProgram(HCQProgram): cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \ .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer) - Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf), self.dev.prof_exec_counter)] + Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.dev.prof_prg_counter, self.dev.pmc_sched, bytes(pmc_buf), + self.dev.prof_exec_counter)] if self.dev.sqtt_enabled: cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \ .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) @@ -619,8 +620,8 @@ class AMDProgram(HCQProgram): self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf) resbuf = (struct.pack('> se) & 1), - self.dev.prof_exec_counter)] + Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.dev.prof_prg_counter, se, resbuf, + bool((SQTT_ITRACE_SE_MASK.value >> se) & 1), self.dev.prof_exec_counter)] return res class AMDAllocator(HCQAllocator['AMDDevice']): diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 16fda389ac..b941208f99 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -301,7 +301,8 @@ class CLikeArgsState(HCQArgsState[ProgramType]): class HCQProgram(Generic[HCQDeviceType]): def __init__(self, args_state_t:Type[HCQArgsState], dev:HCQDeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None): self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size - if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)] + self.dev.prof_prg_counter += 1 + if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base, self.dev.prof_prg_counter)] @staticmethod def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec) @@ -377,6 +378,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True) self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = [] self.prof_exec_counter:int = 0 + self.prof_prg_counter:int = 0 self.kernargs_buf:HCQBuffer = self.allocator.alloc(kernargs_size, BufferSpec(cpu_access=True)) self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True) diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index 5d3d0d4e27..74cf6a52b7 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -283,20 +283,20 @@ def unpack_pmc(e) -> dict: def load_counters(profile:list[ProfileEvent]) -> None: from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent - counter_events:dict[tuple[str, int], dict] = {} + counter_events:dict[tuple[int, int], dict] = {} durations:dict[str, list[float]] = {} - prg_events:dict[str, ProfileProgramEvent] = {} + prg_events:dict[int, ProfileProgramEvent] = {} for e in profile: if isinstance(e, (ProfilePMCEvent, ProfileSQTTEvent)): counter_events.setdefault((e.kern, e.exec_tag), {}).setdefault(type(e), []).append(e) if isinstance(e, ProfileRangeEvent) and e.device.startswith("AMD") and e.en is not None: durations.setdefault(str(e.name), []).append(float(e.en-e.st)) - if isinstance(e, ProfileProgramEvent): prg_events[str(e.name)] = e + if isinstance(e, ProfileProgramEvent) and e.tag is not None: prg_events[e.tag] = e if len(counter_events) == 0: return None ctxs.append({"name":"All Counters", "steps":[create_step("PMC", ("/all-pmc", len(ctxs), 0), (durations, all_counters:={}))]}) run_number = {n:0 for n,_ in counter_events} for (k, tag),v in counter_events.items(): # use the colored name if it exists - name = trace.keys[r].ret.name if (r:=ref_map.get(k)) is not None else k + name = trace.keys[r].ret.name if (r:=ref_map.get(pname:=prg_events[k].name)) is not None else pname run_number[k] += 1 steps:list[dict] = [] if (pmc:=v.get(ProfilePMCEvent)):