hcq: do not assume kernel names are unique (#14371)

* hcq: do not assume kernel names are unique

* colored kernel name
This commit is contained in:
qazal
2026-01-27 09:03:15 -05:00
committed by GitHub
parent e5df7e640b
commit a5f3d46423
4 changed files with 14 additions and 11 deletions

View File

@@ -59,7 +59,7 @@ class ProfileDeviceEvent(ProfileEvent):
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
@dataclass(frozen=True)
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702
@dataclass(frozen=True)
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702

View File

@@ -32,13 +32,13 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
class ProfileSQTTEvent(ProfileEvent): device:str; kern:int; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
@dataclass(frozen=True)
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
@dataclass(frozen=True)
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
class ProfilePMCEvent(ProfileEvent): device:str; kern:int; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
class AMDSignal(HCQSignal):
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
@@ -600,7 +600,8 @@ class AMDProgram(HCQProgram):
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer)
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf), self.dev.prof_exec_counter)]
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.dev.prof_prg_counter, self.dev.pmc_sched, bytes(pmc_buf),
self.dev.prof_exec_counter)]
if self.dev.sqtt_enabled:
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
@@ -619,8 +620,8 @@ class AMDProgram(HCQProgram):
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1),
self.dev.prof_exec_counter)]
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.dev.prof_prg_counter, se, resbuf,
bool((SQTT_ITRACE_SE_MASK.value >> se) & 1), self.dev.prof_exec_counter)]
return res
class AMDAllocator(HCQAllocator['AMDDevice']):

View File

@@ -301,7 +301,8 @@ class CLikeArgsState(HCQArgsState[ProgramType]):
class HCQProgram(Generic[HCQDeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:HCQDeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
self.dev.prof_prg_counter += 1
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base, self.dev.prof_prg_counter)]
@staticmethod
def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec)
@@ -377,6 +378,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
self.prof_exec_counter:int = 0
self.prof_prg_counter:int = 0
self.kernargs_buf:HCQBuffer = self.allocator.alloc(kernargs_size, BufferSpec(cpu_access=True))
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)

View File

@@ -283,20 +283,20 @@ def unpack_pmc(e) -> dict:
def load_counters(profile:list[ProfileEvent]) -> None:
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
counter_events:dict[tuple[str, int], dict] = {}
counter_events:dict[tuple[int, int], dict] = {}
durations:dict[str, list[float]] = {}
prg_events:dict[str, ProfileProgramEvent] = {}
prg_events:dict[int, ProfileProgramEvent] = {}
for e in profile:
if isinstance(e, (ProfilePMCEvent, ProfileSQTTEvent)): counter_events.setdefault((e.kern, e.exec_tag), {}).setdefault(type(e), []).append(e)
if isinstance(e, ProfileRangeEvent) and e.device.startswith("AMD") and e.en is not None:
durations.setdefault(str(e.name), []).append(float(e.en-e.st))
if isinstance(e, ProfileProgramEvent): prg_events[str(e.name)] = e
if isinstance(e, ProfileProgramEvent) and e.tag is not None: prg_events[e.tag] = e
if len(counter_events) == 0: return None
ctxs.append({"name":"All Counters", "steps":[create_step("PMC", ("/all-pmc", len(ctxs), 0), (durations, all_counters:={}))]})
run_number = {n:0 for n,_ in counter_events}
for (k, tag),v in counter_events.items():
# use the colored name if it exists
name = trace.keys[r].ret.name if (r:=ref_map.get(k)) is not None else k
name = trace.keys[r].ret.name if (r:=ref_map.get(pname:=prg_events[k].name)) is not None else pname
run_number[k] += 1
steps:list[dict] = []
if (pmc:=v.get(ProfilePMCEvent)):