mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
hcq: do not assume kernel names are unique (#14371)
* hcq: do not assume kernel names are unique * colored kernel name
This commit is contained in:
@@ -59,7 +59,7 @@ class ProfileDeviceEvent(ProfileEvent):
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702
|
||||
|
||||
@@ -32,13 +32,13 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
|
||||
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; kern:int; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
|
||||
class ProfilePMCEvent(ProfileEvent): device:str; kern:int; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
|
||||
@@ -600,7 +600,8 @@ class AMDProgram(HCQProgram):
|
||||
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer)
|
||||
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf), self.dev.prof_exec_counter)]
|
||||
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.dev.prof_prg_counter, self.dev.pmc_sched, bytes(pmc_buf),
|
||||
self.dev.prof_exec_counter)]
|
||||
if self.dev.sqtt_enabled:
|
||||
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
@@ -619,8 +620,8 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
|
||||
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1),
|
||||
self.dev.prof_exec_counter)]
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.dev.prof_prg_counter, se, resbuf,
|
||||
bool((SQTT_ITRACE_SE_MASK.value >> se) & 1), self.dev.prof_exec_counter)]
|
||||
return res
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
|
||||
@@ -301,7 +301,8 @@ class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
class HCQProgram(Generic[HCQDeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:HCQDeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
|
||||
self.dev.prof_prg_counter += 1
|
||||
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base, self.dev.prof_prg_counter)]
|
||||
|
||||
@staticmethod
|
||||
def _fini(dev, buf, spec): dev.allocator.free(buf, buf.size, spec)
|
||||
@@ -377,6 +378,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.prof_exec_counter:int = 0
|
||||
self.prof_prg_counter:int = 0
|
||||
|
||||
self.kernargs_buf:HCQBuffer = self.allocator.alloc(kernargs_size, BufferSpec(cpu_access=True))
|
||||
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)
|
||||
|
||||
@@ -283,20 +283,20 @@ def unpack_pmc(e) -> dict:
|
||||
|
||||
def load_counters(profile:list[ProfileEvent]) -> None:
|
||||
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
|
||||
counter_events:dict[tuple[str, int], dict] = {}
|
||||
counter_events:dict[tuple[int, int], dict] = {}
|
||||
durations:dict[str, list[float]] = {}
|
||||
prg_events:dict[str, ProfileProgramEvent] = {}
|
||||
prg_events:dict[int, ProfileProgramEvent] = {}
|
||||
for e in profile:
|
||||
if isinstance(e, (ProfilePMCEvent, ProfileSQTTEvent)): counter_events.setdefault((e.kern, e.exec_tag), {}).setdefault(type(e), []).append(e)
|
||||
if isinstance(e, ProfileRangeEvent) and e.device.startswith("AMD") and e.en is not None:
|
||||
durations.setdefault(str(e.name), []).append(float(e.en-e.st))
|
||||
if isinstance(e, ProfileProgramEvent): prg_events[str(e.name)] = e
|
||||
if isinstance(e, ProfileProgramEvent) and e.tag is not None: prg_events[e.tag] = e
|
||||
if len(counter_events) == 0: return None
|
||||
ctxs.append({"name":"All Counters", "steps":[create_step("PMC", ("/all-pmc", len(ctxs), 0), (durations, all_counters:={}))]})
|
||||
run_number = {n:0 for n,_ in counter_events}
|
||||
for (k, tag),v in counter_events.items():
|
||||
# use the colored name if it exists
|
||||
name = trace.keys[r].ret.name if (r:=ref_map.get(k)) is not None else k
|
||||
name = trace.keys[r].ret.name if (r:=ref_map.get(pname:=prg_events[k].name)) is not None else pname
|
||||
run_number[k] += 1
|
||||
steps:list[dict] = []
|
||||
if (pmc:=v.get(ProfilePMCEvent)):
|
||||
|
||||
Reference in New Issue
Block a user