hcq: add tag to exec events (#13311)

* hcq: add tag to exec events

* f

* fix

* fix
This commit is contained in:
nimlgen
2025-11-17 16:59:30 +03:00
committed by GitHub
parent 50a443f558
commit f63ded5817
5 changed files with 24 additions and 16 deletions

View File

@@ -166,6 +166,7 @@ class RGP:
se=ev.se,
itrace=merged_sqtt_events[ev.se].itrace or ev.itrace,
blob=merged_sqtt_events[ev.se].blob + ev.blob,
exec_tag=0,
)
sqtt_events = list(merged_sqtt_events.values())

View File

@@ -1,5 +1,5 @@
import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools
from tinygrad.helpers import temp, unwrap, DEBUG
from tinygrad.helpers import temp, unwrap, DEBUG, ProfileRangeEvent
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
from tinygrad.runtime.autogen import llvm, rocprof
@@ -47,8 +47,9 @@ class WaveExec:
insts:list[InstExec]
class _ROCParseCtx:
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.dev_evs, self.sqtt_evs, self.prog_evs = dev_evs, iter(sqtt_evs), prog_evs
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], exec_evs:dict[int, ProfileRangeEvent], sqtt_evs:list[ProfileSQTTEvent],
prog_evs:list[ProfileProgramEvent]):
self.dev_evs, self.exec_evs, self.sqtt_evs, self.prog_evs = dev_evs, exec_evs, iter(sqtt_evs), prog_evs
self.disasms:dict[tuple[str, int], tuple[str, int]] = {}
self.inst_execs:dict[str, list[WaveExec]] = {}
@@ -62,6 +63,7 @@ class _ROCParseCtx:
self.active_kern = x.kern if x is not None else None
self.active_se = x.se if x is not None else None
self.active_blob = (ctypes.c_ubyte * len(x.blob)).from_buffer_copy(x.blob) if x is not None else None
self.active_range = self.exec_evs[x.exec_tag] if x is not None else None
return self.active_blob
def on_occupancy_ev(self, ev:rocprof.rocprofiler_thread_trace_decoder_occupancy_t):
@@ -84,14 +86,16 @@ class _ROCParseCtx:
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
dev_events:dict[str, ProfileDeviceEvent] = {}
exec_events:dict[int, ProfileRangeEvent] = {}
sqtt_events:list[ProfileSQTTEvent] = []
prog_events:list[ProfileProgramEvent] = []
for e in profile:
if isinstance(e, ProfileRangeEvent) and e.device.startswith("AMD"): exec_events[e.tag] = e
if isinstance(e, ProfileDeviceEvent): dev_events[e.device] = e
if isinstance(e, ProfileSQTTEvent): sqtt_events.append(e)
if isinstance(e, ProfileProgramEvent) and e.device.startswith("AMD"): prog_events.append(e)
ROCParseCtx = _ROCParseCtx(dev_events, sqtt_events, prog_events)
ROCParseCtx = _ROCParseCtx(dev_events, exec_events, sqtt_events, prog_events)
@rocprof.rocprof_trace_decoder_se_data_callback_t
def copy_cb(buf, buf_size, data_ptr):

View File

@@ -271,7 +271,7 @@ class ProfileEvent: pass
@dataclass
class ProfileRangeEvent(ProfileEvent):
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False # noqa: E702
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False; tag:int|None=None # noqa: E702
@dataclass(frozen=True)
class ProfilePointEvent(ProfileEvent):

View File

@@ -28,13 +28,13 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
@dataclass(frozen=True)
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool # noqa: E702
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
@dataclass(frozen=True)
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
@dataclass(frozen=True)
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes # noqa: E702
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
class AMDSignal(HCQSignal):
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
@@ -583,7 +583,7 @@ class AMDProgram(HCQProgram):
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer)
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf))]
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf), self.dev.prof_exec_counter)]
if self.dev.sqtt_enabled:
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
@@ -602,7 +602,8 @@ class AMDProgram(HCQProgram):
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1),
self.dev.prof_exec_counter)]
return res
class AMDAllocator(HCQAllocator['AMDDevice']):

View File

@@ -3,7 +3,7 @@ from typing import cast, Callable, Type, TypeVar, Generic, Any, Sequence
import contextlib, decimal, statistics, time, ctypes, array, os, struct, collections, functools
try: import fcntl # windows misses that
except ImportError: fcntl = None #type:ignore[assignment]
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited, unwrap
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerPairT
from tinygrad.uop.ops import sym_infer, sint, UOp
from tinygrad.runtime.autogen import libc
@@ -262,7 +262,7 @@ class HCQSignal(Generic[HCQDeviceType]):
if not_passed and self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
@contextlib.contextmanager
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None, tag=None):
st, en = (dev.new_signal(), dev.new_signal()) if enabled else (None, None)
assert queue is not None or queue_type is not None, "Either queue or queue_type must be provided"
@@ -277,7 +277,7 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.next_timeline()).submit(dev)
if enabled and PROFILE:
dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
dev.prof_exec_recs.append((tag, unwrap(st), unwrap(en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
class HCQArgsState(Generic[ProgramType]):
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
@@ -336,7 +336,8 @@ class HCQProgram(Generic[HCQDeviceType]):
kernargs = self.fill_kernargs(bufs, vals)
q = self.dev.hw_compute_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1).memory_barrier()
with hcq_profile(self.dev, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
self.dev.prof_exec_counter += 1
with hcq_profile(self.dev, queue=q, desc=self.name, enabled=wait or PROFILE, tag=self.dev.prof_exec_counter) as (sig_st, sig_en):
q.exec(self, kernargs, global_size, local_size)
q.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
@@ -371,7 +372,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
self.prof_exec_recs:list[tuple[int, HCQSignal, HCQSignal, str, bool]] = []
self.prof_exec_counter:int = 0
self.kernargs_buf:HCQBuffer = self.allocator.alloc(kernargs_size, BufferSpec(cpu_access=True))
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)
@@ -395,8 +397,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
if PROFILE:
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
self.sig_prof_records = []
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp, t) for t,st,en,name,cp in self.prof_exec_recs]
self.prof_exec_recs = []
def next_timeline(self):
self.timeline_value += 1