mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
hcq: add tag to exec events (#13311)
* hcq: add tag to exec events * f * fix * fix
This commit is contained in:
@@ -166,6 +166,7 @@ class RGP:
|
||||
se=ev.se,
|
||||
itrace=merged_sqtt_events[ev.se].itrace or ev.itrace,
|
||||
blob=merged_sqtt_events[ev.se].blob + ev.blob,
|
||||
exec_tag=0,
|
||||
)
|
||||
sqtt_events = list(merged_sqtt_events.values())
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools
|
||||
from tinygrad.helpers import temp, unwrap, DEBUG
|
||||
from tinygrad.helpers import temp, unwrap, DEBUG, ProfileRangeEvent
|
||||
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
|
||||
from tinygrad.runtime.ops_amd import ProfileSQTTEvent, ProfilePMCEvent
|
||||
from tinygrad.runtime.autogen import llvm, rocprof
|
||||
@@ -47,8 +47,9 @@ class WaveExec:
|
||||
insts:list[InstExec]
|
||||
|
||||
class _ROCParseCtx:
|
||||
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
|
||||
self.dev_evs, self.sqtt_evs, self.prog_evs = dev_evs, iter(sqtt_evs), prog_evs
|
||||
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], exec_evs:dict[int, ProfileRangeEvent], sqtt_evs:list[ProfileSQTTEvent],
|
||||
prog_evs:list[ProfileProgramEvent]):
|
||||
self.dev_evs, self.exec_evs, self.sqtt_evs, self.prog_evs = dev_evs, exec_evs, iter(sqtt_evs), prog_evs
|
||||
self.disasms:dict[tuple[str, int], tuple[str, int]] = {}
|
||||
self.inst_execs:dict[str, list[WaveExec]] = {}
|
||||
|
||||
@@ -62,6 +63,7 @@ class _ROCParseCtx:
|
||||
self.active_kern = x.kern if x is not None else None
|
||||
self.active_se = x.se if x is not None else None
|
||||
self.active_blob = (ctypes.c_ubyte * len(x.blob)).from_buffer_copy(x.blob) if x is not None else None
|
||||
self.active_range = self.exec_evs[x.exec_tag] if x is not None else None
|
||||
return self.active_blob
|
||||
|
||||
def on_occupancy_ev(self, ev:rocprof.rocprofiler_thread_trace_decoder_occupancy_t):
|
||||
@@ -84,14 +86,16 @@ class _ROCParseCtx:
|
||||
|
||||
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
|
||||
dev_events:dict[str, ProfileDeviceEvent] = {}
|
||||
exec_events:dict[int, ProfileRangeEvent] = {}
|
||||
sqtt_events:list[ProfileSQTTEvent] = []
|
||||
prog_events:list[ProfileProgramEvent] = []
|
||||
for e in profile:
|
||||
if isinstance(e, ProfileRangeEvent) and e.device.startswith("AMD"): exec_events[e.tag] = e
|
||||
if isinstance(e, ProfileDeviceEvent): dev_events[e.device] = e
|
||||
if isinstance(e, ProfileSQTTEvent): sqtt_events.append(e)
|
||||
if isinstance(e, ProfileProgramEvent) and e.device.startswith("AMD"): prog_events.append(e)
|
||||
|
||||
ROCParseCtx = _ROCParseCtx(dev_events, sqtt_events, prog_events)
|
||||
ROCParseCtx = _ROCParseCtx(dev_events, exec_events, sqtt_events, prog_events)
|
||||
|
||||
@rocprof.rocprof_trace_decoder_se_data_callback_t
|
||||
def copy_cb(buf, buf_size, data_ptr):
|
||||
|
||||
@@ -271,7 +271,7 @@ class ProfileEvent: pass
|
||||
|
||||
@dataclass
|
||||
class ProfileRangeEvent(ProfileEvent):
|
||||
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False # noqa: E702
|
||||
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False; tag:int|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfilePointEvent(ProfileEvent):
|
||||
|
||||
@@ -28,13 +28,13 @@ AQL_HDR = (1 << hsa.HSA_PACKET_HEADER_BARRIER) | (hsa.HSA_FENCE_SCOPE_SYSTEM <<
|
||||
| (hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool # noqa: E702
|
||||
class ProfileSQTTEvent(ProfileEvent): device:str; kern:str; se:int; blob:bytes; itrace:bool; exec_tag:int # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PMCSample: name:str; block:str; xcc:int; inst:int; se:int; sa:int; wgp:int; off:int; size:int; regsample:str # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes # noqa: E702
|
||||
class ProfilePMCEvent(ProfileEvent): device:str; kern:str; sched:list[PMCSample]; blob:bytes; exec_tag:int # noqa: E702
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 100})
|
||||
@@ -583,7 +583,7 @@ class AMDProgram(HCQProgram):
|
||||
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).pmc_read(self.dev.pmc_buffer, self.dev.pmc_sched) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.dev.allocator._copyout(pmc_buf:=memoryview(bytearray(self.dev.pmc_buffer.size)), self.dev.pmc_buffer)
|
||||
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf))]
|
||||
Compiled.profile_events += [ProfilePMCEvent(self.dev.device, self.name, self.dev.pmc_sched, bytes(pmc_buf), self.dev.prof_exec_counter)]
|
||||
if self.dev.sqtt_enabled:
|
||||
cast(AMDComputeQueue, self.dev.hw_compute_queue_t()).sqtt_stop(self.dev.sqtt_wptrs) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
@@ -602,7 +602,8 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
self.dev.allocator._copyout(sqtt_mv:=memoryview(bytearray(wptr)), buf)
|
||||
resbuf = (struct.pack('<Q', 0x11 | (4 << 13) | (0xf << 16) | (se << 24)) + bytes(sqtt_mv)) if self.dev.target[0] == 9 else bytes(sqtt_mv)
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1))]
|
||||
Compiled.profile_events += [ProfileSQTTEvent(self.dev.device, self.name, se, resbuf, bool((SQTT_ITRACE_SE_MASK.value >> se) & 1),
|
||||
self.dev.prof_exec_counter)]
|
||||
return res
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import cast, Callable, Type, TypeVar, Generic, Any, Sequence
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, struct, collections, functools
|
||||
try: import fcntl # windows misses that
|
||||
except ImportError: fcntl = None #type:ignore[assignment]
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited, unwrap
|
||||
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerPairT
|
||||
from tinygrad.uop.ops import sym_infer, sint, UOp
|
||||
from tinygrad.runtime.autogen import libc
|
||||
@@ -262,7 +262,7 @@ class HCQSignal(Generic[HCQDeviceType]):
|
||||
if not_passed and self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None, tag=None):
|
||||
st, en = (dev.new_signal(), dev.new_signal()) if enabled else (None, None)
|
||||
assert queue is not None or queue_type is not None, "Either queue or queue_type must be provided"
|
||||
|
||||
@@ -277,7 +277,7 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
|
||||
queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.next_timeline()).submit(dev)
|
||||
|
||||
if enabled and PROFILE:
|
||||
dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
|
||||
dev.prof_exec_recs.append((tag, unwrap(st), unwrap(en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQArgsState(Generic[ProgramType]):
|
||||
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
|
||||
@@ -336,7 +336,8 @@ class HCQProgram(Generic[HCQDeviceType]):
|
||||
kernargs = self.fill_kernargs(bufs, vals)
|
||||
q = self.dev.hw_compute_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1).memory_barrier()
|
||||
|
||||
with hcq_profile(self.dev, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
self.dev.prof_exec_counter += 1
|
||||
with hcq_profile(self.dev, queue=q, desc=self.name, enabled=wait or PROFILE, tag=self.dev.prof_exec_counter) as (sig_st, sig_en):
|
||||
q.exec(self, kernargs, global_size, local_size)
|
||||
|
||||
q.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
@@ -371,7 +372,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.prof_exec_recs:list[tuple[int, HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.prof_exec_counter:int = 0
|
||||
|
||||
self.kernargs_buf:HCQBuffer = self.allocator.alloc(kernargs_size, BufferSpec(cpu_access=True))
|
||||
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)
|
||||
@@ -395,8 +397,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
|
||||
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
||||
if PROFILE:
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
|
||||
self.sig_prof_records = []
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp, t) for t,st,en,name,cp in self.prof_exec_recs]
|
||||
self.prof_exec_recs = []
|
||||
|
||||
def next_timeline(self):
|
||||
self.timeline_value += 1
|
||||
|
||||
Reference in New Issue
Block a user