mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 01:26:29 -05:00
viz profiler (#8287)
* only hcq * fix get_metadata * linter * oops * tiny * linter * time * print pm * hmm * nits
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
import collections, time
|
||||
from typing import List, Any, Dict, cast, Optional, Tuple, Set
|
||||
from tinygrad.helpers import round_up, PROFILE, memsize_to_str
|
||||
from tinygrad.helpers import round_up, PROFILE
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState, BumpAllocator
|
||||
from tinygrad.device import Buffer, BufferSpec, Compiled, Device
|
||||
from tinygrad.device import Buffer, BufferSpec, Compiled, Device, ProfileGraphEntry, ProfileGraphEvent
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.ops import UOp, Variable
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
@@ -51,8 +51,11 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.kickoff_value: int = 0
|
||||
self.kickoff_var = UOp.variable("kickoff_var", 0, 0xffffffff, dtype=dtypes.uint32)
|
||||
|
||||
# When profiling allocate 2 signals for each jit item to measure speed. The jth jit item have signals at 2*j and 2*j+1.
|
||||
# TODO: This logic might allocate a few extra signals...
|
||||
self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(jit_cache) * 2)] if PROFILE else []
|
||||
self.prof_records: List[Tuple[Tuple[int, bool], Tuple[int, bool], HCQCompiled, str, bool, List[int], Optional[Dict]]] = []
|
||||
self.prog_graph_deps: List[List[int]] = []
|
||||
self.prof_graph_entries: List[ProfileGraphEntry] = []
|
||||
|
||||
last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
|
||||
queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
|
||||
@@ -102,18 +105,20 @@ class HCQGraph(MultiGraphRunner):
|
||||
|
||||
# Collect profile information if profiling is enabled.
|
||||
if PROFILE:
|
||||
# When execution are chained, we can reuse the end timestamp from the previous command as the start timestamp for the current command.
|
||||
sig_st = prev_ji * 2 + 1 if len(opt_deps) == 0 and (prev_ji:=last_j[enqueue_queue]) is not None else j * 2
|
||||
|
||||
# Description based on the command.
|
||||
prof_ji_desc = ji.prg._prg.name if is_exec_prg else f"{ji.bufs[1].device} -> {ji.bufs[0].device}" # type: ignore
|
||||
|
||||
sig_st, sig_en = (j * 2, True), (j * 2 + 1, True)
|
||||
if len(opt_deps) == 0 and (prev_ji:=last_j[enqueue_queue]) is not None: sig_st = (prev_ji * 2 + 1, False)
|
||||
|
||||
if is_exec_prg: prof_args = None
|
||||
else: prof_args = {"Size": memsize_to_str(ji.bufs[0].nbytes), "GB/S": lambda dur, b=ji.bufs[0].nbytes: f"{b/1e3/dur:.2f}"} # type: ignore
|
||||
|
||||
self.prof_records.append((sig_st, sig_en, enqueue_dev, prof_ji_desc, not is_exec_prg, [d - 1 for _, d in rdeps], prof_args))
|
||||
self.prof_graph_entries.append(ProfileGraphEntry(enqueue_dev.device, prof_ji_desc, sig_st, j * 2 + 1, is_copy=not is_exec_prg))
|
||||
self.prog_graph_deps.append([d - 1 for _, d in rdeps])
|
||||
|
||||
last_j[enqueue_queue] = j
|
||||
|
||||
# Check which signals are used in the profile graph.
|
||||
self.prof_signal_is_used = [any(ent.st_id == j or ent.en_id == j for ent in self.prof_graph_entries) for j in range(len(self.prof_signals))]
|
||||
|
||||
# Build hardware queues.
|
||||
self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices}
|
||||
|
||||
@@ -132,7 +137,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
for sig, val in sync_signals + deps: enqueue_queue.wait(sig, val)
|
||||
|
||||
# Encode waits and start profile timestamp (if needed).
|
||||
if PROFILE and self.prof_records[j][0][1]: enqueue_queue.timestamp(self.prof_signals[self.prof_records[j][0][0]])
|
||||
if PROFILE and self.prof_signal_is_used[j * 2]: enqueue_queue.timestamp(self.prof_signals[j * 2])
|
||||
|
||||
# Encode main commands based on ji type.
|
||||
if isinstance(ji.prg, CompiledRunner):
|
||||
@@ -145,7 +150,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device]))
|
||||
|
||||
# Encode finish profile timestamp (if needed).
|
||||
if PROFILE and self.prof_records[j][1][1]: enqueue_queue.timestamp(self.prof_signals[self.prof_records[j][1][0]])
|
||||
if PROFILE and self.prof_signal_is_used[j * 2 + 1]: enqueue_queue.timestamp(self.prof_signals[j * 2 + 1])
|
||||
|
||||
if signal_val is not None: enqueue_queue.signal(signal, signal_val)
|
||||
|
||||
@@ -189,14 +194,8 @@ class HCQGraph(MultiGraphRunner):
|
||||
return None
|
||||
|
||||
def collect_timestamps(self):
|
||||
timestamps = [s.timestamp for s in self.prof_signals]
|
||||
|
||||
for (st,_), (en,_), dev, desc, is_cp, deps, args in self.prof_records:
|
||||
dev.raw_prof_records += [(timestamps[st], timestamps[en], desc, is_cp, args)]
|
||||
|
||||
for x in deps:
|
||||
(b_st,_), (b_en,_), b_dev, _, b_is_cp, _, _ = self.prof_records[x]
|
||||
dev.dep_prof_records += [(timestamps[b_st], timestamps[b_en], b_dev, b_is_cp, timestamps[st], timestamps[en], dev, is_cp)]
|
||||
# NOTE: Append to any device is fine...
|
||||
self.devices[0].profile_events += [ProfileGraphEvent(self.prof_graph_entries, self.prog_graph_deps, [s.timestamp for s in self.prof_signals])]
|
||||
|
||||
def __del__(self):
|
||||
for dev in self.devices: self.last_timeline[dev][0].wait(self.last_timeline[dev][1])
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional, Dict, Tuple, cast, Type, Union, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from typing import List, Optional, Dict, Tuple, cast, Type, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
|
||||
from tinygrad.ops import sym_infer, sint, Variable
|
||||
|
||||
# **************** for HCQ Compatible Devices ****************
|
||||
@@ -294,51 +294,11 @@ class HCQProgram(Generic[DeviceType]):
|
||||
if wait: self.dev.synchronize()
|
||||
return (float(sig_en.timestamp - sig_st.timestamp) / 1e6) if wait else None
|
||||
|
||||
class ProfileLogger:
|
||||
writers: int = 0
|
||||
mjson: List[Dict] = []
|
||||
actors: Dict[Union[str, Tuple[str, str]], int] = {}
|
||||
|
||||
def __init__(self): self.events, self.deps, ProfileLogger.writers = [], [], ProfileLogger.writers + 1
|
||||
|
||||
def add_event(self, ev_name, ev_start, ev_end, actor, subactor=None, args=None): self.events += [(ev_name, ev_start, ev_end, actor, subactor, args)]
|
||||
|
||||
def _ensure_actor(self, actor_name, subactor_name):
|
||||
if actor_name not in self.actors:
|
||||
self.actors[actor_name] = (pid:=len(self.actors))
|
||||
self.mjson.append({"name": "process_name", "ph": "M", "pid": pid, "args": {"name": actor_name}})
|
||||
|
||||
if (subactor_key:=(actor_name,subactor_name)) not in self.actors:
|
||||
self.actors[subactor_key] = (tid:=len(self.actors))
|
||||
self.mjson.append({"name": "thread_name", "ph": "M", "pid": self.actors[actor_name], "tid":tid, "args": {"name": subactor_name}})
|
||||
|
||||
return self.actors[actor_name], self.actors.get(subactor_key, -1)
|
||||
|
||||
def __del__(self):
|
||||
# perfetto json docs: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
|
||||
for name, st, et, actor_name, subactor_name, args in self.events:
|
||||
pid, tid = self._ensure_actor(actor_name,subactor_name)
|
||||
args = {k: (v if v.__class__ is str else v(et-st)) for k, v in args.items()} if args is not None else None
|
||||
self.mjson.append({"name": name, "ph": "X", "pid": pid, "tid": tid, "ts": st, "dur": et-st, "args": args})
|
||||
|
||||
for en,st,dep_actor_name,dep_subactor_name,actor_name,subactor_name in self.deps:
|
||||
dep_pid, dep_tid = self._ensure_actor(dep_actor_name,dep_subactor_name)
|
||||
pid, tid = self._ensure_actor(actor_name,subactor_name)
|
||||
self.mjson.append({"ph": "s", "pid": dep_pid, "tid": dep_tid, "id": len(self.mjson), "ts": en, "bp": "e"})
|
||||
self.mjson.append({"ph": "f", "pid": pid, "tid": tid, "id": len(self.mjson)-1, "ts": st, "bp": "e"})
|
||||
|
||||
ProfileLogger.writers -= 1
|
||||
if ProfileLogger.writers == 0 and len(self.mjson) > 0:
|
||||
with open(PROFILEPATH.value, "w") as f: f.write(json.dumps({"traceEvents": self.mjson}))
|
||||
print(f"Saved profile to {PROFILEPATH.value}. Use https://ui.perfetto.dev/ to open it.")
|
||||
|
||||
class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
"""
|
||||
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
|
||||
"""
|
||||
devices: List[HCQCompiled] = []
|
||||
gpu2cpu_copy_time_diff: decimal.Decimal = decimal.Decimal('nan')
|
||||
gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
|
||||
@@ -350,7 +310,6 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
|
||||
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
|
||||
if PROFILE: self._prof_setup()
|
||||
|
||||
from tinygrad.runtime.graph.hcq import HCQGraph
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
@@ -367,13 +326,11 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
|
||||
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
||||
if PROFILE:
|
||||
self.raw_prof_records += [(st.timestamp, en.timestamp, name, is_cp, None) for st, en, name, is_cp in self.sig_prof_records]
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
|
||||
self.sig_prof_records = []
|
||||
|
||||
def _ensure_shared_time_base(self):
|
||||
if not self.gpu2cpu_compute_time_diff.is_nan(): return
|
||||
|
||||
def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWQueue]):
|
||||
def _at_profile_finalize(self):
|
||||
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
|
||||
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)
|
||||
d.timeline_value += 1
|
||||
st = time.perf_counter_ns()
|
||||
@@ -381,65 +338,10 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
et = time.perf_counter_ns()
|
||||
return (decimal.Decimal(et+st) / 2000) - d.timeline_signal.timestamp
|
||||
|
||||
# randomly sample the timing from GPU to CPU
|
||||
choices: List = [(d, d.hw_compute_queue_t, []) for d in self.devices]
|
||||
choices += [(d, d.hw_copy_queue_t, []) for d in self.devices if d.hw_copy_queue_t is not None]
|
||||
for _ in range(100*len(self.devices)):
|
||||
d,q,l = random.choice(choices)
|
||||
l.append(_sync_cpu_queue(d,q))
|
||||
for d,q,l in choices:
|
||||
if q == d.hw_compute_queue_t: d.gpu2cpu_compute_time_diff = statistics.median(l)
|
||||
if q == d.hw_copy_queue_t: d.gpu2cpu_copy_time_diff = statistics.median(l)
|
||||
|
||||
def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWQueue], q2_t:Type[HWQueue]):
|
||||
q1_t().signal(d1.timeline_signal, d1.timeline_value).wait(d2.timeline_signal, d2.timeline_value) \
|
||||
.timestamp(d1.timeline_signal).signal(d1.timeline_signal, d1.timeline_value+1).submit(d1)
|
||||
q2_t().signal(d2.timeline_signal, d2.timeline_value).wait(d1.timeline_signal, d1.timeline_value) \
|
||||
.timestamp(d2.timeline_signal).signal(d2.timeline_signal, d2.timeline_value+1).submit(d2)
|
||||
d1.timeline_value += 2
|
||||
d2.timeline_value += 2
|
||||
d1.timeline_signal.wait(d1.timeline_value - 1)
|
||||
d2.timeline_signal.wait(d2.timeline_value - 1)
|
||||
return d2.timeline_signal.timestamp - d1.timeline_signal.timestamp
|
||||
|
||||
# then test it by timing the GPU to GPU times
|
||||
jitter_matrix = [[float('nan')]*len(self.devices) for _ in range(len(self.devices))]
|
||||
for i1, d1 in enumerate(self.devices):
|
||||
for i2, d2 in enumerate(self.devices):
|
||||
if d1 == d2: continue
|
||||
d1_to_d2 = statistics.median(_sync_gpu_to_gpu_queue(d1, d2, d1.hw_compute_queue_t, d2.hw_compute_queue_t) - \
|
||||
_sync_gpu_to_gpu_queue(d2, d1, d2.hw_compute_queue_t, d1.hw_compute_queue_t) for _ in range(20)) / 2
|
||||
jitter_matrix[i1][i2] = d1_to_d2 - (d1.gpu2cpu_compute_time_diff - d2.gpu2cpu_compute_time_diff)
|
||||
print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
|
||||
|
||||
def _gpu2cpu_time(self, gpu_time:decimal.Decimal, is_copy:bool) -> float:
|
||||
"""
|
||||
Translates local gpu time (timestamp) into global cpu time.
|
||||
"""
|
||||
self._ensure_shared_time_base()
|
||||
return float(gpu_time + (self.gpu2cpu_copy_time_diff if is_copy else self.gpu2cpu_compute_time_diff))
|
||||
|
||||
def _prof_setup(self):
|
||||
if hasattr(self, 'profile_logger'): return
|
||||
atexit.register(self._prof_finalize)
|
||||
self.profile_logger = ProfileLogger()
|
||||
|
||||
def _prof_finalize(self):
|
||||
qname = ["COMPUTE", "DMA"]
|
||||
|
||||
# Sync to be sure all events on the device are recorded.
|
||||
self.synchronize()
|
||||
|
||||
for st, en, name, is_cp, args in self.raw_prof_records:
|
||||
self.profile_logger.events += [(name, self._gpu2cpu_time(st, is_cp), self._gpu2cpu_time(en, is_cp), self.device, qname[is_cp], args)]
|
||||
for a_st, a_en, a_dev, a_is_copy, b_st, b_en, b_dev, b_is_copy in self.dep_prof_records:
|
||||
# Perfetto connects nodes based on timing data, ensuring every choice is valid by averaging times to a midpoint.
|
||||
a_tm, b_tm = a_dev._gpu2cpu_time((a_st+a_en)/decimal.Decimal(2), a_is_copy), b_dev._gpu2cpu_time((b_st+b_en)/decimal.Decimal(2), b_is_copy)
|
||||
self.profile_logger.deps += [(a_tm, b_tm, a_dev.device, qname[a_is_copy], b_dev.device, qname[b_is_copy])]
|
||||
self.raw_prof_records, self.dep_prof_records = [], []
|
||||
|
||||
# Remove the logger, this flushes all data written by the device.
|
||||
del self.profile_logger
|
||||
gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
|
||||
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
|
||||
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff)]
|
||||
|
||||
def _wrap_timeline_signal(self):
|
||||
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
|
||||
|
||||
Reference in New Issue
Block a user