diff --git a/test/null/test_viz.py b/test/null/test_viz.py index 7b322d1575..fb5ca51990 100644 --- a/test/null/test_viz.py +++ b/test/null/test_viz.py @@ -405,6 +405,16 @@ class TestVizProfiler(BaseTestViz): self.assertEqual(j["dur"], (event2["st"]+event2["dur"])-event["st"]) + def test_copy_node_bandwidth(self): + sz = 256*1024*1024 + dur = 10_000 + prof = [ProfileRangeEvent(device='NV:SDMA:0', name=TracingKey("NV -> NV:1", ret=sz), st=decimal.Decimal(1000), en=decimal.Decimal(1000+dur)), + ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-1000))] + j = load_profile(prof) + event = j['layout']['NV:SDMA:0']['events'][0] + gbs = sz/(dur*1e-6)*1e-9 + self.assertEqual(event['fmt'], f"{gbs:.0f} GB/s") + def test_graph(self): prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)), ProfileDeviceEvent(device='NV:1:SDMA:0', tdiff=decimal.Decimal(-50)), diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index aaa2bc886d..bbf3f8ff94 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -4,6 +4,7 @@ import contextlib, decimal, statistics, time, ctypes, array, os, struct, collect try: import fcntl # windows misses that except ImportError: fcntl = None #type:ignore[assignment] from tinygrad.helpers import PROFILE, getenv, to_mv, from_mv, cpu_profile, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing +from tinygrad.helpers import TracingKey from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet from tinygrad.uop.ops import sym_infer, sint, UOp from tinygrad.runtime.autogen import libc @@ -375,7 +376,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True) - self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, str]] = [] + self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str|TracingKey, str]] = [] self.prof_exec_counter:int = 0 self.prof_prg_counter:int = 0 @@ -519,7 +520,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): with cpu_profile(f'TINY -> {self.dev.device}', f"{self.dev.device}:COPY"): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src)) return - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"TINY -> {self.dev.device}", ret=src.nbytes), enabled=PROFILE, + dev_suff="SDMA:0"): for i in range(0, src.nbytes, self.b[0].size): self.b_next = (self.b_next + 1) % len(self.b) self.dev.timeline_signal.wait(self.b_timeline[self.b_next]) @@ -540,7 +542,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): return None assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"DISK -> {self.dev.device}", ret=size), enabled=PROFILE, + dev_suff="SDMA:0"): for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \ @@ -553,7 +556,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): with cpu_profile(f'{self.dev.device} -> TINY', f"{self.dev.device}:COPY"): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest)) return - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE, dev_suff="SDMA:0"): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"{self.dev.device} -> TINY", ret=dest.nbytes), enabled=PROFILE, + dev_suff="SDMA:0"): for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \ @@ -565,7 +569,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): cast(HCQAllocator, src_dev.allocator).map(dest) assert src_dev.hw_copy_queue_t is not None - with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): + with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=TracingKey(f"{src_dev.device} -> {dest_dev.device}", ret=sz), enabled=PROFILE, + dev_suff="SDMA:0"): src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \ .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ .copy(dest.va_addr, src.va_addr, sz) \ diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index 725b6733ba..26c7978279 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -209,6 +209,9 @@ def timeline_layout(dev_events:list[tuple[int, int, float, DevEvent]], start_ts: name = e.name.display_name ref = next((v for k in e.name.keys if (v:=ref_map.get(k)) is not None), None) if isinstance(e.name.ret, str): fmt.append(e.name.ret) + elif isinstance(e.name.ret, int): + membw = e.name.ret / (dur * 1e-6) + fmt.append(f"{membw*1e-9:.0f} GB/s" if membw < 1e13 else f"{membw*1e-12:.0f} TB/s") events.append(struct.pack("