mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
@@ -405,6 +405,16 @@ class TestVizProfiler(BaseTestViz):
|
||||
|
||||
self.assertEqual(j["dur"], (event2["st"]+event2["dur"])-event["st"])
|
||||
|
||||
def test_copy_node_bandwidth(self):
|
||||
sz = 256*1024*1024
|
||||
dur = 10_000
|
||||
prof = [ProfileRangeEvent(device='NV:SDMA:0', name=TracingKey("NV -> NV:1", ret=sz), st=decimal.Decimal(1000), en=decimal.Decimal(1000+dur)),
|
||||
ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-1000))]
|
||||
j = load_profile(prof)
|
||||
event = j['layout']['NV:SDMA:0']['events'][0]
|
||||
gbs = sz/(dur*1e-6)*1e-9
|
||||
self.assertEqual(event['fmt'], f"{gbs:.0f} GB/s")
|
||||
|
||||
def test_graph(self):
|
||||
prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)),
|
||||
ProfileDeviceEvent(device='NV:1:SDMA:0', tdiff=decimal.Decimal(-50)),
|
||||
|
||||
@@ -4,6 +4,7 @@ import contextlib, decimal, statistics, time, ctypes, array, os, struct, collect
|
||||
try: import fcntl # windows misses that
|
||||
except ImportError: fcntl = None #type:ignore[assignment]
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, from_mv, cpu_profile, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing
|
||||
from tinygrad.helpers import TracingKey
|
||||
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet
|
||||
from tinygrad.uop.ops import sym_infer, sint, UOp
|
||||
from tinygrad.runtime.autogen import libc
|
||||
@@ -375,7 +376,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, str]] = []
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str|TracingKey, str]] = []
|
||||
self.prof_exec_counter:int = 0
|
||||
self.prof_prg_counter:int = 0
|
||||
|
||||
@@ -519,7 +520,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
with cpu_profile(f'TINY -> {self.dev.device}', f"{self.dev.device}:COPY"): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src))
|
||||
return
|
||||
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"TINY -> {self.dev.device}", ret=src.nbytes), enabled=PROFILE,
|
||||
dev_suff="SDMA:0"):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
self.dev.timeline_signal.wait(self.b_timeline[self.b_next])
|
||||
@@ -540,7 +542,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
return None
|
||||
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"DISK -> {self.dev.device}", ret=size), enabled=PROFILE,
|
||||
dev_suff="SDMA:0"):
|
||||
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
|
||||
@@ -553,7 +556,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
with cpu_profile(f'{self.dev.device} -> TINY', f"{self.dev.device}:COPY"): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest))
|
||||
return
|
||||
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=TracingKey(f"{self.dev.device} -> TINY", ret=dest.nbytes), enabled=PROFILE,
|
||||
dev_suff="SDMA:0"):
|
||||
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
|
||||
@@ -565,7 +569,8 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
cast(HCQAllocator, src_dev.allocator).map(dest)
|
||||
|
||||
assert src_dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=TracingKey(f"{src_dev.device} -> {dest_dev.device}", ret=sz), enabled=PROFILE,
|
||||
dev_suff="SDMA:0"):
|
||||
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
||||
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr, src.va_addr, sz) \
|
||||
|
||||
@@ -209,6 +209,9 @@ def timeline_layout(dev_events:list[tuple[int, int, float, DevEvent]], start_ts:
|
||||
name = e.name.display_name
|
||||
ref = next((v for k in e.name.keys if (v:=ref_map.get(k)) is not None), None)
|
||||
if isinstance(e.name.ret, str): fmt.append(e.name.ret)
|
||||
elif isinstance(e.name.ret, int):
|
||||
membw = e.name.ret / (dur * 1e-6)
|
||||
fmt.append(f"{membw*1e-9:.0f} GB/s" if membw < 1e13 else f"{membw*1e-12:.0f} TB/s")
|
||||
events.append(struct.pack("<IIIIfI", enum_str(name, scache), option(ref), option(key), rel_ts(st,start_ts), dur, enum_str("\n".join(fmt),scache)))
|
||||
return struct.pack("<BI", 0, len(events))+b"".join(events) if events else None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user