diff --git a/extra/perfetto/to_perfetto.py b/extra/perfetto/to_perfetto.py index f12d1c494c..f5ff58050b 100644 --- a/extra/perfetto/to_perfetto.py +++ b/extra/perfetto/to_perfetto.py @@ -2,27 +2,26 @@ import sys, pickle, decimal, json from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent from tinygrad.helpers import tqdm, temp, ProfileEvent, ProfileRangeEvent, TracingKey -devices:dict[str, tuple[decimal.Decimal, decimal.Decimal, int]] = {} -def prep_ts(device:str, ts:decimal.Decimal, is_copy): return int(decimal.Decimal(ts) + devices[device][is_copy]) -def dev_to_pid(device:str, is_copy=False): return {"pid": devices[device][2], "tid": int(is_copy)} +devices:dict[str, tuple[decimal.Decimal, int]] = {} +def prep_ts(device:str, ts:decimal.Decimal): return int(decimal.Decimal(ts) + devices[device][0]) +def dev_to_pid(device:str): return {"pid": devices[device][1], "tid": 0} def dev_ev_to_perfetto_json(ev:ProfileDeviceEvent): - devices[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff, len(devices)) + devices[ev.device] = (ev.tdiff, len(devices)) return [{"name": "process_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "args": {"name": ev.device}}, - {"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": "COMPUTE"}}, - {"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 1, "args": {"name": "COPY"}}] + {"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": ev.device}}] def range_ev_to_perfetto_json(ev:ProfileRangeEvent): name = ev.name.display_name if isinstance(ev.name, TracingKey) else ev.name - return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st, ev.is_copy), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device, ev.is_copy)}] + return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device)}] def graph_ev_to_perfetto_json(ev:ProfileGraphEvent, reccnt): ret = [] for i,e in enumerate(ev.ents): st, en = ev.sigs[e.st_id], ev.sigs[e.en_id] name = e.name.display_name if isinstance(e.name, TracingKey) else e.name - ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st, e.is_copy), "dur": float(en-st), **dev_to_pid(e.device, e.is_copy)}] + ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st), "dur": float(en-st), **dev_to_pid(e.device)}] for dep in ev.deps[i]: d = ev.ents[dep] - ret += [{"ph": "s", **dev_to_pid(d.device, d.is_copy), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id], d.is_copy), "bp": "e"}] - ret += [{"ph": "f", **dev_to_pid(e.device, e.is_copy), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st, e.is_copy), "bp": "e"}] + ret += [{"ph": "s", **dev_to_pid(d.device), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id]), "bp": "e"}] + ret += [{"ph": "f", **dev_to_pid(e.device), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st), "bp": "e"}] return ret def to_perfetto(profile:list[ProfileEvent]): # Start json with devices. diff --git a/extra/sqtt/rgptool.py b/extra/sqtt/rgptool.py index 84c5a3094d..0ce150a0f3 100755 --- a/extra/sqtt/rgptool.py +++ b/extra/sqtt/rgptool.py @@ -145,7 +145,8 @@ class RGP: @staticmethod def from_profile(profile_pickled, device:str|None=None): profile: list[ProfileEvent] = pickle.loads(profile_pickled) - device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')} + def _is_base_dev(d): return all(p.isdigit() for p in d.split(":")[1:]) + device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD') and _is_base_dev(x.device)} if device is None: if len(device_events) == 0: raise RuntimeError('No supported devices found in profile') if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}") diff --git a/test/null/test_viz.py b/test/null/test_viz.py index bc1a57828e..9b41dd1577 100644 --- a/test/null/test_viz.py +++ b/test/null/test_viz.py @@ -365,8 +365,8 @@ def load_profile(lst:list[ProfileEvent]) -> dict: class TestVizProfiler(BaseTestViz): def test_node(self): - prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=False), - ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100))] + prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010)), + ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000))] j = load_profile(prof) @@ -379,28 +379,28 @@ class TestVizProfiler(BaseTestViz): assert event['ref'] is None def test_copy_node(self): - prof = [ProfileRangeEvent(device='NV', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True), - ProfileRangeEvent(device='NV:2', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True), - ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)), - ProfileDeviceEvent(device='NV:2', comp_tdiff=decimal.Decimal(-800), copy_tdiff=decimal.Decimal(-80))] + prof = [ProfileRangeEvent(device='NV:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)), + ProfileRangeEvent(device='NV:2:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)), + ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-100)), + ProfileDeviceEvent(device='NV:2:SDMA:0', tdiff=decimal.Decimal(-80))] j = load_profile(prof) - event = j['layout']['NV']['events'][0] + event = j['layout']['NV:SDMA:0']['events'][0] self.assertEqual(event['name'], 'COPYxx') self.assertEqual(event['st'], 0) # first event self.assertEqual(event['dur'], 10) - event2 = j['layout']['NV:2']['events'][0] + event2 = j['layout']['NV:2:SDMA:0']['events'][0] self.assertEqual(event2['st'], 20) # second event, diff clock self.assertEqual(j["dur"], (event2["st"]+event2["dur"])-event["st"]) def test_graph(self): - prof = [ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)), - ProfileDeviceEvent(device='NV:1', comp_tdiff=decimal.Decimal(-500), copy_tdiff=decimal.Decimal(-50)), - ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1, is_copy=False), - ProfileGraphEntry(device='NV:1', name='NV -> NV:1', st_id=2, en_id=3, is_copy=True)], + prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)), + ProfileDeviceEvent(device='NV:1:SDMA:0', tdiff=decimal.Decimal(-50)), + ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1), + ProfileGraphEntry(device='NV:1:SDMA:0', name='NV -> NV:1', st_id=2, en_id=3)], deps=[[], [0]], sigs=[decimal.Decimal(1000), decimal.Decimal(1002), decimal.Decimal(1004), decimal.Decimal(1008)])] @@ -409,22 +409,20 @@ class TestVizProfiler(BaseTestViz): tracks = list(j['layout']) self.assertEqual(tracks[0], 'NV') self.assertEqual(tracks[1], 'NV Graph') - self.assertEqual(tracks[2], 'NV:1') + self.assertEqual(tracks[2], 'NV:1:SDMA:0') nv_events = j['layout']['NV']['events'] self.assertEqual(nv_events[0]['name'], 'E_25_4n2') self.assertEqual(nv_events[0]['st'], 0) self.assertEqual(nv_events[0]['dur'], 2) - #self.assertEqual(j['devEvents'][6]['pid'], j['devEvents'][0]['pid']) - nv1_events = j['layout']['NV:1']['events'] - self.assertEqual(nv1_events[0]['name'], 'NV -> NV:1') - self.assertEqual(nv1_events[0]['st'], 954) - #self.assertEqual(j['devEvents'][7]['pid'], j['devEvents'][3]['pid']) + sdma_events = j['layout']['NV:1:SDMA:0']['events'] + self.assertEqual(sdma_events[0]['name'], 'NV -> NV:1') + self.assertEqual(sdma_events[0]['st'], 954) graph_events = j['layout']['NV Graph']['events'] self.assertEqual(graph_events[0]['st'], nv_events[0]['st']) - self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], nv1_events[0]['st']+nv1_events[0]['dur']) + self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], sdma_events[0]['st']+sdma_events[0]['dur']) def test_bytes_per_kernel(self): step = 10 diff --git a/test/test_profiler.py b/test/test_profiler.py index 02d675210e..03cadb434d 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -6,6 +6,9 @@ from tinygrad.runtime.support.hcq import HCQCompiled from tinygrad.engine.realize import get_runner MOCKGPU = getenv("MOCKGPU") +def _dev_base(d): + p = d.split(":") + return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}" @contextlib.contextmanager def helper_collect_profile(*devs): @@ -55,7 +58,7 @@ class TestProfiler(unittest.TestCase): kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)] assert len(kernel_runs) == 1, "one kernel run is expected" assert kernel_runs[0].name == runner_name, "kernel name is not correct" - assert not kernel_runs[0].is_copy, "kernel should not be copy" + assert _dev_base(kernel_runs[0].device) == kernel_runs[0].device, "kernel should not be on a sub-device" def test_profile_copyin(self): buf1 = Buffer(Device.DEFAULT, 2, dtypes.float, options=BufferSpec(nolru=True)).ensure_allocated() @@ -63,10 +66,8 @@ class TestProfiler(unittest.TestCase): with helper_collect_profile(TestProfiler.d0) as profile: buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) - profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) - kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)] + kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)] assert len(kernel_runs) == 1, "one kernel run is expected" - assert kernel_runs[0].is_copy, "kernel should be copy" def test_profile_multiops(self): runner_name = TestProfiler.runner._prg.name @@ -77,16 +78,12 @@ class TestProfiler(unittest.TestCase): TestProfiler.runner([buf1, TestProfiler.a.uop.buffer], var_vals={}) buf1.copyout(memoryview(bytearray(buf1.nbytes))) - profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) - evs = [x for x in profile if isinstance(x, ProfileRangeEvent)] + evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)] assert len(evs) == 3, "3 kernel runs are expected" # NOTE: order of events does not matter, the tool is responsible for sorting them - copy_events = [e for e in evs if e.is_copy] - self.assertEqual(len(copy_events), 2) - - prg_events = [e for e in evs if not e.is_copy] - assert prg_events[0].name == runner_name, "kernel name is not correct" + prg_events = [e for e in evs if e.device == TestProfiler.d0.device] + assert any(e.name == runner_name for e in prg_events), "kernel name is not correct" #for i in range(1, 3): # assert evs[i].st > evs[i-1].en, "timestamp not aranged" @@ -102,13 +99,9 @@ class TestProfiler(unittest.TestCase): buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) buf2.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) - profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) - profile1, _ = helper_profile_filter_device(profile, d1.device) - - for p in [profile0, profile1]: - evs = [x for x in p if isinstance(x, ProfileRangeEvent)] + for dev in [TestProfiler.d0.device, d1.device]: + evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and _dev_base(x.device) == dev] assert len(evs) == 1, "one kernel runs are expected" - assert evs[0].is_copy, "kernel should be copy" def test_profile_multidev_transfer(self): try: d1 = Device[f"{Device.DEFAULT}:1"] @@ -118,10 +111,8 @@ class TestProfiler(unittest.TestCase): with helper_collect_profile(TestProfiler.d0, d1) as profile: buf1.to(f"{Device.DEFAULT}:1").realize() - profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) - kernel_runs = [x for x in profile0 if isinstance(x, ProfileRangeEvent)] + kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)] assert len(kernel_runs) == 1, "one kernel run is expected" - assert kernel_runs[0].is_copy, "kernel should be copy" @unittest.skipIf(Device.DEFAULT in "METAL" or (MOCKGPU and Device.DEFAULT == "AMD"), "AMD mockgpu does not support queue wait interrupts") def test_profile_graph(self): @@ -167,17 +158,18 @@ class TestProfiler(unittest.TestCase): return d2.timeline_signal.timestamp - d1.timeline_signal.timestamp # then test it by timing the GPU to GPU times + dev_evs = {x.device:x for x in Compiled.profile_events if isinstance(x, ProfileDeviceEvent)} jitter_matrix = [[float('nan')] * len(devs) for _ in range(len(devs))] pairs = [(p1, p2) for p1 in enumerate(devs) for p2 in enumerate(devs) if p1 != p2] for (i1, d1), (i2, d2) in pairs: - cpu_diff = d1.gpu2cpu_compute_time_diff - d2.gpu2cpu_compute_time_diff + cpu_diff = dev_evs[d1.device].tdiff - dev_evs[d2.device].tdiff jitter_matrix[i1][i2] = statistics.median(_sync_d2d(d1, d2) - _sync_d2d(d2, d1) for _ in range(20)) / 2 - cpu_diff + print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix])) + for (i1, d1), (i2, d2) in pairs: assert abs(jitter_matrix[i1][i2]) < 0.5, "jitter should be less than 0.5us" - print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix])) - @unittest.skip("this test is flaky") def test_cpu_profile(self): def test_fxn(err=False): @@ -228,7 +220,7 @@ class TestProfiler(unittest.TestCase): Tensor.realize(a, b) profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) exec_points = [e for e in profile if isinstance(e, ProfilePointEvent) and e.name == "exec"] - range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and not e.is_copy] + range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and _dev_base(e.device) == e.device] self.assertEqual(len(exec_points), len(range_events), 2) self.assertEqual(len(dedup(e.arg['name'] for e in exec_points)), 1) self.assertEqual(len(dedup(e.arg['metadata'] for e in exec_points)), 1) diff --git a/tinygrad/device.py b/tinygrad/device.py index 09577091aa..dc39aa2e12 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -56,14 +56,13 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices] # **************** Profile **************** @dataclass(frozen=True) -class ProfileDeviceEvent(ProfileEvent): - device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702 +class ProfileDeviceEvent(ProfileEvent): device:str; tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702 @dataclass(frozen=True) class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702 @dataclass(frozen=True) -class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702 +class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int # noqa: E702 @dataclass(frozen=True) class ProfileGraphEvent(ProfileEvent): ents:list[ProfileGraphEntry]; deps:list[list[int]]; sigs:list[decimal.Decimal] # noqa: E702 diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 682f25e6f2..8242cf581f 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -287,8 +287,7 @@ class TracingKey: class ProfileEvent: pass @dataclass -class ProfileRangeEvent(ProfileEvent): - device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False # noqa: E702 +class ProfileRangeEvent(ProfileEvent): device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None # noqa: E702 @dataclass(frozen=True) class ProfilePointEvent(ProfileEvent): @@ -296,8 +295,8 @@ class ProfilePointEvent(ProfileEvent): cpu_events:list[ProfileEvent] = [] @contextlib.contextmanager -def cpu_profile(name:str|TracingKey, device="TINY", is_copy=False, display=True) -> Generator[ProfileRangeEvent, None, None]: - res = ProfileRangeEvent(device, name, perf_counter_us(), is_copy=is_copy) +def cpu_profile(name:str|TracingKey, device="TINY", display=True) -> Generator[ProfileRangeEvent, None, None]: + res = ProfileRangeEvent(device, name, perf_counter_us()) try: yield res finally: res.en = perf_counter_us() diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py index 710e1a43bb..718e55f67b 100644 --- a/tinygrad/runtime/graph/hcq.py +++ b/tinygrad/runtime/graph/hcq.py @@ -131,7 +131,8 @@ class HCQGraph(MultiGraphRunner): # Description based on the command. prof_ji_desc = ji.prg._prg.name if is_exec_prg else f"{ji.bufs[1].device} -> {ji.bufs[0].device}" # type: ignore - self.prof_graph_entries.append(ProfileGraphEntry(enqueue_dev.device, prof_ji_desc, sig_st, j * 2 + 1, is_copy=not is_exec_prg)) + prof_name = f"{enqueue_dev.device}:SDMA:{queue_idx}" if not is_exec_prg else enqueue_dev.device + self.prof_graph_entries.append(ProfileGraphEntry(prof_name, prof_ji_desc, sig_st, j * 2 + 1)) self.prof_graph_deps.append([d - 1 for _, d in rdeps]) last_j[enqueue_queue] = j diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py index c1931295a6..9105942228 100644 --- a/tinygrad/runtime/graph/metal.py +++ b/tinygrad/runtime/graph/metal.py @@ -101,7 +101,7 @@ class MetalGraph(GraphRunner): def collect_timestamps(self): # create a graph event and evenly space each program st, en = decimal.Decimal(self.command_buffer.GPUStartTime()) * 1000000, decimal.Decimal(self.command_buffer.GPUEndTime()) * 1000000 - ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1, is_copy=False) for i,ji in enumerate(self.jit_cache)] + ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1) for i,ji in enumerate(self.jit_cache)] step = (en-st)/len(ents) self.dev.profile_events += [ProfileGraphEvent(ents, [], [st+step*i for i in range(len(ents)+1)])] diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 0d8b58fe9d..8cb09b5cfc 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filte from tinygrad.uop.ops import sint from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar -from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv +from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer from tinygrad.renderer.llvmir import AMDLLVMRenderer from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm @@ -955,6 +955,7 @@ class AMDDevice(HCQCompiled): ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size) self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 + self.sdma_queues:dict = {} self.has_sdma_queue = self.sdma_queue(0) is not None compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None), @@ -1018,11 +1019,12 @@ class AMDDevice(HCQCompiled): wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer, ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx)) - @functools.lru_cache(None) def sdma_queue(self, idx:int): if getenv("AMD_DISABLE_SDMA"): return None - with contextlib.suppress(OSError): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx) - return None + if idx in self.sdma_queues: return self.sdma_queues[idx] + with contextlib.suppress(OSError): + self.sdma_queues[idx] = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx) + return self.sdma_queues.get(idx, None) def _ensure_has_local_memory(self, private_segment_size): if self.max_private_segment_size >= private_segment_size: return @@ -1063,3 +1065,5 @@ class AMDDevice(HCQCompiled): def on_device_hang(self): self.iface.on_device_hang() def device_props(self): return self.iface.props + + def hw_copy_queues(self): return [(f"SDMA:{i}", functools.partial(unwrap(self.hw_copy_queue_t), queue_idx=i)) for i in self.sdma_queues] diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index b956890bcc..d0621f15b0 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -51,7 +51,7 @@ class MetalDevice(Compiled): st, en = decimal.Decimal(cbuf.GPUStartTime()) * 1000000, decimal.Decimal(cbuf.GPUEndTime()) * 1000000 # NOTE: command buffers from MetalGraph are not profiled here if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None and not lb.startswith("batched"): - Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en, is_copy=lb.startswith("COPY"))] + Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)] self.mtl_buffers_in_flight.clear() def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary: @@ -191,7 +191,7 @@ class MetalAllocator(LRUAllocator[MetalDevice]): # There is no real metal multidevice support for now, so transfer is used only for tests. src_dev.synchronize() def _cp_mv(self, dst, src, prof_desc): - with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src + with cpu_profile(prof_desc, self.dev.device): dst[:] = src def _as_buffer(self, src:MetalBuffer) -> memoryview: self.dev.synchronize() return to_mv(src.buf.contents(), src.size + src.offset)[src.offset:] diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 10e02a0ca2..bd40a8604b 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -333,7 +333,7 @@ class NVAllocator(HCQAllocator['NVDevice']): self.dev._ensure_has_vid_hw(w, h) q = NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) - with hcq_profile(self.dev, queue=q, desc="NVDEC", enabled=PROFILE): + with hcq_profile(self.dev, queue=q, desc="HEVC Decode", enabled=PROFILE, dev_suff="NVDEC"): q.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)], round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off, self.dev.intra_unk_off, self.dev.vid_stat_buf) diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 5f4d5c1a3a..4c8c3adfec 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -331,7 +331,7 @@ class QCOMAllocator(HCQAllocatorBase): return self.dev._gpu_map(opts.external_ptr, size, image=opts.image) if opts.external_ptr else self.dev._gpu_alloc(size, image=opts.image) def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, prof_text, dest_off=0, src_off=0): - with cpu_profile(prof_text, self.dev.device, is_copy=True): + with cpu_profile(prof_text, self.dev.device): while src_off < src_size: ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size) src_off, dest_off = src_off+src_stride, dest_off+dest_stride diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 6351a7de7f..d2b67a66f5 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -265,7 +265,7 @@ class HCQSignal(Generic[HCQDeviceType]): if not_passed and self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})") @contextlib.contextmanager -def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None): +def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None, dev_suff:str|None=None): st, en = (dev.new_signal(), dev.new_signal()) if enabled else (None, None) assert queue is not None or queue_type is not None, "Either queue or queue_type must be provided" @@ -279,7 +279,7 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue] elif enabled and queue_type is not None: queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.next_timeline()).submit(dev) - if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t)) + if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, f"{dev.device}:{dev_suff}" if dev_suff else dev.device)) class HCQArgsState(Generic[ProgramType]): def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint|None, ...]=()): @@ -376,7 +376,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True) - self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = [] + self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, str]] = [] self.prof_exec_counter:int = 0 self.prof_prg_counter:int = 0 @@ -402,7 +402,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): if self.timeline_value > (1 << 31): self._wrap_timeline_signal() if PROFILE: - Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records] + Compiled.profile_events += [ProfileRangeEvent(dev, name, st.timestamp, en.timestamp) for st,en,name,dev in self.sig_prof_records] self.sig_prof_records = [] def next_timeline(self): @@ -418,6 +418,10 @@ class HCQCompiled(Compiled, Generic[SignalType]): def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent. + def hw_compute_queues(self) -> list[tuple[str|None, Callable[[], HWQueue]]]: return [(None, self.hw_compute_queue_t)] + def hw_copy_queues(self) -> list[tuple[str, Callable[[], HWQueue]]]: + return [("SDMA:0", self.hw_copy_queue_t)] if self.hw_copy_queue_t is not None else [] + def _at_profile_finalize(self): self.synchronize() # Expect device to be synchronizes @@ -428,10 +432,9 @@ class HCQCompiled(Compiled, Generic[SignalType]): et = time.perf_counter_ns() return (decimal.Decimal(et+st) / 2000) - d.timeline_signal.timestamp - self.gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)]) - if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0) - else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)]) - Compiled.profile_events += [ProfileDeviceEvent(self.device, self.gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())] + for prefix, q_t in self.hw_compute_queues() + self.hw_copy_queues(): + devname = f"{self.device}:{prefix}" if prefix else self.device + Compiled.profile_events += [ProfileDeviceEvent(devname, statistics.median([_sync(self, q_t) for _ in range(40)]), props=self.device_props())] def _wrap_timeline_signal(self): self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1 @@ -514,10 +517,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): def _copyin(self, dest:HCQBuffer, src:memoryview): if self.dev.hw_copy_queue_t is None: self.dev.synchronize() - with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device, is_copy=True): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src)) + with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src)) return - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): for i in range(0, src.nbytes, self.b[0].size): self.b_next = (self.b_next + 1) % len(self.b) self.dev.timeline_signal.wait(self.b_timeline[self.b_next]) @@ -538,7 +541,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): return None assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \ @@ -548,10 +551,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): def _copyout(self, dest:memoryview, src:HCQBuffer): self.dev.synchronize() if self.dev.hw_copy_queue_t is None: - with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest)) + with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest)) return - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE, dev_suff="SDMA:0"): for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \ @@ -563,7 +566,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): cast(HCQAllocator, src_dev.allocator).map(dest) assert src_dev.hw_copy_queue_t is not None - with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE): + with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE, dev_suff="SDMA:0"): src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \ .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ .copy(dest.va_addr, src.va_addr, sz) \ diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index fc3a5a24f1..90893a2bdf 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -168,19 +168,19 @@ def option(s:int|None) -> int: return 0 if s is None else s+1 # Profiler API -device_ts_diffs:dict[str, tuple[Decimal, Decimal]] = {} -def cpu_ts_diff(device:str, thread=0) -> Decimal: return device_ts_diffs.get(device, (Decimal(0),))[thread] +device_ts_diffs:dict[str, Decimal] = {} +def cpu_ts_diff(device:str) -> Decimal: return device_ts_diffs.get(device, Decimal(0)) amdgpu_targets:dict[str, int] = {} DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]: for e in profile: - if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device, e.is_copy)), (e.en if e.en is not None else e.st)+diff, e) + if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device)), (e.en if e.en is not None else e.st)+diff, e) elif isinstance(e, ProfilePointEvent): yield (e.ts, e.ts, e) elif isinstance(e, ProfileGraphEvent): cpu_ts = [] - for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device, ent.is_copy)), e.sigs[ent.en_id]+diff] + for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device)), e.sigs[ent.en_id]+diff] yield (st:=min(cpu_ts)), (et:=max(cpu_ts)), ProfileRangeEvent(f"{e.ents[0].device.split(':')[0]} Graph", f"batched {len(e.ents)}", st, et) for i,ent in enumerate(e.ents): yield (cpu_ts[i*2], cpu_ts[i*2+1], ent) @@ -384,7 +384,7 @@ def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]=device_ device_decoders:dict[str, Callable[[list[ProfileEvent]], None]] = {} for ev in profile: if isinstance(ev, ProfileDeviceEvent): - device_ts_diffs[ev.device] = (ev.comp_tdiff,ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff) + device_ts_diffs[ev.device] = ev.tdiff if (d:=ev.device.split(":")[0]) == "AMD": device_decoders[d] = load_counters amdgpu_targets[d] = unwrap(ev.props)["gfx_target_version"]