diff devices for sdma (#14589)

* start

* x

* fix

* sdma

* c

* clean

* x

* hm

* cleaer
This commit is contained in:
nimlgen
2026-02-06 16:39:12 +03:00
committed by GitHub
parent 7cb996e153
commit fbeb978170
14 changed files with 86 additions and 90 deletions

View File

@@ -2,27 +2,26 @@ import sys, pickle, decimal, json
from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent
from tinygrad.helpers import tqdm, temp, ProfileEvent, ProfileRangeEvent, TracingKey
devices:dict[str, tuple[decimal.Decimal, decimal.Decimal, int]] = {}
def prep_ts(device:str, ts:decimal.Decimal, is_copy): return int(decimal.Decimal(ts) + devices[device][is_copy])
def dev_to_pid(device:str, is_copy=False): return {"pid": devices[device][2], "tid": int(is_copy)}
devices:dict[str, tuple[decimal.Decimal, int]] = {}
def prep_ts(device:str, ts:decimal.Decimal): return int(decimal.Decimal(ts) + devices[device][0])
def dev_to_pid(device:str): return {"pid": devices[device][1], "tid": 0}
def dev_ev_to_perfetto_json(ev:ProfileDeviceEvent):
devices[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff, len(devices))
devices[ev.device] = (ev.tdiff, len(devices))
return [{"name": "process_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "args": {"name": ev.device}},
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": "COMPUTE"}},
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 1, "args": {"name": "COPY"}}]
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": ev.device}}]
def range_ev_to_perfetto_json(ev:ProfileRangeEvent):
name = ev.name.display_name if isinstance(ev.name, TracingKey) else ev.name
return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st, ev.is_copy), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device, ev.is_copy)}]
return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device)}]
def graph_ev_to_perfetto_json(ev:ProfileGraphEvent, reccnt):
ret = []
for i,e in enumerate(ev.ents):
st, en = ev.sigs[e.st_id], ev.sigs[e.en_id]
name = e.name.display_name if isinstance(e.name, TracingKey) else e.name
ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st, e.is_copy), "dur": float(en-st), **dev_to_pid(e.device, e.is_copy)}]
ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st), "dur": float(en-st), **dev_to_pid(e.device)}]
for dep in ev.deps[i]:
d = ev.ents[dep]
ret += [{"ph": "s", **dev_to_pid(d.device, d.is_copy), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id], d.is_copy), "bp": "e"}]
ret += [{"ph": "f", **dev_to_pid(e.device, e.is_copy), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st, e.is_copy), "bp": "e"}]
ret += [{"ph": "s", **dev_to_pid(d.device), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id]), "bp": "e"}]
ret += [{"ph": "f", **dev_to_pid(e.device), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st), "bp": "e"}]
return ret
def to_perfetto(profile:list[ProfileEvent]):
# Start json with devices.

View File

@@ -145,7 +145,8 @@ class RGP:
@staticmethod
def from_profile(profile_pickled, device:str|None=None):
profile: list[ProfileEvent] = pickle.loads(profile_pickled)
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')}
def _is_base_dev(d): return all(p.isdigit() for p in d.split(":")[1:])
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD') and _is_base_dev(x.device)}
if device is None:
if len(device_events) == 0: raise RuntimeError('No supported devices found in profile')
if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}")

View File

@@ -365,8 +365,8 @@ def load_profile(lst:list[ProfileEvent]) -> dict:
class TestVizProfiler(BaseTestViz):
def test_node(self):
prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=False),
ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100))]
prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000))]
j = load_profile(prof)
@@ -379,28 +379,28 @@ class TestVizProfiler(BaseTestViz):
assert event['ref'] is None
def test_copy_node(self):
prof = [ProfileRangeEvent(device='NV', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True),
ProfileRangeEvent(device='NV:2', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True),
ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)),
ProfileDeviceEvent(device='NV:2', comp_tdiff=decimal.Decimal(-800), copy_tdiff=decimal.Decimal(-80))]
prof = [ProfileRangeEvent(device='NV:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
ProfileRangeEvent(device='NV:2:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-100)),
ProfileDeviceEvent(device='NV:2:SDMA:0', tdiff=decimal.Decimal(-80))]
j = load_profile(prof)
event = j['layout']['NV']['events'][0]
event = j['layout']['NV:SDMA:0']['events'][0]
self.assertEqual(event['name'], 'COPYxx')
self.assertEqual(event['st'], 0) # first event
self.assertEqual(event['dur'], 10)
event2 = j['layout']['NV:2']['events'][0]
event2 = j['layout']['NV:2:SDMA:0']['events'][0]
self.assertEqual(event2['st'], 20) # second event, diff clock
self.assertEqual(j["dur"], (event2["st"]+event2["dur"])-event["st"])
def test_graph(self):
prof = [ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)),
ProfileDeviceEvent(device='NV:1', comp_tdiff=decimal.Decimal(-500), copy_tdiff=decimal.Decimal(-50)),
ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1, is_copy=False),
ProfileGraphEntry(device='NV:1', name='NV -> NV:1', st_id=2, en_id=3, is_copy=True)],
prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)),
ProfileDeviceEvent(device='NV:1:SDMA:0', tdiff=decimal.Decimal(-50)),
ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1),
ProfileGraphEntry(device='NV:1:SDMA:0', name='NV -> NV:1', st_id=2, en_id=3)],
deps=[[], [0]],
sigs=[decimal.Decimal(1000), decimal.Decimal(1002), decimal.Decimal(1004), decimal.Decimal(1008)])]
@@ -409,22 +409,20 @@ class TestVizProfiler(BaseTestViz):
tracks = list(j['layout'])
self.assertEqual(tracks[0], 'NV')
self.assertEqual(tracks[1], 'NV Graph')
self.assertEqual(tracks[2], 'NV:1')
self.assertEqual(tracks[2], 'NV:1:SDMA:0')
nv_events = j['layout']['NV']['events']
self.assertEqual(nv_events[0]['name'], 'E_25_4n2')
self.assertEqual(nv_events[0]['st'], 0)
self.assertEqual(nv_events[0]['dur'], 2)
#self.assertEqual(j['devEvents'][6]['pid'], j['devEvents'][0]['pid'])
nv1_events = j['layout']['NV:1']['events']
self.assertEqual(nv1_events[0]['name'], 'NV -> NV:1')
self.assertEqual(nv1_events[0]['st'], 954)
#self.assertEqual(j['devEvents'][7]['pid'], j['devEvents'][3]['pid'])
sdma_events = j['layout']['NV:1:SDMA:0']['events']
self.assertEqual(sdma_events[0]['name'], 'NV -> NV:1')
self.assertEqual(sdma_events[0]['st'], 954)
graph_events = j['layout']['NV Graph']['events']
self.assertEqual(graph_events[0]['st'], nv_events[0]['st'])
self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], nv1_events[0]['st']+nv1_events[0]['dur'])
self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], sdma_events[0]['st']+sdma_events[0]['dur'])
def test_bytes_per_kernel(self):
step = 10

View File

@@ -6,6 +6,9 @@ from tinygrad.runtime.support.hcq import HCQCompiled
from tinygrad.engine.realize import get_runner
MOCKGPU = getenv("MOCKGPU")
def _dev_base(d):
p = d.split(":")
return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"
@contextlib.contextmanager
def helper_collect_profile(*devs):
@@ -55,7 +58,7 @@ class TestProfiler(unittest.TestCase):
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
assert len(kernel_runs) == 1, "one kernel run is expected"
assert kernel_runs[0].name == runner_name, "kernel name is not correct"
assert not kernel_runs[0].is_copy, "kernel should not be copy"
assert _dev_base(kernel_runs[0].device) == kernel_runs[0].device, "kernel should not be on a sub-device"
def test_profile_copyin(self):
buf1 = Buffer(Device.DEFAULT, 2, dtypes.float, options=BufferSpec(nolru=True)).ensure_allocated()
@@ -63,10 +66,8 @@ class TestProfiler(unittest.TestCase):
with helper_collect_profile(TestProfiler.d0) as profile:
buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
assert len(kernel_runs) == 1, "one kernel run is expected"
assert kernel_runs[0].is_copy, "kernel should be copy"
def test_profile_multiops(self):
runner_name = TestProfiler.runner._prg.name
@@ -77,16 +78,12 @@ class TestProfiler(unittest.TestCase):
TestProfiler.runner([buf1, TestProfiler.a.uop.buffer], var_vals={})
buf1.copyout(memoryview(bytearray(buf1.nbytes)))
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
evs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
assert len(evs) == 3, "3 kernel runs are expected"
# NOTE: order of events does not matter, the tool is responsible for sorting them
copy_events = [e for e in evs if e.is_copy]
self.assertEqual(len(copy_events), 2)
prg_events = [e for e in evs if not e.is_copy]
assert prg_events[0].name == runner_name, "kernel name is not correct"
prg_events = [e for e in evs if e.device == TestProfiler.d0.device]
assert any(e.name == runner_name for e in prg_events), "kernel name is not correct"
#for i in range(1, 3):
# assert evs[i].st > evs[i-1].en, "timestamp not aranged"
@@ -102,13 +99,9 @@ class TestProfiler(unittest.TestCase):
buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
buf2.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
profile1, _ = helper_profile_filter_device(profile, d1.device)
for p in [profile0, profile1]:
evs = [x for x in p if isinstance(x, ProfileRangeEvent)]
for dev in [TestProfiler.d0.device, d1.device]:
evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and _dev_base(x.device) == dev]
assert len(evs) == 1, "one kernel runs are expected"
assert evs[0].is_copy, "kernel should be copy"
def test_profile_multidev_transfer(self):
try: d1 = Device[f"{Device.DEFAULT}:1"]
@@ -118,10 +111,8 @@ class TestProfiler(unittest.TestCase):
with helper_collect_profile(TestProfiler.d0, d1) as profile:
buf1.to(f"{Device.DEFAULT}:1").realize()
profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
kernel_runs = [x for x in profile0 if isinstance(x, ProfileRangeEvent)]
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
assert len(kernel_runs) == 1, "one kernel run is expected"
assert kernel_runs[0].is_copy, "kernel should be copy"
@unittest.skipIf(Device.DEFAULT in "METAL" or (MOCKGPU and Device.DEFAULT == "AMD"), "AMD mockgpu does not support queue wait interrupts")
def test_profile_graph(self):
@@ -167,17 +158,18 @@ class TestProfiler(unittest.TestCase):
return d2.timeline_signal.timestamp - d1.timeline_signal.timestamp
# then test it by timing the GPU to GPU times
dev_evs = {x.device:x for x in Compiled.profile_events if isinstance(x, ProfileDeviceEvent)}
jitter_matrix = [[float('nan')] * len(devs) for _ in range(len(devs))]
pairs = [(p1, p2) for p1 in enumerate(devs) for p2 in enumerate(devs) if p1 != p2]
for (i1, d1), (i2, d2) in pairs:
cpu_diff = d1.gpu2cpu_compute_time_diff - d2.gpu2cpu_compute_time_diff
cpu_diff = dev_evs[d1.device].tdiff - dev_evs[d2.device].tdiff
jitter_matrix[i1][i2] = statistics.median(_sync_d2d(d1, d2) - _sync_d2d(d2, d1) for _ in range(20)) / 2 - cpu_diff
print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
for (i1, d1), (i2, d2) in pairs:
assert abs(jitter_matrix[i1][i2]) < 0.5, "jitter should be less than 0.5us"
print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
@unittest.skip("this test is flaky")
def test_cpu_profile(self):
def test_fxn(err=False):
@@ -228,7 +220,7 @@ class TestProfiler(unittest.TestCase):
Tensor.realize(a, b)
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
exec_points = [e for e in profile if isinstance(e, ProfilePointEvent) and e.name == "exec"]
range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and not e.is_copy]
range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and _dev_base(e.device) == e.device]
self.assertEqual(len(exec_points), len(range_events), 2)
self.assertEqual(len(dedup(e.arg['name'] for e in exec_points)), 1)
self.assertEqual(len(dedup(e.arg['metadata'] for e in exec_points)), 1)

View File

@@ -56,14 +56,13 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
# **************** Profile ****************
@dataclass(frozen=True)
class ProfileDeviceEvent(ProfileEvent):
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
class ProfileDeviceEvent(ProfileEvent): device:str; tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
@dataclass(frozen=True)
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702
@dataclass(frozen=True)
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int # noqa: E702
@dataclass(frozen=True)
class ProfileGraphEvent(ProfileEvent): ents:list[ProfileGraphEntry]; deps:list[list[int]]; sigs:list[decimal.Decimal] # noqa: E702

View File

@@ -287,8 +287,7 @@ class TracingKey:
class ProfileEvent: pass
@dataclass
class ProfileRangeEvent(ProfileEvent):
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False # noqa: E702
class ProfileRangeEvent(ProfileEvent): device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None # noqa: E702
@dataclass(frozen=True)
class ProfilePointEvent(ProfileEvent):
@@ -296,8 +295,8 @@ class ProfilePointEvent(ProfileEvent):
cpu_events:list[ProfileEvent] = []
@contextlib.contextmanager
def cpu_profile(name:str|TracingKey, device="TINY", is_copy=False, display=True) -> Generator[ProfileRangeEvent, None, None]:
res = ProfileRangeEvent(device, name, perf_counter_us(), is_copy=is_copy)
def cpu_profile(name:str|TracingKey, device="TINY", display=True) -> Generator[ProfileRangeEvent, None, None]:
res = ProfileRangeEvent(device, name, perf_counter_us())
try: yield res
finally:
res.en = perf_counter_us()

View File

@@ -131,7 +131,8 @@ class HCQGraph(MultiGraphRunner):
# Description based on the command.
prof_ji_desc = ji.prg._prg.name if is_exec_prg else f"{ji.bufs[1].device} -> {ji.bufs[0].device}" # type: ignore
self.prof_graph_entries.append(ProfileGraphEntry(enqueue_dev.device, prof_ji_desc, sig_st, j * 2 + 1, is_copy=not is_exec_prg))
prof_name = f"{enqueue_dev.device}:SDMA:{queue_idx}" if not is_exec_prg else enqueue_dev.device
self.prof_graph_entries.append(ProfileGraphEntry(prof_name, prof_ji_desc, sig_st, j * 2 + 1))
self.prof_graph_deps.append([d - 1 for _, d in rdeps])
last_j[enqueue_queue] = j

View File

@@ -101,7 +101,7 @@ class MetalGraph(GraphRunner):
def collect_timestamps(self):
# create a graph event and evenly space each program
st, en = decimal.Decimal(self.command_buffer.GPUStartTime()) * 1000000, decimal.Decimal(self.command_buffer.GPUEndTime()) * 1000000
ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1, is_copy=False) for i,ji in enumerate(self.jit_cache)]
ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1) for i,ji in enumerate(self.jit_cache)]
step = (en-st)/len(ents)
self.dev.profile_events += [ProfileGraphEvent(ents, [], [st+step*i for i in range(len(ents)+1)])]

View File

@@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filte
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm
@@ -955,6 +955,7 @@ class AMDDevice(HCQCompiled):
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queues:dict = {}
self.has_sdma_queue = self.sdma_queue(0) is not None
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
@@ -1018,11 +1019,12 @@ class AMDDevice(HCQCompiled):
wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer,
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx))
@functools.lru_cache(None)
def sdma_queue(self, idx:int):
if getenv("AMD_DISABLE_SDMA"): return None
with contextlib.suppress(OSError): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
return None
if idx in self.sdma_queues: return self.sdma_queues[idx]
with contextlib.suppress(OSError):
self.sdma_queues[idx] = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
return self.sdma_queues.get(idx, None)
def _ensure_has_local_memory(self, private_segment_size):
if self.max_private_segment_size >= private_segment_size: return
@@ -1063,3 +1065,5 @@ class AMDDevice(HCQCompiled):
def on_device_hang(self): self.iface.on_device_hang()
def device_props(self): return self.iface.props
def hw_copy_queues(self): return [(f"SDMA:{i}", functools.partial(unwrap(self.hw_copy_queue_t), queue_idx=i)) for i in self.sdma_queues]

View File

@@ -51,7 +51,7 @@ class MetalDevice(Compiled):
st, en = decimal.Decimal(cbuf.GPUStartTime()) * 1000000, decimal.Decimal(cbuf.GPUEndTime()) * 1000000
# NOTE: command buffers from MetalGraph are not profiled here
if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None and not lb.startswith("batched"):
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en, is_copy=lb.startswith("COPY"))]
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)]
self.mtl_buffers_in_flight.clear()
def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary:
@@ -191,7 +191,7 @@ class MetalAllocator(LRUAllocator[MetalDevice]):
# There is no real metal multidevice support for now, so transfer is used only for tests.
src_dev.synchronize()
def _cp_mv(self, dst, src, prof_desc):
with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src
with cpu_profile(prof_desc, self.dev.device): dst[:] = src
def _as_buffer(self, src:MetalBuffer) -> memoryview:
self.dev.synchronize()
return to_mv(src.buf.contents(), src.size + src.offset)[src.offset:]

View File

@@ -333,7 +333,7 @@ class NVAllocator(HCQAllocator['NVDevice']):
self.dev._ensure_has_vid_hw(w, h)
q = NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1)
with hcq_profile(self.dev, queue=q, desc="NVDEC", enabled=PROFILE):
with hcq_profile(self.dev, queue=q, desc="HEVC Decode", enabled=PROFILE, dev_suff="NVDEC"):
q.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
self.dev.intra_unk_off, self.dev.vid_stat_buf)

View File

@@ -331,7 +331,7 @@ class QCOMAllocator(HCQAllocatorBase):
return self.dev._gpu_map(opts.external_ptr, size, image=opts.image) if opts.external_ptr else self.dev._gpu_alloc(size, image=opts.image)
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, prof_text, dest_off=0, src_off=0):
with cpu_profile(prof_text, self.dev.device, is_copy=True):
with cpu_profile(prof_text, self.dev.device):
while src_off < src_size:
ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
src_off, dest_off = src_off+src_stride, dest_off+dest_stride

View File

@@ -265,7 +265,7 @@ class HCQSignal(Generic[HCQDeviceType]):
if not_passed and self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
@contextlib.contextmanager
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None, dev_suff:str|None=None):
st, en = (dev.new_signal(), dev.new_signal()) if enabled else (None, None)
assert queue is not None or queue_type is not None, "Either queue or queue_type must be provided"
@@ -279,7 +279,7 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
elif enabled and queue_type is not None:
queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.next_timeline()).submit(dev)
if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, f"{dev.device}:{dev_suff}" if dev_suff else dev.device))
class HCQArgsState(Generic[ProgramType]):
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint|None, ...]=()):
@@ -376,7 +376,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, str]] = []
self.prof_exec_counter:int = 0
self.prof_prg_counter:int = 0
@@ -402,7 +402,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
if PROFILE:
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
Compiled.profile_events += [ProfileRangeEvent(dev, name, st.timestamp, en.timestamp) for st,en,name,dev in self.sig_prof_records]
self.sig_prof_records = []
def next_timeline(self):
@@ -418,6 +418,10 @@ class HCQCompiled(Compiled, Generic[SignalType]):
def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent.
def hw_compute_queues(self) -> list[tuple[str|None, Callable[[], HWQueue]]]: return [(None, self.hw_compute_queue_t)]
def hw_copy_queues(self) -> list[tuple[str, Callable[[], HWQueue]]]:
return [("SDMA:0", self.hw_copy_queue_t)] if self.hw_copy_queue_t is not None else []
def _at_profile_finalize(self):
self.synchronize() # Expect device to be synchronizes
@@ -428,10 +432,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
et = time.perf_counter_ns()
return (decimal.Decimal(et+st) / 2000) - d.timeline_signal.timestamp
self.gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
Compiled.profile_events += [ProfileDeviceEvent(self.device, self.gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())]
for prefix, q_t in self.hw_compute_queues() + self.hw_copy_queues():
devname = f"{self.device}:{prefix}" if prefix else self.device
Compiled.profile_events += [ProfileDeviceEvent(devname, statistics.median([_sync(self, q_t) for _ in range(40)]), props=self.device_props())]
def _wrap_timeline_signal(self):
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
@@ -514,10 +517,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
def _copyin(self, dest:HCQBuffer, src:memoryview):
if self.dev.hw_copy_queue_t is None:
self.dev.synchronize()
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device, is_copy=True): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src))
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src))
return
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE):
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b)
self.dev.timeline_signal.wait(self.b_timeline[self.b_next])
@@ -538,7 +541,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
return None
assert self.dev.hw_copy_queue_t is not None
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE):
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
@@ -548,10 +551,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
def _copyout(self, dest:memoryview, src:HCQBuffer):
self.dev.synchronize()
if self.dev.hw_copy_queue_t is None:
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest))
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest))
return
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE):
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE, dev_suff="SDMA:0"):
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
@@ -563,7 +566,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
cast(HCQAllocator, src_dev.allocator).map(dest)
assert src_dev.hw_copy_queue_t is not None
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE):
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
.copy(dest.va_addr, src.va_addr, sz) \

View File

@@ -168,19 +168,19 @@ def option(s:int|None) -> int: return 0 if s is None else s+1
# Profiler API
device_ts_diffs:dict[str, tuple[Decimal, Decimal]] = {}
def cpu_ts_diff(device:str, thread=0) -> Decimal: return device_ts_diffs.get(device, (Decimal(0),))[thread]
device_ts_diffs:dict[str, Decimal] = {}
def cpu_ts_diff(device:str) -> Decimal: return device_ts_diffs.get(device, Decimal(0))
amdgpu_targets:dict[str, int] = {}
DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent
def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]:
for e in profile:
if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device, e.is_copy)), (e.en if e.en is not None else e.st)+diff, e)
if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device)), (e.en if e.en is not None else e.st)+diff, e)
elif isinstance(e, ProfilePointEvent): yield (e.ts, e.ts, e)
elif isinstance(e, ProfileGraphEvent):
cpu_ts = []
for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device, ent.is_copy)), e.sigs[ent.en_id]+diff]
for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device)), e.sigs[ent.en_id]+diff]
yield (st:=min(cpu_ts)), (et:=max(cpu_ts)), ProfileRangeEvent(f"{e.ents[0].device.split(':')[0]} Graph", f"batched {len(e.ents)}", st, et)
for i,ent in enumerate(e.ents): yield (cpu_ts[i*2], cpu_ts[i*2+1], ent)
@@ -384,7 +384,7 @@ def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]=device_
device_decoders:dict[str, Callable[[list[ProfileEvent]], None]] = {}
for ev in profile:
if isinstance(ev, ProfileDeviceEvent):
device_ts_diffs[ev.device] = (ev.comp_tdiff,ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff)
device_ts_diffs[ev.device] = ev.tdiff
if (d:=ev.device.split(":")[0]) == "AMD":
device_decoders[d] = load_counters
amdgpu_targets[d] = unwrap(ev.props)["gfx_target_version"]