mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
diff devices for sdma (#14589)
* start * x * fix * sdma * c * clean * x * hm * cleaer
This commit is contained in:
@@ -2,27 +2,26 @@ import sys, pickle, decimal, json
|
||||
from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent
|
||||
from tinygrad.helpers import tqdm, temp, ProfileEvent, ProfileRangeEvent, TracingKey
|
||||
|
||||
devices:dict[str, tuple[decimal.Decimal, decimal.Decimal, int]] = {}
|
||||
def prep_ts(device:str, ts:decimal.Decimal, is_copy): return int(decimal.Decimal(ts) + devices[device][is_copy])
|
||||
def dev_to_pid(device:str, is_copy=False): return {"pid": devices[device][2], "tid": int(is_copy)}
|
||||
devices:dict[str, tuple[decimal.Decimal, int]] = {}
|
||||
def prep_ts(device:str, ts:decimal.Decimal): return int(decimal.Decimal(ts) + devices[device][0])
|
||||
def dev_to_pid(device:str): return {"pid": devices[device][1], "tid": 0}
|
||||
def dev_ev_to_perfetto_json(ev:ProfileDeviceEvent):
|
||||
devices[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff, len(devices))
|
||||
devices[ev.device] = (ev.tdiff, len(devices))
|
||||
return [{"name": "process_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "args": {"name": ev.device}},
|
||||
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": "COMPUTE"}},
|
||||
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 1, "args": {"name": "COPY"}}]
|
||||
{"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": ev.device}}]
|
||||
def range_ev_to_perfetto_json(ev:ProfileRangeEvent):
|
||||
name = ev.name.display_name if isinstance(ev.name, TracingKey) else ev.name
|
||||
return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st, ev.is_copy), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device, ev.is_copy)}]
|
||||
return [{"name": name, "ph": "X", "ts": prep_ts(ev.device, ev.st), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device)}]
|
||||
def graph_ev_to_perfetto_json(ev:ProfileGraphEvent, reccnt):
|
||||
ret = []
|
||||
for i,e in enumerate(ev.ents):
|
||||
st, en = ev.sigs[e.st_id], ev.sigs[e.en_id]
|
||||
name = e.name.display_name if isinstance(e.name, TracingKey) else e.name
|
||||
ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st, e.is_copy), "dur": float(en-st), **dev_to_pid(e.device, e.is_copy)}]
|
||||
ret += [{"name": name, "ph": "X", "ts": prep_ts(e.device, st), "dur": float(en-st), **dev_to_pid(e.device)}]
|
||||
for dep in ev.deps[i]:
|
||||
d = ev.ents[dep]
|
||||
ret += [{"ph": "s", **dev_to_pid(d.device, d.is_copy), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id], d.is_copy), "bp": "e"}]
|
||||
ret += [{"ph": "f", **dev_to_pid(e.device, e.is_copy), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st, e.is_copy), "bp": "e"}]
|
||||
ret += [{"ph": "s", **dev_to_pid(d.device), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id]), "bp": "e"}]
|
||||
ret += [{"ph": "f", **dev_to_pid(e.device), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st), "bp": "e"}]
|
||||
return ret
|
||||
def to_perfetto(profile:list[ProfileEvent]):
|
||||
# Start json with devices.
|
||||
|
||||
@@ -145,7 +145,8 @@ class RGP:
|
||||
@staticmethod
|
||||
def from_profile(profile_pickled, device:str|None=None):
|
||||
profile: list[ProfileEvent] = pickle.loads(profile_pickled)
|
||||
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')}
|
||||
def _is_base_dev(d): return all(p.isdigit() for p in d.split(":")[1:])
|
||||
device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD') and _is_base_dev(x.device)}
|
||||
if device is None:
|
||||
if len(device_events) == 0: raise RuntimeError('No supported devices found in profile')
|
||||
if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}")
|
||||
|
||||
@@ -365,8 +365,8 @@ def load_profile(lst:list[ProfileEvent]) -> dict:
|
||||
|
||||
class TestVizProfiler(BaseTestViz):
|
||||
def test_node(self):
|
||||
prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=False),
|
||||
ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100))]
|
||||
prof = [ProfileRangeEvent(device='NV', name='E_2', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
|
||||
ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000))]
|
||||
|
||||
j = load_profile(prof)
|
||||
|
||||
@@ -379,28 +379,28 @@ class TestVizProfiler(BaseTestViz):
|
||||
assert event['ref'] is None
|
||||
|
||||
def test_copy_node(self):
|
||||
prof = [ProfileRangeEvent(device='NV', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True),
|
||||
ProfileRangeEvent(device='NV:2', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010), is_copy=True),
|
||||
ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)),
|
||||
ProfileDeviceEvent(device='NV:2', comp_tdiff=decimal.Decimal(-800), copy_tdiff=decimal.Decimal(-80))]
|
||||
prof = [ProfileRangeEvent(device='NV:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
|
||||
ProfileRangeEvent(device='NV:2:SDMA:0', name='COPYxx', st=decimal.Decimal(1000), en=decimal.Decimal(1010)),
|
||||
ProfileDeviceEvent(device='NV:SDMA:0', tdiff=decimal.Decimal(-100)),
|
||||
ProfileDeviceEvent(device='NV:2:SDMA:0', tdiff=decimal.Decimal(-80))]
|
||||
|
||||
j = load_profile(prof)
|
||||
|
||||
event = j['layout']['NV']['events'][0]
|
||||
event = j['layout']['NV:SDMA:0']['events'][0]
|
||||
self.assertEqual(event['name'], 'COPYxx')
|
||||
self.assertEqual(event['st'], 0) # first event
|
||||
self.assertEqual(event['dur'], 10)
|
||||
|
||||
event2 = j['layout']['NV:2']['events'][0]
|
||||
event2 = j['layout']['NV:2:SDMA:0']['events'][0]
|
||||
self.assertEqual(event2['st'], 20) # second event, diff clock
|
||||
|
||||
self.assertEqual(j["dur"], (event2["st"]+event2["dur"])-event["st"])
|
||||
|
||||
def test_graph(self):
|
||||
prof = [ProfileDeviceEvent(device='NV', comp_tdiff=decimal.Decimal(-1000), copy_tdiff=decimal.Decimal(-100)),
|
||||
ProfileDeviceEvent(device='NV:1', comp_tdiff=decimal.Decimal(-500), copy_tdiff=decimal.Decimal(-50)),
|
||||
ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1, is_copy=False),
|
||||
ProfileGraphEntry(device='NV:1', name='NV -> NV:1', st_id=2, en_id=3, is_copy=True)],
|
||||
prof = [ProfileDeviceEvent(device='NV', tdiff=decimal.Decimal(-1000)),
|
||||
ProfileDeviceEvent(device='NV:1:SDMA:0', tdiff=decimal.Decimal(-50)),
|
||||
ProfileGraphEvent(ents=[ProfileGraphEntry(device='NV', name='E_25_4n2', st_id=0, en_id=1),
|
||||
ProfileGraphEntry(device='NV:1:SDMA:0', name='NV -> NV:1', st_id=2, en_id=3)],
|
||||
deps=[[], [0]],
|
||||
sigs=[decimal.Decimal(1000), decimal.Decimal(1002), decimal.Decimal(1004), decimal.Decimal(1008)])]
|
||||
|
||||
@@ -409,22 +409,20 @@ class TestVizProfiler(BaseTestViz):
|
||||
tracks = list(j['layout'])
|
||||
self.assertEqual(tracks[0], 'NV')
|
||||
self.assertEqual(tracks[1], 'NV Graph')
|
||||
self.assertEqual(tracks[2], 'NV:1')
|
||||
self.assertEqual(tracks[2], 'NV:1:SDMA:0')
|
||||
|
||||
nv_events = j['layout']['NV']['events']
|
||||
self.assertEqual(nv_events[0]['name'], 'E_25_4n2')
|
||||
self.assertEqual(nv_events[0]['st'], 0)
|
||||
self.assertEqual(nv_events[0]['dur'], 2)
|
||||
#self.assertEqual(j['devEvents'][6]['pid'], j['devEvents'][0]['pid'])
|
||||
|
||||
nv1_events = j['layout']['NV:1']['events']
|
||||
self.assertEqual(nv1_events[0]['name'], 'NV -> NV:1')
|
||||
self.assertEqual(nv1_events[0]['st'], 954)
|
||||
#self.assertEqual(j['devEvents'][7]['pid'], j['devEvents'][3]['pid'])
|
||||
sdma_events = j['layout']['NV:1:SDMA:0']['events']
|
||||
self.assertEqual(sdma_events[0]['name'], 'NV -> NV:1')
|
||||
self.assertEqual(sdma_events[0]['st'], 954)
|
||||
|
||||
graph_events = j['layout']['NV Graph']['events']
|
||||
self.assertEqual(graph_events[0]['st'], nv_events[0]['st'])
|
||||
self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], nv1_events[0]['st']+nv1_events[0]['dur'])
|
||||
self.assertEqual(graph_events[0]['st']+graph_events[0]['dur'], sdma_events[0]['st']+sdma_events[0]['dur'])
|
||||
|
||||
def test_bytes_per_kernel(self):
|
||||
step = 10
|
||||
|
||||
@@ -6,6 +6,9 @@ from tinygrad.runtime.support.hcq import HCQCompiled
|
||||
from tinygrad.engine.realize import get_runner
|
||||
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
def _dev_base(d):
|
||||
p = d.split(":")
|
||||
return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"
|
||||
|
||||
@contextlib.contextmanager
|
||||
def helper_collect_profile(*devs):
|
||||
@@ -55,7 +58,7 @@ class TestProfiler(unittest.TestCase):
|
||||
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
|
||||
assert len(kernel_runs) == 1, "one kernel run is expected"
|
||||
assert kernel_runs[0].name == runner_name, "kernel name is not correct"
|
||||
assert not kernel_runs[0].is_copy, "kernel should not be copy"
|
||||
assert _dev_base(kernel_runs[0].device) == kernel_runs[0].device, "kernel should not be on a sub-device"
|
||||
|
||||
def test_profile_copyin(self):
|
||||
buf1 = Buffer(Device.DEFAULT, 2, dtypes.float, options=BufferSpec(nolru=True)).ensure_allocated()
|
||||
@@ -63,10 +66,8 @@ class TestProfiler(unittest.TestCase):
|
||||
with helper_collect_profile(TestProfiler.d0) as profile:
|
||||
buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
|
||||
|
||||
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
|
||||
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
|
||||
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
|
||||
assert len(kernel_runs) == 1, "one kernel run is expected"
|
||||
assert kernel_runs[0].is_copy, "kernel should be copy"
|
||||
|
||||
def test_profile_multiops(self):
|
||||
runner_name = TestProfiler.runner._prg.name
|
||||
@@ -77,16 +78,12 @@ class TestProfiler(unittest.TestCase):
|
||||
TestProfiler.runner([buf1, TestProfiler.a.uop.buffer], var_vals={})
|
||||
buf1.copyout(memoryview(bytearray(buf1.nbytes)))
|
||||
|
||||
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
|
||||
evs = [x for x in profile if isinstance(x, ProfileRangeEvent)]
|
||||
evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
|
||||
|
||||
assert len(evs) == 3, "3 kernel runs are expected"
|
||||
# NOTE: order of events does not matter, the tool is responsible for sorting them
|
||||
copy_events = [e for e in evs if e.is_copy]
|
||||
self.assertEqual(len(copy_events), 2)
|
||||
|
||||
prg_events = [e for e in evs if not e.is_copy]
|
||||
assert prg_events[0].name == runner_name, "kernel name is not correct"
|
||||
prg_events = [e for e in evs if e.device == TestProfiler.d0.device]
|
||||
assert any(e.name == runner_name for e in prg_events), "kernel name is not correct"
|
||||
|
||||
#for i in range(1, 3):
|
||||
# assert evs[i].st > evs[i-1].en, "timestamp not aranged"
|
||||
@@ -102,13 +99,9 @@ class TestProfiler(unittest.TestCase):
|
||||
buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
|
||||
buf2.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
|
||||
|
||||
profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
|
||||
profile1, _ = helper_profile_filter_device(profile, d1.device)
|
||||
|
||||
for p in [profile0, profile1]:
|
||||
evs = [x for x in p if isinstance(x, ProfileRangeEvent)]
|
||||
for dev in [TestProfiler.d0.device, d1.device]:
|
||||
evs = [x for x in profile if isinstance(x, ProfileRangeEvent) and _dev_base(x.device) == dev]
|
||||
assert len(evs) == 1, "one kernel runs are expected"
|
||||
assert evs[0].is_copy, "kernel should be copy"
|
||||
|
||||
def test_profile_multidev_transfer(self):
|
||||
try: d1 = Device[f"{Device.DEFAULT}:1"]
|
||||
@@ -118,10 +111,8 @@ class TestProfiler(unittest.TestCase):
|
||||
with helper_collect_profile(TestProfiler.d0, d1) as profile:
|
||||
buf1.to(f"{Device.DEFAULT}:1").realize()
|
||||
|
||||
profile0, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
|
||||
kernel_runs = [x for x in profile0 if isinstance(x, ProfileRangeEvent)]
|
||||
kernel_runs = [x for x in profile if isinstance(x, ProfileRangeEvent) and x.device.startswith(TestProfiler.d0.device)]
|
||||
assert len(kernel_runs) == 1, "one kernel run is expected"
|
||||
assert kernel_runs[0].is_copy, "kernel should be copy"
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in "METAL" or (MOCKGPU and Device.DEFAULT == "AMD"), "AMD mockgpu does not support queue wait interrupts")
|
||||
def test_profile_graph(self):
|
||||
@@ -167,17 +158,18 @@ class TestProfiler(unittest.TestCase):
|
||||
return d2.timeline_signal.timestamp - d1.timeline_signal.timestamp
|
||||
|
||||
# then test it by timing the GPU to GPU times
|
||||
dev_evs = {x.device:x for x in Compiled.profile_events if isinstance(x, ProfileDeviceEvent)}
|
||||
jitter_matrix = [[float('nan')] * len(devs) for _ in range(len(devs))]
|
||||
pairs = [(p1, p2) for p1 in enumerate(devs) for p2 in enumerate(devs) if p1 != p2]
|
||||
for (i1, d1), (i2, d2) in pairs:
|
||||
cpu_diff = d1.gpu2cpu_compute_time_diff - d2.gpu2cpu_compute_time_diff
|
||||
cpu_diff = dev_evs[d1.device].tdiff - dev_evs[d2.device].tdiff
|
||||
jitter_matrix[i1][i2] = statistics.median(_sync_d2d(d1, d2) - _sync_d2d(d2, d1) for _ in range(20)) / 2 - cpu_diff
|
||||
|
||||
print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
|
||||
|
||||
for (i1, d1), (i2, d2) in pairs:
|
||||
assert abs(jitter_matrix[i1][i2]) < 0.5, "jitter should be less than 0.5us"
|
||||
|
||||
print("pairwise clock jitter matrix (us):\n" + '\n'.join([''.join([f'{float(item):8.3f}' for item in row]) for row in jitter_matrix]))
|
||||
|
||||
@unittest.skip("this test is flaky")
|
||||
def test_cpu_profile(self):
|
||||
def test_fxn(err=False):
|
||||
@@ -228,7 +220,7 @@ class TestProfiler(unittest.TestCase):
|
||||
Tensor.realize(a, b)
|
||||
profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device)
|
||||
exec_points = [e for e in profile if isinstance(e, ProfilePointEvent) and e.name == "exec"]
|
||||
range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and not e.is_copy]
|
||||
range_events = [e for e in profile if isinstance(e, ProfileRangeEvent) and _dev_base(e.device) == e.device]
|
||||
self.assertEqual(len(exec_points), len(range_events), 2)
|
||||
self.assertEqual(len(dedup(e.arg['name'] for e in exec_points)), 1)
|
||||
self.assertEqual(len(dedup(e.arg['metadata'] for e in exec_points)), 1)
|
||||
|
||||
@@ -56,14 +56,13 @@ atexit.register(lambda: [Device[dn].finalize() for dn in Device._opened_devices]
|
||||
# **************** Profile ****************
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileDeviceEvent(ProfileEvent):
|
||||
device:str; comp_tdiff:decimal.Decimal=decimal.Decimal(0); copy_tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
|
||||
class ProfileDeviceEvent(ProfileEvent): device:str; tdiff:decimal.Decimal=decimal.Decimal(0); props:dict[str,Any]|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None; tag:int|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702
|
||||
class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfileGraphEvent(ProfileEvent): ents:list[ProfileGraphEntry]; deps:list[list[int]]; sigs:list[decimal.Decimal] # noqa: E702
|
||||
|
||||
@@ -287,8 +287,7 @@ class TracingKey:
|
||||
class ProfileEvent: pass
|
||||
|
||||
@dataclass
|
||||
class ProfileRangeEvent(ProfileEvent):
|
||||
device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None; is_copy:bool=False # noqa: E702
|
||||
class ProfileRangeEvent(ProfileEvent): device:str; name:str|TracingKey; st:decimal.Decimal; en:decimal.Decimal|None=None # noqa: E702
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProfilePointEvent(ProfileEvent):
|
||||
@@ -296,8 +295,8 @@ class ProfilePointEvent(ProfileEvent):
|
||||
|
||||
cpu_events:list[ProfileEvent] = []
|
||||
@contextlib.contextmanager
|
||||
def cpu_profile(name:str|TracingKey, device="TINY", is_copy=False, display=True) -> Generator[ProfileRangeEvent, None, None]:
|
||||
res = ProfileRangeEvent(device, name, perf_counter_us(), is_copy=is_copy)
|
||||
def cpu_profile(name:str|TracingKey, device="TINY", display=True) -> Generator[ProfileRangeEvent, None, None]:
|
||||
res = ProfileRangeEvent(device, name, perf_counter_us())
|
||||
try: yield res
|
||||
finally:
|
||||
res.en = perf_counter_us()
|
||||
|
||||
@@ -131,7 +131,8 @@ class HCQGraph(MultiGraphRunner):
|
||||
# Description based on the command.
|
||||
prof_ji_desc = ji.prg._prg.name if is_exec_prg else f"{ji.bufs[1].device} -> {ji.bufs[0].device}" # type: ignore
|
||||
|
||||
self.prof_graph_entries.append(ProfileGraphEntry(enqueue_dev.device, prof_ji_desc, sig_st, j * 2 + 1, is_copy=not is_exec_prg))
|
||||
prof_name = f"{enqueue_dev.device}:SDMA:{queue_idx}" if not is_exec_prg else enqueue_dev.device
|
||||
self.prof_graph_entries.append(ProfileGraphEntry(prof_name, prof_ji_desc, sig_st, j * 2 + 1))
|
||||
self.prof_graph_deps.append([d - 1 for _, d in rdeps])
|
||||
|
||||
last_j[enqueue_queue] = j
|
||||
|
||||
@@ -101,7 +101,7 @@ class MetalGraph(GraphRunner):
|
||||
def collect_timestamps(self):
|
||||
# create a graph event and evenly space each program
|
||||
st, en = decimal.Decimal(self.command_buffer.GPUStartTime()) * 1000000, decimal.Decimal(self.command_buffer.GPUEndTime()) * 1000000
|
||||
ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1, is_copy=False) for i,ji in enumerate(self.jit_cache)]
|
||||
ents = [ProfileGraphEntry(self.device, cast(CompiledRunner, ji.prg)._prg.name, i, i+1) for i,ji in enumerate(self.jit_cache)]
|
||||
step = (en-st)/len(ents)
|
||||
self.dev.profile_events += [ProfileGraphEvent(ents, [], [st+step*i for i in range(len(ents)+1)])]
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filte
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
|
||||
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
|
||||
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv
|
||||
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap
|
||||
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm
|
||||
@@ -955,6 +955,7 @@ class AMDDevice(HCQCompiled):
|
||||
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.sdma_queues:dict = {}
|
||||
self.has_sdma_queue = self.sdma_queue(0) is not None
|
||||
|
||||
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
|
||||
@@ -1018,11 +1019,12 @@ class AMDDevice(HCQCompiled):
|
||||
wptr=getattr(hsa.amd_queue_t, 'write_dispatch_id').offset, eop_buffer=eop_buffer, cwsr_buffer=cwsr_buffer,
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx))
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def sdma_queue(self, idx:int):
|
||||
if getenv("AMD_DISABLE_SDMA"): return None
|
||||
with contextlib.suppress(OSError): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
|
||||
return None
|
||||
if idx in self.sdma_queues: return self.sdma_queues[idx]
|
||||
with contextlib.suppress(OSError):
|
||||
self.sdma_queues[idx] = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
|
||||
return self.sdma_queues.get(idx, None)
|
||||
|
||||
def _ensure_has_local_memory(self, private_segment_size):
|
||||
if self.max_private_segment_size >= private_segment_size: return
|
||||
@@ -1063,3 +1065,5 @@ class AMDDevice(HCQCompiled):
|
||||
def on_device_hang(self): self.iface.on_device_hang()
|
||||
|
||||
def device_props(self): return self.iface.props
|
||||
|
||||
def hw_copy_queues(self): return [(f"SDMA:{i}", functools.partial(unwrap(self.hw_copy_queue_t), queue_idx=i)) for i in self.sdma_queues]
|
||||
|
||||
@@ -51,7 +51,7 @@ class MetalDevice(Compiled):
|
||||
st, en = decimal.Decimal(cbuf.GPUStartTime()) * 1000000, decimal.Decimal(cbuf.GPUEndTime()) * 1000000
|
||||
# NOTE: command buffers from MetalGraph are not profiled here
|
||||
if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None and not lb.startswith("batched"):
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en, is_copy=lb.startswith("COPY"))]
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)]
|
||||
self.mtl_buffers_in_flight.clear()
|
||||
|
||||
def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary:
|
||||
@@ -191,7 +191,7 @@ class MetalAllocator(LRUAllocator[MetalDevice]):
|
||||
# There is no real metal multidevice support for now, so transfer is used only for tests.
|
||||
src_dev.synchronize()
|
||||
def _cp_mv(self, dst, src, prof_desc):
|
||||
with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src
|
||||
with cpu_profile(prof_desc, self.dev.device): dst[:] = src
|
||||
def _as_buffer(self, src:MetalBuffer) -> memoryview:
|
||||
self.dev.synchronize()
|
||||
return to_mv(src.buf.contents(), src.size + src.offset)[src.offset:]
|
||||
|
||||
@@ -333,7 +333,7 @@ class NVAllocator(HCQAllocator['NVDevice']):
|
||||
self.dev._ensure_has_vid_hw(w, h)
|
||||
|
||||
q = NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1)
|
||||
with hcq_profile(self.dev, queue=q, desc="NVDEC", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue=q, desc="HEVC Decode", enabled=PROFILE, dev_suff="NVDEC"):
|
||||
q.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
|
||||
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
|
||||
self.dev.intra_unk_off, self.dev.vid_stat_buf)
|
||||
|
||||
@@ -331,7 +331,7 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
return self.dev._gpu_map(opts.external_ptr, size, image=opts.image) if opts.external_ptr else self.dev._gpu_alloc(size, image=opts.image)
|
||||
|
||||
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, prof_text, dest_off=0, src_off=0):
|
||||
with cpu_profile(prof_text, self.dev.device, is_copy=True):
|
||||
with cpu_profile(prof_text, self.dev.device):
|
||||
while src_off < src_size:
|
||||
ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
|
||||
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
|
||||
|
||||
@@ -265,7 +265,7 @@ class HCQSignal(Generic[HCQDeviceType]):
|
||||
if not_passed and self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None, dev_suff:str|None=None):
|
||||
st, en = (dev.new_signal(), dev.new_signal()) if enabled else (None, None)
|
||||
assert queue is not None or queue_type is not None, "Either queue or queue_type must be provided"
|
||||
|
||||
@@ -279,7 +279,7 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
|
||||
elif enabled and queue_type is not None:
|
||||
queue_type().wait(dev.timeline_signal, dev.timeline_value - 1).timestamp(en).signal(dev.timeline_signal, dev.next_timeline()).submit(dev)
|
||||
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, (queue_type or type(queue)) is dev.hw_copy_queue_t))
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((unwrap(st), unwrap(en), desc, f"{dev.device}:{dev_suff}" if dev_suff else dev.device))
|
||||
|
||||
class HCQArgsState(Generic[ProgramType]):
|
||||
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint|None, ...]=()):
|
||||
@@ -376,7 +376,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.new_signal(value=0, is_timeline=True), self.new_signal(value=0, is_timeline=True)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, str]] = []
|
||||
self.prof_exec_counter:int = 0
|
||||
self.prof_prg_counter:int = 0
|
||||
|
||||
@@ -402,7 +402,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
|
||||
if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
|
||||
if PROFILE:
|
||||
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
|
||||
Compiled.profile_events += [ProfileRangeEvent(dev, name, st.timestamp, en.timestamp) for st,en,name,dev in self.sig_prof_records]
|
||||
self.sig_prof_records = []
|
||||
|
||||
def next_timeline(self):
|
||||
@@ -418,6 +418,10 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
|
||||
def device_props(self) -> dict[str,Any]: return {} # to be overridden if needed. dict keys are backend dependent.
|
||||
|
||||
def hw_compute_queues(self) -> list[tuple[str|None, Callable[[], HWQueue]]]: return [(None, self.hw_compute_queue_t)]
|
||||
def hw_copy_queues(self) -> list[tuple[str, Callable[[], HWQueue]]]:
|
||||
return [("SDMA:0", self.hw_copy_queue_t)] if self.hw_copy_queue_t is not None else []
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
self.synchronize() # Expect device to be synchronizes
|
||||
|
||||
@@ -428,10 +432,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
et = time.perf_counter_ns()
|
||||
return (decimal.Decimal(et+st) / 2000) - d.timeline_signal.timestamp
|
||||
|
||||
self.gpu2cpu_compute_time_diff = statistics.median([_sync(self, self.hw_compute_queue_t) for _ in range(40)])
|
||||
if self.hw_copy_queue_t is None: gpu2cpu_copy_time_diff = decimal.Decimal(0)
|
||||
else: gpu2cpu_copy_time_diff = statistics.median([_sync(self, self.hw_copy_queue_t) for _ in range(40)])
|
||||
Compiled.profile_events += [ProfileDeviceEvent(self.device, self.gpu2cpu_compute_time_diff, gpu2cpu_copy_time_diff, props=self.device_props())]
|
||||
for prefix, q_t in self.hw_compute_queues() + self.hw_copy_queues():
|
||||
devname = f"{self.device}:{prefix}" if prefix else self.device
|
||||
Compiled.profile_events += [ProfileDeviceEvent(devname, statistics.median([_sync(self, q_t) for _ in range(40)]), props=self.device_props())]
|
||||
|
||||
def _wrap_timeline_signal(self):
|
||||
self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1
|
||||
@@ -514,10 +517,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
if self.dev.hw_copy_queue_t is None:
|
||||
self.dev.synchronize()
|
||||
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device, is_copy=True): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src))
|
||||
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device): ctypes.memmove(int(dest.va_addr), from_mv(src), len(src))
|
||||
return
|
||||
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
self.dev.timeline_signal.wait(self.b_timeline[self.b_next])
|
||||
@@ -538,7 +541,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
return None
|
||||
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"DISK -> {self.dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
|
||||
@@ -548,10 +551,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
||||
self.dev.synchronize()
|
||||
if self.dev.hw_copy_queue_t is None:
|
||||
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest))
|
||||
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device): ctypes.memmove(from_mv(dest), int(src.va_addr), len(dest))
|
||||
return
|
||||
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
|
||||
@@ -563,7 +566,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
cast(HCQAllocator, src_dev.allocator).map(dest)
|
||||
|
||||
assert src_dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE):
|
||||
with hcq_profile(src_dev, queue_type=src_dev.hw_copy_queue_t, desc=f"{src_dev.device} -> {dest_dev.device}", enabled=PROFILE, dev_suff="SDMA:0"):
|
||||
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
||||
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr, src.va_addr, sz) \
|
||||
|
||||
@@ -168,19 +168,19 @@ def option(s:int|None) -> int: return 0 if s is None else s+1
|
||||
|
||||
# Profiler API
|
||||
|
||||
device_ts_diffs:dict[str, tuple[Decimal, Decimal]] = {}
|
||||
def cpu_ts_diff(device:str, thread=0) -> Decimal: return device_ts_diffs.get(device, (Decimal(0),))[thread]
|
||||
device_ts_diffs:dict[str, Decimal] = {}
|
||||
def cpu_ts_diff(device:str) -> Decimal: return device_ts_diffs.get(device, Decimal(0))
|
||||
|
||||
amdgpu_targets:dict[str, int] = {}
|
||||
|
||||
DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent
|
||||
def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]:
|
||||
for e in profile:
|
||||
if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device, e.is_copy)), (e.en if e.en is not None else e.st)+diff, e)
|
||||
if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device)), (e.en if e.en is not None else e.st)+diff, e)
|
||||
elif isinstance(e, ProfilePointEvent): yield (e.ts, e.ts, e)
|
||||
elif isinstance(e, ProfileGraphEvent):
|
||||
cpu_ts = []
|
||||
for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device, ent.is_copy)), e.sigs[ent.en_id]+diff]
|
||||
for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device)), e.sigs[ent.en_id]+diff]
|
||||
yield (st:=min(cpu_ts)), (et:=max(cpu_ts)), ProfileRangeEvent(f"{e.ents[0].device.split(':')[0]} Graph", f"batched {len(e.ents)}", st, et)
|
||||
for i,ent in enumerate(e.ents): yield (cpu_ts[i*2], cpu_ts[i*2+1], ent)
|
||||
|
||||
@@ -384,7 +384,7 @@ def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]=device_
|
||||
device_decoders:dict[str, Callable[[list[ProfileEvent]], None]] = {}
|
||||
for ev in profile:
|
||||
if isinstance(ev, ProfileDeviceEvent):
|
||||
device_ts_diffs[ev.device] = (ev.comp_tdiff,ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff)
|
||||
device_ts_diffs[ev.device] = ev.tdiff
|
||||
if (d:=ev.device.split(":")[0]) == "AMD":
|
||||
device_decoders[d] = load_counters
|
||||
amdgpu_targets[d] = unwrap(ev.props)["gfx_target_version"]
|
||||
|
||||
Reference in New Issue
Block a user