diff --git a/test/test_profiler.py b/test/test_profiler.py index 71755d2afd..b307c45ac4 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -10,15 +10,15 @@ MOCKGPU = getenv("MOCKGPU") @contextlib.contextmanager def helper_collect_profile(*devs): - Compiled.profile_events = [] + for dev in devs: dev.synchronize() + Compiled.profile_events = [x for x in Compiled.profile_events if isinstance(x, ProfileDeviceEvent) and x.device.startswith("METAL")] profile_list = [] with Context(PROFILE=1): - try: yield profile_list - finally: - for dev in devs: dev.synchronize() - for dev in devs: dev._at_profile_finalize() - for x in Compiled.profile_events: profile_list.append(x) + yield profile_list + for dev in devs: dev.synchronize() + for dev in devs: dev._at_profile_finalize() + for x in Compiled.profile_events: profile_list.append(x) def helper_profile_filter_device(profile, device:str): assert any(getattr(x, "device", None) == device and isinstance(x, ProfileDeviceEvent) for x in profile), f"device {device} is not registred" @@ -26,7 +26,7 @@ def helper_profile_filter_device(profile, device:str): assert len(dev_events) == 1, "only one device registration event is expected" return [x for x in profile if getattr(x, "device", None) == device], dev_events[0] -@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "HCQ device required to run") +@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled) or Device.DEFAULT in {"METAL"}, "HCQ device required to run") class TestProfiler(unittest.TestCase): @classmethod def setUpClass(self): @@ -39,9 +39,6 @@ class TestProfiler(unittest.TestCase): TestProfiler.runner = get_runner(TestProfiler.d0.device, si.ast) TestProfiler.b.lazydata.buffer.allocate() - TestProfiler.kernargs_ba_ptr = TestProfiler.runner._prg.fill_kernargs([TestProfiler.b.lazydata.buffer._buf, TestProfiler.a.lazydata.buffer._buf]) - TestProfiler.kernargs_ab_ptr = TestProfiler.runner._prg.fill_kernargs([TestProfiler.a.lazydata.buffer._buf, TestProfiler.b.lazydata.buffer._buf]) - def test_profile_kernel_run(self): runner_name = TestProfiler.runner._prg.name with helper_collect_profile(TestProfiler.d0) as profile: @@ -71,12 +68,12 @@ class TestProfiler(unittest.TestCase): with helper_collect_profile(TestProfiler.d0) as profile: buf1.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) TestProfiler.runner([buf1, TestProfiler.a.lazydata.buffer], var_vals={}) - buf1.as_buffer() + buf1.copyout(memoryview(bytearray(buf1.nbytes))) profile, _ = helper_profile_filter_device(profile, TestProfiler.d0.device) evs = [x for x in profile if isinstance(x, ProfileRangeEvent)] - assert len(evs) == 3, "two kernel runs are expected" + assert len(evs) == 3, "3 kernel runs are expected" assert evs[0].is_copy, "kernel should be copy" assert evs[1].name == runner_name, "kernel name is not correct" assert not evs[1].is_copy, "kernel should not be copy" @@ -102,7 +99,7 @@ class TestProfiler(unittest.TestCase): assert len(evs) == 1, "one kernel runs are expected" assert evs[0].is_copy, "kernel should be copy" - @unittest.skipIf(MOCKGPU and Device.DEFAULT == "AMD", "AMD mockgpu with indirect buffers does not support queue wait interrupts") + @unittest.skipIf(Device.DEFAULT in "METAL" or (MOCKGPU and Device.DEFAULT == "AMD"), "AMD mockgpu does not support queue wait interrupts") def test_profile_graph(self): d1 = Device[f"{Device.DEFAULT}:1"] @@ -124,7 +121,7 @@ class TestProfiler(unittest.TestCase): assert len(graph_evs) == 1, "one graph event is expected" assert len(graph_evs[0].ents) == 2, "two entities are expected" - @unittest.skipIf(CI, "skip CI") + @unittest.skipIf(CI or not issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "skip CI") def test_dev_jitter_matrix(self): dev_cnt = 6 devs = [Device[f"{Device.DEFAULT}:{i}"] for i in range(dev_cnt)] diff --git a/tinygrad/device.py b/tinygrad/device.py index 56d51fccff..99e52831dc 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass, replace from collections import defaultdict -from typing import Optional, Dict, Tuple, Any, Iterator, List, Set -import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal +from typing import Optional, Dict, Tuple, Any, Iterator, List, Set, Generator +import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes from tinygrad.renderer import Renderer @@ -61,6 +61,16 @@ class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:boo @dataclass(frozen=True) class ProfileGraphEvent(ProfileEvent): ents:List[ProfileGraphEntry]; deps:List[List[int]]; sigs:List[decimal.Decimal] # noqa: E702 +@dataclass +class ProfileResult: st:Optional[int]=None; en:Optional[int]=None # noqa: E702 + +@contextlib.contextmanager +def cpu_profile(name, device="CPU", is_copy=False, display=True) -> Generator[ProfileResult, None, None]: + yield (res:=ProfileResult(st:=time.perf_counter_ns())) + res.en = en = time.perf_counter_ns() + if PROFILE and display: + Compiled.profile_events += [ProfileRangeEvent(device, name, decimal.Decimal(st) / 1000, decimal.Decimal(en) / 1000, is_copy=is_copy)] + # **************** Buffer + Allocators **************** @@ -221,7 +231,7 @@ class Compiler: def disassemble(self, lib:bytes): pass class Compiled: - profile_events:List[ProfileEvent] = [] + profile_events:List[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device. def __init__(self, device:str, allocator:Allocator, renderer:Optional[Renderer], compiler:Optional[Compiler], runtime, graph=None): self.device, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler or Compiler(), runtime, graph diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py index 84c5f7d1dc..62afcaf439 100644 --- a/tinygrad/runtime/graph/metal.py +++ b/tinygrad/runtime/graph/metal.py @@ -7,7 +7,7 @@ from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.engine.jit import GraphRunner, GraphException from tinygrad.ops import Variable from tinygrad.runtime.ops_metal import wait_check, msg, libobjc, to_struct, objc_instance,\ - MTLResourceOptions, elapsed_time, objc_id + MTLResourceOptions, cmdbuf_st_time, cmdbuf_en_time, objc_id, to_ns_str class MTLIndirectCommandType: MTLIndirectCommandTypeConcurrentDispatch = (1 << 5) @@ -90,11 +90,12 @@ class MetalGraph(GraphRunner): msg(encoder, "executeCommandsInBuffer:withRange:", self.icb, self.range) msg(encoder, "endEncoding") + msg(command_buffer, "setLabel:", to_ns_str(f"batched {len(self.jit_cache)}")) msg(command_buffer, "commit") self.command_buffer = command_buffer + self.dev.mtl_buffers_in_flight.append(command_buffer) if wait: wait_check(command_buffer) - return elapsed_time(command_buffer) - self.dev.mtl_buffers_in_flight.append(command_buffer) + return cmdbuf_en_time(command_buffer) - cmdbuf_st_time(command_buffer) return None diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 82b50c929d..69d96a20bd 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -1,8 +1,8 @@ from __future__ import annotations -import os, pathlib, struct, ctypes, tempfile, functools +import os, pathlib, struct, ctypes, tempfile, functools, decimal from typing import List, Any, Union, Tuple, cast -from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t -from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator +from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE +from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, cpu_profile, ProfileDeviceEvent, ProfileRangeEvent from tinygrad.renderer.cstyle import MetalRenderer class objc_id(ctypes.c_void_p): # This prevents ctypes from converting response to plain int, and dict.fromkeys() can use it to dedup @@ -44,6 +44,7 @@ def msg(ptr: objc_id, selector: str, /, *args: Any, restype: type[T] = objc_id) return sender(ptr, sel(selector), *args) def to_ns_str(s: str): return msg(libobjc.objc_getClass(b"NSString"), "stringWithUTF8String:", s.encode(), restype=objc_instance) +def from_ns_str(s): return bytes(msg(s, "UTF8String", restype=ctypes.c_char_p)).decode() def to_struct(*t: int, _type: type = ctypes.c_ulong): return init_c_struct_t(tuple([(f"field{i}", _type) for i in range(len(t))]))(*t) @@ -51,12 +52,13 @@ def wait_check(cbuf: Any): msg(cbuf, "waitUntilCompleted") error_check(msg(cbuf, "error", restype=objc_instance)) -def elapsed_time(cbuf: objc_id): - return cast(float, msg(cbuf, "GPUEndTime", restype=ctypes.c_double)) - cast(float, msg(cbuf, "GPUStartTime", restype=ctypes.c_double)) +def cmdbuf_label(cbuf: objc_id) -> str: return from_ns_str(msg(cbuf, "label", restype=objc_id)) +def cmdbuf_st_time(cbuf: objc_id) -> float: return cast(float, msg(cbuf, "GPUStartTime", restype=ctypes.c_double)) +def cmdbuf_en_time(cbuf: objc_id) -> float: return cast(float, msg(cbuf, "GPUEndTime", restype=ctypes.c_double)) def error_check(error: objc_instance, error_constructor: type[Exception] = RuntimeError): if error.value is None: return None - raise error_constructor(bytes(msg(msg(error, "localizedDescription", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode()) + raise error_constructor(from_ns_str(msg(error, "localizedDescription", restype=objc_instance))) def metal_src_to_library(device:MetalDevice, src:str) -> objc_instance: options = msg(libobjc.objc_getClass(b"MTLCompileOptions"), "new", restype=objc_instance) @@ -136,11 +138,12 @@ class MetalProgram: for i,a in enumerate(vals, start=len(bufs)): msg(encoder, "setBytes:length:atIndex:", bytes(ctypes.c_int(a)), 4, i) msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size)) msg(encoder, "endEncoding") + msg(command_buffer, "setLabel:", to_ns_str(self.name)) msg(command_buffer, "commit") + self.dev.mtl_buffers_in_flight.append(command_buffer) if wait: wait_check(command_buffer) - return elapsed_time(command_buffer) - self.dev.mtl_buffers_in_flight.append(command_buffer) + return cmdbuf_en_time(command_buffer) - cmdbuf_st_time(command_buffer) class MetalBuffer: def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset @@ -172,11 +175,13 @@ class MetalAllocator(LRUAllocator): src_dev.timeline_value += 1 msg(src_command_buffer, "commit") src_dev.mtl_buffers_in_flight.append(src_command_buffer) + def _cp_mv(self, dst, src, prof_desc): + with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src def _as_buffer(self, src:MetalBuffer) -> memoryview: self.dev.synchronize() return to_mv(cast(int, msg(src.buf, "contents", restype=objc_id).value), src.size + src.offset)[src.offset:] - def _copyin(self, dest:MetalBuffer, src:memoryview): self._as_buffer(dest)[:] = src - def _copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self._as_buffer(src) + def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "CPU -> METAL") + def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> CPU") def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset) class MetalDevice(Compiled): @@ -185,14 +190,18 @@ class MetalDevice(Compiled): self.mtl_queue = msg(self.sysdevice, "newCommandQueueWithMaxCommandBufferCount:", 1024, restype=objc_instance) if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue") self.mtl_buffers_in_flight: List[Any] = [] - self.mv_in_metal: List[memoryview] = [] self.timeline_signal = msg(self.sysdevice, "newSharedEvent", restype=objc_instance) self.timeline_value = 0 + Compiled.profile_events += [ProfileDeviceEvent(device)] + from tinygrad.runtime.graph.metal import MetalGraph super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler() if getenv("METAL_DIRECT", 1) else Compiler(), functools.partial(MetalProgram, self), MetalGraph) + def synchronize(self): - for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf) - self.mv_in_metal.clear() + for cbuf in self.mtl_buffers_in_flight: + wait_check(cbuf) + st, en = decimal.Decimal(cmdbuf_st_time(cbuf)) * 1000000, decimal.Decimal(cmdbuf_en_time(cbuf)) * 1000000 + if PROFILE: Compiled.profile_events += [ProfileRangeEvent(self.device, cmdbuf_label(cbuf), st, en, is_copy=False)] self.mtl_buffers_in_flight.clear() diff --git a/tinygrad/viz/README b/tinygrad/viz/README index 95f503e6bd..4565529c67 100644 --- a/tinygrad/viz/README +++ b/tinygrad/viz/README @@ -8,16 +8,17 @@ https://tiny-tools-client.vercel.app/ and a viewer for: SAVE_SCHEDULE=1 TRACK_MATCH_STATS=2 -PROFILE=1 (eventually) +PROFILE=1 to use: -1. Run tinygrad with VIZ=1 (this saves the pkls and launches the server (new process please!)) +1. Run tinygrad with VIZ=1 and/or PROFILE=1 (this saves the pkls and launches the server (new process please!)) 2. That's it! This should be able to: -1. See all schedules -2. See all graphs and how they were rewritten -3. See generated code +1. See all schedules (VIZ=1) +2. See all graphs and how they were rewritten (VIZ=1) +3. See generated code (VIZ=1) +4. See profile (PROFILE=1) bunch of dev rules: * everything must be responsive to keyboard smashing! lag should never happen