diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 07fa2b63d4..7040bd2db1 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -1,6 +1,6 @@ from __future__ import annotations import platform, subprocess, sys, ctypes, functools, time, mmap -from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond +from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond, cpu_profile from tinygrad.device import Compiler, BufferSpec, DMACPURef from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface from tinygrad.runtime.support.elf import jit_loader @@ -92,8 +92,10 @@ class CPUAllocator(HCQAllocatorBase): return HCQBuffer(va:=addr, sz:=size, meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev) def _as_buffer(self, src) -> memoryview: return to_mv(src.va_addr, src.size) def _as_dmaref(self, buf): return DMACPURef(buf.va_addr, buf.size) - def _copyin(self, dest, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), len(src)) - def _copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src.va_addr, len(dest)) + def _copyin(self, dest, src:memoryview): + with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src)) + def _copyout(self, dest:memoryview, src): + with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest)) def _map(self, buf:HCQBuffer): if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu") diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 719377105a..027e5080ab 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -226,6 +226,6 @@ class MetalAllocator(LRUAllocator[MetalDevice]): def _as_buffer(self, src:MetalBuffer) -> memoryview: self.dev.synchronize() return to_mv(cast(int, msg("contents", objc_id)(src.buf).value), src.size + src.offset)[src.offset:] - def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "CPU -> METAL") - def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> CPU") + def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "TINY -> METAL") + def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> TINY") def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 0dda28f499..38296547ac 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -496,7 +496,7 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]): class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): def _copyin(self, dest:HCQBuffer, src:memoryview): assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.device}", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE): for i in range(0, src.nbytes, self.b[0].size): self.b_next = (self.b_next + 1) % len(self.b) self.dev.timeline_signal.wait(self.b_timeline[self.b_next]) @@ -528,7 +528,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): self.dev.synchronize() assert self.dev.hw_copy_queue_t is not None - with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> CPU", enabled=PROFILE): + with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE): for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)): self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \