mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
cpu: copies in profile (#11392)
* cpu: copies in profile * fix * rename to tiny?
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import platform, subprocess, sys, ctypes, functools, time, mmap
|
||||
from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond
|
||||
from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, wait_cond, cpu_profile
|
||||
from tinygrad.device import Compiler, BufferSpec, DMACPURef
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
||||
from tinygrad.runtime.support.elf import jit_loader
|
||||
@@ -92,8 +92,10 @@ class CPUAllocator(HCQAllocatorBase):
|
||||
return HCQBuffer(va:=addr, sz:=size, meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
|
||||
def _as_buffer(self, src) -> memoryview: return to_mv(src.va_addr, src.size)
|
||||
def _as_dmaref(self, buf): return DMACPURef(buf.va_addr, buf.size)
|
||||
def _copyin(self, dest, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
|
||||
def _copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
|
||||
def _copyin(self, dest, src:memoryview):
|
||||
with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
|
||||
def _copyout(self, dest:memoryview, src):
|
||||
with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
|
||||
def _map(self, buf:HCQBuffer):
|
||||
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
|
||||
|
||||
|
||||
@@ -226,6 +226,6 @@ class MetalAllocator(LRUAllocator[MetalDevice]):
|
||||
def _as_buffer(self, src:MetalBuffer) -> memoryview:
|
||||
self.dev.synchronize()
|
||||
return to_mv(cast(int, msg("contents", objc_id)(src.buf).value), src.size + src.offset)[src.offset:]
|
||||
def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "CPU -> METAL")
|
||||
def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> CPU")
|
||||
def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "TINY -> METAL")
|
||||
def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> TINY")
|
||||
def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
|
||||
|
||||
@@ -496,7 +496,7 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
|
||||
class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.device}", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
self.dev.timeline_signal.wait(self.b_timeline[self.b_next])
|
||||
@@ -528,7 +528,7 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
self.dev.synchronize()
|
||||
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> CPU", enabled=PROFILE):
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE):
|
||||
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(cp_size, dest.nbytes-i)) \
|
||||
|
||||
Reference in New Issue
Block a user