mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
amd: copies w/o sdma (#14036)
* amd: copies w/o sdma * as_args * fixes * f
This commit is contained in:
@@ -113,7 +113,7 @@ class TestGraph(unittest.TestCase):
|
||||
def skip_if_not_multigraph(self):
|
||||
graph = g.func if isinstance(g:=(d:=Device[Device.DEFAULT]).graph, functools.partial) else g
|
||||
if not issubclass(graph, MultiGraphRunner): self.skipTest("graph is not supported (not MultiGraphRunner)")
|
||||
if not hasattr(d.allocator, '_transfer'): self.skipTest("device is not supported (no transfers)")
|
||||
if not hasattr(d.allocator, '_transfer') or not d.allocator.supports_transfer: self.skipTest("device is not supported (no transfers)")
|
||||
|
||||
def test_order_copy_writed(self):
|
||||
self.skip_if_not_multigraph()
|
||||
|
||||
@@ -218,10 +218,10 @@ DeviceType = TypeVar('DeviceType', bound='Compiled')
|
||||
|
||||
# TODO: size, dest, src are the same type. can we enforce this?
|
||||
class Allocator(Generic[DeviceType]):
|
||||
def __init__(self, dev:DeviceType):
|
||||
def __init__(self, dev:DeviceType, supports_copy_from_disk:bool=True, supports_transfer:bool=True):
|
||||
self.dev: DeviceType = dev
|
||||
self.default_buffer_spec: BufferSpec = BufferSpec()
|
||||
self.supports_copy_from_disk: bool = True
|
||||
self.supports_copy_from_disk, self.supports_transfer = supports_copy_from_disk, supports_transfer
|
||||
# overridden in LRUAllocator
|
||||
def alloc(self, size:int, options:BufferSpec|None=None):
|
||||
assert size > 0, f"alloc size must be positive, getting {size}"
|
||||
@@ -244,9 +244,9 @@ class LRUAllocator(Allocator, Generic[DeviceType]):
|
||||
The LRU Allocator is responsible for caching buffers.
|
||||
It ensures that buffers are not freed until it is absolutely necessary, optimizing performance.
|
||||
"""
|
||||
def __init__(self, dev:DeviceType):
|
||||
def __init__(self, dev:DeviceType, **kwargs):
|
||||
self.cache: dict[tuple[int, BufferSpec|None], Any] = defaultdict(list)
|
||||
super().__init__(dev)
|
||||
super().__init__(dev, **kwargs)
|
||||
def alloc(self, size:int, options:BufferSpec|None=None):
|
||||
if len(c := self.cache[(size, options)]): return c.pop()
|
||||
try: return super().alloc(size, options)
|
||||
|
||||
@@ -2,8 +2,7 @@ from typing import cast, Callable
|
||||
import time, pprint, random, itertools, math
|
||||
from dataclasses import dataclass, replace, field
|
||||
from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey
|
||||
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod, Context
|
||||
from tinygrad.helpers import unwrap
|
||||
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod, Context, unwrap
|
||||
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer
|
||||
from tinygrad.device import Device, Buffer
|
||||
from tinygrad.renderer import ProgramSpec, Estimates
|
||||
@@ -128,7 +127,7 @@ si_lowerer = PatternMatcher([
|
||||
(UPat((Ops.SINK, Ops.PROGRAM), name="sink"), lambda ctx,sink: get_runner(ctx[0].device, sink)),
|
||||
(UPat(Ops.BUFFER_VIEW), lambda ctx: ViewOp(ctx[0])),
|
||||
(UPat(Ops.COPY, name="copy"), lambda ctx,copy: (BufferXfer(ctx[0].nbytes, ctx[0].device, ctx[1].device) \
|
||||
if hasattr(Device[ctx[0].device].allocator, '_transfer') and all_same([x.device.split(":")[0] for x in ctx]) \
|
||||
if hasattr(alc:=Device[ctx[0].device].allocator, '_transfer') and alc.supports_transfer and all_same([x.device.split(":")[0] for x in ctx]) \
|
||||
else BufferCopy(ctx[0].nbytes, ctx[0].device, ctx[1].device))),
|
||||
(UPat(Ops.ENCDEC, name="encdec"), lambda ctx,encdec: EncDec(encdec, ctx[0].nbytes, ctx[1].device)),
|
||||
])
|
||||
|
||||
@@ -620,12 +620,12 @@ class AMDProgram(HCQProgram):
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
def __init__(self, dev:AMDDevice):
|
||||
super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None)
|
||||
super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None,
|
||||
supports_copy_from_disk=not dev.is_usb() and dev.has_sdma_queue, supports_transfer=dev.has_sdma_queue)
|
||||
if hasattr(dev.iface, "as_dmaref"): self._as_dmaref = dev.iface.as_dmaref
|
||||
self.supports_copy_from_disk = not dev.is_usb()
|
||||
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
||||
return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access or not self.dev.has_sdma_queue)
|
||||
|
||||
def _do_free(self, opaque, options:BufferSpec): self.dev.iface.free(opaque)
|
||||
|
||||
@@ -940,6 +940,7 @@ class AMDDevice(HCQCompiled):
|
||||
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.has_sdma_queue = self.sdma_queue(0) is not None
|
||||
|
||||
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
|
||||
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
|
||||
@@ -947,7 +948,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
|
||||
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
|
||||
functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size),
|
||||
functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size) if self.has_sdma_queue else None,
|
||||
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
|
||||
|
||||
# Scratch setup
|
||||
@@ -1003,7 +1004,9 @@ class AMDDevice(HCQCompiled):
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx))
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def sdma_queue(self, idx:int=0): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
|
||||
def sdma_queue(self, idx:int):
|
||||
with contextlib.suppress(OSError): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
|
||||
return None
|
||||
|
||||
def _ensure_has_local_memory(self, private_segment_size):
|
||||
if self.max_private_segment_size >= private_segment_size: return
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from __future__ import annotations
|
||||
import platform, sys, ctypes, functools, time, mmap, threading, queue
|
||||
from tinygrad.helpers import from_mv, to_mv, OSX, WIN, mv_address, wait_cond, cpu_profile, suppress_finalizing, unwrap, data64_le
|
||||
from tinygrad.helpers import to_mv, OSX, WIN, mv_address, wait_cond, suppress_finalizing, unwrap, data64_le
|
||||
from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
|
||||
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
||||
from tinygrad.runtime.support.hcq import CLikeArgsState
|
||||
from tinygrad.renderer.cstyle import ClangJITRenderer
|
||||
from tinygrad.renderer.llvmir import LLVMRenderer
|
||||
@@ -111,7 +111,8 @@ class CPUProgram(HCQProgram):
|
||||
def __del__(self):
|
||||
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
|
||||
|
||||
class CPUAllocator(HCQAllocatorBase):
|
||||
class CPUAllocator(HCQAllocator):
|
||||
def __init__(self, dev:CPUDevice): super().__init__(dev, supports_copy_from_disk=False, supports_transfer=False)
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
if options.external_ptr: addr, buf = options.external_ptr, None
|
||||
elif WIN: addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE))
|
||||
@@ -123,12 +124,6 @@ class CPUAllocator(HCQAllocatorBase):
|
||||
def _as_dmaref(self, buf):
|
||||
self.dev.synchronize()
|
||||
return DMACPURef(buf.va_addr, buf.size)
|
||||
def _copyin(self, dest, src:memoryview):
|
||||
self.dev.synchronize()
|
||||
with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
|
||||
def _copyout(self, dest:memoryview, src):
|
||||
self.dev.synchronize()
|
||||
with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
|
||||
def _map(self, buf:HCQBuffer):
|
||||
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import cast, Callable, Type, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, struct, collections, functools
|
||||
try: import fcntl # windows misses that
|
||||
except ImportError: fcntl = None #type:ignore[assignment]
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, from_mv, cpu_profile, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing
|
||||
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet
|
||||
from tinygrad.uop.ops import sym_infer, sint, UOp
|
||||
from tinygrad.runtime.autogen import libc
|
||||
@@ -486,8 +486,8 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
|
||||
This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`.
|
||||
"""
|
||||
|
||||
def __init__(self, dev:HCQDeviceType, batch_size:int=(2 << 20), batch_cnt:int=32, copy_bufs=None, max_copyout_size:int|None=None):
|
||||
super().__init__(dev)
|
||||
def __init__(self, dev:HCQDeviceType, batch_size:int=(2 << 20), batch_cnt:int=32, copy_bufs=None, max_copyout_size:int|None=None, **kwargs):
|
||||
super().__init__(dev, **kwargs)
|
||||
self.b = copy_bufs or [self._alloc(batch_size, BufferSpec(host=True)) for _ in range(batch_cnt)]
|
||||
self.b_timeline, self.b_next, self.max_copyout_size = [0] * len(self.b), 0, max_copyout_size
|
||||
|
||||
@@ -510,7 +510,11 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
|
||||
|
||||
class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
if self.dev.hw_copy_queue_t is None:
|
||||
self.dev.synchronize()
|
||||
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device, is_copy=True): ctypes.memmove(cast(int, dest.va_addr), from_mv(src), len(src))
|
||||
return
|
||||
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
@@ -541,8 +545,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
|
||||
|
||||
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
||||
self.dev.synchronize()
|
||||
if self.dev.hw_copy_queue_t is None:
|
||||
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), cast(int, src.va_addr), len(dest))
|
||||
return
|
||||
|
||||
assert self.dev.hw_copy_queue_t is not None
|
||||
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE):
|
||||
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
|
||||
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
|
||||
Reference in New Issue
Block a user