amd: copies w/o sdma (#14036)

* amd: copies w/o sdma

* as_args

* fixes

* f
This commit is contained in:
nimlgen
2026-01-06 21:15:58 +03:00
committed by GitHub
parent 7fb18f7e47
commit 325f4006ff
6 changed files with 30 additions and 27 deletions

View File

@@ -113,7 +113,7 @@ class TestGraph(unittest.TestCase):
def skip_if_not_multigraph(self): def skip_if_not_multigraph(self):
graph = g.func if isinstance(g:=(d:=Device[Device.DEFAULT]).graph, functools.partial) else g graph = g.func if isinstance(g:=(d:=Device[Device.DEFAULT]).graph, functools.partial) else g
if not issubclass(graph, MultiGraphRunner): self.skipTest("graph is not supported (not MultiGraphRunner)") if not issubclass(graph, MultiGraphRunner): self.skipTest("graph is not supported (not MultiGraphRunner)")
if not hasattr(d.allocator, '_transfer'): self.skipTest("device is not supported (no transfers)") if not hasattr(d.allocator, '_transfer') or not d.allocator.supports_transfer: self.skipTest("device is not supported (no transfers)")
def test_order_copy_writed(self): def test_order_copy_writed(self):
self.skip_if_not_multigraph() self.skip_if_not_multigraph()

View File

@@ -218,10 +218,10 @@ DeviceType = TypeVar('DeviceType', bound='Compiled')
# TODO: size, dest, src are the same type. can we enforce this? # TODO: size, dest, src are the same type. can we enforce this?
class Allocator(Generic[DeviceType]): class Allocator(Generic[DeviceType]):
def __init__(self, dev:DeviceType): def __init__(self, dev:DeviceType, supports_copy_from_disk:bool=True, supports_transfer:bool=True):
self.dev: DeviceType = dev self.dev: DeviceType = dev
self.default_buffer_spec: BufferSpec = BufferSpec() self.default_buffer_spec: BufferSpec = BufferSpec()
self.supports_copy_from_disk: bool = True self.supports_copy_from_disk, self.supports_transfer = supports_copy_from_disk, supports_transfer
# overridden in LRUAllocator # overridden in LRUAllocator
def alloc(self, size:int, options:BufferSpec|None=None): def alloc(self, size:int, options:BufferSpec|None=None):
assert size > 0, f"alloc size must be positive, getting {size}" assert size > 0, f"alloc size must be positive, getting {size}"
@@ -244,9 +244,9 @@ class LRUAllocator(Allocator, Generic[DeviceType]):
The LRU Allocator is responsible for caching buffers. The LRU Allocator is responsible for caching buffers.
It ensures that buffers are not freed until it is absolutely necessary, optimizing performance. It ensures that buffers are not freed until it is absolutely necessary, optimizing performance.
""" """
def __init__(self, dev:DeviceType): def __init__(self, dev:DeviceType, **kwargs):
self.cache: dict[tuple[int, BufferSpec|None], Any] = defaultdict(list) self.cache: dict[tuple[int, BufferSpec|None], Any] = defaultdict(list)
super().__init__(dev) super().__init__(dev, **kwargs)
def alloc(self, size:int, options:BufferSpec|None=None): def alloc(self, size:int, options:BufferSpec|None=None):
if len(c := self.cache[(size, options)]): return c.pop() if len(c := self.cache[(size, options)]): return c.pop()
try: return super().alloc(size, options) try: return super().alloc(size, options)

View File

@@ -2,8 +2,7 @@ from typing import cast, Callable
import time, pprint, random, itertools, math import time, pprint, random, itertools, math
from dataclasses import dataclass, replace, field from dataclasses import dataclass, replace, field
from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey from tinygrad.helpers import all_same, colored, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, TRACEMETA, TracingKey
from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod, Context from tinygrad.helpers import DEVECTORIZE, time_to_str, VALIDATE_WITH_CPU, cpu_profile, PROFILE, ProfilePointEvent, cpu_events, prod, Context, unwrap
from tinygrad.helpers import unwrap
from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer
from tinygrad.device import Device, Buffer from tinygrad.device import Device, Buffer
from tinygrad.renderer import ProgramSpec, Estimates from tinygrad.renderer import ProgramSpec, Estimates
@@ -128,7 +127,7 @@ si_lowerer = PatternMatcher([
(UPat((Ops.SINK, Ops.PROGRAM), name="sink"), lambda ctx,sink: get_runner(ctx[0].device, sink)), (UPat((Ops.SINK, Ops.PROGRAM), name="sink"), lambda ctx,sink: get_runner(ctx[0].device, sink)),
(UPat(Ops.BUFFER_VIEW), lambda ctx: ViewOp(ctx[0])), (UPat(Ops.BUFFER_VIEW), lambda ctx: ViewOp(ctx[0])),
(UPat(Ops.COPY, name="copy"), lambda ctx,copy: (BufferXfer(ctx[0].nbytes, ctx[0].device, ctx[1].device) \ (UPat(Ops.COPY, name="copy"), lambda ctx,copy: (BufferXfer(ctx[0].nbytes, ctx[0].device, ctx[1].device) \
if hasattr(Device[ctx[0].device].allocator, '_transfer') and all_same([x.device.split(":")[0] for x in ctx]) \ if hasattr(alc:=Device[ctx[0].device].allocator, '_transfer') and alc.supports_transfer and all_same([x.device.split(":")[0] for x in ctx]) \
else BufferCopy(ctx[0].nbytes, ctx[0].device, ctx[1].device))), else BufferCopy(ctx[0].nbytes, ctx[0].device, ctx[1].device))),
(UPat(Ops.ENCDEC, name="encdec"), lambda ctx,encdec: EncDec(encdec, ctx[0].nbytes, ctx[1].device)), (UPat(Ops.ENCDEC, name="encdec"), lambda ctx,encdec: EncDec(encdec, ctx[0].nbytes, ctx[1].device)),
]) ])

View File

@@ -620,12 +620,12 @@ class AMDProgram(HCQProgram):
class AMDAllocator(HCQAllocator['AMDDevice']): class AMDAllocator(HCQAllocator['AMDDevice']):
def __init__(self, dev:AMDDevice): def __init__(self, dev:AMDDevice):
super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None) super().__init__(dev, copy_bufs=getattr(dev.iface, 'copy_bufs', None), max_copyout_size=0x1000 if dev.is_usb() else None,
supports_copy_from_disk=not dev.is_usb() and dev.has_sdma_queue, supports_transfer=dev.has_sdma_queue)
if hasattr(dev.iface, "as_dmaref"): self._as_dmaref = dev.iface.as_dmaref if hasattr(dev.iface, "as_dmaref"): self._as_dmaref = dev.iface.as_dmaref
self.supports_copy_from_disk = not dev.is_usb()
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access) return self.dev.iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access or not self.dev.has_sdma_queue)
def _do_free(self, opaque, options:BufferSpec): self.dev.iface.free(opaque) def _do_free(self, opaque, options:BufferSpec): self.dev.iface.free(opaque)
@@ -940,6 +940,7 @@ class AMDDevice(HCQCompiled):
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size) ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 self.max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.has_sdma_queue = self.sdma_queue(0) is not None
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None), compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM), CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
@@ -947,7 +948,7 @@ class AMDDevice(HCQCompiled):
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal, super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self), functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size), functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size) if self.has_sdma_queue else None,
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000) kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
# Scratch setup # Scratch setup
@@ -1003,7 +1004,9 @@ class AMDDevice(HCQCompiled):
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx)) ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size, idx=idx))
@functools.lru_cache(None) @functools.lru_cache(None)
def sdma_queue(self, idx:int=0): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx) def sdma_queue(self, idx:int):
with contextlib.suppress(OSError): return self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20), idx=idx)
return None
def _ensure_has_local_memory(self, private_segment_size): def _ensure_has_local_memory(self, private_segment_size):
if self.max_private_segment_size >= private_segment_size: return if self.max_private_segment_size >= private_segment_size: return

View File

@@ -1,9 +1,9 @@
from __future__ import annotations from __future__ import annotations
import platform, sys, ctypes, functools, time, mmap, threading, queue import platform, sys, ctypes, functools, time, mmap, threading, queue
from tinygrad.helpers import from_mv, to_mv, OSX, WIN, mv_address, wait_cond, cpu_profile, suppress_finalizing, unwrap, data64_le from tinygrad.helpers import to_mv, OSX, WIN, mv_address, wait_cond, suppress_finalizing, unwrap, data64_le
from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
from tinygrad.runtime.support.hcq import CLikeArgsState from tinygrad.runtime.support.hcq import CLikeArgsState
from tinygrad.renderer.cstyle import ClangJITRenderer from tinygrad.renderer.cstyle import ClangJITRenderer
from tinygrad.renderer.llvmir import LLVMRenderer from tinygrad.renderer.llvmir import LLVMRenderer
@@ -111,7 +111,8 @@ class CPUProgram(HCQProgram):
def __del__(self): def __del__(self):
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
class CPUAllocator(HCQAllocatorBase): class CPUAllocator(HCQAllocator):
def __init__(self, dev:CPUDevice): super().__init__(dev, supports_copy_from_disk=False, supports_transfer=False)
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
if options.external_ptr: addr, buf = options.external_ptr, None if options.external_ptr: addr, buf = options.external_ptr, None
elif WIN: addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE)) elif WIN: addr = mv_address(buf:=mmap.mmap(-1, size, access=mmap.ACCESS_WRITE))
@@ -123,12 +124,6 @@ class CPUAllocator(HCQAllocatorBase):
def _as_dmaref(self, buf): def _as_dmaref(self, buf):
self.dev.synchronize() self.dev.synchronize()
return DMACPURef(buf.va_addr, buf.size) return DMACPURef(buf.va_addr, buf.size)
def _copyin(self, dest, src:memoryview):
self.dev.synchronize()
with cpu_profile('TINY -> CPU', self.dev.device, is_copy=True): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
def _copyout(self, dest:memoryview, src):
self.dev.synchronize()
with cpu_profile('CPU -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
def _map(self, buf:HCQBuffer): def _map(self, buf:HCQBuffer):
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu") if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")

View File

@@ -3,7 +3,7 @@ from typing import cast, Callable, Type, TypeVar, Generic, Any
import contextlib, decimal, statistics, time, ctypes, array, os, struct, collections, functools import contextlib, decimal, statistics, time, ctypes, array, os, struct, collections, functools
try: import fcntl # windows misses that try: import fcntl # windows misses that
except ImportError: fcntl = None #type:ignore[assignment] except ImportError: fcntl = None #type:ignore[assignment]
from tinygrad.helpers import PROFILE, getenv, to_mv, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing from tinygrad.helpers import PROFILE, getenv, to_mv, from_mv, cpu_profile, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet
from tinygrad.uop.ops import sym_infer, sint, UOp from tinygrad.uop.ops import sym_infer, sint, UOp
from tinygrad.runtime.autogen import libc from tinygrad.runtime.autogen import libc
@@ -486,8 +486,8 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`. This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`.
""" """
def __init__(self, dev:HCQDeviceType, batch_size:int=(2 << 20), batch_cnt:int=32, copy_bufs=None, max_copyout_size:int|None=None): def __init__(self, dev:HCQDeviceType, batch_size:int=(2 << 20), batch_cnt:int=32, copy_bufs=None, max_copyout_size:int|None=None, **kwargs):
super().__init__(dev) super().__init__(dev, **kwargs)
self.b = copy_bufs or [self._alloc(batch_size, BufferSpec(host=True)) for _ in range(batch_cnt)] self.b = copy_bufs or [self._alloc(batch_size, BufferSpec(host=True)) for _ in range(batch_cnt)]
self.b_timeline, self.b_next, self.max_copyout_size = [0] * len(self.b), 0, max_copyout_size self.b_timeline, self.b_next, self.max_copyout_size = [0] * len(self.b), 0, max_copyout_size
@@ -510,7 +510,11 @@ class HCQAllocatorBase(LRUAllocator[HCQDeviceType], Generic[HCQDeviceType]):
class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]): class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
def _copyin(self, dest:HCQBuffer, src:memoryview): def _copyin(self, dest:HCQBuffer, src:memoryview):
assert self.dev.hw_copy_queue_t is not None if self.dev.hw_copy_queue_t is None:
self.dev.synchronize()
with cpu_profile(f'TINY -> {self.dev.device}', self.dev.device, is_copy=True): ctypes.memmove(cast(int, dest.va_addr), from_mv(src), len(src))
return
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE): with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"TINY -> {self.dev.device}", enabled=PROFILE):
for i in range(0, src.nbytes, self.b[0].size): for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b) self.b_next = (self.b_next + 1) % len(self.b)
@@ -541,8 +545,10 @@ class HCQAllocator(HCQAllocatorBase, Generic[HCQDeviceType]):
def _copyout(self, dest:memoryview, src:HCQBuffer): def _copyout(self, dest:memoryview, src:HCQBuffer):
self.dev.synchronize() self.dev.synchronize()
if self.dev.hw_copy_queue_t is None:
with cpu_profile(f'{self.dev.device} -> TINY', self.dev.device, is_copy=True): ctypes.memmove(from_mv(dest), cast(int, src.va_addr), len(dest))
return
assert self.dev.hw_copy_queue_t is not None
with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE): with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"{self.dev.device} -> TINY", enabled=PROFILE):
for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)): for i in range(0, dest.nbytes, cp_size:=(self.max_copyout_size or self.b[0].size)):
self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ self.dev.hw_copy_queue_t().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \