_alloc and _free with options (#3934)

* _alloc has options

* linter

* fix hsa
This commit is contained in:
nimlgen
2024-03-26 19:11:41 +03:00
committed by GitHub
parent 739f47eb0f
commit e2d6f76723
9 changed files with 34 additions and 38 deletions

View File

@@ -1,7 +1,7 @@
import ctypes, collections, time, itertools
from typing import List, Any, Dict, cast, Optional, Union, Tuple
from tinygrad.helpers import GraphException, init_c_var, round_up
from tinygrad.device import Compiled, Buffer, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
from tinygrad.device import Compiled, Buffer, BufferOptions, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats
from tinygrad.shape.symbolic import Variable
from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
from tinygrad.features.jit import JitItem, get_input_replace, get_jit_stats, \
@@ -47,7 +47,7 @@ class HSAGraph(MultiDeviceJITGraph):
kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
for ji in self.jit_cache:
if isinstance(ji.prg, CompiledASTRunner): kernargs_size[ji.prg.device] += round_up(ctypes.sizeof(ji.prg.clprg.args_struct_t), 16)
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz) for dev,sz in kernargs_size.items()}
kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferOptions()) for dev,sz in kernargs_size.items()}
# Fill initial arguments.
self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}

View File

@@ -123,13 +123,13 @@ class CUDAAllocator(LRUAllocator):
def __init__(self, device:CUDADevice):
self.device = device
super().__init__()
def _alloc(self, size):
def _alloc(self, size, options:BufferOptions):
check(cuda.cuCtxSetCurrent(self.device.context))
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
def _alloc_with_options(self, size:int, options:BufferOptions):
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0)))
else: raise ValueError("no options")
def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
def _free(self, opaque, options:BufferOptions):
if options.host: return check(cuda.cuMemFreeHost(opaque))
else: check(cuda.cuMemFree_v2(opaque))
def copyin(self, dest, src:memoryview):
check(cuda.cuCtxSetCurrent(self.device.context))
host_mem = self.alloc(len(src), BufferOptions(host=True))

View File

@@ -20,7 +20,7 @@ class DiskBuffer:
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
class DiskAllocator(Allocator):
def __init__(self, device:str): self.device = device
def _alloc(self, size:int):
def _alloc(self, size:int, options):
if self.device.startswith("shm:"):
fd = _posixshmem.shm_open("/"+self.device[4:].lstrip("/"), os.O_RDWR, 0o600)
mem = mmap.mmap(fd, size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)

View File

@@ -65,15 +65,13 @@ class CLAllocator(LRUAllocator):
def __init__(self, device:CLDevice):
self.device = device
super().__init__()
def _alloc(self, size:int) -> ctypes._CData:
return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
def _alloc_with_options(self, size:int, options:BufferOptions) -> ctypes._CData:
def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
if options.image is not None:
return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
else: return self._alloc(size)
def _free(self, buf:ctypes._CData): check(cl.clReleaseMemObject(buf))
else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf))
def copyin(self, dest:ctypes._CData, src:memoryview):
check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command

View File

@@ -105,27 +105,25 @@ class HSAAllocator(LRUAllocator):
self.device = device
super().__init__()
def _alloc(self, size:int):
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _alloc_with_options(self, size:int, options:BufferOptions):
def _alloc(self, size:int, options:BufferOptions):
if options.host:
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
return mem.value
else: raise ValueError("no options")
else:
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
return buf.value
def _free(self, opaque:T):
def _free(self, opaque:T, options:BufferOptions):
HSADevice.synchronize_system()
check(hsa.hsa_amd_memory_pool_free(opaque))
def copyin(self, dest:T, src: memoryview):
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
mem = self._alloc_with_options(src.nbytes, BufferOptions(host=True))
mem = self._alloc(src.nbytes, BufferOptions(host=True))
ctypes.memmove(mem, from_mv(src), src.nbytes)
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
@@ -137,7 +135,7 @@ class HSAAllocator(LRUAllocator):
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
if not hasattr(self, 'hb'):
self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
self.hb = [self._alloc(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
self.hb_polarity = 0
self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
@@ -262,7 +260,7 @@ class HSADevice(Compiled):
def _new_kernargs_region(self, sz:int):
if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
self.kernarg_start_addr: int = self.allocator._alloc(sz)
self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferOptions())
self.kernarg_next_addr = self.kernarg_start_addr
self.kernarg_pool_sz: int = sz

View File

@@ -67,7 +67,7 @@ class MetalAllocator(LRUAllocator):
for x in self.track_cross_device: x.synchronize()
self.track_cross_device.clear()
return super().free_cache()
def _alloc(self, size:int) -> Any:
def _alloc(self, size:int, options) -> Any:
ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
return ret
@@ -82,7 +82,7 @@ class MetalAllocator(LRUAllocator):
ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
if ret: self.device.mv_in_metal.append(src)
return ret
def _free(self, opaque:Any): opaque.release()
def _free(self, opaque:Any, options): opaque.release()
def as_buffer(self, src:Any) -> memoryview:
self.device.synchronize()
return src.contents().as_buffer(src.length())

View File

@@ -188,7 +188,7 @@ class PythonCompiler(Compiler):
def compile(self, src:str) -> bytes: return base64.b64decode(src)
class PythonAllocator(Allocator):
def _alloc(self, size): return memoryview(bytearray(size))
def _alloc(self, size, options): return memoryview(bytearray(size))
def copyin(self, dest, src:memoryview): dest[:] = src
def copyout(self, dest:memoryview, src): dest[:] = src