From e2d6f76723aa38f333fb55eefb354a4c1d6815c1 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 26 Mar 2024 19:11:41 +0300 Subject: [PATCH] _alloc and _free with options (#3934) * _alloc has options * linter * fix hsa --- test/external/external_test_speed_llama.py | 2 +- tinygrad/device.py | 18 +++++++++--------- tinygrad/runtime/graph/hsa.py | 4 ++-- tinygrad/runtime/ops_cuda.py | 10 +++++----- tinygrad/runtime/ops_disk.py | 2 +- tinygrad/runtime/ops_gpu.py | 8 +++----- tinygrad/runtime/ops_hsa.py | 22 ++++++++++------------ tinygrad/runtime/ops_metal.py | 4 ++-- tinygrad/runtime/ops_python.py | 2 +- 9 files changed, 34 insertions(+), 38 deletions(-) diff --git a/test/external/external_test_speed_llama.py b/test/external/external_test_speed_llama.py index 8d381569cf..768d9683d3 100644 --- a/test/external/external_test_speed_llama.py +++ b/test/external/external_test_speed_llama.py @@ -12,7 +12,7 @@ class FakeProgram: def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass class FakeAllocator(Allocator): - def _alloc(self, sz): return None + def _alloc(self, sz, options): return None def copyin(self, dest, src:memoryview): pass class TestLLaMASpeed(unittest.TestCase): diff --git a/tinygrad/device.py b/tinygrad/device.py index 496ec85f64..10a9469a3d 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -148,11 +148,11 @@ class BufferXfer(BufferCopy): class Allocator: def alloc(self, size:int, options:Optional[BufferOptions]=None): assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}" - return self._alloc_with_options(size, options) if options is not None else self._alloc(size) - def _alloc(self, size:int): raise NotImplementedError("need alloc") - def _alloc_with_options(self, size:int, options:BufferOptions): return self._alloc(size) # TODO: override this if you support options - def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque) - def _free(self, opaque): pass # if opaque is a Python object, you don't need a free + return self._alloc(size, options if options is not None else BufferOptions()) + def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc") + def free(self, opaque, size:int, options:Optional[BufferOptions]=None): + self._free(opaque, options if options is not None else BufferOptions()) + def _free(self, opaque, options:BufferOptions): pass # if opaque is a Python object, you don't need a free def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin") def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout") @@ -165,15 +165,15 @@ class LRUAllocator(Allocator): # pylint: disable=abstract-method self.free_cache() return super().alloc(size, options) def free_cache(self): - for opaques in self.cache.values(): - for opaque in opaques: self._free(opaque) + for (sz,options),opaques in self.cache.items(): + for opaque in opaques: super().free(opaque, sz, options) opaques.clear() def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None): if getenv("LRU", 1) and (options is None or not options.signal): self.cache[(size, options)].append(opaque) - else: self._free(opaque) + else: super().free(size, size, options) class _MallocAllocator(LRUAllocator): - def _alloc(self, size:int): return (ctypes.c_uint8 * size)() + def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)() def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src)) def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src)) def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest)) diff --git a/tinygrad/runtime/graph/hsa.py b/tinygrad/runtime/graph/hsa.py index f92135cbfc..763b060505 100644 --- a/tinygrad/runtime/graph/hsa.py +++ b/tinygrad/runtime/graph/hsa.py @@ -1,7 +1,7 @@ import ctypes, collections, time, itertools from typing import List, Any, Dict, cast, Optional, Union, Tuple from tinygrad.helpers import GraphException, init_c_var, round_up -from tinygrad.device import Compiled, Buffer, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats +from tinygrad.device import Compiled, Buffer, BufferOptions, CompiledASTRunner, BufferXfer, MultiDeviceJITGraph, update_stats from tinygrad.shape.symbolic import Variable from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler from tinygrad.features.jit import JitItem, get_input_replace, get_jit_stats, \ @@ -47,7 +47,7 @@ class HSAGraph(MultiDeviceJITGraph): kernargs_size: Dict[Compiled, int] = collections.defaultdict(int) for ji in self.jit_cache: if isinstance(ji.prg, CompiledASTRunner): kernargs_size[ji.prg.device] += round_up(ctypes.sizeof(ji.prg.clprg.args_struct_t), 16) - kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz) for dev,sz in kernargs_size.items()} + kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferOptions()) for dev,sz in kernargs_size.items()} # Fill initial arguments. self.ji_kargs_structs: Dict[int, ctypes.Structure] = {} diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 227c1e8fba..367b3327ac 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -123,13 +123,13 @@ class CUDAAllocator(LRUAllocator): def __init__(self, device:CUDADevice): self.device = device super().__init__() - def _alloc(self, size): + def _alloc(self, size, options:BufferOptions): check(cuda.cuCtxSetCurrent(self.device.context)) - return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) - def _alloc_with_options(self, size:int, options:BufferOptions): if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0))) - else: raise ValueError("no options") - def _free(self, opaque): check(cuda.cuMemFree_v2(opaque)) + else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) + def _free(self, opaque, options:BufferOptions): + if options.host: return check(cuda.cuMemFreeHost(opaque)) + else: check(cuda.cuMemFree_v2(opaque)) def copyin(self, dest, src:memoryview): check(cuda.cuCtxSetCurrent(self.device.context)) host_mem = self.alloc(len(src), BufferOptions(host=True)) diff --git a/tinygrad/runtime/ops_disk.py b/tinygrad/runtime/ops_disk.py index 85de0c52df..b8b0c8b1ba 100644 --- a/tinygrad/runtime/ops_disk.py +++ b/tinygrad/runtime/ops_disk.py @@ -20,7 +20,7 @@ class DiskBuffer: MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000) class DiskAllocator(Allocator): def __init__(self, device:str): self.device = device - def _alloc(self, size:int): + def _alloc(self, size:int, options): if self.device.startswith("shm:"): fd = _posixshmem.shm_open("/"+self.device[4:].lstrip("/"), os.O_RDWR, 0o600) mem = mmap.mmap(fd, size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 819444b177..4ce0aa2c53 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -65,15 +65,13 @@ class CLAllocator(LRUAllocator): def __init__(self, device:CLDevice): self.device = device super().__init__() - def _alloc(self, size:int) -> ctypes._CData: - return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status) - def _alloc_with_options(self, size:int, options:BufferOptions) -> ctypes._CData: + def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData: if options.image is not None: return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE, cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status) - else: return self._alloc(size) - def _free(self, buf:ctypes._CData): check(cl.clReleaseMemObject(buf)) + else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status) + def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf)) def copyin(self, dest:ctypes._CData, src:memoryview): check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None)) self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index bcba4bdf93..ee21a025fc 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -105,27 +105,25 @@ class HSAAllocator(LRUAllocator): self.device = device super().__init__() - def _alloc(self, size:int): - c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]) - check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p()))) - check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf)) - return buf.value - - def _alloc_with_options(self, size:int, options:BufferOptions): + def _alloc(self, size:int, options:BufferOptions): if options.host: check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p()))) check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem)) return mem.value - else: raise ValueError("no options") + else: + c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]) + check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p()))) + check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf)) + return buf.value - def _free(self, opaque:T): + def _free(self, opaque:T, options:BufferOptions): HSADevice.synchronize_system() check(hsa.hsa_amd_memory_pool_free(opaque)) def copyin(self, dest:T, src: memoryview): # Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets. self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True)) - mem = self._alloc_with_options(src.nbytes, BufferOptions(host=True)) + mem = self._alloc(src.nbytes, BufferOptions(host=True)) ctypes.memmove(mem, from_mv(src), src.nbytes) check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal), copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True)) @@ -137,7 +135,7 @@ class HSAAllocator(LRUAllocator): self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True)) if not hasattr(self, 'hb'): - self.hb = [self._alloc_with_options(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)] + self.hb = [self._alloc(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)] self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)] self.hb_polarity = 0 self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1] @@ -262,7 +260,7 @@ class HSADevice(Compiled): def _new_kernargs_region(self, sz:int): if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr) - self.kernarg_start_addr: int = self.allocator._alloc(sz) + self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferOptions()) self.kernarg_next_addr = self.kernarg_start_addr self.kernarg_pool_sz: int = sz diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 39787c46dc..f209240798 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -67,7 +67,7 @@ class MetalAllocator(LRUAllocator): for x in self.track_cross_device: x.synchronize() self.track_cross_device.clear() return super().free_cache() - def _alloc(self, size:int) -> Any: + def _alloc(self, size:int, options) -> Any: ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared) if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}") return ret @@ -82,7 +82,7 @@ class MetalAllocator(LRUAllocator): ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None) if ret: self.device.mv_in_metal.append(src) return ret - def _free(self, opaque:Any): opaque.release() + def _free(self, opaque:Any, options): opaque.release() def as_buffer(self, src:Any) -> memoryview: self.device.synchronize() return src.contents().as_buffer(src.length()) diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index b9a2977a95..244a9a541e 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -188,7 +188,7 @@ class PythonCompiler(Compiler): def compile(self, src:str) -> bytes: return base64.b64decode(src) class PythonAllocator(Allocator): - def _alloc(self, size): return memoryview(bytearray(size)) + def _alloc(self, size, options): return memoryview(bytearray(size)) def copyin(self, dest, src:memoryview): dest[:] = src def copyout(self, dest:memoryview, src): dest[:] = src