From 4a07ea355d51d33daca224c221cbf31a061722b4 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:23:55 -0800 Subject: [PATCH] buffer options should work (#3211) * buffer options should work * minor * fix dtype --- tinygrad/device.py | 42 ++++++++++++++++++++----------------- tinygrad/jit.py | 7 ++++--- tinygrad/runtime/ops_gpu.py | 10 ++++----- tinygrad/runtime/ops_hip.py | 6 +++++- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index 144908fd71..f08116d3cc 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -7,6 +7,7 @@ from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_i from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.symbolic import Variable, sym_infer, sint from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, UnaryOps, Op, GlobalCounters, MovementOps +from dataclasses import dataclass if TYPE_CHECKING: from tinygrad.codegen.linearizer import Linearizer @@ -65,13 +66,18 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona # **************** Buffer / Allocator **************** +@dataclass(frozen=True, eq=True) +class BufferOptions: + image: Optional[ImageDType] = None + uncached: bool = False + class Buffer: - def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None): + def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None): assert isinstance(dtype, DType) - self.device, self.size, self.dtype, self.d = device, size, dtype, Device[device] + if isinstance(dtype, ImageDType): options = BufferOptions(image=dtype) # TODO: image hack shouldn't be here. where should it be? + self.device, self.size, self.dtype, self.d, self.options = device, size, dtype, Device[device], options self.allocator = self.d.allocator - # TODO: image hack shouldn't be here. where should it be? - self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else self.nbytes) + self._buf = opaque if opaque is not None else self.allocator.alloc(self.nbytes, options) # TODO: mem_used for all devices if not self.device.startswith("DISK"): GlobalCounters.mem_used += self.nbytes @property @@ -79,8 +85,7 @@ class Buffer: def __del__(self): if not hasattr(self, '_buf'): return # happens when __init__ has raised exception if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes - if isinstance(self.dtype, ImageDType): self.allocator.free(self._buf, self.dtype) - else: self.allocator.free(self._buf, self.nbytes) + self.allocator.free(self._buf, self.nbytes, self.options) def __repr__(self): return f"" def as_buffer(self, allow_zero_copy=False, force_zero_copy=False) -> memoryview: # zero copy with as_buffer (disabled by default due to use after free) @@ -133,32 +138,31 @@ class BufferXfer(BufferCopy): else: src.d.synchronize() # TODO: size, dest, src are the same type. can we enforce this? -sz_type = Union[ImageDType, int] class Allocator: - def alloc(self, size:sz_type): + def alloc(self, size:int, options:Optional[BufferOptions]=None): assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}" - return self._alloc_image(size) if isinstance(size, ImageDType) else self._alloc(size) + return self._alloc_with_options(size, options) if options is not None else self._alloc(size) def _alloc(self, size:int): raise NotImplementedError("need alloc") - def _alloc_image(self, dtype:ImageDType): raise RuntimeError("need alloc image") - def free(self, opaque, size:sz_type): self._free(opaque) # if you are returning a Python object, you don't need a free - def _free(self, opaque): pass + def _alloc_with_options(self, size:int, options:BufferOptions): return self._alloc(size) # TODO: override this if you support options + def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque) + def _free(self, opaque): pass # if opaque is a Python object, you don't need a free def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin") def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout") class LRUAllocator(Allocator): # pylint: disable=abstract-method - def __init__(self): self.cache: Dict[sz_type, Any] = defaultdict(list) - def alloc(self, size:sz_type): - if len(c := self.cache[size]): return c.pop() - try: return super().alloc(size) + def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list) + def alloc(self, size:int, options:Optional[BufferOptions]=None): + if len(c := self.cache[(size, options)]): return c.pop() + try: return super().alloc(size, options) except (RuntimeError, MemoryError): self.free_cache() - return super().alloc(size) + return super().alloc(size, options) def free_cache(self): for opaques in self.cache.values(): for opaque in opaques: self._free(opaque) opaques.clear() - def free(self, opaque:Any, size:sz_type): - if getenv("LRU", 1): self.cache[size].append(opaque) + def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None): + if getenv("LRU", 1): self.cache[(size, options)].append(opaque) else: self._free(opaque) class _MallocAllocator(LRUAllocator): diff --git a/tinygrad/jit.py b/tinygrad/jit.py index 3f3c329b20..e04f111dd6 100644 --- a/tinygrad/jit.py +++ b/tinygrad/jit.py @@ -140,14 +140,15 @@ class TinyJit(Generic[ReturnType]): return cast(ReturnType, self.ret) class PlaceHolder: - def __init__(self, buf:Buffer): self.size, self.dtype, self.device, self.ref, self.bufid = buf.size, buf.dtype, buf.device, ref(buf), id(buf._buf) - def to_tuple(self): return (self.size, self.dtype, self.device, self.bufid) + def __init__(self, buf:Buffer): + self.size, self.dtype, self.device, self.ref, self.bufid, self.options = buf.size, buf.dtype, buf.device, ref(buf), id(buf._buf), buf.options + def to_tuple(self): return (self.size, self.dtype, self.device, self.bufid, self.options) def __hash__(self): return hash(self.to_tuple()) def __eq__(self, x): return isinstance(x, PlaceHolder) and self.to_tuple() == x.to_tuple() def alloc_if_needed(self, buffer_cache: Dict[PlaceHolder, Buffer]) -> Buffer: ret = self.ref() if ret: return ret - if self not in buffer_cache: buffer_cache[self] = Buffer(self.device, self.size, self.dtype) + if self not in buffer_cache: buffer_cache[self] = Buffer(self.device, self.size, self.dtype, options=self.options) return buffer_cache[self] class _CacheCollector: diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 70df2de364..4a6c860ed3 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -3,10 +3,9 @@ from typing import Tuple, Optional, List import ctypes, functools, hashlib import gpuctypes.opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG -from tinygrad.dtype import ImageDType from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import OpenCLRenderer -from tinygrad.device import Compiled, LRUAllocator +from tinygrad.device import Compiled, LRUAllocator, BufferOptions # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something OSX_TIMING_RATIO = (125/3) if OSX else 1.0 @@ -61,10 +60,11 @@ class CLAllocator(LRUAllocator): super().__init__() def _alloc(self, size:int) -> cl.cl_mem: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status) - def _alloc_image(self, dtype:ImageDType) -> cl.cl_mem: + def _alloc_with_options(self, size:int, options:BufferOptions) -> cl.cl_mem: + assert options.image is not None return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE, - cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[dtype.itemsize]), - dtype.shape[1], dtype.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status) + cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), + options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status) def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf)) def copyin(self, dest:cl.cl_mem, src:memoryview): check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None)) diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 9c48695691..5174e2af95 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List import gpuctypes.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style from tinygrad.helpers import from_mv, round_up, to_mv -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator +from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.codegen.kernel import LinearizerOptions @@ -52,6 +52,10 @@ class HIPAllocator(LRUAllocator): def _alloc(self, size:int): check(hip.hipSetDevice(self.device.device)) return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size))) + def _alloc_with_options(self, size:int, options:BufferOptions): + assert options.uncached + check(hip.hipSetDevice(self.device.device)) + return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3))) # hipDeviceMallocUncached = 3 def _free(self, opaque:T): check(hip.hipFree(opaque)) def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0))) def copy_from_fd(self, dest, fd, offset, size):