From 4a07ea355d51d33daca224c221cbf31a061722b4 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Mon, 22 Jan 2024 19:23:55 -0800
Subject: [PATCH] buffer options should work (#3211)

* buffer options should work

* minor

* fix dtype
---
 tinygrad/device.py          | 42 ++++++++++++++++++++-----------------
 tinygrad/jit.py             |  7 ++++---
 tinygrad/runtime/ops_gpu.py | 10 ++++-----
 tinygrad/runtime/ops_hip.py |  6 +++++-
 4 files changed, 37 insertions(+), 28 deletions(-)
diff --git a/tinygrad/device.py b/tinygrad/device.py
index 144908fd71..f08116d3cc 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -7,6 +7,7 @@ from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_i
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, UnaryOps, Op, GlobalCounters, MovementOps
+from dataclasses import dataclass
 
 if TYPE_CHECKING:
   from tinygrad.codegen.linearizer import Linearizer
@@ -65,13 +66,18 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:int, var_vals: Optiona
 
 # **************** Buffer / Allocator ****************
 
+@dataclass(frozen=True, eq=True)
+class BufferOptions:
+  image: Optional[ImageDType] = None
+  uncached: bool = False
+
 class Buffer:
-  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
+  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None, options:Optional[BufferOptions]=None):
     assert isinstance(dtype, DType)
-    self.device, self.size, self.dtype, self.d = device, size, dtype, Device[device]
+    if isinstance(dtype, ImageDType): options = BufferOptions(image=dtype) # TODO: image hack shouldn't be here. where should it be?
+    self.device, self.size, self.dtype, self.d, self.options = device, size, dtype, Device[device], options
     self.allocator = self.d.allocator
-    # TODO: image hack shouldn't be here. where should it be?
-    self._buf = opaque if opaque is not None else self.allocator.alloc(dtype if isinstance(dtype, ImageDType) else self.nbytes)
+    self._buf = opaque if opaque is not None else self.allocator.alloc(self.nbytes, options)
     # TODO: mem_used for all devices
     if not self.device.startswith("DISK"): GlobalCounters.mem_used += self.nbytes
   @property
@@ -79,8 +85,7 @@ class Buffer:
   def __del__(self):
     if not hasattr(self, '_buf'): return # happens when __init__ has raised exception
     if not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes
-    if isinstance(self.dtype, ImageDType): self.allocator.free(self._buf, self.dtype)
-    else: self.allocator.free(self._buf, self.nbytes)
+    self.allocator.free(self._buf, self.nbytes, self.options)
   def __repr__(self): return f"<buf device:{self.device} size:{self.size} dtype:{self.dtype}>"
   def as_buffer(self, allow_zero_copy=False, force_zero_copy=False) -> memoryview:
     # zero copy with as_buffer (disabled by default due to use after free)
@@ -133,32 +138,31 @@ class BufferXfer(BufferCopy):
     else: src.d.synchronize()
 
 # TODO: size, dest, src are the same type. can we enforce this?
-sz_type = Union[ImageDType, int]
 class Allocator:
-  def alloc(self, size:sz_type):
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
     assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
-    return self._alloc_image(size) if isinstance(size, ImageDType) else self._alloc(size)
+    return self._alloc_with_options(size, options) if options is not None else self._alloc(size)
   def _alloc(self, size:int): raise NotImplementedError("need alloc")
-  def _alloc_image(self, dtype:ImageDType): raise RuntimeError("need alloc image")
-  def free(self, opaque, size:sz_type): self._free(opaque) # if you are returning a Python object, you don't need a free
-  def _free(self, opaque): pass
+  def _alloc_with_options(self, size:int, options:BufferOptions): return self._alloc(size)  # TODO: override this if you support options
+  def free(self, opaque, size:int, options:Optional[BufferOptions]=None): self._free(opaque)
+  def _free(self, opaque): pass  # if opaque is a Python object, you don't need a free
   def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
   def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
 
 class LRUAllocator(Allocator):  # pylint: disable=abstract-method
-  def __init__(self): self.cache: Dict[sz_type, Any] = defaultdict(list)
-  def alloc(self, size:sz_type):
-    if len(c := self.cache[size]): return c.pop()
-    try: return super().alloc(size)
+  def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    if len(c := self.cache[(size, options)]): return c.pop()
+    try: return super().alloc(size, options)
     except (RuntimeError, MemoryError):
       self.free_cache()
-      return super().alloc(size)
+      return super().alloc(size, options)
   def free_cache(self):
     for opaques in self.cache.values():
       for opaque in opaques: self._free(opaque)
       opaques.clear()
-  def free(self, opaque:Any, size:sz_type):
-    if getenv("LRU", 1): self.cache[size].append(opaque)
+  def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
+    if getenv("LRU", 1): self.cache[(size, options)].append(opaque)
     else: self._free(opaque)
 
 class _MallocAllocator(LRUAllocator):
diff --git a/tinygrad/jit.py b/tinygrad/jit.py
index 3f3c329b20..e04f111dd6 100644
--- a/tinygrad/jit.py
+++ b/tinygrad/jit.py
@@ -140,14 +140,15 @@ class TinyJit(Generic[ReturnType]):
     return cast(ReturnType, self.ret)
 
 class PlaceHolder:
-  def __init__(self, buf:Buffer): self.size, self.dtype, self.device, self.ref, self.bufid = buf.size, buf.dtype, buf.device, ref(buf), id(buf._buf)
-  def to_tuple(self): return (self.size, self.dtype, self.device, self.bufid)
+  def __init__(self, buf:Buffer):
+    self.size, self.dtype, self.device, self.ref, self.bufid, self.options = buf.size, buf.dtype, buf.device, ref(buf), id(buf._buf), buf.options
+  def to_tuple(self): return (self.size, self.dtype, self.device, self.bufid, self.options)
   def __hash__(self): return hash(self.to_tuple())
   def __eq__(self, x): return isinstance(x, PlaceHolder) and self.to_tuple() == x.to_tuple()
   def alloc_if_needed(self, buffer_cache: Dict[PlaceHolder, Buffer]) -> Buffer:
     ret = self.ref()
     if ret: return ret
-    if self not in buffer_cache: buffer_cache[self] = Buffer(self.device, self.size, self.dtype)
+    if self not in buffer_cache: buffer_cache[self] = Buffer(self.device, self.size, self.dtype, options=self.options)
     return buffer_cache[self]
 
 class _CacheCollector:
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 70df2de364..4a6c860ed3 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -3,10 +3,9 @@ from typing import Tuple, Optional, List
 import ctypes, functools, hashlib
 import gpuctypes.opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
-from tinygrad.dtype import ImageDType
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
-from tinygrad.device import Compiled, LRUAllocator
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions
 
 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -61,10 +60,11 @@ class CLAllocator(LRUAllocator):
     super().__init__()
   def _alloc(self, size:int) -> cl.cl_mem:
     return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
-  def _alloc_image(self, dtype:ImageDType) -> cl.cl_mem:
+  def _alloc_with_options(self, size:int, options:BufferOptions) -> cl.cl_mem:
+    assert options.image is not None
     return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
-                                      cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[dtype.itemsize]),
-                                      dtype.shape[1], dtype.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
+                                      cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
+                                      options.image.shape[1], options.image.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
   def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf))
   def copyin(self, dest:cl.cl_mem, src:memoryview):
     check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py
index 9c48695691..5174e2af95 100644
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List
 import gpuctypes.hip as hip
 from tinygrad.helpers import DEBUG, getenv, init_c_var, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style
 from tinygrad.helpers import from_mv, round_up, to_mv
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.codegen.kernel import LinearizerOptions
 
@@ -52,6 +52,10 @@ class HIPAllocator(LRUAllocator):
   def _alloc(self, size:int):
     check(hip.hipSetDevice(self.device.device))
     return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
+  def _alloc_with_options(self, size:int, options:BufferOptions):
+    assert options.uncached
+    check(hip.hipSetDevice(self.device.device))
+    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipExtMallocWithFlags(ctypes.byref(x), size, 3)))  # hipDeviceMallocUncached = 3
   def _free(self, opaque:T): check(hip.hipFree(opaque))
   def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0)))
   def copy_from_fd(self, dest, fd, offset, size):