No dtype alloc (#2570)

* fix all allocs * improve docs * ugh fix fake alloc
2026-04-07 03:00:26 -04:00 · 2023-12-02 13:29:40 -08:00
parent c8774713c5
commit d6b404ac11
14 changed files with 157 additions and 123 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Tuple, Optional, Union, List, cast
 import ctypes, functools
 import gpuctypes.opencl as cl
-from tinygrad.helpers import to_char_p_p, from_mv, diskcache, OSX, DType, ImageDType
+from tinygrad.helpers import to_char_p_p, from_mv, diskcache, OSX, ImageDType
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.opencl import OpenCLRenderer
 from tinygrad.device import Compiled, LRUAllocator
@@ -66,14 +66,15 @@ class CLAllocator(LRUAllocator):
  def __init__(self, device:CLDevice):
    self.device = device
    super().__init__()
-  def _alloc(self, size:int, dtype:DType):
-    if isinstance(dtype, ImageDType):
-      return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
-                                        cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[dtype.itemsize]), dtype.shape[1], dtype.shape[0],
-                                        0, None, ctypes.byref(status := ctypes.c_int32())), status)
-    else:
-      return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size*dtype.itemsize, None, ctypes.byref(status := ctypes.c_int32())), status)
+  def _alloc(self, size:int) -> cl.cl_mem:
+    return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
  def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf))
+  def _cast_image(self, buf:cl.cl_mem, dtype:ImageDType, row_pitch:int) -> cl.cl_mem:
+    desc = cl.cl_image_desc(image_type=cl.CL_MEM_OBJECT_IMAGE2D, image_width=dtype.shape[1], image_height=dtype.shape[0], image_row_pitch=row_pitch)
+    desc._0.mem_object = buf
+    return checked(cl.clCreateImage(self.device.context, cl.CL_MEM_READ_WRITE,
+                                    cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[dtype.itemsize]),
+                                    desc, None, ctypes.byref(status := ctypes.c_int32())), status)
  def copyin(self, dest:cl.cl_mem, src:memoryview):
    check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
    self.device.pending_copyin.append(src)    # NOTE: these can't be freed until the GPU actually executes this command