compile works (#688)

* compile works * runtimes * line count * fix custom, to tg dtype * meh, that's fine with lazy import
2026-02-17 01:51:40 -05:00 · 2023-03-12 11:01:25 -07:00
parent af7745073f
commit 15e0b56e39
17 changed files with 71 additions and 78 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
-import platform, functools
+import platform
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Optional, List, Final
 from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
-from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
+from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer, Specialized
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 OSX = platform.system() == "Darwin"
@@ -12,16 +12,12 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.p
 FLOAT16 = getenv("FLOAT16", 0)

 class _CL:
-  @functools.cached_property
-  def cl_ctx(self) -> cl.Context:
+  def __init__(self):
    devices: List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
    if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
    if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
-    return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
-
-  @functools.cached_property
-  def cl_queue(self) -> cl.CommandQueue:
-    return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
+    self.cl_ctx: cl.Context = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
+    self.cl_queue: cl.CommandQueue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
 CL = _CL()

 class CLBuffer(RawBufferCopyInOut):
@@ -39,7 +35,7 @@ class CLImage(RawBuffer):  # pylint: disable=abstract-method
    GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
  def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height

-@functools.lru_cache(maxsize=None)
+#@functools.lru_cache(maxsize=None)
 class CLProgram:
  def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
    self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg)  # type: ignore
@@ -77,11 +73,8 @@ class CLCodegen(GPUCodegen):
    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])

 class GPUBuffer(CompiledBuffer):
-  raw_buffer_type = CLBuffer
+  spec = Specialized(CLBuffer, CLCodegen, CLProgram)
  # override this method for image
-  @classmethod
-  def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
+  def create_raw_buffer(self, shape, backing, dtype) -> RawBuffer:
    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)   # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
    else: return super().create_raw_buffer(shape, backing, dtype)
-  codegen_type = CLCodegen
-  runtime_type = CLProgram