Refactor ASTs (#622)

* ugh worst branch name * compiler refactor continues * scc -> cloc * buf -> _buf * finish _buf, and program -> runtime * gpu is still working, clang isn't * clang in new style * ops_metal * something broke it * improve metal * clean up tons of cl crap * hack fix sync * cleaner gpu * gpu metal clang * cleanups * minor refactor * GPUCodegen * fix up LLVM * blind CUDA refactor * codegen / runtime * keep ops naming * linter passes * woah, llvm was allocing 4x what it needed to * bugfixes * fix openpilot compiler * fix compile_efficientnet * method cache should fix tests * deal with duped functions
2026-02-11 07:05:04 -05:00 · 2023-03-01 18:57:29 -08:00
parent 5e41d5857c
commit bfcec234a2
34 changed files with 557 additions and 627 deletions
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+import platform, functools
+import numpy as np
+import pyopencl as cl  # type: ignore
+from typing import Dict, Optional, List, ClassVar, Final
+from collections import defaultdict
+from tinygrad.helpers import IMAGE, DEBUG, getenv
+from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
+from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
+
+OSX = platform.system() == "Darwin"
+OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
+CLCACHE = getenv("CLCACHE", 1)
+FLOAT16 = getenv("FLOAT16", 0)
+
+class _CL:
+  @functools.cached_property
+  def cl_ctx(self) -> cl.Context:
+    devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
+    if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
+    if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
+    return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
+
+  @functools.cached_property
+  def cl_queue(self) -> cl.CommandQueue:
+    return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
+CL = _CL()
+  
+class CLBuffer(RawBufferCopyInOut):
+  # TODO: this can be in RawBuffer generically
+  BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
+
+  def __init__(self, size):
+    self.size = size
+    if len(CLBuffer.BUFFER_CACHE[size]) > 0:
+      self._cl = CLBuffer.BUFFER_CACHE[size].pop()
+    else:
+      # TODO: on GPU OOM, clear the cache
+      self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
+      GlobalCounters.mem_used += self._cl.size
+
+  def __del__(self):
+    if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
+    else: GlobalCounters.mem_used -= self._cl.size
+
+  def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
+  def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)
+
+class CLImage(RawBuffer):
+  fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
+  IMAGE : Final = True
+
+  def __init__(self, shape):
+    self._cl = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
+    GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
+
+  def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
+
+class CLProgram:
+  def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
+    self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg)  # type: ignore
+    try:
+      self._clprg = self.clprogram.build()
+    except cl.RuntimeError as e:
+      if DEBUG >= 3: print("FAILED TO BUILD", prg)
+      raise e
+    self.clprg = self._clprg.__getattr__(name)
+    if DEBUG >= 5 and not OSX: print(self.clprogram.get_info(cl.program_info.BINARIES)[0].decode('utf-8'))  # print the PTX for NVIDIA. TODO: probably broken for everything else
+    if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes)
+
+  def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
+    e = self.clprg(CL.cl_queue, global_size, local_size, *[x._cl if isinstance(x, (CLBuffer, CLImage)) else x for x in bufs])
+    if wait:
+      CL.cl_queue.finish()
+      return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
+    return None
+
+class CLCodegen(GPUCodegen):
+  lang = GPULanguage(
+    kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
+    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
+    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
+
+class GPUBuffer(CompiledBuffer):
+  raw_buffer_type = CLBuffer
+  # override this method for image
+  @classmethod
+  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
+    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and not backing: return CLImage(shape)
+    else: return super().create_raw_buffer(shape, backing)
+  codegen_type = CLCodegen
+  runtime_type = CLProgram