CL class, debugging

2026-04-29 03:00:14 -04:00 · 2022-06-21 20:16:29 -07:00
parent 0b820f7966
commit 9d06a86f7f
3 changed files with 44 additions and 24 deletions
--- a/accel/lazy/ops_lazy.py
+++ b/accel/lazy/ops_lazy.py
@@ -77,6 +77,8 @@ class LazyBuffer:

  def unary_op(x, op): return elementwise_op(op, (x,))
  def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y))
+
+  @functools.lru_cache(maxsize=None)
  def contiguous_op(x): return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))

  @functools.lru_cache(maxsize=None)
--- a/accel/opencl/ops_opencl.py
+++ b/accel/opencl/ops_opencl.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
-import pyopencl as cl
-from tinygrad.llops.ops_gpu import GPUBuffer, get_cl_ctx, get_cl_queue, CLProgram, code_for_op
+from tinygrad.llops.ops_gpu import GPUBuffer, CL, CLProgram, code_for_op
 from tinygrad.ops import ProcessingOps
 from tinygrad.helpers import prod, ConvArgs
 from typing import List, Tuple, Optional, Dict
 import numpy as np
+import pyopencl as cl

 import pathlib
 def load(x):
@@ -13,6 +13,15 @@ def load(x):
   return ret
 CONV_SRC = load(pathlib.Path(__file__).parent.parent.parent / 'accel/opencl/conv.cl')

+class ECL(CL):
+  @staticmethod
+  def image(shape):
+    if CL.DEBUG >= 2: print(f"cl: create image({shape})")
+    fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
+    # HALF_FLOAT breaks tests
+    #fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT)
+    return cl.Image(CL().cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=shape)
+
 def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:
  middle_code = []

@@ -46,16 +55,16 @@ class OpenCLBuffer(GPUBuffer):
  def fromCPU(x):
    ret = OpenCLBuffer(x.shape)
    # TODO: this is blocking even though we told it not to
-    cl.enqueue_copy(get_cl_queue(), ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
+    CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
    return ret

  @property
  def cl(self):
    if self._buf is None:
      if self.st.contiguous:
-        self._buf = cl.Buffer(get_cl_ctx(), cl.mem_flags.READ_WRITE, 4*roundup(prod(self.shape)))
+        self._buf = CL.malloc(4*roundup(prod(self.shape)))
      if self._image is not None:
-        self._buf = cl.Buffer(get_cl_ctx(), cl.mem_flags.READ_WRITE, 4*roundup(prod(self._image.shape)*4))
+        self._buf = CL.malloc(4*roundup(prod(self._image.shape)*4))
        #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
        CLProgram("from_image", """
          __kernel void from_image(
@@ -78,10 +87,7 @@ class OpenCLBuffer(GPUBuffer):
  def image(self):
    if self._image is None:
      assert self.shape[2] == 4 and len(self.shape) == 3
-      fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
-      # HALF_FLOAT breaks tests
-      #fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT)
-      self._image = cl.Image(get_cl_ctx(), cl.mem_flags.READ_WRITE, fmt, shape=(self.shape[1], self.shape[0]))
+      self._image = ECL.image(shape=(self.shape[1], self.shape[0]))
      if self._buf is not None:
        assert prod(self.shape) == prod(self._image.shape)*4
        #print(f"converting {self.shape} to image with shape {self._image.shape}")
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import os
 import functools
 import numpy as np
 import pyopencl as cl
@@ -7,28 +8,40 @@ from tinygrad.helpers import prod, ConvArgs
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, ProcessingOps
 from tinygrad.shapetracker import ShapeTracker, View, strides_for_shape

-cl_ctx, cl_queue = None, None
-def get_cl_ctx(): return cl_ctx
-def get_cl_queue(): return cl_queue
-def require_init_gpu():
-  global cl_ctx, cl_queue
-  if cl_ctx is None:
+
+class CL:
+  DEBUG = int(os.getenv("DEBUGCL", "0"))
+  def __init__(self):
+    if getattr(CL, "cl_queue", None) is not None: return
    devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
    if len(devices) == 0:  # settle for CPU
      devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.CPU)
-    cl_ctx = cl.Context(devices=devices)
-    cl_queue = cl.CommandQueue(cl_ctx)  # this is an in-order command queue
+    CL.cl_ctx = cl.Context(devices=devices)
+    CL.cl_queue = cl.CommandQueue(self.cl_ctx)  # this is an in-order command queue
+
+  @staticmethod
+  def enqueue_copy(a, b, is_blocking=False):
+    if CL.DEBUG: print(f"cl: copy into {type(a)} sz {a.size} block {is_blocking}")
+    cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking)
+
+  @staticmethod
+  def malloc(sz):
+    if CL.DEBUG >= 2: print(f"cl: malloc({sz})")
+    return cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, sz)

@functools.lru_cache(maxsize=None)
 class CLProgram:
  def __init__(self, name, prg, options=tuple(), argdtypes=None):
    self.name = name
-    self.built = cl.Program(cl_ctx, prg).build(options=options)
+    if CL.DEBUG >= 2: print(f"cl: building {self.name:20s} with {options}")
+    self.built = cl.Program(CL().cl_ctx, prg).build(options=options)
    self.clprg = self.built.__getattr__(name)
    if argdtypes is not None: self.clprg.set_scalar_arg_dtypes(argdtypes)
  def __call__(self, *args):
-    #print(f"running {self.name} with {args[0]} count {len(args)-2}")
-    self.clprg(cl_queue, *args)
+    if CL.DEBUG: print(f"cl: running {self.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d}")
+    self.clprg(CL().cl_queue, *args)
+
+# **** end CL wrappers ****

 code_for_op = {
  UnaryOps.NOOP: "(A)", UnaryOps.RELU: "max(A, (float)0.)", UnaryOps.EXP: "exp(A)", UnaryOps.LOG: "log(A)", UnaryOps.NEG: "(-(A))", UnaryOps.SIGN: "sign(A)",
@@ -37,14 +50,13 @@ code_for_op = {

 class GPUBuffer:
  def __init__(self, shape, hostbuf:Optional[GPUBuffer]=None):
-    require_init_gpu()
    self.st = ShapeTracker(shape)
    self.shape = self.st.shape
    self._buf = hostbuf._buf if hostbuf is not None else None
  
  @property
  def cl(self):
-    if self._buf is None: self._buf = cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*prod(self.shape))
+    if self._buf is None: self._buf = CL.malloc(4*prod(self.shape))
    return self._buf

  def __repr__(self):
@@ -54,12 +66,12 @@ class GPUBuffer:
  def fromCPU(x):
    ret = GPUBuffer(x.shape)
    # TODO: this is blocking even though we told it not to
-    cl.enqueue_copy(cl_queue, ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
+    CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
    return ret

  def toCPU(self):
    data = np.empty(self.shape, dtype=np.float32)
-    cl.enqueue_copy(cl_queue, data, self.contiguous_op().cl, is_blocking=True)
+    CL.enqueue_copy(data, self.contiguous_op().cl, is_blocking=True)
    return data

  def contiguous_view(x, name:str) -> str: