precompute weights for opencl

2026-01-23 05:48:08 -05:00 · 2022-07-08 10:56:48 -07:00
parent d8e7f1f8bc
commit 68959be05d
3 changed files with 14 additions and 12 deletions
--- a/accel/opencl/ops_opencl.py
+++ b/accel/opencl/ops_opencl.py
@@ -51,16 +51,12 @@ def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:

 def roundup(x, n=4): return (x+(n-1))//n * n
 class OpenCLBuffer(GPUBuffer):
-  def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None):
-    super().__init__(shape, hostbuf)
+  def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
    self._image = hostbuf._image if hostbuf is not None else None
+    super().__init__(shape, hostbuf, backing)

  @staticmethod
-  def fromCPU(x):
-    ret = OpenCLBuffer(x.shape)
-    # TODO: this is blocking even though we told it not to
-    CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
-    return ret
+  def fromCPU(x): return OpenCLBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
  
  def __repr__(self): return f"<OpenCLBuffer with shape {self.shape!r}>"

@@ -69,8 +65,14 @@ class OpenCLBuffer(GPUBuffer):
    if self._buf is None:
      if self.st.contiguous:
        self._buf = CLBuffer(4*roundup(prod(self.shape)))
+        if self._backing is not None:
+          CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
+          self._backing = None
      if self._image is not None:
        self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
+        if self._backing is not None:
+          CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
+          self._backing = None
        #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
        CLProgram("from_image", """
          __kernel void from_image(
@@ -154,7 +156,6 @@ class OpenCLBuffer(GPUBuffer):
          getters.append(f"inline float4 get4_{name}(__global const float *x, const sampler_t smp, int2 loc, int gid) {{"+
            f"return (float4)(get_{name}(x,gid+0), get_{name}(x,gid+1), get_{name}(x,gid+2), get_{name}(x,gid+3)); }}")
        else:
-          print("folded")
          fakebufs.append(name)
          getters.append(f"inline float4 get4_{name}(int gid) {{"+
            f"return (float4)(get_{name}(gid+0), get_{name}(gid+1), get_{name}(gid+2), get_{name}(gid+3)); }}")
--- a/accel/opencl/preprocessing.py
+++ b/accel/opencl/preprocessing.py
@@ -56,7 +56,7 @@ def preprocessing_op(x,w,C):
  C = C._replace(out_shape = (C.bs*C.oy, C.ox*C.cout//4, 4))
  #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
  #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w
-  return x,w,C
+  return x,w.contiguous_op(),C

 def postprocessing_op(ret, C, C_initial):
  added_output_channels = C.rcout - C_initial.rcout
@@ -75,5 +75,7 @@ def postprocessing_op(ret, C, C_initial):

 def processed_conv(x, w, C):
  x,w,Cn = preprocessing_op(x,w,C)
+  # precompute the weight
+  w.realize().image
  ret = x.processing_op(ProcessingOps.CONV, w, Cn)
  return postprocessing_op(ret, Cn, C)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -194,14 +194,12 @@ class LazyBuffer:
    self.shape = self.st.shape
    self.optype, self.op = optype, op
    self.realized : Optional[DeviceBuffer] = None
-    self.device = device
+    self.device, self.dbuffer = device, Device._buffers[device]
    self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
    # NOTE: op should be read only after construction of LazyBuffer
    for x in get_lazybuffers(op): x.children.add(self)
    if not LAZY: self.realize()

-  @property
-  def dbuffer(self) -> DeviceBuffer: return Device._buffers[self.device]
  def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"

  # this produces a device buffer
@@ -225,6 +223,7 @@ class LazyBuffer:

  def unary_op(x:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, x)
  def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y)
+  def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)

  def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
    return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))