From 68959be05d52d6cf94d782cb366f0aa89898d454 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Fri, 8 Jul 2022 10:56:48 -0700
Subject: [PATCH] precompute weights for opencl

---
 accel/opencl/ops_opencl.py    | 17 +++++++++--------
 accel/opencl/preprocessing.py |  4 +++-
 tinygrad/ops.py               |  5 ++---
 3 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/accel/opencl/ops_opencl.py b/accel/opencl/ops_opencl.py
index b84aa05e5d..906b04e775 100644
--- a/accel/opencl/ops_opencl.py
+++ b/accel/opencl/ops_opencl.py
@@ -51,16 +51,12 @@ def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:
 
 def roundup(x, n=4): return (x+(n-1))//n * n
 class OpenCLBuffer(GPUBuffer):
-  def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None):
-    super().__init__(shape, hostbuf)
+  def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
     self._image = hostbuf._image if hostbuf is not None else None
+    super().__init__(shape, hostbuf, backing)
 
   @staticmethod
-  def fromCPU(x):
-    ret = OpenCLBuffer(x.shape)
-    # TODO: this is blocking even though we told it not to
-    CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
-    return ret
+  def fromCPU(x): return OpenCLBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
   
   def __repr__(self): return f"<OpenCLBuffer with shape {self.shape!r}>"
 
@@ -69,8 +65,14 @@ class OpenCLBuffer(GPUBuffer):
     if self._buf is None:
       if self.st.contiguous:
         self._buf = CLBuffer(4*roundup(prod(self.shape)))
+        if self._backing is not None:
+          CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
+          self._backing = None
       if self._image is not None:
         self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
+        if self._backing is not None:
+          CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
+          self._backing = None
         #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
         CLProgram("from_image", """
           __kernel void from_image(
@@ -154,7 +156,6 @@ class OpenCLBuffer(GPUBuffer):
           getters.append(f"inline float4 get4_{name}(__global const float *x, const sampler_t smp, int2 loc, int gid) {{"+
             f"return (float4)(get_{name}(x,gid+0), get_{name}(x,gid+1), get_{name}(x,gid+2), get_{name}(x,gid+3)); }}")
         else:
-          print("folded")
           fakebufs.append(name)
           getters.append(f"inline float4 get4_{name}(int gid) {{"+
             f"return (float4)(get_{name}(gid+0), get_{name}(gid+1), get_{name}(gid+2), get_{name}(gid+3)); }}")
diff --git a/accel/opencl/preprocessing.py b/accel/opencl/preprocessing.py
index 568c34cc64..a8ffdc9af6 100644
--- a/accel/opencl/preprocessing.py
+++ b/accel/opencl/preprocessing.py
@@ -56,7 +56,7 @@ def preprocessing_op(x,w,C):
   C = C._replace(out_shape = (C.bs*C.oy, C.ox*C.cout//4, 4))
   #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
   #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w
-  return x,w,C
+  return x,w.contiguous_op(),C
 
 def postprocessing_op(ret, C, C_initial):
   added_output_channels = C.rcout - C_initial.rcout
@@ -75,5 +75,7 @@ def postprocessing_op(ret, C, C_initial):
 
 def processed_conv(x, w, C):
   x,w,Cn = preprocessing_op(x,w,C)
+  # precompute the weight
+  w.realize().image
   ret = x.processing_op(ProcessingOps.CONV, w, Cn)
   return postprocessing_op(ret, Cn, C)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index c9762c5309..65104dbc6b 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -194,14 +194,12 @@ class LazyBuffer:
     self.shape = self.st.shape
     self.optype, self.op = optype, op
     self.realized : Optional[DeviceBuffer] = None
-    self.device = device
+    self.device, self.dbuffer = device, Device._buffers[device]
     self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
     # NOTE: op should be read only after construction of LazyBuffer
     for x in get_lazybuffers(op): x.children.add(self)
     if not LAZY: self.realize()
 
-  @property
-  def dbuffer(self) -> DeviceBuffer: return Device._buffers[self.device]
   def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
 
   # this produces a device buffer
@@ -225,6 +223,7 @@ class LazyBuffer:
 
   def unary_op(x:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, x)
   def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y)
+  def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
 
   def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
     return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))