From 68959be05d52d6cf94d782cb366f0aa89898d454 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Fri, 8 Jul 2022 10:56:48 -0700 Subject: [PATCH] precompute weights for opencl --- accel/opencl/ops_opencl.py | 17 +++++++++-------- accel/opencl/preprocessing.py | 4 +++- tinygrad/ops.py | 5 ++--- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/accel/opencl/ops_opencl.py b/accel/opencl/ops_opencl.py index b84aa05e5d..906b04e775 100644 --- a/accel/opencl/ops_opencl.py +++ b/accel/opencl/ops_opencl.py @@ -51,16 +51,12 @@ def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]: def roundup(x, n=4): return (x+(n-1))//n * n class OpenCLBuffer(GPUBuffer): - def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None): - super().__init__(shape, hostbuf) + def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None): self._image = hostbuf._image if hostbuf is not None else None + super().__init__(shape, hostbuf, backing) @staticmethod - def fromCPU(x): - ret = OpenCLBuffer(x.shape) - # TODO: this is blocking even though we told it not to - CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False) - return ret + def fromCPU(x): return OpenCLBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel()) def __repr__(self): return f"" @@ -69,8 +65,14 @@ class OpenCLBuffer(GPUBuffer): if self._buf is None: if self.st.contiguous: self._buf = CLBuffer(4*roundup(prod(self.shape))) + if self._backing is not None: + CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False) + self._backing = None if self._image is not None: self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4)) + if self._backing is not None: + CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False) + self._backing = None #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}") CLProgram("from_image", """ __kernel void from_image( @@ -154,7 +156,6 @@ class OpenCLBuffer(GPUBuffer): getters.append(f"inline float4 get4_{name}(__global const float *x, const sampler_t smp, int2 loc, int gid) {{"+ f"return (float4)(get_{name}(x,gid+0), get_{name}(x,gid+1), get_{name}(x,gid+2), get_{name}(x,gid+3)); }}") else: - print("folded") fakebufs.append(name) getters.append(f"inline float4 get4_{name}(int gid) {{"+ f"return (float4)(get_{name}(gid+0), get_{name}(gid+1), get_{name}(gid+2), get_{name}(gid+3)); }}") diff --git a/accel/opencl/preprocessing.py b/accel/opencl/preprocessing.py index 568c34cc64..a8ffdc9af6 100644 --- a/accel/opencl/preprocessing.py +++ b/accel/opencl/preprocessing.py @@ -56,7 +56,7 @@ def preprocessing_op(x,w,C): C = C._replace(out_shape = (C.bs*C.oy, C.ox*C.cout//4, 4)) #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w - return x,w,C + return x,w.contiguous_op(),C def postprocessing_op(ret, C, C_initial): added_output_channels = C.rcout - C_initial.rcout @@ -75,5 +75,7 @@ def postprocessing_op(ret, C, C_initial): def processed_conv(x, w, C): x,w,Cn = preprocessing_op(x,w,C) + # precompute the weight + w.realize().image ret = x.processing_op(ProcessingOps.CONV, w, Cn) return postprocessing_op(ret, Cn, C) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index c9762c5309..65104dbc6b 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -194,14 +194,12 @@ class LazyBuffer: self.shape = self.st.shape self.optype, self.op = optype, op self.realized : Optional[DeviceBuffer] = None - self.device = device + self.device, self.dbuffer = device, Device._buffers[device] self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet() # NOTE: op should be read only after construction of LazyBuffer for x in get_lazybuffers(op): x.children.add(self) if not LAZY: self.realize() - @property - def dbuffer(self) -> DeviceBuffer: return Device._buffers[self.device] def __repr__(self): return f"" # this produces a device buffer @@ -225,6 +223,7 @@ class LazyBuffer: def unary_op(x:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, x) def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y) + def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP) def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer: return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))