diff --git a/accel/opencl/ops_opencl.py b/accel/opencl/ops_opencl.py index 81a208ee02..7c684ca21e 100644 --- a/accel/opencl/ops_opencl.py +++ b/accel/opencl/ops_opencl.py @@ -141,6 +141,7 @@ class OpenCLBuffer(GPUBuffer): } def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None): self._image = hostbuf._image if hostbuf is not None else None + self.copied_backing = False super().__init__(shape, hostbuf, backing) assert not (self._image and self._buf) @@ -152,18 +153,18 @@ class OpenCLBuffer(GPUBuffer): @property def cl(self): if self._buf is None: - if self._backing is not None: + if self._backing is not None and not self.copied_backing: self._buf = CLBuffer(4*roundup(prod(self._backing.shape))) CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False) - self._backing = None + self.copied_backing = True elif self.st.contiguous: self._buf = CLBuffer(4*roundup(prod(self.shape))) if self._image is not None: self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4)) - if self._backing is not None: + if self._backing is not None and not self.copied_backing: CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False) - self._backing = None + self.copied_backing = True #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}") CLProgram("from_image", f""" __kernel void from_image( @@ -187,6 +188,7 @@ class OpenCLBuffer(GPUBuffer): def image(self): if self._image is None: assert len(self.shape) == 3 and self.shape[2] == 4, f"bad shape for image {self.shape}" + assert self.st.contiguous, f"{self} is not contiguous" self._image = CLImage(shape=(self.shape[1], self.shape[0])) if self._buf is not None: assert prod(self.shape) <= prod(self._image.cl.shape)*4 diff --git a/accel/opencl/preprocessing.py b/accel/opencl/preprocessing.py index 611c700a1f..4cc306aab8 100644 --- a/accel/opencl/preprocessing.py +++ b/accel/opencl/preprocessing.py @@ -57,8 +57,8 @@ def preprocessing_op(x,w,C): #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w - # okay but unneeded - #x = x.contiguous_op() + # contiguous before image, always + x = x.contiguous_op() w = w.contiguous_op() # early realize on the weights diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index b5b658289c..4d9f7cb47c 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -125,7 +125,7 @@ class GPUBuffer: def unary_op(x, op:UnaryOps): return type(x)(x.shape)._processing_op([("A", x)], GPUBuffer.code_for_op[op]) def binary_op(x, op:BinaryOps, y:GPUBuffer): return type(x)(x.shape)._processing_op([("A", x), ("B", y)], GPUBuffer.code_for_op[op]) - def contiguous_op(x): return type(x)(x.shape, x, x._backing) if x.st.contiguous else x.unary_op(UnaryOps.NOOP) + def contiguous_op(x): return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP) def movement_op(x, op:MovementOps, arg) -> GPUBuffer: return type(x)(ShapeTracker(x.st).movement_op(op, arg), x) def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), op=op) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index bf8fe115ff..bf82325d67 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -124,7 +124,8 @@ def _realize_loadops(self:LazyBuffer) -> Tuple[DeviceBuffer, List[DeviceBuffer], return Device._buffers[self.device].fromCPU(self.op.arg), [], LoadOps elif self.op.op == LoadOps.CONTIGUOUS: real_src = self.op.src[0].realize(self.device) - return real_src.contiguous_op(), [real_src], LoadOps + ret = real_src.contiguous_op() + return ret, [real_src], LoadOps if ret != real_src else None else: assert NotImplementedError(f"unknown LoadOp {self.op.op}") @@ -260,7 +261,8 @@ class LazyBuffer: # we haven't realized the Buffer yet self.realized, real_srcs, real_type = _realize[self.optype](self) # in lazy mode, we don't log until we realize - log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs) + if real_type is not None: + log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs) # no need to keep the op after realization del self.op