diff --git a/accel/opencl/ops_opencl.py b/accel/opencl/ops_opencl.py
index 81a208ee02..7c684ca21e 100644
--- a/accel/opencl/ops_opencl.py
+++ b/accel/opencl/ops_opencl.py
@@ -141,6 +141,7 @@ class OpenCLBuffer(GPUBuffer):
   }
   def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
     self._image = hostbuf._image if hostbuf is not None else None
+    self.copied_backing = False
     super().__init__(shape, hostbuf, backing)
     assert not (self._image and self._buf)
 
@@ -152,18 +153,18 @@ class OpenCLBuffer(GPUBuffer):
   @property
   def cl(self):
     if self._buf is None:
-      if self._backing is not None:
+      if self._backing is not None and not self.copied_backing:
         self._buf = CLBuffer(4*roundup(prod(self._backing.shape)))
         CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
-        self._backing = None
+        self.copied_backing = True
       elif self.st.contiguous:
         self._buf = CLBuffer(4*roundup(prod(self.shape)))
 
       if self._image is not None:
         self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
-        if self._backing is not None:
+        if self._backing is not None and not self.copied_backing:
           CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
-          self._backing = None
+          self.copied_backing = True
         #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
         CLProgram("from_image", f"""
           __kernel void from_image(
@@ -187,6 +188,7 @@ class OpenCLBuffer(GPUBuffer):
   def image(self):
     if self._image is None:
       assert len(self.shape) == 3 and self.shape[2] == 4, f"bad shape for image {self.shape}"
+      assert self.st.contiguous, f"{self} is not contiguous"
       self._image = CLImage(shape=(self.shape[1], self.shape[0]))
       if self._buf is not None:
         assert prod(self.shape) <= prod(self._image.cl.shape)*4
diff --git a/accel/opencl/preprocessing.py b/accel/opencl/preprocessing.py
index 611c700a1f..4cc306aab8 100644
--- a/accel/opencl/preprocessing.py
+++ b/accel/opencl/preprocessing.py
@@ -57,8 +57,8 @@ def preprocessing_op(x,w,C):
   #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
   #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w
 
-  # okay but unneeded
-  #x = x.contiguous_op()
+  # contiguous before image, always
+  x = x.contiguous_op()
   w = w.contiguous_op()
 
   # early realize on the weights
diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py
index b5b658289c..4d9f7cb47c 100644
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -125,7 +125,7 @@ class GPUBuffer:
 
   def unary_op(x, op:UnaryOps): return type(x)(x.shape)._processing_op([("A", x)], GPUBuffer.code_for_op[op])
   def binary_op(x, op:BinaryOps, y:GPUBuffer): return type(x)(x.shape)._processing_op([("A", x), ("B", y)], GPUBuffer.code_for_op[op])
-  def contiguous_op(x): return type(x)(x.shape, x, x._backing) if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
+  def contiguous_op(x): return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
   def movement_op(x, op:MovementOps, arg) -> GPUBuffer: return type(x)(ShapeTracker(x.st).movement_op(op, arg), x)
   def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), op=op)
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index bf8fe115ff..bf82325d67 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -124,7 +124,8 @@ def _realize_loadops(self:LazyBuffer) -> Tuple[DeviceBuffer, List[DeviceBuffer],
     return Device._buffers[self.device].fromCPU(self.op.arg), [], LoadOps
   elif self.op.op == LoadOps.CONTIGUOUS:
     real_src = self.op.src[0].realize(self.device)
-    return real_src.contiguous_op(), [real_src], LoadOps
+    ret = real_src.contiguous_op()
+    return ret, [real_src], LoadOps if ret != real_src else None
   else:
     assert NotImplementedError(f"unknown LoadOp {self.op.op}")
 
@@ -260,7 +261,8 @@ class LazyBuffer:
       # we haven't realized the Buffer yet
       self.realized, real_srcs, real_type = _realize[self.optype](self)
       # in lazy mode, we don't log until we realize
-      log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs)
+      if real_type is not None:
+        log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs)
       # no need to keep the op after realization
       del self.op