might fix tests

2026-01-23 22:08:08 -05:00 · 2022-10-20 16:27:11 -07:00
parent fd6ba8e7ac
commit 9f8c414589
4 changed files with 13 additions and 9 deletions
--- a/accel/opencl/ops_opencl.py
+++ b/accel/opencl/ops_opencl.py
@@ -141,6 +141,7 @@ class OpenCLBuffer(GPUBuffer):
  }
  def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
    self._image = hostbuf._image if hostbuf is not None else None
+    self.copied_backing = False
    super().__init__(shape, hostbuf, backing)
    assert not (self._image and self._buf)

@@ -152,18 +153,18 @@ class OpenCLBuffer(GPUBuffer):
  @property
  def cl(self):
    if self._buf is None:
-      if self._backing is not None:
+      if self._backing is not None and not self.copied_backing:
        self._buf = CLBuffer(4*roundup(prod(self._backing.shape)))
        CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
-        self._backing = None
+        self.copied_backing = True
      elif self.st.contiguous:
        self._buf = CLBuffer(4*roundup(prod(self.shape)))

      if self._image is not None:
        self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
-        if self._backing is not None:
+        if self._backing is not None and not self.copied_backing:
          CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
-          self._backing = None
+          self.copied_backing = True
        #print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
        CLProgram("from_image", f"""
          __kernel void from_image(
@@ -187,6 +188,7 @@ class OpenCLBuffer(GPUBuffer):
  def image(self):
    if self._image is None:
      assert len(self.shape) == 3 and self.shape[2] == 4, f"bad shape for image {self.shape}"
+      assert self.st.contiguous, f"{self} is not contiguous"
      self._image = CLImage(shape=(self.shape[1], self.shape[0]))
      if self._buf is not None:
        assert prod(self.shape) <= prod(self._image.cl.shape)*4
--- a/accel/opencl/preprocessing.py
+++ b/accel/opencl/preprocessing.py
@@ -57,8 +57,8 @@ def preprocessing_op(x,w,C):
  #x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
  #w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w

-  # okay but unneeded
-  #x = x.contiguous_op()
+  # contiguous before image, always
+  x = x.contiguous_op()
  w = w.contiguous_op()

  # early realize on the weights
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -125,7 +125,7 @@ class GPUBuffer:

  def unary_op(x, op:UnaryOps): return type(x)(x.shape)._processing_op([("A", x)], GPUBuffer.code_for_op[op])
  def binary_op(x, op:BinaryOps, y:GPUBuffer): return type(x)(x.shape)._processing_op([("A", x), ("B", y)], GPUBuffer.code_for_op[op])
-  def contiguous_op(x): return type(x)(x.shape, x, x._backing) if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
+  def contiguous_op(x): return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
  def movement_op(x, op:MovementOps, arg) -> GPUBuffer: return type(x)(ShapeTracker(x.st).movement_op(op, arg), x)
  def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), op=op)

--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -124,7 +124,8 @@ def _realize_loadops(self:LazyBuffer) -> Tuple[DeviceBuffer, List[DeviceBuffer],
    return Device._buffers[self.device].fromCPU(self.op.arg), [], LoadOps
  elif self.op.op == LoadOps.CONTIGUOUS:
    real_src = self.op.src[0].realize(self.device)
-    return real_src.contiguous_op(), [real_src], LoadOps
+    ret = real_src.contiguous_op()
+    return ret, [real_src], LoadOps if ret != real_src else None
  else:
    assert NotImplementedError(f"unknown LoadOp {self.op.op}")

@@ -260,7 +261,8 @@ class LazyBuffer:
      # we haven't realized the Buffer yet
      self.realized, real_srcs, real_type = _realize[self.optype](self)
      # in lazy mode, we don't log until we realize
-      log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs)
+      if real_type is not None:
+        log_op(real_type, [x.op for x in get_lazyops(self.op)], self.realized, real_srcs)
      # no need to keep the op after realization
      del self.op