From d449b3bef1b5c876e846accde27792d9828c0a52 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 4 Oct 2023 07:18:58 -0700
Subject: [PATCH] think about removing realize from lazybuffer (#1965)

* remove realize from lazybuffer

* okay fine, back that off

* fix tests maybe

* fix test
---
 test/test_lazybuffer.py       |  2 +-
 tinygrad/lazy.py              | 11 +----------
 tinygrad/realize.py           |  7 +++++--
 tinygrad/runtime/ops_torch.py |  2 +-
 tinygrad/tensor.py            |  9 ++++++---
 5 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/test/test_lazybuffer.py b/test/test_lazybuffer.py
index 27f978c64b..4800a649ae 100644
--- a/test/test_lazybuffer.py
+++ b/test/test_lazybuffer.py
@@ -18,7 +18,7 @@ class TestLazyBuffer(unittest.TestCase):
       b = LazyBuffer.fromCPU(a).realize()
       #assert b.st.contiguous == a.flags.c_contiguous
       assert b.st.shape == a.shape
-      np.testing.assert_equal(a, b.toCPU())
+      np.testing.assert_equal(a, Tensor(b).numpy())
 
     for ndims in range(1, 4):
       a = np.random.randn(*(4,)*ndims).astype(np.float32)
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index f5dcb93281..c8446869c5 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -4,7 +4,7 @@ from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast, Mapp
 from weakref import ref, WeakSet, WeakValueDictionary
 
 import numpy as np
-from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, partition, all_int, dedup, merge_dicts
+from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, partition, dedup, merge_dicts
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer, BufferOps
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.shape.symbolic import Variable, sint
@@ -223,15 +223,6 @@ class LazyBuffer:
   def fromCPU(x: np.ndarray) -> LazyBuffer:
     return LazyBuffer("CPU", ShapeTracker.from_shape(x.shape), LoadOps, None, dtypes.from_np(x.dtype), {}, RawNumpyBuffer.fromCPU(x))
 
-  def prepare_transfer(self):
-    self_casted = self.e(UnaryOps.CAST, arg=(dtypes.from_np(self.dtype.np), False)) if dtypes.from_np(self.dtype.np) != self.dtype else self
-    return self_casted.contiguous().realize().realized
-
-  def toCPU(self) -> np.ndarray:
-    assert self.dtype.np, f"{self.dtype} is not supported in toCPU"
-    assert all_int(self.shape), f"no toCPU if shape is symbolic, {self.shape=}"
-    return cast(RawBuffer, self.prepare_transfer()).toCPU().reshape(self.shape)
-
   # *** elementwise ops ***
 
   def e(self:LazyBuffer, op:Union[UnaryOps, BinaryOps, TernaryOps], *srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer:
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index 19829dfbcf..9c53aa413d 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -47,16 +47,19 @@ def _realize_contiguous(buffer: LazyBuffer, src: LazyBuffer) -> None:
   assert buffer.dtype == src.dtype, f"contiguous dtype mismatch, expecting {buffer.dtype}, got {src.dtype}"
 
 def _realize_from(buffer: LazyBuffer, src: LazyBuffer) -> None:
+  assert src.realized.size == buffer.st.size(), f"size mismatch on FROM {src.realized.size} != {buffer.st.size()}"
+  assert src.st.contiguous and buffer.st.contiguous, "all must be contiguous for from"
   if DEBUG >= 3: print(f"*** copy {buffer.device} <- {src.device} size {src.realized.size} dtype {src.realized.dtype}")
   # TODO: make this generic
   if isinstance(src.realized, RawDiskBuffer) and issubclass(Device[buffer.device].buffer, RawBufferMapped):
     assert all_int(buffer.shape), "does not support symbolic shape"
     buffer.realized = Device[buffer.device].buffer(prod(buffer.shape), buffer.dtype, **buffer._device_extra_args())
-    src.prepare_transfer().readinto(cast(RawBufferMapped, buffer.realized)._buffer())
+    src.realized.readinto(cast(RawBufferMapped, buffer.realized)._buffer())
   elif isinstance(src.realized, RawBufferTransfer) and issubclass(Device[buffer.device].buffer, RawBufferTransfer) and P2P >= 1:
     buffer.realized = cast(RawBufferTransfer, Device[buffer.device].buffer).transfer(src.realized, buffer.shape, buffer.dtype, **buffer._device_extra_args())
   else:
-    buffer.realized = Device[buffer.device].buffer.fromCPU(src.toCPU(), **buffer._device_extra_args())
+    # TODO: schedule this as FROM to go to CPU, and a FROM to go to device
+    buffer.realized = Device[buffer.device].buffer.fromCPU(src.realized.toCPU(), **buffer._device_extra_args())
 
 # *** n op LoadOps ***
 
diff --git a/tinygrad/runtime/ops_torch.py b/tinygrad/runtime/ops_torch.py
index 38c380818a..fe5d2f3a05 100644
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@@ -31,7 +31,7 @@ class RawTorchBuffer(RawBuffer):
   def __init__(self, size:int, dtype:DType, buf:Optional[torch.Tensor]=None): super().__init__(size, dtype, buf if buf is not None else torch.empty([size], dtype=inverse_type_map[dtype]))
   @classmethod
   def fromCPU(cls, x):
-    buf = torch.from_numpy(x).requires_grad_(False).to(device)
+    buf = torch.from_numpy(x if all(s>=0 for s in x.strides) else x.copy()).requires_grad_(False).to(device)
     return cls(prod(x.shape), type_map[buf.dtype], buf)
   def toCPU(self): return self._buf.cpu().numpy()
 TorchBuffer = Interpreted(RawTorchBuffer, torch_fxn_for_op, from_underlying=lambda x: RawTorchBuffer(prod(x.shape), type_map[x.dtype], x))
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 218089ddfc..863e33e219 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -97,7 +97,7 @@ class Tensor:
     # TODO: this is a hack for writing to DISK
     if self.device.startswith("DISK"):
       if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype)
-      self.lazydata.contiguous().realize().realized._copyin(x.numpy())  # type: ignore
+      self.contiguous().realize().lazydata.realized._copyin(x.numpy())  # type: ignore
       return self
     if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype)
     assert self.shape == x.shape and self.device == x.device, f"assign shape mismatch {self.shape} != {x.shape} or device mismatch {self.device} != {x.device}"
@@ -107,8 +107,11 @@ class Tensor:
     self.lazydata = x.lazydata
     return self
 
-  def detach(self): return Tensor(self.lazydata, device=self.device, requires_grad=False)
-  def numpy(self) -> np.ndarray: return self.lazydata.toCPU()
+  def detach(self) -> Tensor: return Tensor(self.lazydata, device=self.device, requires_grad=False)
+  def numpy(self) -> np.ndarray:
+    assert all_int(self.shape), f"no numpy if shape is symbolic, {self.shape=}"
+    assert self.dtype.np is not None, f"no numpy dtype for {self.dtype}"
+    return self.detach().cast(dtypes.from_np(self.dtype.np)).contiguous().to('CPU').realize().lazydata.realized._buf.reshape(self.shape)
 
   # TODO: if things are realized this won't work
   def to_(self, device:str):