From 0b88c5f9236bd6774663ed74ff1d0a07125f5270 Mon Sep 17 00:00:00 2001
From: kposborne2 <53231580+kposborne2@users.noreply.github.com>
Date: Sun, 4 Jun 2023 08:55:50 -0700
Subject: [PATCH] Eliminate LoadOps.FROMCPU (#920)

* Add fromCPU method to init LazyBuffer to eliminate LoadOps.FROMCPU

* squish

* remove failing test

* seems logical

* Revert "seems logical"

This reverts commit bbdcdc8713f60725012d9f1602e18de4bdd3a4ed.

* inline and remove assertion

* fromCPU staticmethod, defer non-cpu device to loadop

* restore test
---
 tinygrad/lazy.py        | 21 ++++++++++++---------
 tinygrad/ops.py         |  2 +-
 tinygrad/runtime/lib.py |  2 +-
 tinygrad/tensor.py      |  5 ++---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 74bbc2bade..b17988c560 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -7,6 +7,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, D
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers
 from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped
+from tinygrad.runtime.ops_cpu import RawNumpyBuffer
 from tinygrad.runtime.ops_disk import RawDiskBuffer
 
 # lazy can recurse a lot
@@ -73,7 +74,7 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
   st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
 
   # fromcpu aren't cached
-  if optype == LoadOps and op.op in [LoadOps.FROMCPU, LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
+  if optype == LoadOps and op.op in [LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
 
   #print("create_lazybuffer", device, shape, optype, op, dtype)
 
@@ -87,16 +88,17 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
 
 class LazyBuffer:
   __deletable__ = ('op',)
-  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
+  def __init__(self, device:str, st:ShapeTracker, optype:OpType, src:Union[LazyOp, RawBuffer], dtype:DType):
     self.st = st  # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
     self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype
-    self.op: LazyOp = op
-    self.realized: Optional[RawBuffer] = None
+    self.realized: Optional[RawBuffer] = src if isinstance(src, RawBuffer) else None
     self.output_buffer: Optional[RawBuffer] = None   # TODO: do we really need this? or can we just use realized
     # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
     self.children: weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
     # NOTE: op should be read only after construction of LazyBuffer
-    for x in get_buffers(op): x.children.add(self)
+    if isinstance(src, LazyOp):
+      self.op: LazyOp = src
+      for x in get_buffers(self.op): x.children.add(self)
     if not LAZY: self.realize()
 
     # log phantom ops to the graph
@@ -109,10 +111,7 @@ class LazyBuffer:
   def realize(self:LazyBuffer) -> LazyBuffer:
     if self.realized is None:
       # get real ops first
-      if self.op.op == LoadOps.FROMCPU:
-        if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}")
-        self.realized = Device[self.device].buffer.fromCPU(self.op.arg, **self._device_extra_args())
-      elif self.op.op == LoadOps.CONTIGUOUS:
+      if self.op.op == LoadOps.CONTIGUOUS:
         realized = self.op.src[0].realize().realized
         if self.op.src[0].st.contiguous and not isinstance(realized, RawConst) and realized.size == prod(self.shape):
           # no need to run an AST, this is already contiguous
@@ -179,6 +178,10 @@ class LazyBuffer:
   def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer:
     return create_lazybuffer(device, shape, LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype)
 
+  @staticmethod
+  def fromCPU(x: np.ndarray) -> LazyBuffer:
+    return LazyBuffer("CPU", ShapeTracker(x.shape), LoadOps, RawNumpyBuffer.fromCPU(x), dtypes.from_np(x.dtype))
+
   # create a constant with the shape and dtype of self
   def const_like(self, val) -> LazyBuffer:
     # NOTE: dtypes.from_np(self.dtype.np) to deal with image types
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index f2cd5e0a49..720b22cafd 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -12,7 +12,7 @@ class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto();
 class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() # noqa: E702
 class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702
 class FusedOps(Enum): MULACC = auto() # noqa: E702
-class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); FROMCPU = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
+class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
 
 Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, FusedOps]
 OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[FusedOps]]
diff --git a/tinygrad/runtime/lib.py b/tinygrad/runtime/lib.py
index d29d81b006..313e13b0a4 100644
--- a/tinygrad/runtime/lib.py
+++ b/tinygrad/runtime/lib.py
@@ -36,7 +36,7 @@ class RawBufferMapped(RawBufferCopyIn):
 
 # this one is simple enough that i moved it out of the runtimes
 class RawMallocBuffer(RawBufferMapped):
-  def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int64: ctypes.c_int64}[dtype] * size)())
+  def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.int64: ctypes.c_int64}[dtype] * size)())
   def _buffer(self): return memoryview(self._buf)
 
 class RawBufferCopyInOut(RawBufferCopyIn):
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index d7bd9724e7..877bc03b7a 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -39,13 +39,12 @@ class Tensor:
     device = Device.canonicalize(device)
     if isinstance(data, (list, tuple)):
       data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
+    if isinstance(data, np.ndarray):
+      data = LazyBuffer.fromCPU(data)
 
     if isinstance(data, LazyBuffer):
       assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
       lazydata = data if data.device == device else LazyBuffer.loadop(LoadOps.FROM, data.shape, data.dtype, device, src=data)
-    elif isinstance(data, np.ndarray):
-      # TODO: create CPUBuffer directly
-      lazydata = LazyBuffer.loadop(LoadOps.FROMCPU, data.shape, dtypes.from_np(data.dtype), device, data)
     elif isinstance(data, (int, float)):
       lazydata = LazyBuffer.loadop(LoadOps.CONST, tuple(), dtype if dtype is not None else Tensor.default_type, device, data)
     else: