Eliminate LoadOps.FROMCPU (#920)

* Add fromCPU method to init LazyBuffer to eliminate LoadOps.FROMCPU * squish * remove failing test * seems logical * Revert "seems logical" This reverts commit bbdcdc8713. * inline and remove assertion * fromCPU staticmethod, defer non-cpu device to loadop * restore test
2026-02-18 18:35:12 -05:00 · 2023-06-04 08:55:50 -07:00
parent 3e0b37f050
commit 0b88c5f923
4 changed files with 16 additions and 14 deletions
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -7,6 +7,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, D
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers
 from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped
+from tinygrad.runtime.ops_cpu import RawNumpyBuffer
 from tinygrad.runtime.ops_disk import RawDiskBuffer

 # lazy can recurse a lot
@@ -73,7 +74,7 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
  st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))

  # fromcpu aren't cached
-  if optype == LoadOps and op.op in [LoadOps.FROMCPU, LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
+  if optype == LoadOps and op.op in [LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)

  #print("create_lazybuffer", device, shape, optype, op, dtype)

@@ -87,16 +88,17 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op

 class LazyBuffer:
  __deletable__ = ('op',)
-  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
+  def __init__(self, device:str, st:ShapeTracker, optype:OpType, src:Union[LazyOp, RawBuffer], dtype:DType):
    self.st = st  # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
    self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype
-    self.op: LazyOp = op
-    self.realized: Optional[RawBuffer] = None
+    self.realized: Optional[RawBuffer] = src if isinstance(src, RawBuffer) else None
    self.output_buffer: Optional[RawBuffer] = None   # TODO: do we really need this? or can we just use realized
    # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
    self.children: weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
    # NOTE: op should be read only after construction of LazyBuffer
-    for x in get_buffers(op): x.children.add(self)
+    if isinstance(src, LazyOp):
+      self.op: LazyOp = src
+      for x in get_buffers(self.op): x.children.add(self)
    if not LAZY: self.realize()

    # log phantom ops to the graph
@@ -109,10 +111,7 @@ class LazyBuffer:
  def realize(self:LazyBuffer) -> LazyBuffer:
    if self.realized is None:
      # get real ops first
-      if self.op.op == LoadOps.FROMCPU:
-        if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}")
-        self.realized = Device[self.device].buffer.fromCPU(self.op.arg, **self._device_extra_args())
-      elif self.op.op == LoadOps.CONTIGUOUS:
+      if self.op.op == LoadOps.CONTIGUOUS:
        realized = self.op.src[0].realize().realized
        if self.op.src[0].st.contiguous and not isinstance(realized, RawConst) and realized.size == prod(self.shape):
          # no need to run an AST, this is already contiguous
@@ -179,6 +178,10 @@ class LazyBuffer:
  def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer:
    return create_lazybuffer(device, shape, LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype)

+  @staticmethod
+  def fromCPU(x: np.ndarray) -> LazyBuffer:
+    return LazyBuffer("CPU", ShapeTracker(x.shape), LoadOps, RawNumpyBuffer.fromCPU(x), dtypes.from_np(x.dtype))
+
  # create a constant with the shape and dtype of self
  def const_like(self, val) -> LazyBuffer:
    # NOTE: dtypes.from_np(self.dtype.np) to deal with image types
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -12,7 +12,7 @@ class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto();
 class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() # noqa: E702
 class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702
 class FusedOps(Enum): MULACC = auto() # noqa: E702
-class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); FROMCPU = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
+class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702

 Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, FusedOps]
 OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[FusedOps]]
--- a/tinygrad/runtime/lib.py
+++ b/tinygrad/runtime/lib.py
@@ -36,7 +36,7 @@ class RawBufferMapped(RawBufferCopyIn):

 # this one is simple enough that i moved it out of the runtimes
 class RawMallocBuffer(RawBufferMapped):
-  def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int64: ctypes.c_int64}[dtype] * size)())
+  def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.int64: ctypes.c_int64}[dtype] * size)())
  def _buffer(self): return memoryview(self._buf)

 class RawBufferCopyInOut(RawBufferCopyIn):
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -39,13 +39,12 @@ class Tensor:
    device = Device.canonicalize(device)
    if isinstance(data, (list, tuple)):
      data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
+    if isinstance(data, np.ndarray):
+      data = LazyBuffer.fromCPU(data)

    if isinstance(data, LazyBuffer):
      assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
      lazydata = data if data.device == device else LazyBuffer.loadop(LoadOps.FROM, data.shape, data.dtype, device, src=data)
-    elif isinstance(data, np.ndarray):
-      # TODO: create CPUBuffer directly
-      lazydata = LazyBuffer.loadop(LoadOps.FROMCPU, data.shape, dtypes.from_np(data.dtype), device, data)
    elif isinstance(data, (int, float)):
      lazydata = LazyBuffer.loadop(LoadOps.CONST, tuple(), dtype if dtype is not None else Tensor.default_type, device, data)
    else: