From 0b88c5f9236bd6774663ed74ff1d0a07125f5270 Mon Sep 17 00:00:00 2001 From: kposborne2 <53231580+kposborne2@users.noreply.github.com> Date: Sun, 4 Jun 2023 08:55:50 -0700 Subject: [PATCH] Eliminate LoadOps.FROMCPU (#920) * Add fromCPU method to init LazyBuffer to eliminate LoadOps.FROMCPU * squish * remove failing test * seems logical * Revert "seems logical" This reverts commit bbdcdc8713f60725012d9f1602e18de4bdd3a4ed. * inline and remove assertion * fromCPU staticmethod, defer non-cpu device to loadop * restore test --- tinygrad/lazy.py | 21 ++++++++++++--------- tinygrad/ops.py | 2 +- tinygrad/runtime/lib.py | 2 +- tinygrad/tensor.py | 5 ++--- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 74bbc2bade..b17988c560 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -7,6 +7,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, D from tinygrad.shape.shapetracker import ShapeTracker, get_contraction from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped +from tinygrad.runtime.ops_cpu import RawNumpyBuffer from tinygrad.runtime.ops_disk import RawDiskBuffer # lazy can recurse a lot @@ -73,7 +74,7 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape)) # fromcpu aren't cached - if optype == LoadOps and op.op in [LoadOps.FROMCPU, LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype) + if optype == LoadOps and op.op in [LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype) #print("create_lazybuffer", device, shape, optype, op, dtype) @@ -87,16 +88,17 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op class LazyBuffer: __deletable__ = ('op',) - def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType): + def __init__(self, device:str, st:ShapeTracker, optype:OpType, src:Union[LazyOp, RawBuffer], dtype:DType): self.st = st # NOTE: this is not a copy! this should be a "read-only" ShapeTracker self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype - self.op: LazyOp = op - self.realized: Optional[RawBuffer] = None + self.realized: Optional[RawBuffer] = src if isinstance(src, RawBuffer) else None self.output_buffer: Optional[RawBuffer] = None # TODO: do we really need this? or can we just use realized # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child? self.children: weakref.WeakSet[LazyBuffer] = weakref.WeakSet() # NOTE: op should be read only after construction of LazyBuffer - for x in get_buffers(op): x.children.add(self) + if isinstance(src, LazyOp): + self.op: LazyOp = src + for x in get_buffers(self.op): x.children.add(self) if not LAZY: self.realize() # log phantom ops to the graph @@ -109,10 +111,7 @@ class LazyBuffer: def realize(self:LazyBuffer) -> LazyBuffer: if self.realized is None: # get real ops first - if self.op.op == LoadOps.FROMCPU: - if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}") - self.realized = Device[self.device].buffer.fromCPU(self.op.arg, **self._device_extra_args()) - elif self.op.op == LoadOps.CONTIGUOUS: + if self.op.op == LoadOps.CONTIGUOUS: realized = self.op.src[0].realize().realized if self.op.src[0].st.contiguous and not isinstance(realized, RawConst) and realized.size == prod(self.shape): # no need to run an AST, this is already contiguous @@ -179,6 +178,10 @@ class LazyBuffer: def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer: return create_lazybuffer(device, shape, LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype) + @staticmethod + def fromCPU(x: np.ndarray) -> LazyBuffer: + return LazyBuffer("CPU", ShapeTracker(x.shape), LoadOps, RawNumpyBuffer.fromCPU(x), dtypes.from_np(x.dtype)) + # create a constant with the shape and dtype of self def const_like(self, val) -> LazyBuffer: # NOTE: dtypes.from_np(self.dtype.np) to deal with image types diff --git a/tinygrad/ops.py b/tinygrad/ops.py index f2cd5e0a49..720b22cafd 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -12,7 +12,7 @@ class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto(); class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() # noqa: E702 class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702 class FusedOps(Enum): MULACC = auto() # noqa: E702 -class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); FROMCPU = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702 +class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702 Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, FusedOps] OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[FusedOps]] diff --git a/tinygrad/runtime/lib.py b/tinygrad/runtime/lib.py index d29d81b006..313e13b0a4 100644 --- a/tinygrad/runtime/lib.py +++ b/tinygrad/runtime/lib.py @@ -36,7 +36,7 @@ class RawBufferMapped(RawBufferCopyIn): # this one is simple enough that i moved it out of the runtimes class RawMallocBuffer(RawBufferMapped): - def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int64: ctypes.c_int64}[dtype] * size)()) + def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.int64: ctypes.c_int64}[dtype] * size)()) def _buffer(self): return memoryview(self._buf) class RawBufferCopyInOut(RawBufferCopyIn): diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index d7bd9724e7..877bc03b7a 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -39,13 +39,12 @@ class Tensor: device = Device.canonicalize(device) if isinstance(data, (list, tuple)): data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np) + if isinstance(data, np.ndarray): + data = LazyBuffer.fromCPU(data) if isinstance(data, LazyBuffer): assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported" lazydata = data if data.device == device else LazyBuffer.loadop(LoadOps.FROM, data.shape, data.dtype, device, src=data) - elif isinstance(data, np.ndarray): - # TODO: create CPUBuffer directly - lazydata = LazyBuffer.loadop(LoadOps.FROMCPU, data.shape, dtypes.from_np(data.dtype), device, data) elif isinstance(data, (int, float)): lazydata = LazyBuffer.loadop(LoadOps.CONST, tuple(), dtype if dtype is not None else Tensor.default_type, device, data) else: