Eliminate LoadOps.FROMCPU (#920)

* Add fromCPU method to init LazyBuffer to eliminate LoadOps.FROMCPU

* squish

* remove failing test

* seems logical

* Revert "seems logical"

This reverts commit bbdcdc8713.

* inline and remove assertion

* fromCPU staticmethod, defer non-cpu device to loadop

* restore test
This commit is contained in:
kposborne2
2023-06-04 08:55:50 -07:00
committed by GitHub
parent 3e0b37f050
commit 0b88c5f923
4 changed files with 16 additions and 14 deletions

View File

@@ -7,6 +7,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, D
from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers
from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped
from tinygrad.runtime.ops_cpu import RawNumpyBuffer
from tinygrad.runtime.ops_disk import RawDiskBuffer
# lazy can recurse a lot
@@ -73,7 +74,7 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
# fromcpu aren't cached
if optype == LoadOps and op.op in [LoadOps.FROMCPU, LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
if optype == LoadOps and op.op in [LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
#print("create_lazybuffer", device, shape, optype, op, dtype)
@@ -87,16 +88,17 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
class LazyBuffer:
__deletable__ = ('op',)
def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
def __init__(self, device:str, st:ShapeTracker, optype:OpType, src:Union[LazyOp, RawBuffer], dtype:DType):
self.st = st # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype
self.op: LazyOp = op
self.realized: Optional[RawBuffer] = None
self.realized: Optional[RawBuffer] = src if isinstance(src, RawBuffer) else None
self.output_buffer: Optional[RawBuffer] = None # TODO: do we really need this? or can we just use realized
# TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
self.children: weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
# NOTE: op should be read only after construction of LazyBuffer
for x in get_buffers(op): x.children.add(self)
if isinstance(src, LazyOp):
self.op: LazyOp = src
for x in get_buffers(self.op): x.children.add(self)
if not LAZY: self.realize()
# log phantom ops to the graph
@@ -109,10 +111,7 @@ class LazyBuffer:
def realize(self:LazyBuffer) -> LazyBuffer:
if self.realized is None:
# get real ops first
if self.op.op == LoadOps.FROMCPU:
if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}")
self.realized = Device[self.device].buffer.fromCPU(self.op.arg, **self._device_extra_args())
elif self.op.op == LoadOps.CONTIGUOUS:
if self.op.op == LoadOps.CONTIGUOUS:
realized = self.op.src[0].realize().realized
if self.op.src[0].st.contiguous and not isinstance(realized, RawConst) and realized.size == prod(self.shape):
# no need to run an AST, this is already contiguous
@@ -179,6 +178,10 @@ class LazyBuffer:
def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer:
return create_lazybuffer(device, shape, LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype)
@staticmethod
def fromCPU(x: np.ndarray) -> LazyBuffer:
return LazyBuffer("CPU", ShapeTracker(x.shape), LoadOps, RawNumpyBuffer.fromCPU(x), dtypes.from_np(x.dtype))
# create a constant with the shape and dtype of self
def const_like(self, val) -> LazyBuffer:
# NOTE: dtypes.from_np(self.dtype.np) to deal with image types

View File

@@ -12,7 +12,7 @@ class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto();
class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() # noqa: E702
class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702
class FusedOps(Enum): MULACC = auto() # noqa: E702
class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); FROMCPU = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, FusedOps]
OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[FusedOps]]

View File

@@ -36,7 +36,7 @@ class RawBufferMapped(RawBufferCopyIn):
# this one is simple enough that i moved it out of the runtimes
class RawMallocBuffer(RawBufferMapped):
def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int64: ctypes.c_int64}[dtype] * size)())
def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.int64: ctypes.c_int64}[dtype] * size)())
def _buffer(self): return memoryview(self._buf)
class RawBufferCopyInOut(RawBufferCopyIn):

View File

@@ -39,13 +39,12 @@ class Tensor:
device = Device.canonicalize(device)
if isinstance(data, (list, tuple)):
data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
if isinstance(data, np.ndarray):
data = LazyBuffer.fromCPU(data)
if isinstance(data, LazyBuffer):
assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
lazydata = data if data.device == device else LazyBuffer.loadop(LoadOps.FROM, data.shape, data.dtype, device, src=data)
elif isinstance(data, np.ndarray):
# TODO: create CPUBuffer directly
lazydata = LazyBuffer.loadop(LoadOps.FROMCPU, data.shape, dtypes.from_np(data.dtype), device, data)
elif isinstance(data, (int, float)):
lazydata = LazyBuffer.loadop(LoadOps.CONST, tuple(), dtype if dtype is not None else Tensor.default_type, device, data)
else: