mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-18 18:35:12 -05:00
Eliminate LoadOps.FROMCPU (#920)
* Add fromCPU method to init LazyBuffer to eliminate LoadOps.FROMCPU
* squish
* remove failing test
* seems logical
* Revert "seems logical"
This reverts commit bbdcdc8713.
* inline and remove assertion
* fromCPU staticmethod, defer non-cpu device to loadop
* restore test
This commit is contained in:
@@ -7,6 +7,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, D
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
|
||||
from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers
|
||||
from tinygrad.runtime.lib import RawConst, RawBuffer, RawBufferMapped
|
||||
from tinygrad.runtime.ops_cpu import RawNumpyBuffer
|
||||
from tinygrad.runtime.ops_disk import RawDiskBuffer
|
||||
|
||||
# lazy can recurse a lot
|
||||
@@ -73,7 +74,7 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
|
||||
st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
|
||||
|
||||
# fromcpu aren't cached
|
||||
if optype == LoadOps and op.op in [LoadOps.FROMCPU, LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
|
||||
if optype == LoadOps and op.op in [LoadOps.EMPTY, LoadOps.RAND, LoadOps.CONST]: return LazyBuffer(device, st, optype, op, dtype)
|
||||
|
||||
#print("create_lazybuffer", device, shape, optype, op, dtype)
|
||||
|
||||
@@ -87,16 +88,17 @@ def create_lazybuffer(device:str, shape:Union[ShapeTracker, Tuple[int, ...]], op
|
||||
|
||||
class LazyBuffer:
|
||||
__deletable__ = ('op',)
|
||||
def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:LazyOp, dtype:DType):
|
||||
def __init__(self, device:str, st:ShapeTracker, optype:OpType, src:Union[LazyOp, RawBuffer], dtype:DType):
|
||||
self.st = st # NOTE: this is not a copy! this should be a "read-only" ShapeTracker
|
||||
self.device, self.shape, self.optype, self.dtype = device, self.st.shape, optype, dtype
|
||||
self.op: LazyOp = op
|
||||
self.realized: Optional[RawBuffer] = None
|
||||
self.realized: Optional[RawBuffer] = src if isinstance(src, RawBuffer) else None
|
||||
self.output_buffer: Optional[RawBuffer] = None # TODO: do we really need this? or can we just use realized
|
||||
# TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
|
||||
self.children: weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
|
||||
# NOTE: op should be read only after construction of LazyBuffer
|
||||
for x in get_buffers(op): x.children.add(self)
|
||||
if isinstance(src, LazyOp):
|
||||
self.op: LazyOp = src
|
||||
for x in get_buffers(self.op): x.children.add(self)
|
||||
if not LAZY: self.realize()
|
||||
|
||||
# log phantom ops to the graph
|
||||
@@ -109,10 +111,7 @@ class LazyBuffer:
|
||||
def realize(self:LazyBuffer) -> LazyBuffer:
|
||||
if self.realized is None:
|
||||
# get real ops first
|
||||
if self.op.op == LoadOps.FROMCPU:
|
||||
if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}")
|
||||
self.realized = Device[self.device].buffer.fromCPU(self.op.arg, **self._device_extra_args())
|
||||
elif self.op.op == LoadOps.CONTIGUOUS:
|
||||
if self.op.op == LoadOps.CONTIGUOUS:
|
||||
realized = self.op.src[0].realize().realized
|
||||
if self.op.src[0].st.contiguous and not isinstance(realized, RawConst) and realized.size == prod(self.shape):
|
||||
# no need to run an AST, this is already contiguous
|
||||
@@ -179,6 +178,10 @@ class LazyBuffer:
|
||||
def loadop(op, shape, dtype, device, arg=None, src=None) -> LazyBuffer:
|
||||
return create_lazybuffer(device, shape, LoadOps, LazyOp(op, tuple() if src is None else (src,), arg), dtype)
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x: np.ndarray) -> LazyBuffer:
|
||||
return LazyBuffer("CPU", ShapeTracker(x.shape), LoadOps, RawNumpyBuffer.fromCPU(x), dtypes.from_np(x.dtype))
|
||||
|
||||
# create a constant with the shape and dtype of self
|
||||
def const_like(self, val) -> LazyBuffer:
|
||||
# NOTE: dtypes.from_np(self.dtype.np) to deal with image types
|
||||
|
||||
@@ -12,7 +12,7 @@ class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto();
|
||||
class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() # noqa: E702
|
||||
class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702
|
||||
class FusedOps(Enum): MULACC = auto() # noqa: E702
|
||||
class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); FROMCPU = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
|
||||
class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # noqa: E702
|
||||
|
||||
Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, FusedOps]
|
||||
OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOps], Type[LoadOps], Type[FusedOps]]
|
||||
|
||||
@@ -36,7 +36,7 @@ class RawBufferMapped(RawBufferCopyIn):
|
||||
|
||||
# this one is simple enough that i moved it out of the runtimes
|
||||
class RawMallocBuffer(RawBufferMapped):
|
||||
def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int64: ctypes.c_int64}[dtype] * size)())
|
||||
def __init__(self, size, dtype: DType): super().__init__(size, dtype, ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.int64: ctypes.c_int64}[dtype] * size)())
|
||||
def _buffer(self): return memoryview(self._buf)
|
||||
|
||||
class RawBufferCopyInOut(RawBufferCopyIn):
|
||||
|
||||
@@ -39,13 +39,12 @@ class Tensor:
|
||||
device = Device.canonicalize(device)
|
||||
if isinstance(data, (list, tuple)):
|
||||
data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
|
||||
if isinstance(data, np.ndarray):
|
||||
data = LazyBuffer.fromCPU(data)
|
||||
|
||||
if isinstance(data, LazyBuffer):
|
||||
assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
|
||||
lazydata = data if data.device == device else LazyBuffer.loadop(LoadOps.FROM, data.shape, data.dtype, device, src=data)
|
||||
elif isinstance(data, np.ndarray):
|
||||
# TODO: create CPUBuffer directly
|
||||
lazydata = LazyBuffer.loadop(LoadOps.FROMCPU, data.shape, dtypes.from_np(data.dtype), device, data)
|
||||
elif isinstance(data, (int, float)):
|
||||
lazydata = LazyBuffer.loadop(LoadOps.CONST, tuple(), dtype if dtype is not None else Tensor.default_type, device, data)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user