mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 23:48:01 -05:00
* cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device
85 lines
4.8 KiB
Python
85 lines
4.8 KiB
Python
from typing import List, cast, Dict, Callable
|
|
import numpy as np
|
|
from tinygrad.ops import ScheduleItem, LazyOp, LoadOps, BufferOps
|
|
from tinygrad.device import Device, Buffer
|
|
from tinygrad.graph import log_schedule_item, print_tree
|
|
from tinygrad.helpers import DEBUG, prod
|
|
|
|
def run_schedule(schedule:List[ScheduleItem], disable_logging=False):
|
|
# NOTE: if you for loop the schedule it's slow because nothing frees
|
|
while len(schedule):
|
|
si = schedule.pop(0)
|
|
if not disable_logging: log_schedule_item(si)
|
|
assert all(x.realized for x in si.inputs), "can't run schedule, some inputs aren't realized"
|
|
assert all(si.out.device == x.device for x in si.inputs) or si.ast.op is LoadOps.FROM, f"all devices must be the same, {si.out.device} != {[x.device for x in si.inputs]} {print_tree(si.ast) or ''}"
|
|
# check if we can reuse the output buffer
|
|
# if it's aliased, don't use it
|
|
# TODO: this is pretty wrong actually, who knows where else this buffer is used?
|
|
# TODO: what if an assign is required? this silently is wrong
|
|
# TODO: this logic doesn't belong here, it should be checked in assign or at least schedule
|
|
if si.out.output_buffer is not None:
|
|
for i,a in enumerate(si.inputs):
|
|
# TODO: if this is contiguous it's fine
|
|
if a.realized == si.out.output_buffer:
|
|
if any(not x.arg.st.contiguous for x in si.ast.get_lazyops() if x.op == BufferOps.LOAD and x.arg.idx == i+1):
|
|
si.out.output_buffer = None
|
|
break
|
|
# we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape
|
|
si.out.realized = si.out.output_buffer if si.out.output_buffer is not None else \
|
|
Buffer(si.out.device, prod((s if isinstance(s, int) else s.max for s in si.out.shape)), si.out.dtype)
|
|
#Device[si.out.device].buffer(prod((s if isinstance(s, int) else s.max for s in si.out.shape)), si.out.dtype, **si.out._device_extra_args())
|
|
# TODO: size 0 should be removed from the schedule
|
|
if si.out.realized.size != 0:
|
|
if si.ast.op in LoadOps:
|
|
# confirm the LoadOps are contiguous and in order
|
|
for i,s in enumerate(si.ast.src): assert isinstance(s, LazyOp) and s.op == BufferOps.LOAD and s.arg.idx == i+1 and s.arg.st.contiguous, f"bad LoadOps src {i}: {s}"
|
|
kwargs = {"arg": si.ast.arg} if si.ast.arg is not None else {}
|
|
LOAD_OPS_DISPATCHER[cast(LoadOps, si.ast.op)](si.out.realized, *[x.realized for x in si.inputs], **kwargs)
|
|
else:
|
|
Device[si.out.device].get_runner(si.ast).exec([si.out.realized] + [x.realized for x in si.inputs], si.var_vals)
|
|
del si.out.op
|
|
for v in si.out.views: del v.op
|
|
#assert si.out.realized and isinstance(si.out.realized, Device[si.out.device].buffer), f"device mismatch on realized got {type(si.out.realized)} expected {si.out.device}"
|
|
assert si.out.realized.dtype == si.out.dtype, f"realized dtype is incorrect, {si.out.realized.dtype} != {si.out.dtype}"
|
|
|
|
# *** zero op LoadOps ***
|
|
|
|
def _realize_empty(buffer: Buffer) -> None:
|
|
if DEBUG >= 2: print(f"*** empty {buffer.device} shape {buffer.size:5d} dtype {buffer.dtype}")
|
|
|
|
# TODO: remove this and write the RNG in tinygrad
|
|
def _realize_rand(buffer: Buffer, arg) -> None:
|
|
if DEBUG >= 2: print(f"*** rand {buffer.device} seed {arg:<10d} shape {buffer.size:5d} dtype {buffer.dtype}")
|
|
rng = np.random.default_rng(arg)
|
|
rng_np_buffer = rng.random(size=buffer.size, dtype=np.float32).astype(dtype=buffer.dtype.np, copy=False)
|
|
buffer.copyin(rng_np_buffer.data)
|
|
|
|
# *** one op LoadOps ***
|
|
|
|
#from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer
|
|
#from tinygrad.runtime.ops_disk import RawDiskBuffer
|
|
def _realize_from(buffer: Buffer, src: Buffer) -> None:
|
|
assert src.size == buffer.size, f"size mismatch on FROM {src.size=} != {buffer.size=}"
|
|
if DEBUG >= 2: print(f"*** copy {buffer.device} <- {src.device} size {src.size:<16d} shape {buffer.size:5d} dtype {src.dtype}")
|
|
buffer.copyin(src.toCPU().data)
|
|
# TODO: make this generic
|
|
#if isinstance(src.realized, RawDiskBuffer) and isinstance(buffer.realized, RawBufferMapped):
|
|
# src.realized.readinto(buffer.realized._buffer())
|
|
#elif isinstance(src.realized, RawBufferTransfer) and isinstance(buffer.realized, RawBufferTransfer) and getenv("P2P", 0) >= 1:
|
|
# buffer.realized._transfer(src.realized)
|
|
#else:
|
|
#buffer.realized._copyin(src.realized.toCPU())
|
|
|
|
# *** n op LoadOps ***
|
|
|
|
def _realize_custom(buffer: Buffer, *inputs: Buffer, arg) -> None:
|
|
if DEBUG >= 2: print(f"*** custom {buffer.device} shape {buffer.size:5d} dtype {buffer.dtype}")
|
|
arg(buffer, *inputs)
|
|
|
|
LOAD_OPS_DISPATCHER: Dict[LoadOps, Callable] = {
|
|
LoadOps.EMPTY: _realize_empty,
|
|
LoadOps.RAND: _realize_rand,
|
|
LoadOps.FROM: _realize_from,
|
|
LoadOps.CUSTOM: _realize_custom,
|
|
}
|