diff --git a/examples/mlperf/dataloader.py b/examples/mlperf/dataloader.py index 118b045ebc..3e78c18801 100644 --- a/examples/mlperf/dataloader.py +++ b/examples/mlperf/dataloader.py @@ -67,11 +67,11 @@ def loader_process(q_in, q_out, X:Tensor, seed): # broken out #img_tensor = Tensor(img.tobytes(), device='CPU') - #storage_tensor = X[idx].contiguous().realize().lazydata.realized + #storage_tensor = X[idx].contiguous().realize().lazydata.base.realized #storage_tensor._copyin(img_tensor.numpy()) # faster - X[idx].contiguous().realize().lazydata.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() + X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = img.tobytes() # ideal #X[idx].assign(img.tobytes()) # NOTE: this is slow! @@ -267,8 +267,8 @@ def load_unet3d_data(preprocessed_dataset_dir, seed, queue_in, queue_out, X:Tens x = random_brightness_augmentation(x) x = gaussian_noise(x) - X[idx].contiguous().realize().lazydata.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes() - Y[idx].contiguous().realize().lazydata.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes() + X[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = x.tobytes() + Y[idx].contiguous().realize().lazydata.base.realized.as_buffer(force_zero_copy=True)[:] = y.tobytes() queue_out.put(idx) queue_out.put(None) diff --git a/test/external/fuzz_graph.py b/test/external/fuzz_graph.py index ac189e65f5..0fa0b55b02 100644 --- a/test/external/fuzz_graph.py +++ b/test/external/fuzz_graph.py @@ -29,7 +29,7 @@ def alloc_rawbuffer(device, fill=False): if fill: with Context(DEBUG=0): data = np.random.randint(-10000, 10000, size=rawbuf.size, dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data).realize().lazydata.realized.as_buffer()) + rawbuf.copyin(Tensor(data).realize().lazydata.base.realized.as_buffer()) return rawbuf def gen_kernel_ji(device, deps): diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index 6245f3b5d6..101b473f1a 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -75,7 +75,7 @@ def get_fuzz_rawbufs(lin): data = np.random.uniform(-1, 1, size=rawbuf.size).astype(dtype=_to_np_dtype(rawbuf.dtype)) else: data = np.random.uniform(-10, 10, size=rawbuf.size).astype(dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data, device=lin.opts.device).realize().lazydata.realized.as_buffer()) + rawbuf.copyin(Tensor(data, device=lin.opts.device).realize().lazydata.base.realized.as_buffer()) return rawbufs def get_fuzz_rawbuf_like(old_rawbuf, zero=False, copy=False, size=None, force_device=None): diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py index 0ee010ec56..8b8d8ea9a3 100755 --- a/test/external/process_replay/process_replay.py +++ b/test/external/process_replay/process_replay.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # compare kernels created by HEAD against master import os, multiprocessing, logging, pickle, sqlite3, difflib, functools -from typing import Callable, List, Tuple, Union, cast +from typing import Callable, List, Set, Tuple, Union, cast from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm from tinygrad.engine.schedule import ScheduleContext, full_ast_rewrite from tinygrad.codegen.kernel import Kernel, Opt @@ -28,7 +28,8 @@ if REF == "master": SKIP_PROCESS_REPLAY = True # *** recreators -def recreate_sched(ast:UOp) -> UOp: return full_ast_rewrite(ast, ScheduleContext())[0] +def recreate_sched(ast:UOp, assigns:Set[UOp]) -> UOp: + return full_ast_rewrite(ast, ScheduleContext(assigns=assigns))[0] def recreate_kernel(ast:UOp, opts:Renderer, applied_opts:List[Opt], name:str, _) -> str: k = Kernel(ast, opts=opts) for opt in applied_opts: k.apply_opt(opt) diff --git a/test/test_graph.py b/test/test_graph.py index 7be34ba0f1..e55ee912a0 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -36,7 +36,7 @@ def helper_alloc_rawbuffer(device, fill=False): if fill: with Context(DEBUG=0): data = np.random.randint(-10000, 10000, size=rawbuf.size, dtype=_to_np_dtype(rawbuf.dtype)) - rawbuf.copyin(Tensor(data).realize().lazydata.realized.as_buffer()) + rawbuf.copyin(Tensor(data).realize().lazydata.base.realized.as_buffer()) return rawbuf def helper_run_jit(jis, bufs, out_buffers): diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 6eec229acd..5a3d5f75c4 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -228,14 +228,14 @@ def full_ast_rewrite(pre:UOp, ctx:ScheduleContext) -> Tuple[UOp, ScheduleItemCon and ShapeTracker.from_shape(s.shape).shrink(m) == s.shrink(m)) for x in ops): raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n" +colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green")) - if getenv("RUN_PROCESS_REPLAY"): PROCESS_REPLAY_CAPTURE.append((pre, sink)) + if getenv("RUN_PROCESS_REPLAY"): PROCESS_REPLAY_CAPTURE.append(((pre, ctx.assigns), sink)) return sink, si_ctx -PROCESS_REPLAY_CAPTURE: List[Tuple[UOp, UOp]] = [] +PROCESS_REPLAY_CAPTURE: List[Tuple[Tuple[UOp, Set[UOp]], UOp]] = [] if getenv("RUN_PROCESS_REPLAY"): @atexit.register def save_process_replay() -> None: - for x,ret in PROCESS_REPLAY_CAPTURE: diskcache_put("schedule_process_replay", str(x.key), (x, {}, ret)) + for x,ret in PROCESS_REPLAY_CAPTURE: diskcache_put("schedule_process_replay", str(x[0].key), (*x, {}, ret)) # **** Schedule grouping @@ -381,7 +381,7 @@ break_sched = PatternMatcher([ @track_rewrites(named=True) def create_schedule_with_vars(outs:List[LazyBuffer]) -> Tuple[List[ScheduleItem], Dict[Variable, int]]: - if len(outs:=dedup(x.base for x in outs if x.realized is None and x.base.op is not Ops.CONST)) == 0: return [], {} + if len(outs:=dedup(x.base for x in outs if x.base.realized is None and x.base.op is not Ops.CONST)) == 0: return [], {} for out in outs: out.forced_realize = True # create the big graph ctx = ScheduleContext()