From 24ca8eeaa76b020212e86c7f0da307d41d453704 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:41:16 -0800
Subject: [PATCH] small fixups from schedule_cache (#13557)

---
 .github/workflows/test.yml |   4 +-
 docs/abstractions2.py      | 135 -------------------------------------
 test/test_tensor.py        |   4 +-
 tinygrad/uop/ops.py        |   8 ++-
 4 files changed, 10 insertions(+), 141 deletions(-)
 delete mode 100644 docs/abstractions2.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5b46d598c1..4b7c2a7d4e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -71,9 +71,7 @@ jobs:
     - name: Test Docs Build
       run: python -m mkdocs build --strict
     - name: Test Docs
-      run: |
-        python docs/abstractions2.py
-        python docs/abstractions3.py
+      run: python docs/abstractions3.py
     - name: Test README
       run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  python README.py
     - name: Test Quickstart
diff --git a/docs/abstractions2.py b/docs/abstractions2.py
deleted file mode 100644
index c1d13a86cf..0000000000
--- a/docs/abstractions2.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# tinygrad is a tensor library, and as a tensor library it has multiple parts
-# 1. a "runtime". this allows buffer management, compilation, and running programs
-# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all
-# 3. a "UOp" that fuses the compute into kernels, using memory only when needed
-# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()"
-
-
-print("******** first, the runtime ***********")
-
-from tinygrad.runtime.ops_cpu import ClangJITCompiler, CPUDevice, CPUProgram
-
-cpu = CPUDevice()
-
-# allocate some buffers
-out = cpu.allocator.alloc(4)
-a = cpu.allocator.alloc(4)
-b = cpu.allocator.alloc(4)
-
-# load in some values (little endian)
-cpu.allocator._copyin(a, memoryview(bytearray([2,0,0,0])))
-cpu.allocator._copyin(b, memoryview(bytearray([3,0,0,0])))
-
-# compile a program to a binary
-lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
-
-# create a runtime for the program
-fxn = cpu.runtime("add", lib)
-
-# run the program
-fxn(out, a, b)
-
-# check the data out
-print(val := cpu.allocator._as_buffer(out).cast("I").tolist()[0])
-assert val == 5
-
-
-print("******** second, the Device ***********")
-
-DEVICE = "CPU"   # NOTE: you can change this!
-
-import struct
-from tinygrad.dtype import dtypes
-from tinygrad.device import Buffer, Device
-from tinygrad.uop.ops import UOp, Ops
-
-# allocate some buffers + load in values
-out = Buffer(DEVICE, 1, dtypes.int32).allocate()
-a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
-b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
-# NOTE: a._buf is the same as the return from cpu.allocator.alloc
-
-# describe the computation
-idx = UOp.const(dtypes.index, 0)
-buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
-buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
-alu = buf_1.index(idx) + buf_2.index(idx)
-output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
-st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.index(idx), alu))
-s = UOp(Ops.SINK, dtypes.void, (st_0,))
-
-# convert the computation to a "linearized" format (print the format)
-from tinygrad.engine.realize import get_program, CompiledRunner
-program = get_program(s, Device[DEVICE].renderer)
-
-# compile a program (and print the source)
-fxn = CompiledRunner(program)
-print(fxn.p.src)
-# NOTE: fxn.clprg is the CPUProgram
-
-# run the program
-fxn.exec([out, a, b])
-
-# check the data out
-assert out.as_buffer().cast('I')[0] == 5
-
-
-print("******** third, the UOp ***********")
-
-from tinygrad.engine.realize import run_schedule
-from tinygrad.engine.schedule import create_schedule_with_vars
-from tinygrad.schedule.rangeify import get_rangeify_map
-
-# allocate some values + load in values
-a = UOp.new_buffer(DEVICE, 1, dtypes.int32)
-b = UOp.new_buffer(DEVICE, 1, dtypes.int32)
-a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
-b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
-
-# describe the computation
-out = a + b
-s = UOp(Ops.SINK, dtypes.void, (out,))
-
-# group the computation into kernels
-becomes_map = get_rangeify_map(s)
-
-# the compute maps to an assign
-assign = becomes_map[a+b].base
-
-# the first source is the output buffer (data)
-assert assign.src[0].op is Ops.BUFFER
-# the second source is the kernel (compute)
-assert assign.src[1].op is Ops.KERNEL
-
-# schedule the kernel graph in a linear list
-s = UOp(Ops.SINK, dtypes.void, (assign,))
-sched, _ = create_schedule_with_vars(s)
-assert len(sched) == 1
-
-# DEBUGGING: print the compute ast
-print(sched[-1].ast)
-# NOTE: sched[-1].ast is the same as st_0 above
-
-# the output will be stored in a new buffer
-out = assign.buf_uop
-assert out.op is Ops.BUFFER and not out.buffer.is_allocated()
-print(out)
-
-# run that schedule
-run_schedule(sched)
-
-# check the data out
-assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5
-
-
-print("******** fourth, the Tensor ***********")
-
-from tinygrad import Tensor
-
-a = Tensor([2], dtype=dtypes.int32, device=DEVICE)
-b = Tensor([3], dtype=dtypes.int32, device=DEVICE)
-out = a + b
-
-# check the data out
-print(val:=out.item())
-assert val == 5
diff --git a/test/test_tensor.py b/test/test_tensor.py
index a053549781..4fe776934a 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -841,11 +841,13 @@ class TestTensorMetadata(unittest.TestCase):
     self.assertTrue(y.grad.uop.metadata[0].backward)
     si = Tensor.schedule(out, x.grad, y.grad)[-1]
     #self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
-    self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
+    # skip numpy, this is schedule cache
+    self.assertSetEqual(set(m.name for m in si.metadata if m.name != "numpy"), {"sigmoid", "relu"})
     #bw = [m for m in si.metadata if m.backward]
     #self.assertEqual(len(bw), 1)
     #self.assertEqual(bw[0].name, "sigmoid")
 
+  @unittest.skip("metadata is no longer promised to be exact with schedulecache")
   def test_tracemeta_0(self):
     with Context(TRACEMETA=0):
       x = Tensor.rand(3, requires_grad=True)
diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py
index 83130beef0..9de708d93e 100644
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@@ -663,7 +663,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
       assert all_same([x.size for x in ret.bufs]) and all_same([x.dtype for x in ret.bufs]), "multibuffers mismatch buffers"
       return ret
     assert self.op is Ops.BUFFER, f"must be BUFFER {self.op}"
-    assert self.src[0].op is Ops.UNIQUE, "buffer src[0] must be UNIQUE"
+    assert self.src[0].op is Ops.UNIQUE, f"buffer src[0] must be UNIQUE, not {self.src[0].op}"
     if (cret:=buffers.get(self)) is not None: return cret
     rdtype = self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base
     if isinstance(self.device, tuple): ret = MultiBuffer(self.device, self.size, rdtype).ref(1)
@@ -672,8 +672,12 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
     return ret
   @property
   def realized(self) -> Buffer|MultiBuffer|None:
+    # only these can be realized
+    if self.op not in (Ops.BUFFER, Ops.MSTACK): return None
+    # LUNIQUEs are never realized
+    if self.op_in_backward_slice_with_self(Ops.LUNIQUE): return None
     # NOTE: this is used by the JIT to determine which inputs we capture
-    return self.buffer if self.op in {Ops.BUFFER, Ops.MSTACK} and self.buffer.is_allocated() else None
+    return self.buffer if self.buffer.is_allocated() else None
   @property
   def is_realized(self) -> bool:
     return all(x.base.realized is not None for x in self.base.src) if self.base.op is Ops.MULTI else self.base.realized is not None