From 24ca8eeaa76b020212e86c7f0da307d41d453704 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:41:16 -0800 Subject: [PATCH] small fixups from schedule_cache (#13557) --- .github/workflows/test.yml | 4 +- docs/abstractions2.py | 135 ------------------------------------- test/test_tensor.py | 4 +- tinygrad/uop/ops.py | 8 ++- 4 files changed, 10 insertions(+), 141 deletions(-) delete mode 100644 docs/abstractions2.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5b46d598c1..4b7c2a7d4e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -71,9 +71,7 @@ jobs: - name: Test Docs Build run: python -m mkdocs build --strict - name: Test Docs - run: | - python docs/abstractions2.py - python docs/abstractions3.py + run: python docs/abstractions3.py - name: Test README run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py - name: Test Quickstart diff --git a/docs/abstractions2.py b/docs/abstractions2.py deleted file mode 100644 index c1d13a86cf..0000000000 --- a/docs/abstractions2.py +++ /dev/null @@ -1,135 +0,0 @@ -# tinygrad is a tensor library, and as a tensor library it has multiple parts -# 1. a "runtime". this allows buffer management, compilation, and running programs -# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all -# 3. a "UOp" that fuses the compute into kernels, using memory only when needed -# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()" - - -print("******** first, the runtime ***********") - -from tinygrad.runtime.ops_cpu import ClangJITCompiler, CPUDevice, CPUProgram - -cpu = CPUDevice() - -# allocate some buffers -out = cpu.allocator.alloc(4) -a = cpu.allocator.alloc(4) -b = cpu.allocator.alloc(4) - -# load in some values (little endian) -cpu.allocator._copyin(a, memoryview(bytearray([2,0,0,0]))) -cpu.allocator._copyin(b, memoryview(bytearray([3,0,0,0]))) - -# compile a program to a binary -lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") - -# create a runtime for the program -fxn = cpu.runtime("add", lib) - -# run the program -fxn(out, a, b) - -# check the data out -print(val := cpu.allocator._as_buffer(out).cast("I").tolist()[0]) -assert val == 5 - - -print("******** second, the Device ***********") - -DEVICE = "CPU" # NOTE: you can change this! - -import struct -from tinygrad.dtype import dtypes -from tinygrad.device import Buffer, Device -from tinygrad.uop.ops import UOp, Ops - -# allocate some buffers + load in values -out = Buffer(DEVICE, 1, dtypes.int32).allocate() -a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2)))) -b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3)))) -# NOTE: a._buf is the same as the return from cpu.allocator.alloc - -# describe the computation -idx = UOp.const(dtypes.index, 0) -buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1) -buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2) -alu = buf_1.index(idx) + buf_2.index(idx) -output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0) -st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.index(idx), alu)) -s = UOp(Ops.SINK, dtypes.void, (st_0,)) - -# convert the computation to a "linearized" format (print the format) -from tinygrad.engine.realize import get_program, CompiledRunner -program = get_program(s, Device[DEVICE].renderer) - -# compile a program (and print the source) -fxn = CompiledRunner(program) -print(fxn.p.src) -# NOTE: fxn.clprg is the CPUProgram - -# run the program -fxn.exec([out, a, b]) - -# check the data out -assert out.as_buffer().cast('I')[0] == 5 - - -print("******** third, the UOp ***********") - -from tinygrad.engine.realize import run_schedule -from tinygrad.engine.schedule import create_schedule_with_vars -from tinygrad.schedule.rangeify import get_rangeify_map - -# allocate some values + load in values -a = UOp.new_buffer(DEVICE, 1, dtypes.int32) -b = UOp.new_buffer(DEVICE, 1, dtypes.int32) -a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2)))) -b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3)))) - -# describe the computation -out = a + b -s = UOp(Ops.SINK, dtypes.void, (out,)) - -# group the computation into kernels -becomes_map = get_rangeify_map(s) - -# the compute maps to an assign -assign = becomes_map[a+b].base - -# the first source is the output buffer (data) -assert assign.src[0].op is Ops.BUFFER -# the second source is the kernel (compute) -assert assign.src[1].op is Ops.KERNEL - -# schedule the kernel graph in a linear list -s = UOp(Ops.SINK, dtypes.void, (assign,)) -sched, _ = create_schedule_with_vars(s) -assert len(sched) == 1 - -# DEBUGGING: print the compute ast -print(sched[-1].ast) -# NOTE: sched[-1].ast is the same as st_0 above - -# the output will be stored in a new buffer -out = assign.buf_uop -assert out.op is Ops.BUFFER and not out.buffer.is_allocated() -print(out) - -# run that schedule -run_schedule(sched) - -# check the data out -assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5 - - -print("******** fourth, the Tensor ***********") - -from tinygrad import Tensor - -a = Tensor([2], dtype=dtypes.int32, device=DEVICE) -b = Tensor([3], dtype=dtypes.int32, device=DEVICE) -out = a + b - -# check the data out -print(val:=out.item()) -assert val == 5 diff --git a/test/test_tensor.py b/test/test_tensor.py index a053549781..4fe776934a 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -841,11 +841,13 @@ class TestTensorMetadata(unittest.TestCase): self.assertTrue(y.grad.uop.metadata[0].backward) si = Tensor.schedule(out, x.grad, y.grad)[-1] #self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}") - self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"}) + # skip numpy, this is schedule cache + self.assertSetEqual(set(m.name for m in si.metadata if m.name != "numpy"), {"sigmoid", "relu"}) #bw = [m for m in si.metadata if m.backward] #self.assertEqual(len(bw), 1) #self.assertEqual(bw[0].name, "sigmoid") + @unittest.skip("metadata is no longer promised to be exact with schedulecache") def test_tracemeta_0(self): with Context(TRACEMETA=0): x = Tensor.rand(3, requires_grad=True) diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py index 83130beef0..9de708d93e 100644 --- a/tinygrad/uop/ops.py +++ b/tinygrad/uop/ops.py @@ -663,7 +663,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass): assert all_same([x.size for x in ret.bufs]) and all_same([x.dtype for x in ret.bufs]), "multibuffers mismatch buffers" return ret assert self.op is Ops.BUFFER, f"must be BUFFER {self.op}" - assert self.src[0].op is Ops.UNIQUE, "buffer src[0] must be UNIQUE" + assert self.src[0].op is Ops.UNIQUE, f"buffer src[0] must be UNIQUE, not {self.src[0].op}" if (cret:=buffers.get(self)) is not None: return cret rdtype = self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base if isinstance(self.device, tuple): ret = MultiBuffer(self.device, self.size, rdtype).ref(1) @@ -672,8 +672,12 @@ class UOp(OpMixin, metaclass=UOpMetaClass): return ret @property def realized(self) -> Buffer|MultiBuffer|None: + # only these can be realized + if self.op not in (Ops.BUFFER, Ops.MSTACK): return None + # LUNIQUEs are never realized + if self.op_in_backward_slice_with_self(Ops.LUNIQUE): return None # NOTE: this is used by the JIT to determine which inputs we capture - return self.buffer if self.op in {Ops.BUFFER, Ops.MSTACK} and self.buffer.is_allocated() else None + return self.buffer if self.buffer.is_allocated() else None @property def is_realized(self) -> bool: return all(x.base.realized is not None for x in self.base.src) if self.base.op is Ops.MULTI else self.base.realized is not None