mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
small fixups from schedule_cache (#13557)
This commit is contained in:
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -71,9 +71,7 @@ jobs:
|
||||
- name: Test Docs Build
|
||||
run: python -m mkdocs build --strict
|
||||
- name: Test Docs
|
||||
run: |
|
||||
python docs/abstractions2.py
|
||||
python docs/abstractions3.py
|
||||
run: python docs/abstractions3.py
|
||||
- name: Test README
|
||||
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py
|
||||
- name: Test Quickstart
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
# tinygrad is a tensor library, and as a tensor library it has multiple parts
|
||||
# 1. a "runtime". this allows buffer management, compilation, and running programs
|
||||
# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all
|
||||
# 3. a "UOp" that fuses the compute into kernels, using memory only when needed
|
||||
# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()"
|
||||
|
||||
|
||||
print("******** first, the runtime ***********")
|
||||
|
||||
from tinygrad.runtime.ops_cpu import ClangJITCompiler, CPUDevice, CPUProgram
|
||||
|
||||
cpu = CPUDevice()
|
||||
|
||||
# allocate some buffers
|
||||
out = cpu.allocator.alloc(4)
|
||||
a = cpu.allocator.alloc(4)
|
||||
b = cpu.allocator.alloc(4)
|
||||
|
||||
# load in some values (little endian)
|
||||
cpu.allocator._copyin(a, memoryview(bytearray([2,0,0,0])))
|
||||
cpu.allocator._copyin(b, memoryview(bytearray([3,0,0,0])))
|
||||
|
||||
# compile a program to a binary
|
||||
lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
|
||||
|
||||
# create a runtime for the program
|
||||
fxn = cpu.runtime("add", lib)
|
||||
|
||||
# run the program
|
||||
fxn(out, a, b)
|
||||
|
||||
# check the data out
|
||||
print(val := cpu.allocator._as_buffer(out).cast("I").tolist()[0])
|
||||
assert val == 5
|
||||
|
||||
|
||||
print("******** second, the Device ***********")
|
||||
|
||||
DEVICE = "CPU" # NOTE: you can change this!
|
||||
|
||||
import struct
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.uop.ops import UOp, Ops
|
||||
|
||||
# allocate some buffers + load in values
|
||||
out = Buffer(DEVICE, 1, dtypes.int32).allocate()
|
||||
a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
|
||||
b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
|
||||
# NOTE: a._buf is the same as the return from cpu.allocator.alloc
|
||||
|
||||
# describe the computation
|
||||
idx = UOp.const(dtypes.index, 0)
|
||||
buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
|
||||
buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
|
||||
alu = buf_1.index(idx) + buf_2.index(idx)
|
||||
output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
|
||||
st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.index(idx), alu))
|
||||
s = UOp(Ops.SINK, dtypes.void, (st_0,))
|
||||
|
||||
# convert the computation to a "linearized" format (print the format)
|
||||
from tinygrad.engine.realize import get_program, CompiledRunner
|
||||
program = get_program(s, Device[DEVICE].renderer)
|
||||
|
||||
# compile a program (and print the source)
|
||||
fxn = CompiledRunner(program)
|
||||
print(fxn.p.src)
|
||||
# NOTE: fxn.clprg is the CPUProgram
|
||||
|
||||
# run the program
|
||||
fxn.exec([out, a, b])
|
||||
|
||||
# check the data out
|
||||
assert out.as_buffer().cast('I')[0] == 5
|
||||
|
||||
|
||||
print("******** third, the UOp ***********")
|
||||
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from tinygrad.engine.schedule import create_schedule_with_vars
|
||||
from tinygrad.schedule.rangeify import get_rangeify_map
|
||||
|
||||
# allocate some values + load in values
|
||||
a = UOp.new_buffer(DEVICE, 1, dtypes.int32)
|
||||
b = UOp.new_buffer(DEVICE, 1, dtypes.int32)
|
||||
a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
|
||||
b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
|
||||
|
||||
# describe the computation
|
||||
out = a + b
|
||||
s = UOp(Ops.SINK, dtypes.void, (out,))
|
||||
|
||||
# group the computation into kernels
|
||||
becomes_map = get_rangeify_map(s)
|
||||
|
||||
# the compute maps to an assign
|
||||
assign = becomes_map[a+b].base
|
||||
|
||||
# the first source is the output buffer (data)
|
||||
assert assign.src[0].op is Ops.BUFFER
|
||||
# the second source is the kernel (compute)
|
||||
assert assign.src[1].op is Ops.KERNEL
|
||||
|
||||
# schedule the kernel graph in a linear list
|
||||
s = UOp(Ops.SINK, dtypes.void, (assign,))
|
||||
sched, _ = create_schedule_with_vars(s)
|
||||
assert len(sched) == 1
|
||||
|
||||
# DEBUGGING: print the compute ast
|
||||
print(sched[-1].ast)
|
||||
# NOTE: sched[-1].ast is the same as st_0 above
|
||||
|
||||
# the output will be stored in a new buffer
|
||||
out = assign.buf_uop
|
||||
assert out.op is Ops.BUFFER and not out.buffer.is_allocated()
|
||||
print(out)
|
||||
|
||||
# run that schedule
|
||||
run_schedule(sched)
|
||||
|
||||
# check the data out
|
||||
assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5
|
||||
|
||||
|
||||
print("******** fourth, the Tensor ***********")
|
||||
|
||||
from tinygrad import Tensor
|
||||
|
||||
a = Tensor([2], dtype=dtypes.int32, device=DEVICE)
|
||||
b = Tensor([3], dtype=dtypes.int32, device=DEVICE)
|
||||
out = a + b
|
||||
|
||||
# check the data out
|
||||
print(val:=out.item())
|
||||
assert val == 5
|
||||
@@ -841,11 +841,13 @@ class TestTensorMetadata(unittest.TestCase):
|
||||
self.assertTrue(y.grad.uop.metadata[0].backward)
|
||||
si = Tensor.schedule(out, x.grad, y.grad)[-1]
|
||||
#self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
|
||||
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
|
||||
# skip numpy, this is schedule cache
|
||||
self.assertSetEqual(set(m.name for m in si.metadata if m.name != "numpy"), {"sigmoid", "relu"})
|
||||
#bw = [m for m in si.metadata if m.backward]
|
||||
#self.assertEqual(len(bw), 1)
|
||||
#self.assertEqual(bw[0].name, "sigmoid")
|
||||
|
||||
@unittest.skip("metadata is no longer promised to be exact with schedulecache")
|
||||
def test_tracemeta_0(self):
|
||||
with Context(TRACEMETA=0):
|
||||
x = Tensor.rand(3, requires_grad=True)
|
||||
|
||||
@@ -663,7 +663,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
|
||||
assert all_same([x.size for x in ret.bufs]) and all_same([x.dtype for x in ret.bufs]), "multibuffers mismatch buffers"
|
||||
return ret
|
||||
assert self.op is Ops.BUFFER, f"must be BUFFER {self.op}"
|
||||
assert self.src[0].op is Ops.UNIQUE, "buffer src[0] must be UNIQUE"
|
||||
assert self.src[0].op is Ops.UNIQUE, f"buffer src[0] must be UNIQUE, not {self.src[0].op}"
|
||||
if (cret:=buffers.get(self)) is not None: return cret
|
||||
rdtype = self.dtype if isinstance(self.dtype, ImageDType) else self.dtype.base
|
||||
if isinstance(self.device, tuple): ret = MultiBuffer(self.device, self.size, rdtype).ref(1)
|
||||
@@ -672,8 +672,12 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
|
||||
return ret
|
||||
@property
|
||||
def realized(self) -> Buffer|MultiBuffer|None:
|
||||
# only these can be realized
|
||||
if self.op not in (Ops.BUFFER, Ops.MSTACK): return None
|
||||
# LUNIQUEs are never realized
|
||||
if self.op_in_backward_slice_with_self(Ops.LUNIQUE): return None
|
||||
# NOTE: this is used by the JIT to determine which inputs we capture
|
||||
return self.buffer if self.op in {Ops.BUFFER, Ops.MSTACK} and self.buffer.is_allocated() else None
|
||||
return self.buffer if self.buffer.is_allocated() else None
|
||||
@property
|
||||
def is_realized(self) -> bool:
|
||||
return all(x.base.realized is not None for x in self.base.src) if self.base.op is Ops.MULTI else self.base.realized is not None
|
||||
|
||||
Reference in New Issue
Block a user