From 0bee225a5809eb7d1e69ac276e07c0844f773269 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Mon, 21 Apr 2025 11:34:03 +0300 Subject: [PATCH] Tensor.kernelize docs (#9946) * Tensor.kernelize docs * syntax * test_kernelize_bw * Tensor.kernelize docstring * pruning * tiny details * details 2 * becomes_map terminology * more changes to becomes --- docs/developer/kernelize.md | 109 ++++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 + tinygrad/tensor.py | 5 ++ 3 files changed, 116 insertions(+) create mode 100644 docs/developer/kernelize.md diff --git a/docs/developer/kernelize.md b/docs/developer/kernelize.md new file mode 100644 index 0000000000..9731464504 --- /dev/null +++ b/docs/developer/kernelize.md @@ -0,0 +1,109 @@ +# Kernel Creation + +Tinygrad lazily builds up a graph of Tensor operations. The Tensor graph includes a mix of: + +- Buffer and Assignment Ops: `BUFFER`, `BUFFER_VIEW`, `COPY`, `ASSIGN` +- Movement Ops: `RESHAPE`, `EXPAND`, `PERMUTE`, `PAD`, `SHRINK`, `FLIP` +- Compute Ops: `ADD`, `MUL`, `REDUCE_AXIS`, ... + +`Tensor.kernelize` creates the kernels and buffers needed to realize the output Tensor(s). + +## Kernelize flow + +Let's see how a multiply add Tensor graph becomes a fused elementwise kernel. + +```py +# initialize 3 input buffers on the device +a = Tensor([1]).realize() +b = Tensor([2]).realize() +c = Tensor([3]).realize() + +# create the Tensor graph +mul = a*b +out = mul+c + +print(mul) # , None)> on METAL with grad None> +print(out) # , None)> on METAL with grad None> + +out.kernelize() + +print(mul) # , None)> on METAL with grad None> +print(out) # , None)> on METAL with grad None> +``` + +The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp: + +```py +print(out.lazydata) +``` + +The first source is the output BUFFER: + +``` +UOp(Ops.BUFFER, dtypes.int, arg=1, src=( + UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()), + UOp(Ops.UNIQUE, dtypes.void, arg=6, src=()),)) +``` + +And the second source is the KERNEL and its 4 buffer edges (output_buffer, a, b, c): + +``` +UOp(Ops.KERNEL, dtypes.void, arg=,) (__add__, __mul__)>, src=( + UOp(Ops.BUFFER, dtypes.int, arg=1, src=( + x1:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()), + UOp(Ops.UNIQUE, dtypes.void, arg=6, src=()),)), + UOp(Ops.BUFFER, dtypes.int, arg=1, src=( + x1, + UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),)), + UOp(Ops.BUFFER, dtypes.int, arg=1, src=( + x1, + UOp(Ops.UNIQUE, dtypes.void, arg=3, src=()),)), + UOp(Ops.BUFFER, dtypes.int, arg=1, src=( + x1, + UOp(Ops.UNIQUE, dtypes.void, arg=5, src=()),)),)) +``` + +KERNEL describes the compute AST, metadata and memory dependencies. + +BUFFER holds a reference to the device memory where the output will be stored. + +Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusing it: + +```py +child = out+2 +child.kernelize() +print(child.lazydata.src[1].arg.ast) +``` + +``` +UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=0, src=()), + x2:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.ADD, dtypes.int, arg=None, src=( + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=1, src=()), + x2,)), + UOp(Ops.CONST, dtypes.int, arg=2, src=( + x2,)),)),)),)) +``` + +`Tensor.realize` will execute the kernels and write outputs to memory: + +```py +Tensor.realize(out) +print(out) # , )> on METAL with grad None> +print(out.item()) # 5 +``` + +
+ +**Summary** + +- The large Tensor graph is built from a mix of data, compute and movement Ops. + +- `Tensor.kernelize` splits the Tensor graph into data (BUFFER), compute (KERNEL) and links dependencies with ASSIGN. + +- `Tensor.realize` executes KERNELs on device and replaces the Tensor graph with just a BUFFER. + +- Kernelize can be called multiple times on a Tensor. This allows for incrementally building the kernel fusion layout of a large Tensor graph, without having to call `realize` or `schedule`. diff --git a/mkdocs.yml b/mkdocs.yml index a09a4b47fc..d06f81c201 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -24,6 +24,8 @@ nav: - Intro: developer/developer.md - Speed: developer/speed.md - UOp: developer/uop.md + - Grouper: + - developer/kernelize.md - Runtime: - developer/runtime.md - HCQ: developer/hcq.md diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index c377af2abc..ac6af3e7f9 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -225,6 +225,11 @@ class Tensor(SimpleMathTrait): # ***** data handlers **** def kernelize(self, *lst:Tensor) -> Tensor: + """ + Creates the kernels and buffers needed to realize these Tensor(s). + + NOTE: Kernelize can be called multiple times on a Tensor + """ big_sink = UOp.sink(*[x.lazydata for x in (self,)+lst]) # TODO: move this to scheduler tensor_map pass