From 0bee225a5809eb7d1e69ac276e07c0844f773269 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Mon, 21 Apr 2025 11:34:03 +0300
Subject: [PATCH] Tensor.kernelize docs (#9946)

* Tensor.kernelize docs

* syntax

* test_kernelize_bw

* Tensor.kernelize docstring

* pruning

* tiny details

* details 2

* becomes_map terminology

* more changes to becomes
---
 docs/developer/kernelize.md | 109 ++++++++++++++++++++++++++++++++++++
 mkdocs.yml                  |   2 +
 tinygrad/tensor.py          |   5 ++
 3 files changed, 116 insertions(+)
 create mode 100644 docs/developer/kernelize.md
diff --git a/docs/developer/kernelize.md b/docs/developer/kernelize.md
new file mode 100644
index 0000000000..9731464504
--- /dev/null
+++ b/docs/developer/kernelize.md
@@ -0,0 +1,109 @@
+# Kernel Creation
+
+Tinygrad lazily builds up a graph of Tensor operations. The Tensor graph includes a mix of:
+
+- Buffer and Assignment Ops: `BUFFER`, `BUFFER_VIEW`, `COPY`, `ASSIGN`
+- Movement Ops: `RESHAPE`, `EXPAND`, `PERMUTE`, `PAD`, `SHRINK`, `FLIP`
+- Compute Ops: `ADD`, `MUL`, `REDUCE_AXIS`, ...
+
+`Tensor.kernelize` creates the kernels and buffers needed to realize the output Tensor(s).
+
+## Kernelize flow
+
+Let's see how a multiply add Tensor graph becomes a fused elementwise kernel.
+
+```py
+# initialize 3 input buffers on the device
+a = Tensor([1]).realize()
+b = Tensor([2]).realize()
+c = Tensor([3]).realize()
+
+# create the Tensor graph
+mul = a*b
+out = mul+c
+
+print(mul) # <Tensor <UOp METAL (1,) int (<Ops.MUL: 48>, None)> on METAL with grad None>
+print(out) # <Tensor <UOp METAL (1,) int (<Ops.ADD: 52>, None)> on METAL with grad None>
+
+out.kernelize()
+
+print(mul) # <Tensor <UOp METAL (1,) int (<Ops.MUL: 48>, None)> on METAL with grad None>
+print(out) # <Tensor <UOp METAL (1,) int (<Ops.ASSIGN: 66>, None)> on METAL with grad None>
+```
+
+The multiply Tensor stays the same because it is fused. The output Tensor's UOp becomes a new ASSIGN UOp:
+
+```py
+print(out.lazydata)
+```
+
+The first source is the output BUFFER:
+
+```
+UOp(Ops.BUFFER, dtypes.int, arg=1, src=(
+  UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),
+  UOp(Ops.UNIQUE, dtypes.void, arg=6, src=()),))
+```
+
+And the second source is the KERNEL and its 4 buffer edges (output_buffer, a, b, c):
+
+```
+UOp(Ops.KERNEL, dtypes.void, arg=<Kernel 12 SINK(<Ops.STORE: 45>,) (__add__, __mul__)>, src=(
+  UOp(Ops.BUFFER, dtypes.int, arg=1, src=(
+    x1:=UOp(Ops.DEVICE, dtypes.void, arg='METAL', src=()),
+    UOp(Ops.UNIQUE, dtypes.void, arg=6, src=()),)),
+  UOp(Ops.BUFFER, dtypes.int, arg=1, src=(
+     x1,
+    UOp(Ops.UNIQUE, dtypes.void, arg=1, src=()),)),
+  UOp(Ops.BUFFER, dtypes.int, arg=1, src=(
+     x1,
+    UOp(Ops.UNIQUE, dtypes.void, arg=3, src=()),)),
+  UOp(Ops.BUFFER, dtypes.int, arg=1, src=(
+     x1,
+    UOp(Ops.UNIQUE, dtypes.void, arg=5, src=()),)),))
+```
+
+KERNEL describes the compute AST, metadata and memory dependencies.
+
+BUFFER holds a reference to the device memory where the output will be stored.
+
+Once a Tensor is kernelized, all children will LOAD its BUFFER, instead of fusing it:
+
+```py
+child = out+2
+child.kernelize()
+print(child.lazydata.src[1].arg.ast)
+```
+
+```
+UOp(Ops.SINK, dtypes.void, arg=None, src=(
+  UOp(Ops.STORE, dtypes.void, arg=None, src=(
+    UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=0, src=()),
+    x2:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),
+    UOp(Ops.ADD, dtypes.int, arg=None, src=(
+      UOp(Ops.LOAD, dtypes.int, arg=None, src=(
+        UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=1, src=()),
+         x2,)),
+      UOp(Ops.CONST, dtypes.int, arg=2, src=(
+         x2,)),)),)),))
+```
+
+`Tensor.realize` will execute the kernels and write outputs to memory:
+
+```py
+Tensor.realize(out)
+print(out)        # <Tensor <UOp METAL (1,) int (<Ops.BUFFER: 23>, <buf real:True device:METAL size:1 dtype:dtypes.int offset:0>)> on METAL with grad None>
+print(out.item()) # 5
+```
+
+<hr />
+
+**Summary**
+
+- The large Tensor graph is built from a mix of data, compute and movement Ops.
+
+- `Tensor.kernelize` splits the Tensor graph into data (BUFFER), compute (KERNEL) and links dependencies with ASSIGN.
+
+- `Tensor.realize` executes KERNELs on device and replaces the Tensor graph with just a BUFFER.
+
+- Kernelize can be called multiple times on a Tensor. This allows for incrementally building the kernel fusion layout of a large Tensor graph, without having to call `realize` or `schedule`.
diff --git a/mkdocs.yml b/mkdocs.yml
index a09a4b47fc..d06f81c201 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -24,6 +24,8 @@ nav:
     - Intro: developer/developer.md
     - Speed: developer/speed.md
     - UOp: developer/uop.md
+    - Grouper:
+      - developer/kernelize.md
     - Runtime:
       - developer/runtime.md
       - HCQ: developer/hcq.md
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index c377af2abc..ac6af3e7f9 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -225,6 +225,11 @@ class Tensor(SimpleMathTrait):
   # ***** data handlers ****
 
   def kernelize(self, *lst:Tensor) -> Tensor:
+    """
+    Creates the kernels and buffers needed to realize these Tensor(s).
+
+    NOTE: Kernelize can be called multiple times on a Tensor
+    """
     big_sink = UOp.sink(*[x.lazydata for x in (self,)+lst])
 
     # TODO: move this to scheduler tensor_map pass