more docs (#4271)

* more work on docs * CompilerOptions is dataclass
2026-01-10 07:28:15 -05:00 · 2024-04-24 06:52:42 +04:00
parent 6637ecc5fe
commit 60e3aa5cb1
6 changed files with 35 additions and 13 deletions
--- a/docs-legacy/abstractions3.py
+++ b/docs-legacy/abstractions3.py
@@ -39,8 +39,7 @@ optim._step()   # this will step the optimizer without running realize
 # l1.lazydata and l2.lazydata define a computation graph

 from tinygrad.ops import ScheduleItem
-from tinygrad.engine.schedule import create_schedule
-schedule: List[ScheduleItem] = create_schedule([l1.lazydata, l2.lazydata])
+schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)

 print(f"The schedule contains {len(schedule)} items.")
 for si in schedule: print(str(si)[:80])
--- a/docs/developer.md
+++ b/docs/developer.md
@@ -1,21 +1,40 @@
+The tinygrad framework has four pieces
+
+* a PyTorch like <b>frontend</b>.
+* a <b>scheduler</b> which breaks the compute into kernels.
+* a <b>lowering</b> engine which converts ASTs into code that can run on the accelerator.
+* an <b>execution</b> engine which can run that code.
+
 ## Frontend

-Everything in [Tensor](tensor.md) is syntactic sugar around [function.py](function.md), where the forwards and backwards passes are implemented for the different ops. That goes on to construct a graph of
+Everything in [Tensor](tensor.md) is syntactic sugar around [function.py](function.md), where the forwards and backwards passes are implemented for the different mlops. There's about 25 of them, implemented using about 20 basic ops. Those basic ops go on to construct a graph of:

 ::: tinygrad.lazy.LazyBuffer
    options:
        show_source: false

-## Lowering
+The `LazyBuffer` graph specifies the compute in terms of low level tinygrad ops. Not all LazyBuffers will actually become realized. There's two types of LazyBuffers, base and view. base contains compute into a contiguous buffer, and view is a view (specified by a ShapeTracker). Inputs to a base can be either base or view, inputs to a view can only be a single base.

-The [scheduler](/tinygrad/engine/schedule.py) converts the graph of LazyBuffers into a list of `ScheduleItem`. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
+## Scheduling
+
+The [scheduler](/tinygrad/engine/schedule.py) converts the graph of LazyBuffers into a list of `ScheduleItem`. One `ScheduleItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.

 ::: tinygrad.ops.ScheduleItem

+## Lowering
+
 The code in [realize](/tinygrad/engine/realize.py) lowers `ScheduleItem` to `ExecItem` with

 ::: tinygrad.engine.realize.lower_schedule

+There's a ton of complexity hidden behind this, see the `codegen/` directory.
+
+First we lower the AST to UOps, which is a linear list of the compute to be run. This is where the BEAM search happens. The UOps can be changed by `CompilerOptions`.
+
+::: tinygrad.device.CompilerOptions
+
+Then we render the UOps into code, then we compile the code to binary.
+
 ## Execution

 Creating `ExecItem`, which has a run method
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,6 +1,4 @@
-Welcome to the docs for tinygrad. This page is for users of the tinygrad library. We also have [developer docs](developer.md)
-
-tinygrad is not 1.0 yet, but it will be soon. The API has been pretty stable for a while.
+Welcome to the docs for tinygrad. This page is for users of the tinygrad library. tinygrad is not 1.0 yet, but it will be soon. The API has been pretty stable for a while.

 While you can `pip install tinygrad`, we encourage you to install from source:

@@ -10,6 +8,8 @@ cd tinygrad
 python3 -m pip install -e .
 ```

+We also have [developer docs](developer.md), and Di Zhu has created a [bunch of tutorials](https://mesozoic-egg.github.io/tinygrad-notes/) to help understand how tinygrad works.
+
 ## tinygrad Usage

 The main class you will interact with is [Tensor](tensor.md). It functions very similarly to PyTorch, but has a bit more of a functional style. tinygrad supports [many datatypes](dtypes.md).  All operations in tinygrad are lazy, meaning they won't do anything until you realize.
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, NamedTuple, cast
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, cast
 import importlib, inspect, functools, pathlib, time, ctypes, os
 from tinygrad.helpers import prod, getenv, colored, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
@@ -114,7 +115,8 @@ MallocAllocator = _MallocAllocator()

 # **************** for Compiled Devices ****************

-class CompilerOptions(NamedTuple):
+@dataclass(frozen=True)
+class CompilerOptions:
  device: str = ""
  suffix: str = ""
  # TODO: make this generic with a list of supported types
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
 from pathlib import Path
+from dataclasses import replace
 from typing import Tuple, Optional, List
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
@@ -57,7 +58,7 @@ class PTXCompiler(Compiler):
  def __init__(self, arch:str):
    self.arch = arch
    self.version = "7.8" if arch >= "sm_89" else "7.5"
-    PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    PTXCompiler.compiler_opts = replace(PTXCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
    super().__init__(f"compile_ptx_{self.arch}")
  def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
  def compile(self, src:str) -> bytes: return src.encode()
@@ -66,7 +67,7 @@ class CUDACompiler(Compiler):
  compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
  def __init__(self, arch:str):
    self.arch = arch
-    CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    CUDACompiler.compiler_opts = replace(CUDACompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
    check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time
 from typing import Tuple, List, Any
+from dataclasses import replace
 from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
 from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG
 from tinygrad.renderer.cstyle import CUDARenderer
@@ -69,7 +70,7 @@ class NVCompiler(Compiler):
  compiler_opts = CompilerOptions("NV", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
  def __init__(self, arch:str):
    self.arch = arch
-    NVCompiler.compiler_opts = NVCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
    cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")