mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
@@ -39,8 +39,7 @@ optim._step() # this will step the optimizer without running realize
|
||||
# l1.lazydata and l2.lazydata define a computation graph
|
||||
|
||||
from tinygrad.ops import ScheduleItem
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
schedule: List[ScheduleItem] = create_schedule([l1.lazydata, l2.lazydata])
|
||||
schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
|
||||
|
||||
print(f"The schedule contains {len(schedule)} items.")
|
||||
for si in schedule: print(str(si)[:80])
|
||||
|
||||
@@ -1,21 +1,40 @@
|
||||
The tinygrad framework has four pieces
|
||||
|
||||
* a PyTorch like <b>frontend</b>.
|
||||
* a <b>scheduler</b> which breaks the compute into kernels.
|
||||
* a <b>lowering</b> engine which converts ASTs into code that can run on the accelerator.
|
||||
* an <b>execution</b> engine which can run that code.
|
||||
|
||||
## Frontend
|
||||
|
||||
Everything in [Tensor](tensor.md) is syntactic sugar around [function.py](function.md), where the forwards and backwards passes are implemented for the different ops. That goes on to construct a graph of
|
||||
Everything in [Tensor](tensor.md) is syntactic sugar around [function.py](function.md), where the forwards and backwards passes are implemented for the different mlops. There's about 25 of them, implemented using about 20 basic ops. Those basic ops go on to construct a graph of:
|
||||
|
||||
::: tinygrad.lazy.LazyBuffer
|
||||
options:
|
||||
show_source: false
|
||||
|
||||
## Lowering
|
||||
The `LazyBuffer` graph specifies the compute in terms of low level tinygrad ops. Not all LazyBuffers will actually become realized. There's two types of LazyBuffers, base and view. base contains compute into a contiguous buffer, and view is a view (specified by a ShapeTracker). Inputs to a base can be either base or view, inputs to a view can only be a single base.
|
||||
|
||||
The [scheduler](/tinygrad/engine/schedule.py) converts the graph of LazyBuffers into a list of `ScheduleItem`. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
|
||||
## Scheduling
|
||||
|
||||
The [scheduler](/tinygrad/engine/schedule.py) converts the graph of LazyBuffers into a list of `ScheduleItem`. One `ScheduleItem` is one kernel on the GPU, and the scheduler is responsible for breaking the large compute graph into subgraphs that can fit in a kernel. `ast` specifies what compute to run, and `bufs` specifies what buffers to run it on.
|
||||
|
||||
::: tinygrad.ops.ScheduleItem
|
||||
|
||||
## Lowering
|
||||
|
||||
The code in [realize](/tinygrad/engine/realize.py) lowers `ScheduleItem` to `ExecItem` with
|
||||
|
||||
::: tinygrad.engine.realize.lower_schedule
|
||||
|
||||
There's a ton of complexity hidden behind this, see the `codegen/` directory.
|
||||
|
||||
First we lower the AST to UOps, which is a linear list of the compute to be run. This is where the BEAM search happens. The UOps can be changed by `CompilerOptions`.
|
||||
|
||||
::: tinygrad.device.CompilerOptions
|
||||
|
||||
Then we render the UOps into code, then we compile the code to binary.
|
||||
|
||||
## Execution
|
||||
|
||||
Creating `ExecItem`, which has a run method
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
Welcome to the docs for tinygrad. This page is for users of the tinygrad library. We also have [developer docs](developer.md)
|
||||
|
||||
tinygrad is not 1.0 yet, but it will be soon. The API has been pretty stable for a while.
|
||||
Welcome to the docs for tinygrad. This page is for users of the tinygrad library. tinygrad is not 1.0 yet, but it will be soon. The API has been pretty stable for a while.
|
||||
|
||||
While you can `pip install tinygrad`, we encourage you to install from source:
|
||||
|
||||
@@ -10,6 +8,8 @@ cd tinygrad
|
||||
python3 -m pip install -e .
|
||||
```
|
||||
|
||||
We also have [developer docs](developer.md), and Di Zhu has created a [bunch of tutorials](https://mesozoic-egg.github.io/tinygrad-notes/) to help understand how tinygrad works.
|
||||
|
||||
## tinygrad Usage
|
||||
|
||||
The main class you will interact with is [Tensor](tensor.md). It functions very similarly to PyTorch, but has a bit more of a functional style. tinygrad supports [many datatypes](dtypes.md). All operations in tinygrad are lazy, meaning they won't do anything until you realize.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, NamedTuple, cast
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, cast
|
||||
import importlib, inspect, functools, pathlib, time, ctypes, os
|
||||
from tinygrad.helpers import prod, getenv, colored, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT
|
||||
from tinygrad.shape.symbolic import Variable, sym_infer, sint
|
||||
@@ -114,7 +115,8 @@ MallocAllocator = _MallocAllocator()
|
||||
|
||||
# **************** for Compiled Devices ****************
|
||||
|
||||
class CompilerOptions(NamedTuple):
|
||||
@dataclass(frozen=True)
|
||||
class CompilerOptions:
|
||||
device: str = ""
|
||||
suffix: str = ""
|
||||
# TODO: make this generic with a list of supported types
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
|
||||
from pathlib import Path
|
||||
from dataclasses import replace
|
||||
from typing import Tuple, Optional, List
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
|
||||
@@ -57,7 +58,7 @@ class PTXCompiler(Compiler):
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
self.version = "7.8" if arch >= "sm_89" else "7.5"
|
||||
PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
PTXCompiler.compiler_opts = replace(PTXCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
|
||||
super().__init__(f"compile_ptx_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
|
||||
def compile(self, src:str) -> bytes: return src.encode()
|
||||
@@ -66,7 +67,7 @@ class CUDACompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
CUDACompiler.compiler_opts = replace(CUDACompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
|
||||
check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
||||
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
||||
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time
|
||||
from typing import Tuple, List, Any
|
||||
from dataclasses import replace
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
@@ -69,7 +70,7 @@ class NVCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("NV", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
NVCompiler.compiler_opts = NVCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
NVCompiler.compiler_opts = replace(NVCompiler.compiler_opts, has_tensor_cores=int(arch[3:]) >= 80)
|
||||
cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
||||
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
||||
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
||||
|
||||
Reference in New Issue
Block a user