diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 680848810b..41054519cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,6 +55,8 @@ jobs: python-version: 3.8 - name: Install Dependencies run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Test Docs + run: python docs/abstractions.py - name: Run Pytest run: python -m pytest -s -v -n=auto test/ diff --git a/README.md b/README.md index 49e45f09a5..3f56ee9b57 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ tinygrad already supports numerous accelerators, including: - [x] Triton - [x] PyTorch -And it is easy to add more! Your accelerator of choice only needs to support a total of 20 (optionally 21) low level ops. +And it is easy to add more! Your accelerator of choice only needs to support a total of 26 (optionally 27) low level ops. More information can be found in the [documentation for adding new accelerators](/docs/adding_new_accelerators.md). ## Installation diff --git a/docs/abstractions.py b/docs/abstractions.py index 06bb943e3b..28d9e514c9 100644 --- a/docs/abstractions.py +++ b/docs/abstractions.py @@ -98,16 +98,17 @@ class LazyOp: src: Tuple[Union[LazyOp, LazyBuffer], ...] # the sources arg: Optional[Any] = None # and an optional static argument -# there's currently 20 Ops you have to implement for an accelerator. -class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); NEG = auto(); NOT = auto() -class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() +# there's currently 27 Ops you have to implement for an accelerator. +class UnaryOps(Enum): NOOP = auto(); EXP = auto(); LOG = auto(); CAST = auto(); SIN = auto() +class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); POW = auto(); CMPEQ = auto(); MAX = auto() class ReduceOps(Enum): SUM = auto(); MAX = auto() class MovementOps(Enum): RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); STRIDE = auto() -class LoadOps(Enum): FROMCPU = auto() +class FusedOps(Enum): MULACC = auto() +class LoadOps(Enum): EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto() # NOTE: if you have a CompiledBuffer(DeviceBuffer) # you do not need to implement the MovementOps # as they are handled by the ShapeTracker(in tinygrad/shape/shapetracker.py, code 7/10) -Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps] +Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, FusedOps, LoadOps] # most of tinygrad/lazy.py is concerned with fusing Ops into LazyOps ASTs that map to GPUKernels # it's beyond the scope of this tutorial, but you can read the file if interested @@ -130,11 +131,11 @@ assert lazyop.op == BinaryOps.ADD assert len(lazyop.src) == 2 # the first source is the 2, it comes from the CPU -# the source is a LazyBuffer, since FROMCPU cannot be folded into LazyOp ASTs +# the source is a LazyBuffer, holding the data as an ndarray # again, a LazyOp AST is like a GPU kernel. you have to copy the data on the device first print(lazyop.src[0].op) -assert lazyop.src[0].op.op == LoadOps.FROMCPU -assert lazyop.src[0].op.arg == [2], "the arg of the FROMCPU LazyOP is the [2.]" +assert lazyop.src[0].op.op == LoadOps.FROM +assert lazyop.src[0].op.src[0].realized.toCPU()[0] == 2, "the arg of the FROM LazyOP is a LazyBuffer holding [2.]" assert result.lazydata.realized is None, "the LazyBuffer is not realized yet" # now we realize the LazyBuffer diff --git a/docs/adding_new_accelerators.md b/docs/adding_new_accelerators.md index cd8a280fe0..8957435cfb 100644 --- a/docs/adding_new_accelerators.md +++ b/docs/adding_new_accelerators.md @@ -1,16 +1,17 @@ # Adding a new accelerator to tinygrad -It's pretty easy to add a new accelerator to tinygrad. All you need to do is implement a total of 20 (optionally 21) low level ops. Then tinygrad takes care of the rest, handling derivatives and syntactic sugar. +It's pretty easy to add a new accelerator to tinygrad. All you need to do is implement a total of 26 (optionally 27) low level ops. Then tinygrad takes care of the rest, handling derivatives and syntactic sugar. ## llops -These are the ops that you must implement for your accelerator of choice. +These are the ops that you must implement for your accelerator of choice. Compiled Accelerators do not need to implement movement_ops, as they are handled b the ShapeTracker. ``` Buffer # class of memory on this device unary_op (NOOP, EXP, LOG, CAST, SIN) # A -> A reduce_op (SUM, MAX) # A -> B (smaller size, B has 1 in shape) binary_op (ADD, SUB, MUL, DIV, POW, CMPEQ, MAX) # A + A -> A (all the same size) movement_op (EXPAND, RESHAPE, PERMUTE, PAD, SHRINK, STRIDE) # A -> B (different size) +load_op (EMPTY, RAND, CONST, FROM, CONTIGUOUS, CUSTOM) # -> A (initialize data on device) fused_op [[optional]] (MULACC) # A * A -> B ```