diff --git a/docs/design b/docs/design deleted file mode 100644 index 56058ae738..0000000000 --- a/docs/design +++ /dev/null @@ -1,68 +0,0 @@ -Getting the core instruction set correct is the value of tinygrad - -Max size tensor is 6-D for the pool2d - -Unary Ops -=== - -These are the simplest to reason about, and have pointwise mem access. -A and B are always the same size - -Forward : A -> B -Backward (binary): (B', A) -> A' - - - -Reduce Ops (with axis) -=== - -These take in an axis argument. B is smaller than A -Max and Sum are pretty different, do we really need Max? - -Forward : A -> B -Backward : B' -> A' - - - -Binary Ops (with broadcasting) -=== - -Pointwise mem access also. -Broadcasting adds complexity, aliased input. -Unbroadcasting for grad is a sum, but should be combined with the ternary op. - -Forward : (A, B) -> C -Backward (ternary): (C', A, B) -> (A', B') - -C.shape = max(A.shape, B.shape) - - - -Movement Ops (2 or 1) -=== - -Reshape, Transpose, Slice - -Depending on your Tensor implementation, these are free. -Reshape is almost always free. -Slice can be made free, but probably shouldn't be. -Transpose is hard to make free except in trivial cases. - -Regardless, these are "reindexings" of existing arrays -Transpose and Slice are similar enough I think they can be merged. -They should use a DMA engine - - - -Processing Ops (4) -=== - -Matmul is 1 matmul for forward, 2 for backward. -* It's actually three matmuls transposed -* cublasSgemm() - -Conv2D is very complex. It seems to need three. -* cudnnConvolutionForward() -* cudnnConvolutionBackwardData() -* cudnnConvolutionBackwardFilter() -NOTE: Tensor Cores require that the tensors be in the NHWC data layout diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 3b10e512f9..c54d7d1561 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -20,7 +20,14 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}" @functools.lru_cache(maxsize=None) def getenv(key, default=0): return type(default)(os.getenv(key, default)) -DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0) +# NOTE: hack to allow DEBUG(x) to set the debug value at runtime +class DebugSingleton: + def __init__(self): self.value = getenv("DEBUG", 0) + def __call__(self, x): self.value = x + def __bool__(self): return self.value != 0 + def __ge__(self, x): return self.value >= x + +DEBUG, IMAGE = DebugSingleton(), getenv("IMAGE", 0) # **** tinygrad now supports dtypes! ***** diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 72240ceeec..0cd44a4a93 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import Optional, Tuple, Union, List, Dict, Any, cast import sys, weakref, importlib, inspect, functools, pathlib from weakref import WeakValueDictionary -from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten, ImageDType +from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten, ImageDType, DEBUG from tinygrad.shape.shapetracker import ShapeTracker, get_contraction from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers from tinygrad.runtime.lib import RawConst, RawBuffer @@ -111,6 +111,7 @@ class LazyBuffer: if prod(self.op.arg.shape) == 1 and hasattr(Device[self.device].codegen, 'supports_constant_folding'): self.realized = RawConst(1, dtypes.from_np(self.op.arg.dtype), self.op.arg().flatten()[0]) else: + if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}") self.realized = Device[self.device].buffer.fromCPU(self.op.arg(), **self._device_extra_args()) elif self.op.op == LoadOps.CONTIGUOUS: realized = self.op.src[0].realize().realized