From 6e763dc446d8d31fb72f686ca547653b33140791 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 6 Mar 2023 08:25:13 -0800 Subject: [PATCH] matmul example in readme --- README.md | 11 +++++++++++ tinygrad/codegen/gpu.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8360517ea7..a5fdad1b1f 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,17 @@ print(x.grad) # dz/dx print(y.grad) # dz/dy ``` +## Is tinygrad fast? + +Try a matmul. See how, despite the style, it is fused into one kernel with the power of laziness. Currently getting 2.2 TFLOPS on my M1 Max, and will be 8 TFLOPS once it's using the M1 Tensor Cores. + +```python +OPTLOCAL=1 GPU=1 DEBUG=3 python3 -c "from tinygrad.tensor import Tensor; +N = 1024; a, b = Tensor.randn(N, N), Tensor.randn(N, N); +c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).sum(axis=2); +print((c.numpy() - (a.numpy() @ b.numpy())).mean())" +``` + ## Neural networks? It turns out, a decent autograd tensor library is 90% of what you need for neural networks. Add an optimizer (SGD, RMSprop, and Adam implemented) from tinygrad.nn.optim, write some boilerplate minibatching code, and you have all you need. diff --git a/tinygrad/codegen/gpu.py b/tinygrad/codegen/gpu.py index 1bf8bdd2d7..88beafc754 100644 --- a/tinygrad/codegen/gpu.py +++ b/tinygrad/codegen/gpu.py @@ -275,7 +275,7 @@ class GPUCodegen(ASTKernel): for i in range(final_dimension-1, -1, -1): self.kernel += [f"int idx{i} = idx{final_dimension} % {self.output_shape[i]};", f"idx{final_dimension} = idx{final_dimension} / {self.output_shape[i]};\n"] self.output_shape = (prod(self.output_shape[0:final_dimension+1]), ) + self.output_shape[final_dimension+1:] - if DEBUG >= 3: print(f"replaced output shape with {self.output_shape}") + if DEBUG >= 4: print(f"replaced output shape with {self.output_shape}") # early ast accumulators : List[Token] = []