two stage cumsum in tensor.py (#2331)

* two stage cumsum in tensor.py

* 2 more kernels for llama cumsum

* gpt-2 and llama use fast multinomial
This commit is contained in:
George Hotz
2023-11-16 12:09:53 -08:00
committed by GitHub
parent 163b2bc26a
commit 3baaf298d6
7 changed files with 40 additions and 11 deletions

View File

@@ -89,7 +89,7 @@ class TestInferenceMinKernels(unittest.TestCase):
args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
model = Transformer(**args_tiny)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
with CLCache(98):
with CLCache(100):
model(Tensor([[1,2,3,4]]), 0).realize()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")