two stage cumsum in tensor.py (#2331)

* two stage cumsum in tensor.py * 2 more kernels for llama cumsum * gpt-2 and llama use fast multinomial
2026-01-09 15:08:02 -05:00 · 2023-11-16 12:09:53 -08:00
parent 163b2bc26a
commit 3baaf298d6
7 changed files with 40 additions and 11 deletions
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -89,7 +89,7 @@ class TestInferenceMinKernels(unittest.TestCase):
    args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
    model = Transformer(**args_tiny)
    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
-    with CLCache(98):
+    with CLCache(100):
      model(Tensor([[1,2,3,4]]), 0).realize()

@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")