coder.py can write and run code (#2439)

* wip mistral * coder * touchups * cleanups * mistral cleanups * clean up cache create * download the weights, fix tests * fix llama loading * global fixup * clean up all * move llama model * cleanups * Revert "cleanups" This reverts commit a71c5d59eb. * fine, leave it
2026-04-29 03:00:14 -04:00 · 2023-11-25 12:27:54 -08:00
parent df41a57e09
commit 7170a9a057
10 changed files with 334 additions and 167 deletions
--- a/test/external/external_test_allocator_on_models.py
+++ b/test/external/external_test_allocator_on_models.py
@@ -93,7 +93,7 @@ class TestAllocators(unittest.TestCase):
    old_type = Tensor.default_type
    Tensor.default_type = dtypes.float16

-    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
    def __test():
      model = Transformer(**args_tiny)
      derandomize_model(model)
@@ -105,7 +105,7 @@ class TestAllocators(unittest.TestCase):

  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
  def test_lru_allocator_tiny_llama_alloc_counts(self):
-    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
    def test_alloc_count(t):
      model = Transformer(**args_tiny)
      for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
--- a/test/external/external_test_jit_on_models.py
+++ b/test/external/external_test_jit_on_models.py
@@ -19,7 +19,7 @@ class TestJittedModels(unittest.TestCase):
    old_type = Tensor.default_type
    Tensor.default_type = dtypes.float16

-    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
    model = Transformer(**args_tiny)
    derandomize_model(model)
    def test(t): return model(t, 0).realize()
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -86,7 +86,7 @@ class TestInferenceMinKernels(unittest.TestCase):
  def test_llama(self):
    from examples.llama import Transformer
    from tinygrad.shape.symbolic import Variable
-    args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
+    args_tiny = {"dim": 512, "hidden_dim": 1024, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
    model = Transformer(**args_tiny)
    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
    with CLCache(100):
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -51,7 +51,7 @@ class TestRealWorld(unittest.TestCase):
  def test_llama(self):
    Tensor.default_type = dtypes.float16

-    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    args_tiny = {"dim": 1024, "hidden_dim": 2048, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
    model = LLaMaTransformer(**(args_tiny if CI else LLAMA_MODEL_PARAMS["1"]["7B"]["args"]))
    derandomize_model(model)
    @TinyJit