diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3ae361cd36..5dbe140436 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -505,14 +505,14 @@ jobs: with: key: apps_llm - name: Test 1B LLM (llama) - run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm --model llama3.2:1b | tee /dev/stderr | grep -i rooster + run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.llm --model llama3.2:1b | tee /dev/stderr | grep -i rooster - name: Test 1B LLM (llama q4) - run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm --model llama3.2:1b-q4 | tee /dev/stderr | grep -i rooster + run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.llm --model llama3.2:1b-q4 | tee /dev/stderr | grep -i rooster - name: Test 1B LLM (qwen3.5) - run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm --model qwen3.5:0.8b | tee /dev/stderr | grep -i rooster + run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.llm --model qwen3.5:0.8b | tee /dev/stderr | grep -i rooster - name: Test 1B LLM (qwen) # NOTE: qwen is dumb and only knows about female chickens - run: echo "What's a female chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm --model qwen3:0.6b | tee /dev/stderr | grep -i hen + run: echo "What's a female chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.llm --model qwen3:0.6b | tee /dev/stderr | grep -i hen # ****** Models Tests ****** diff --git a/docs/tinygpu.md b/docs/tinygpu.md index 127aa0db32..8d7897ed7c 100644 --- a/docs/tinygpu.md +++ b/docs/tinygpu.md @@ -55,7 +55,7 @@ export PATH="$HOME/.local/bin:$PATH" ### 5. Use it! ```bash -DEV={AMD|NV} python3 tinygrad/apps/llm.py +DEV={AMD|NV} python3 -m tinygrad.llm ``` **Note:** Use `JITBEAM=2` to search for faster kernels (one-time search cost, results cached). diff --git a/pyproject.toml b/pyproject.toml index 6bb6b8d493..084f4272ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,11 +19,11 @@ build-backend = "setuptools.build_meta" include-package-data = true packages = [ 'tinygrad', - 'tinygrad.apps', 'tinygrad.codegen', 'tinygrad.codegen.opt', 'tinygrad.codegen.late', 'tinygrad.engine', + 'tinygrad.llm', 'tinygrad.mixin', 'tinygrad.nn', 'tinygrad.renderer', @@ -112,9 +112,9 @@ docs = [ [tool.mutmut] paths_to_mutate = ["tinygrad/"] do_not_mutate = [ - "tinygrad/apps/*", "tinygrad/codegen/*", "tinygrad/engine/*", + "tinygrad/llm/*", "tinygrad/nn/*", "tinygrad/renderer/*", "tinygrad/runtime/*", diff --git a/sz.py b/sz.py index 324a8e9935..b47e8c7920 100755 --- a/sz.py +++ b/sz.py @@ -56,7 +56,7 @@ def gen_diff(table_old, table_new): def display_diff(diff): return "+"+str(diff) if diff > 0 else str(diff) -NONCORE_DIRS = {"tinygrad/apps", "tinygrad/nn", "tinygrad/renderer", "tinygrad/runtime", "tinygrad/viz"} +NONCORE_DIRS = {"tinygrad/llm", "tinygrad/nn", "tinygrad/renderer", "tinygrad/runtime", "tinygrad/viz"} if __name__ == "__main__": if len(sys.argv) == 3: diff --git a/test/external/external_llm_eval.py b/test/external/external_llm_eval.py index 0841b9740e..9e9e4c9946 100644 --- a/test/external/external_llm_eval.py +++ b/test/external/external_llm_eval.py @@ -1,4 +1,4 @@ -# eval for tinygrad.apps.llm -- hits the server via OpenAI API +# eval for OpenAI API server # uses Meta's exact ARC-Challenge prompt template from lm-evaluation-harness llama3 tasks import argparse, re, pyarrow.parquet as pq from openai import OpenAI diff --git a/test/external/external_test_simple_tokenizer.py b/test/external/external_test_simple_tokenizer.py index 8fc3299ee1..5d69f6859f 100644 --- a/test/external/external_test_simple_tokenizer.py +++ b/test/external/external_test_simple_tokenizer.py @@ -1,7 +1,7 @@ import functools, multiprocessing from transformers import AutoTokenizer from datasets import load_dataset -from tinygrad.apps.llm import SimpleTokenizer +from tinygrad.llm.cli import SimpleTokenizer from tinygrad.helpers import tqdm, getenv, partition @functools.cache diff --git a/test/null/test_attention.py b/test/null/test_attention.py index 39f93c5bce..82bc751388 100644 --- a/test/null/test_attention.py +++ b/test/null/test_attention.py @@ -1,6 +1,6 @@ import unittest from tinygrad import Tensor, dtypes, TinyJit, UOp -from tinygrad.apps.llm import apply_rope as apply_rope_new, precompute_freqs_cis +from tinygrad.llm.cli import apply_rope as apply_rope_new, precompute_freqs_cis from test.helpers import assert_jit_cache_len def apply_rope(x:Tensor, start_pos:int): diff --git a/test/null/test_llm_server.py b/test/null/test_llm_server.py index 5bc00eb84f..8dad3c12c2 100644 --- a/test/null/test_llm_server.py +++ b/test/null/test_llm_server.py @@ -22,18 +22,15 @@ class TestLLMServer(unittest.TestCase): cls.bos_id = 1 cls.eos_id = 999 - import tinygrad.apps.llm as llm_module - llm_module.model = cls.mock_model - llm_module.model_name = "test-model" - llm_module.tok = cls.mock_tok - llm_module.bos_id = cls.bos_id - llm_module.eos_id = cls.eos_id - llm_module.eot_id = None + from tinygrad.llm.cli import Handler, LLMServer - from tinygrad.apps.llm import Handler - from tinygrad.viz.serve import TCPServerWithReuse - - cls.server = TCPServerWithReuse(('127.0.0.1', 0), Handler) + cls.server = LLMServer(('127.0.0.1', 0), Handler) + cls.server.model = cls.mock_model + cls.server.model_name = "test-model" + cls.server.tok = cls.mock_tok + cls.server.bos_id = cls.bos_id + cls.server.eos_id = cls.eos_id + cls.server.eot_id = None cls.port = cls.server.server_address[1] cls.server_thread = threading.Thread(target=cls.server.serve_forever, daemon=True) cls.server_thread.start() diff --git a/test/null/test_llm_tokenizer.py b/test/null/test_llm_tokenizer.py index 99e5ff5fa5..f6ec4cfb66 100644 --- a/test/null/test_llm_tokenizer.py +++ b/test/null/test_llm_tokenizer.py @@ -1,5 +1,5 @@ import unittest, base64, functools, sys -from tinygrad.apps.llm import SimpleTokenizer +from tinygrad.llm.cli import SimpleTokenizer from tinygrad.helpers import fetch @unittest.skipIf(sys.platform == 'win32', "fetch race condition on Windows") diff --git a/test/unit/test_attention.py b/test/unit/test_attention.py index 7eab6c80c8..33c4a272c5 100644 --- a/test/unit/test_attention.py +++ b/test/unit/test_attention.py @@ -1,7 +1,7 @@ import unittest import numpy as np from tinygrad import Tensor, dtypes -from tinygrad.apps.llm import ( +from tinygrad.llm.cli import ( GatedDeltaNetBlock, SSMConfig, TransformerBlock, TransformerConfig, apply_rope as apply_rope_new, precompute_freqs_cis, pairwise_topk, ) diff --git a/test/unit/test_llm_mla.py b/test/unit/test_llm_mla.py index 53b39bbedc..3bc1f19bb5 100644 --- a/test/unit/test_llm_mla.py +++ b/test/unit/test_llm_mla.py @@ -1,7 +1,7 @@ import unittest import numpy as np from tinygrad import Tensor -from tinygrad.apps.llm import Transformer, TransformerConfig, apply_rope +from tinygrad.llm.cli import Transformer, TransformerConfig, apply_rope class TestMLA(unittest.TestCase): def _make_config(self, **kwargs): @@ -13,7 +13,7 @@ class TestMLA(unittest.TestCase): def test_mla_attention_matches_naive(self): config = self._make_config(max_context=16) - from tinygrad.apps.llm import MLATransformerBlock, precompute_freqs_cis + from tinygrad.llm.cli import MLATransformerBlock, precompute_freqs_cis block = MLATransformerBlock(config) c = config diff --git a/test/unit/test_llm_moe.py b/test/unit/test_llm_moe.py index d87ce7586e..54968b8d66 100644 --- a/test/unit/test_llm_moe.py +++ b/test/unit/test_llm_moe.py @@ -2,7 +2,7 @@ import unittest import numpy as np from dataclasses import replace from tinygrad import Tensor -from tinygrad.apps.llm import TransformerBlock, TransformerConfig +from tinygrad.llm.cli import TransformerBlock, TransformerConfig def _moe_config(dim=8, hidden=16, n_heads=2, num_experts=4, num_experts_per_tok=2): return TransformerConfig( diff --git a/test/unit/test_llm_server.py b/test/unit/test_llm_server.py index 4924ba3a65..9f2638af47 100644 --- a/test/unit/test_llm_server.py +++ b/test/unit/test_llm_server.py @@ -2,7 +2,7 @@ import unittest from unittest.mock import patch from tinygrad import Tensor, UOp from tinygrad.schedule import schedule_cache -from tinygrad.apps.llm import Transformer, TransformerConfig +from tinygrad.llm.cli import Transformer, TransformerConfig TEST_CONFIG = TransformerConfig(num_blocks=1, dim=64, hidden_dim=128, n_heads=2, n_kv_heads=2, norm_eps=1e-5, vocab_size=100, head_dim=32, rope_theta=10000.0, rope_dim=32, v_head_dim=32, max_context=32) diff --git a/tinygrad/llm/__init__.py b/tinygrad/llm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tinygrad/llm/__main__.py b/tinygrad/llm/__main__.py new file mode 100644 index 0000000000..832facdaac --- /dev/null +++ b/tinygrad/llm/__main__.py @@ -0,0 +1,2 @@ +from tinygrad.llm.cli import main +if __name__ == "__main__": main() \ No newline at end of file diff --git a/tinygrad/apps/llm.py b/tinygrad/llm/cli.py similarity index 97% rename from tinygrad/apps/llm.py rename to tinygrad/llm/cli.py index 8eabc00d36..6978682781 100644 --- a/tinygrad/apps/llm.py +++ b/tinygrad/llm/cli.py @@ -56,7 +56,7 @@ class SimpleTokenizer: return tokens + self._encode_sentence(text[pos:]) def decode(self, ids:list[int]) -> str: return b''.join(self._tok2bytes[tid] for tid in ids).decode(errors='replace') - def stream_decoder(self) -> typing.Callable[[int|None], str]: + def stream_decoder(self) -> typing.Callable[..., str]: dec = codecs.getincrementaldecoder('utf-8')('replace') def _decode(tid:int|None=None) -> str: return dec.decode(self._tok2bytes[tid]) if tid is not None else dec.decode(b'', final=True) return _decode @@ -545,12 +545,23 @@ CHAT_HTML = b'''tinygrad chat