From 344fd4845ca3e5dc8b9fa2929cc89791dea811fb Mon Sep 17 00:00:00 2001 From: George Hotz Date: Fri, 6 Dec 2024 13:35:02 +0800 Subject: [PATCH] example: self_tokenize. someday tinygrad will be recursively self improving --- examples/self_tokenize.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 examples/self_tokenize.py diff --git a/examples/self_tokenize.py b/examples/self_tokenize.py new file mode 100644 index 0000000000..e89a4fead1 --- /dev/null +++ b/examples/self_tokenize.py @@ -0,0 +1,24 @@ +import os, pathlib +from examples.llama3 import Tokenizer +from tinygrad import fetch + +def stringify(base_path): + ret = [] + for path, _, files in os.walk(os.path.join(base_path, "tinygrad")): + for name in files: + if not name.endswith(".py"): continue + if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue + code = pathlib.Path(os.path.join(path, name)).read_text() + ret += [name, code] + return '\x00'.join(ret) + +if __name__ == "__main__": + code_str = stringify(".") + print(f"code has {len(code_str)} chars") + print(f"code has {code_str.count("\n")} newlines") + + # llama 3 tokenizer + tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix()) + + encoded = tokenizer.encode(code_str) + print(f"code has {len(encoded)} tokens")