example: self_tokenize. someday tinygrad will be recursively self improving

2026-01-09 15:08:02 -05:00 · 2024-12-06 13:35:02 +08:00
parent 3c5d5f9414
commit 344fd4845c
1 changed files with 24 additions and 0 deletions
--- a/examples/self_tokenize.py
+++ b/examples/self_tokenize.py
@@ -0,0 +1,24 @@
+import os, pathlib
+from examples.llama3 import Tokenizer
+from tinygrad import fetch
+
+def stringify(base_path):
+  ret = []
+  for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):
+    for name in files:
+      if not name.endswith(".py"): continue
+      if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue
+      code = pathlib.Path(os.path.join(path, name)).read_text()
+      ret += [name, code]
+  return '\x00'.join(ret)
+
+if __name__ == "__main__":
+  code_str = stringify(".")
+  print(f"code has {len(code_str)} chars")
+  print(f"code has {code_str.count("\n")} newlines")
+
+  # llama 3 tokenizer
+  tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())
+
+  encoded = tokenizer.encode(code_str)
+  print(f"code has {len(encoded)} tokens")