update self_tokenize, fix pylint maybe

2026-01-09 15:08:02 -05:00 · 2024-12-06 13:49:41 +08:00
parent 344fd4845c
commit b28d660172
1 changed files with 18 additions and 8 deletions
--- a/examples/self_tokenize.py
+++ b/examples/self_tokenize.py
@@ -1,24 +1,34 @@
 import os, pathlib
 from examples.llama3 import Tokenizer
+from tabulate import tabulate
 from tinygrad import fetch
+from tinygrad.helpers import flatten

-def stringify(base_path):
+# llama 3 tokenizer
+tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())
+
+def read_code(base_path):
  ret = []
  for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):
    for name in files:
      if not name.endswith(".py"): continue
      if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue
-      code = pathlib.Path(os.path.join(path, name)).read_text()
-      ret += [name, code]
-  return '\x00'.join(ret)
+      fullpath = os.path.join(path, name)
+      code = pathlib.Path(fullpath).read_text()
+      ret += [(fullpath.split("tinygrad/", 1)[1], code)]
+  return ret

 if __name__ == "__main__":
-  code_str = stringify(".")
+  ret = read_code(".")
+
+  table = []
+  for name,code in ret:
+    table.append([name, len(tokenizer.encode(name+"\x00"+code))])
+  print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow"))
+
+  code_str = '\x00'.join(flatten(ret))
  print(f"code has {len(code_str)} chars")
  print(f"code has {code_str.count("\n")} newlines")

-  # llama 3 tokenizer
-  tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())
-
  encoded = tokenizer.encode(code_str)
  print(f"code has {len(encoded)} tokens")