set PAGE_SIZE=1 and generate new dataset (#7559)

13080 rows in total. both generating and loading this are pretty broken now. filters are wrong for example
2026-01-08 22:48:25 -05:00 · 2024-11-05 11:25:01 -05:00
parent 7581a57aac
commit 207bca6cea
6 changed files with 6 additions and 8 deletions
--- a/extra/optimization/extract_dataset.py
+++ b/extra/optimization/extract_dataset.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
 # extract asts from process replay artifacts
 import os
-from tinygrad.helpers import db_connection, VERSION
 from test.external.process_replay.process_replay import _pmap

-PAGE_SIZE = 100
-TABLE_NAME = f"kernel_process_replay_{VERSION}"
 LOGOPS = os.getenv("LOGOPS", "/tmp/sops")

 def extract_ast(*args) -> bool:
@@ -13,6 +10,4 @@ def extract_ast(*args) -> bool:
  return args[-1]

 if __name__ == "__main__":
-  conn = db_connection()
-  row_count = conn.execute(f"SELECT COUNT(*) FROM '{TABLE_NAME}'").fetchone()[0]
  _pmap("kernel", extract_ast)
--- a/extra/optimization/generate_dataset.sh
+++ b/extra/optimization/generate_dataset.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+export PAGE_SIZE=1
 export LOGOPS=/tmp/ops
 export RUN_PROCESS_REPLAY=1
 rm $LOGOPS
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -24,14 +24,16 @@ def kern_str_to_lin(kern_str:str, opts=None):
 import gzip
 from pathlib import Path
 import random
-from tinygrad.helpers import dedup
+from tinygrad.helpers import dedup, DEBUG
 def load_worlds(filter_reduce=True, filter_noimage=True, filter_novariable=True):
  fn = Path(__file__).parent.parent / "datasets/sops.gz"
  ast_strs = dedup(gzip.open(fn).read().decode('utf-8').strip().split("\n"))
  assert len(ast_strs) > 5000, f"dataset size = {len(ast_strs)} is too small"
+  if DEBUG >= 1: print(f"loaded {len(ast_strs)=} before filters")
  if filter_reduce: ast_strs = [x for x in ast_strs if "REDUCE_AXIS" in x]
  if filter_noimage: ast_strs = [x for x in ast_strs if "dtypes.image" not in x]
  if filter_novariable: ast_strs = [x for x in ast_strs if "Variable" not in x]
+  if DEBUG >= 1: print(f"loaded {len(ast_strs)=} after filters")
  random.seed(1337)
  random.shuffle(ast_strs)
  return ast_strs