From 66dfd5e7bfe42723fff0681c80bf63bf982f8230 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Fri, 7 Jun 2024 21:20:57 +0800
Subject: [PATCH] faster codegen process replay (#4858)

* faster codegen process replay

* use self.copy

* regenerate

* delete copy

* test a real error [run_process_replay]

* revert the error change
---
 .github/workflows/test.yml      | 35 +++++++++------------------------
 test/external/replay_codegen.py | 29 +++++++++++++++++++++++++++
 test/test_fusion_op.py          |  2 ++
 tinygrad/codegen/linearizer.py  |  3 ++-
 4 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 test/external/replay_codegen.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0687881139..ec277c7dd4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -465,14 +465,7 @@ jobs:
           diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
       - name: Run pytest (not cuda or amd)
         if: matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv'
-        run: |
-          if [ "$RUN_PROCESS_REPLAY" ]; then
-            git fetch origin master && git checkout origin/master
-            DERANDOMIZE_CI=1 python -m pytest test/ --ignore test/test_gc.py --durations=20
-            git checkout $GITHUB_SHA && ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/ --ignore test/test_gc.py --durations=20
-          else
-            python -m pytest -n=auto test/ --durations=20
-          fi
+        run: python -m pytest -n=auto test/ --durations=20
       # - name: Run test_ops with FUZZ_UOPS=1
       #   if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv'
       #   run: FUZZ_UOPS=1 python -m pytest -n=auto test/test_ops.py --durations=20
@@ -481,32 +474,22 @@ jobs:
         run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Run pytest (cuda)
         if: matrix.backend=='ptx'||matrix.backend=='triton'||matrix.backend=='nv'
-        run: |
-          if [ "$RUN_PROCESS_REPLAY" ]; then
-            git fetch origin master && git checkout origin/master
-            DERANDOMIZE_CI=1 python -m pytest test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20
-            git checkout $GITHUB_SHA
-            ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20
-          else
-            python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20
-          fi
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20
       - name: Run pytest (amd)
         if: matrix.backend=='amd'
-        run: |
-          if [ "$RUN_PROCESS_REPLAY" ]; then
-            git fetch origin master && git checkout origin/master
-            DERANDOMIZE_CI=1 python -m pytest test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20
-            git checkout $GITHUB_SHA
-            ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20
-          else
-            python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20
-          fi
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20
       - name: Compile EfficientNet to C and test it
         if: matrix.backend=='clang'
         run: |
           PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c
           clang -O2 recognize.c -lm -o recognize
           cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
+      - name: Run process replay tests
+        if: env.RUN_PROCESS_REPLAY == '1'
+        run: |
+          cp test/external/replay_codegen.py ./replay_codegen.py
+          git fetch origin master && git checkout origin/master
+          PYTHONPATH=. python3 replay_codegen.py
 
   #testunicorn:
   #  name: ARM64 unicorn Test
diff --git a/test/external/replay_codegen.py b/test/external/replay_codegen.py
new file mode 100644
index 0000000000..ed2617c49d
--- /dev/null
+++ b/test/external/replay_codegen.py
@@ -0,0 +1,29 @@
+# compare kernels created by HEAD against master
+import difflib, pickle
+from tqdm import tqdm
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.helpers import colored, db_connection, VERSION
+
+page_size = 100
+conn = db_connection()
+cur = conn.cursor()
+row_count = cur.execute(f"select count(*) from 'process_replay_{VERSION}'").fetchone()[0]
+for offset in tqdm(range(0, row_count, page_size)):
+  cur.execute(f"SELECT val FROM 'process_replay_{VERSION}' LIMIT ? OFFSET ?", (page_size, offset))
+  for row in cur.fetchall():
+    compare_k: Linearizer = pickle.loads(row[0])
+    compare_src = compare_k.opts.render("test", compare_k.uops)
+    k = Linearizer(*compare_k.ast, opts=compare_k.opts)
+    for opt in compare_k.applied_opts: k.apply_opt(opt)
+    good_uops = k.linearize().uops
+    good_src = k.opts.render("test", good_uops)
+    try: assert compare_src == good_src
+    except AssertionError:
+      print("PROCESS REPLAY FAILED")
+      print(compare_k.ast)
+      print(compare_k.applied_opts)
+      diff = list(difflib.unified_diff(good_src.splitlines(), compare_src.splitlines()))
+      for line in diff:
+        print(colored(line, "red" if line.startswith("-") else "green" if line.startswith("+") else None))
+      # TODO: fix nondeterminism in ASTs with Variable 4860
+      #raise e
diff --git a/test/test_fusion_op.py b/test/test_fusion_op.py
index faab57cf8c..b0de3aed6b 100644
--- a/test/test_fusion_op.py
+++ b/test/test_fusion_op.py
@@ -4,6 +4,7 @@ import numpy as np
 from tinygrad import Tensor, dtypes
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.engine.realize import lower_schedule_item, run_schedule
+from tinygrad.helpers import getenv
 
 class TestFusionOp(unittest.TestCase):
   def test_contiguous_add(self):
@@ -22,6 +23,7 @@ class TestFusionOp(unittest.TestCase):
     outd = out.tolist()
     assert all(x == 20.0 for x in outd)
 
+  @unittest.skipIf(getenv("RUN_PROCESS_REPLAY"), "very slow")
   def test_recursive_add(self):
     st = time.perf_counter()
     a = Tensor([1,2,3,4])
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 607c7c4d73..767997392a 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -4,7 +4,7 @@ import itertools, math, functools
 from collections import defaultdict
 
 from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, ConstType
-from tinygrad.helpers import colored, DEBUG, dedup, prod, getenv, to_function_name
+from tinygrad.helpers import colored, DEBUG, dedup, diskcache_put, prod, getenv, to_function_name
 from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, TernaryOps, ReduceOps, ConstBuffer, MemBuffer, BufferOps, get_lazyop_info
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, NumNode, Node, SumNode, MulNode, DivNode, ModNode, LtNode, AndNode, create_lt_node
@@ -466,6 +466,7 @@ class Linearizer(Kernel):
     self.linearize()
     info = get_lazyop_info(self.ast[0])
     src = self.opts.render(to_function_name(self.name), self.uops)
+    if getenv("RUN_PROCESS_REPLAY"): diskcache_put("process_replay", "".join(map(str,[self.ast,self.applied_opts])), self)
     ops, mem = self.uops.flops_mem()
     run_count = prod((self.global_size if self.global_size else []) + (self.local_size if self.local_size else []))
     # NOTE: we use min here to ignore the indexing FLOPS