From 66dfd5e7bfe42723fff0681c80bf63bf982f8230 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Fri, 7 Jun 2024 21:20:57 +0800 Subject: [PATCH] faster codegen process replay (#4858) * faster codegen process replay * use self.copy * regenerate * delete copy * test a real error [run_process_replay] * revert the error change --- .github/workflows/test.yml | 35 +++++++++------------------------ test/external/replay_codegen.py | 29 +++++++++++++++++++++++++++ test/test_fusion_op.py | 2 ++ tinygrad/codegen/linearizer.py | 3 ++- 4 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 test/external/replay_codegen.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0687881139..ec277c7dd4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -465,14 +465,7 @@ jobs: diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py - name: Run pytest (not cuda or amd) if: matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv' - run: | - if [ "$RUN_PROCESS_REPLAY" ]; then - git fetch origin master && git checkout origin/master - DERANDOMIZE_CI=1 python -m pytest test/ --ignore test/test_gc.py --durations=20 - git checkout $GITHUB_SHA && ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/ --ignore test/test_gc.py --durations=20 - else - python -m pytest -n=auto test/ --durations=20 - fi + run: python -m pytest -n=auto test/ --durations=20 # - name: Run test_ops with FUZZ_UOPS=1 # if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv' # run: FUZZ_UOPS=1 python -m pytest -n=auto test/test_ops.py --durations=20 @@ -481,32 +474,22 @@ jobs: run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Run pytest (cuda) if: matrix.backend=='ptx'||matrix.backend=='triton'||matrix.backend=='nv' - run: | - if [ "$RUN_PROCESS_REPLAY" ]; then - git fetch origin master && git checkout origin/master - DERANDOMIZE_CI=1 python -m pytest test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20 - git checkout $GITHUB_SHA - ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20 - else - python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20 - fi + run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20 - name: Run pytest (amd) if: matrix.backend=='amd' - run: | - if [ "$RUN_PROCESS_REPLAY" ]; then - git fetch origin master && git checkout origin/master - DERANDOMIZE_CI=1 python -m pytest test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20 - git checkout $GITHUB_SHA - ASSERT_COMPILE=1 DERANDOMIZE_CI=1 python -m pytest test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20 - else - python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20 - fi + run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20 - name: Compile EfficientNet to C and test it if: matrix.backend=='clang' run: | PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c clang -O2 recognize.c -lm -o recognize cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock + - name: Run process replay tests + if: env.RUN_PROCESS_REPLAY == '1' + run: | + cp test/external/replay_codegen.py ./replay_codegen.py + git fetch origin master && git checkout origin/master + PYTHONPATH=. python3 replay_codegen.py #testunicorn: # name: ARM64 unicorn Test diff --git a/test/external/replay_codegen.py b/test/external/replay_codegen.py new file mode 100644 index 0000000000..ed2617c49d --- /dev/null +++ b/test/external/replay_codegen.py @@ -0,0 +1,29 @@ +# compare kernels created by HEAD against master +import difflib, pickle +from tqdm import tqdm +from tinygrad.codegen.linearizer import Linearizer +from tinygrad.helpers import colored, db_connection, VERSION + +page_size = 100 +conn = db_connection() +cur = conn.cursor() +row_count = cur.execute(f"select count(*) from 'process_replay_{VERSION}'").fetchone()[0] +for offset in tqdm(range(0, row_count, page_size)): + cur.execute(f"SELECT val FROM 'process_replay_{VERSION}' LIMIT ? OFFSET ?", (page_size, offset)) + for row in cur.fetchall(): + compare_k: Linearizer = pickle.loads(row[0]) + compare_src = compare_k.opts.render("test", compare_k.uops) + k = Linearizer(*compare_k.ast, opts=compare_k.opts) + for opt in compare_k.applied_opts: k.apply_opt(opt) + good_uops = k.linearize().uops + good_src = k.opts.render("test", good_uops) + try: assert compare_src == good_src + except AssertionError: + print("PROCESS REPLAY FAILED") + print(compare_k.ast) + print(compare_k.applied_opts) + diff = list(difflib.unified_diff(good_src.splitlines(), compare_src.splitlines())) + for line in diff: + print(colored(line, "red" if line.startswith("-") else "green" if line.startswith("+") else None)) + # TODO: fix nondeterminism in ASTs with Variable 4860 + #raise e diff --git a/test/test_fusion_op.py b/test/test_fusion_op.py index faab57cf8c..b0de3aed6b 100644 --- a/test/test_fusion_op.py +++ b/test/test_fusion_op.py @@ -4,6 +4,7 @@ import numpy as np from tinygrad import Tensor, dtypes from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import lower_schedule_item, run_schedule +from tinygrad.helpers import getenv class TestFusionOp(unittest.TestCase): def test_contiguous_add(self): @@ -22,6 +23,7 @@ class TestFusionOp(unittest.TestCase): outd = out.tolist() assert all(x == 20.0 for x in outd) + @unittest.skipIf(getenv("RUN_PROCESS_REPLAY"), "very slow") def test_recursive_add(self): st = time.perf_counter() a = Tensor([1,2,3,4]) diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 607c7c4d73..767997392a 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -4,7 +4,7 @@ import itertools, math, functools from collections import defaultdict from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, ConstType -from tinygrad.helpers import colored, DEBUG, dedup, prod, getenv, to_function_name +from tinygrad.helpers import colored, DEBUG, dedup, diskcache_put, prod, getenv, to_function_name from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, TernaryOps, ReduceOps, ConstBuffer, MemBuffer, BufferOps, get_lazyop_info from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.symbolic import Variable, NumNode, Node, SumNode, MulNode, DivNode, ModNode, LtNode, AndNode, create_lt_node @@ -466,6 +466,7 @@ class Linearizer(Kernel): self.linearize() info = get_lazyop_info(self.ast[0]) src = self.opts.render(to_function_name(self.name), self.uops) + if getenv("RUN_PROCESS_REPLAY"): diskcache_put("process_replay", "".join(map(str,[self.ast,self.applied_opts])), self) ops, mem = self.uops.flops_mem() run_count = prod((self.global_size if self.global_size else []) + (self.local_size if self.local_size else [])) # NOTE: we use min here to ignore the indexing FLOPS