From 81b9c04574c3899af605581af13cd8e36338f351 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Sun, 8 Jun 2025 14:05:56 -0700 Subject: [PATCH] move high level stuff to unit tests [pr] (#10708) * move high level stuff to unit tests [pr] * process replay on unit tests * fix pr, less compute * set omp num threads * set 200MB buffer size limit * delete junk * fix tests * faster * move test_indexing to unit * faster --- .github/actions/setup-tinygrad/action.yml | 5 +++- .github/workflows/test.yml | 13 ++++++---- .pre-commit-config.yaml | 2 +- test/external/external_test_datasets.py | 4 +-- .../external/process_replay/process_replay.py | 6 +++-- test/imported/__init__.py | 0 test/models/test_real_world.py | 2 +- test/test_dtype.py | 1 + test/test_linearizer_failures.py | 1 + test/test_multitensor.py | 4 +-- test/test_ops.py | 26 +++++++++---------- test/test_schedule.py | 10 +++---- test/{ => unit}/test_conv.py | 0 test/{imported => unit}/test_indexing.py | 0 14 files changed, 42 insertions(+), 32 deletions(-) delete mode 100644 test/imported/__init__.py rename test/{ => unit}/test_conv.py (100%) rename test/{imported => unit}/test_indexing.py (100%) diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml index 51ad0897f9..cc685d763d 100644 --- a/.github/actions/setup-tinygrad/action.yml +++ b/.github/actions/setup-tinygrad/action.yml @@ -93,10 +93,13 @@ runs: . .venv/bin/activate fi python -m pip install -e . ${{ inputs.pydeps }} - - name: Export venv + - name: Set up venv environment shell: bash run: | echo "VIRTUAL_ENV=${{ github.workspace }}/.venv" >> "$GITHUB_ENV" + echo "OMP_NUM_THREADS=1" >> "$GITHUB_ENV" + # no buffers should be over 300MB in CI + echo "MAX_BUFFER_SIZE=300000000" >> "$GITHUB_ENV" if [[ "$RUNNER_OS" == "Windows" ]]; then echo "${{ github.workspace }}/.venv/Scripts" >> "$GITHUB_PATH" else diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e36194c3cb..a1e8a9456b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -364,6 +364,8 @@ jobs: # run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing - name: Run GC tests run: PYTHONPATH="." python test/external/external_uop_gc.py + - name: Run process replay tests + uses: ./.github/actions/process-replay - name: Repo line count < 14000 lines run: MAX_LINE_COUNT=14000 python sz.py @@ -475,9 +477,9 @@ jobs: run: CPU=1 PYTHONPATH=. python3 test/test_quantize_onnx.py - name: Run REMOTE=1 Test run: | - REMOTEDEV=CPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py - REMOTEDEV=GPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py - REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py + REMOTEDEV=CPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_jit.py test/test_multitensor.py + REMOTEDEV=GPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py test/test_jit.py + REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py - name: Test Optimization Helpers run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py - name: Test Action Space @@ -618,7 +620,7 @@ jobs: if: matrix.backend=='amdllvm' run: python test/test_amd_llvm.py - name: Run pytest (amd) - run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20 + run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20 - name: Run TRANSCENDENTAL math run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run TestOps.test_add with SQTT @@ -654,7 +656,8 @@ jobs: PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT" DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run pytest (cuda) - run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --durations=20 + # skip multitensor because it's slow + run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4968fbaf13..0e23d11069 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: pass_filenames: false - id: tests name: subset of tests - entry: env MAX_BUFFER_SIZE=200000000 PYTHONPATH="." python3 -m pytest -n=4 test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py + entry: env MAX_BUFFER_SIZE=300000000 PYTHONPATH="." python3 -m pytest -n=4 --ignore=test/unit/test_keccak.py --ignore=test/unit/test_indexing.py test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py language: system always_run: true pass_filenames: false diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py index 8fd3c77623..19e58d80de 100644 --- a/test/external/external_test_datasets.py +++ b/test/external/external_test_datasets.py @@ -152,7 +152,7 @@ class TestOpenImagesDataset(ExternalTestDatasets): ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0)) ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"] - np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy()) + np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6) np.testing.assert_equal(tinygrad_boxes[0].numpy(), ref_boxes.numpy()) np.testing.assert_equal(tinygrad_labels[0].numpy(), ref_labels.numpy()) @@ -165,7 +165,7 @@ class TestOpenImagesDataset(ExternalTestDatasets): for ((tinygrad_img, _, _, _), (ref_img, _)) in zip(tinygrad_dataloader, ref_dataloader): ref_img, _ = transform(ref_img.unsqueeze(0)) - np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy()) + np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6) if __name__ == '__main__': unittest.main() diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py index 9dfa32e36a..1ff2873fcf 100755 --- a/test/external/process_replay/process_replay.py +++ b/test/external/process_replay/process_replay.py @@ -34,7 +34,7 @@ class ProcessReplayWarning(Warning): pass def replay_kernelize(ret:dict[UOp, UOp], big_sink:UOp) -> tuple[str, str, tuple[Any, ...]]: UOp.unique_num = itertools.count(max([u.arg for u in big_sink.toposort() if u.op is Ops.UNIQUE], default=0)+1) new_sink = big_sink.substitute(get_kernelize_map(big_sink)) - def to_str(ret:UOp): + def to_str(ret:UOp) -> str: asts = [repr(u.arg.ast) for u in ret.toposort() if u.op is Ops.KERNEL] return "\n".join([f"{len(asts)} kernels", *asts]) return to_str(new_sink), to_str(ret[big_sink]), (big_sink,) @@ -44,7 +44,9 @@ def replay_linearize(k:Kernel, _:Kernel, name_override=None, ast_transform=None) # this should be made fully functional. It's fine for process replay since copy returns a fresh instance k2 = k.copy() k2.linearize(name_override=name_override or to_function_name(k.name), ast_transform=ast_transform) - def to_str(ret:Kernel): return ret.opts.render(ret.uops) + def to_str(ret:Kernel) -> str: + try: return ret.opts.render(ret.uops) + except NotImplementedError: return "" # NULL backend doesn't have a renderer, this is okay return to_str(k2), to_str(k), (k.ast, k.opts, k.applied_opts) replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {"get_kernelize_map":replay_kernelize, "linearize":replay_linearize} diff --git a/test/imported/__init__.py b/test/imported/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index fd19e728b3..e6facd4a64 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -61,7 +61,7 @@ class TestRealWorld(unittest.TestCase): derandomize_model(model) @TinyJit def test(t, t2): return model(t, Tensor([801]), t2).realize() - helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515) + helper_test("test_sd", lambda: (Tensor.randn(1, 4, 32, 32),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515) def test_unet_resblock(self): model = [ResBlock(16, 24, 16) for _ in range(4)] diff --git a/test/test_dtype.py b/test/test_dtype.py index 0c5f9b41ec..734d629887 100644 --- a/test/test_dtype.py +++ b/test/test_dtype.py @@ -523,6 +523,7 @@ class TestTypeSpec(unittest.TestCase): dtypes.default_float = default_float assert dtypes.default_float == default_float + @unittest.skip("this test is slow and spawning whole pythons") def test_env_set_default_float(self): # check default subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'], diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py index bce7278d08..b07936799a 100644 --- a/test/test_linearizer_failures.py +++ b/test/test_linearizer_failures.py @@ -1254,6 +1254,7 @@ class TestLinearizerFailures(unittest.TestCase): opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))] helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) + @unittest.skip("allocating over 200MB buffer") @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI") def test_failure_52(self): # resnet beam. diff --git a/test/test_multitensor.py b/test/test_multitensor.py index f06dc866a6..0f6f16b48a 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -1213,8 +1213,8 @@ class TestMultiTransformer(unittest.TestCase): device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) from extra.models.llama import Transformer - args = {"dim": 64, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024, - "hidden_dim": 64, "max_context": 12} + args = {"dim": 32, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024, + "hidden_dim": 32, "max_context": 12} real_model = Transformer(**args) shard_model = Transformer(**args) diff --git a/test/test_ops.py b/test/test_ops.py index c97270199a..46a89dd8bb 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -2122,8 +2122,8 @@ class TestOps(unittest.TestCase): lambda x,w: Tensor.conv2d(x,w,padding=p).relu()) def _test_conv2d(self, bs=1, cin=1, cout=6): - for H in [1,2,3]: - for W in [1,2,3,5]: + for H in [2,3]: + for W in [1,3,5]: for groups in [1,3] if cin == 3 and cout == 6 and H == 3 and W == 3 else [1]: with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W): helper_test_op([(bs,cin,5,7), (cout,cin//groups,H,W)], @@ -2300,7 +2300,7 @@ class TestOps(unittest.TestCase): def test_max_pool2d(self): for ksz in [(2,2), (3,3), 2, 3, (3,2), (5,5), (5,1)]: with self.subTest(kernel_size=ksz): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz), lambda x: Tensor.max_pool2d(x, kernel_size=ksz)) @@ -2308,7 +2308,7 @@ class TestOps(unittest.TestCase): for ksz in [(2,2), (3,3), 2, 3, (3,2)]: for p in [1, (1,0), (0,1)]: with self.subTest(kernel_size=ksz, padding=p): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz, padding=p), lambda x: Tensor.max_pool2d(x, kernel_size=ksz, padding=p)) self.helper_test_exception([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), padding=(1,1,1)), @@ -2324,40 +2324,40 @@ class TestOps(unittest.TestCase): def test_max_pool2d_padding_int(self): ksz = (2,2) - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x.int(), kernel_size=ksz, padding=1), lambda x: Tensor.max_pool2d(x.int(), kernel_size=ksz, padding=1), forward_only=True) def test_max_pool2d_bigger_stride(self): for stride in [(2,3), (3,2), 2, 3]: with self.subTest(stride=stride): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride), lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride)) def test_max_pool2d_bigger_stride_dilation(self): for stride, dilation in zip([(2,3), (3,2), 2, 3, 4], [(3,2), (2,3), 2, 3, 6]): with self.subTest(stride=stride): - helper_test_op([(32,2,110,28)], + helper_test_op([(32,2,11,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation), lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation)) @unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this") def test_max_pool2d_unit_stride(self): - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=1)) def test_max_pool2d_smaller_stride(self): for stride in [(2,3), (3,2), 2, 3]: with self.subTest(stride=stride): - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=stride), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=stride)) def test_max_pool2d_dilation(self): for dilation in [(2, 3), (3, 2), 2, 3]: - helper_test_op([(8, 2, 17, 14)], + helper_test_op([(3, 2, 17, 14)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), dilation=dilation), lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), dilation=dilation)) @@ -2541,13 +2541,13 @@ class TestOps(unittest.TestCase): def test_interpolate_nearest_exact(self): self.test_interpolate_nearest("nearest-exact") def test_interpolate_bilinear(self): - for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]: + for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]: helper_test_op([(2,3)+in_sz], lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear"), lambda x: Tensor.interpolate(x, size=out_sz, mode="linear"), atol=1e-4) def test_interpolate_bilinear_corners_aligned(self): - for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]: + for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]: helper_test_op([(2,3)+in_sz], lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear", align_corners=True), lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4) @@ -2838,7 +2838,7 @@ class TestOps(unittest.TestCase): b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False) a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False) for reduce in ("sum", "prod", "mean", "amin", "amax"): - for dim in (0,1,2,-1,-2,-3): + for dim in (-1,1,-3): helper_test_op([(4,5,6), (4,5,6)], lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce), lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True) diff --git a/test/test_schedule.py b/test/test_schedule.py index ba12ae26aa..c6e7cd0b00 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1053,9 +1053,9 @@ class TestSchedule(unittest.TestCase): def test_scaled_dot_product_attention_multireduce_fusion(self): Tensor.manual_seed(0) - q = Tensor.randn(32,8,16,64).realize() - k = Tensor.randn(32,8,16,64).realize() - v = Tensor.randn(32,8,16,64).realize() + q = Tensor.randn(32,8,16,8).realize() + k = Tensor.randn(32,8,16,8).realize() + v = Tensor.randn(32,8,16,8).realize() out = Tensor.scaled_dot_product_attention(q,k,v) run_schedule(check_schedule(out, 5)) if getenv("CHECK", 1): @@ -1296,7 +1296,7 @@ class TestSchedule(unittest.TestCase): def test_sgd_4convs_fuse(self): with Tensor.train(): - img = Tensor.empty(2,3,64,64) + img = Tensor.empty(2,3,16,16) c1 = nn.Conv2d(3,4,3,bias=False) c2 = nn.Conv2d(4,8,3,bias=False) c3 = nn.Conv2d(8,16,3,bias=False) @@ -1309,7 +1309,7 @@ class TestSchedule(unittest.TestCase): def test_sgd_4convs_fuse_conv_bw(self): with Tensor.train(): - img = Tensor.empty(2,3,64,64) + img = Tensor.empty(2,3,16,16) c1 = nn.Conv2d(3,4,3,bias=False) c2 = nn.Conv2d(4,8,3,bias=False) c3 = nn.Conv2d(8,16,3,bias=False) diff --git a/test/test_conv.py b/test/unit/test_conv.py similarity index 100% rename from test/test_conv.py rename to test/unit/test_conv.py diff --git a/test/imported/test_indexing.py b/test/unit/test_indexing.py similarity index 100% rename from test/imported/test_indexing.py rename to test/unit/test_indexing.py