From 81b9c04574c3899af605581af13cd8e36338f351 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sun, 8 Jun 2025 14:05:56 -0700
Subject: [PATCH] move high level stuff to unit tests [pr] (#10708)

* move high level stuff to unit tests [pr]

* process replay on unit tests

* fix pr, less compute

* set omp num threads

* set 200MB buffer size limit

* delete junk

* fix tests

* faster

* move test_indexing to unit

* faster
---
 .github/actions/setup-tinygrad/action.yml     |  5 +++-
 .github/workflows/test.yml                    | 13 ++++++----
 .pre-commit-config.yaml                       |  2 +-
 test/external/external_test_datasets.py       |  4 +--
 .../external/process_replay/process_replay.py |  6 +++--
 test/imported/__init__.py                     |  0
 test/models/test_real_world.py                |  2 +-
 test/test_dtype.py                            |  1 +
 test/test_linearizer_failures.py              |  1 +
 test/test_multitensor.py                      |  4 +--
 test/test_ops.py                              | 26 +++++++++----------
 test/test_schedule.py                         | 10 +++----
 test/{ => unit}/test_conv.py                  |  0
 test/{imported => unit}/test_indexing.py      |  0
 14 files changed, 42 insertions(+), 32 deletions(-)
 delete mode 100644 test/imported/__init__.py
 rename test/{ => unit}/test_conv.py (100%)
 rename test/{imported => unit}/test_indexing.py (100%)

diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml
index 51ad0897f9..cc685d763d 100644
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@@ -93,10 +93,13 @@ runs:
           . .venv/bin/activate
         fi
         python -m pip install -e . ${{ inputs.pydeps }}
-    - name: Export venv
+    - name: Set up venv environment
       shell: bash
       run: |
         echo "VIRTUAL_ENV=${{ github.workspace }}/.venv" >> "$GITHUB_ENV"
+        echo "OMP_NUM_THREADS=1" >> "$GITHUB_ENV"
+        # no buffers should be over 300MB in CI
+        echo "MAX_BUFFER_SIZE=300000000" >> "$GITHUB_ENV"
         if [[ "$RUNNER_OS" == "Windows" ]]; then
           echo "${{ github.workspace }}/.venv/Scripts" >> "$GITHUB_PATH"
         else
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e36194c3cb..a1e8a9456b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -364,6 +364,8 @@ jobs:
     #  run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
     - name: Run GC tests
       run: PYTHONPATH="." python test/external/external_uop_gc.py
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
     - name: Repo line count < 14000 lines
       run: MAX_LINE_COUNT=14000 python sz.py
 
@@ -475,9 +477,9 @@ jobs:
         run: CPU=1 PYTHONPATH=. python3 test/test_quantize_onnx.py
       - name: Run REMOTE=1 Test
         run: |
-          REMOTEDEV=CPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
-          REMOTEDEV=GPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
-          REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
+          REMOTEDEV=CPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_jit.py test/test_multitensor.py
+          REMOTEDEV=GPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py test/test_jit.py
+          REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py
       - name: Test Optimization Helpers
         run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py
       - name: Test Action Space
@@ -618,7 +620,7 @@ jobs:
         if: matrix.backend=='amdllvm'
         run: python test/test_amd_llvm.py
       - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
       - name: Run TRANSCENDENTAL math
         run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
       - name: Run TestOps.test_add with SQTT
@@ -654,7 +656,8 @@ jobs:
           PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
           DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Run pytest (cuda)
-        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --durations=20
+        # skip multitensor because it's slow
+        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4968fbaf13..0e23d11069 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
         pass_filenames: false
       - id: tests
         name: subset of tests
-        entry: env MAX_BUFFER_SIZE=200000000 PYTHONPATH="." python3 -m pytest -n=4 test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
+        entry: env MAX_BUFFER_SIZE=300000000 PYTHONPATH="." python3 -m pytest -n=4 --ignore=test/unit/test_keccak.py --ignore=test/unit/test_indexing.py test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
         language: system
         always_run: true
         pass_filenames: false
diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py
index 8fd3c77623..19e58d80de 100644
--- a/test/external/external_test_datasets.py
+++ b/test/external/external_test_datasets.py
@@ -152,7 +152,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):
       ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0))
       ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"]
 
-      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
+      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)
       np.testing.assert_equal(tinygrad_boxes[0].numpy(), ref_boxes.numpy())
       np.testing.assert_equal(tinygrad_labels[0].numpy(), ref_labels.numpy())
 
@@ -165,7 +165,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):
 
     for ((tinygrad_img, _, _, _), (ref_img, _)) in zip(tinygrad_dataloader, ref_dataloader):
       ref_img, _ = transform(ref_img.unsqueeze(0))
-      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
+      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py
index 9dfa32e36a..1ff2873fcf 100755
--- a/test/external/process_replay/process_replay.py
+++ b/test/external/process_replay/process_replay.py
@@ -34,7 +34,7 @@ class ProcessReplayWarning(Warning): pass
 def replay_kernelize(ret:dict[UOp, UOp], big_sink:UOp) -> tuple[str, str, tuple[Any, ...]]:
   UOp.unique_num = itertools.count(max([u.arg for u in big_sink.toposort() if u.op is Ops.UNIQUE], default=0)+1)
   new_sink = big_sink.substitute(get_kernelize_map(big_sink))
-  def to_str(ret:UOp):
+  def to_str(ret:UOp) -> str:
     asts = [repr(u.arg.ast) for u in ret.toposort() if u.op is Ops.KERNEL]
     return "\n".join([f"{len(asts)} kernels", *asts])
   return to_str(new_sink), to_str(ret[big_sink]), (big_sink,)
@@ -44,7 +44,9 @@ def replay_linearize(k:Kernel, _:Kernel, name_override=None, ast_transform=None)
   # this should be made fully functional. It's fine for process replay since copy returns a fresh instance
   k2 = k.copy()
   k2.linearize(name_override=name_override or to_function_name(k.name), ast_transform=ast_transform)
-  def to_str(ret:Kernel): return ret.opts.render(ret.uops)
+  def to_str(ret:Kernel) -> str:
+    try: return ret.opts.render(ret.uops)
+    except NotImplementedError: return "" # NULL backend doesn't have a renderer, this is okay
   return to_str(k2), to_str(k), (k.ast, k.opts, k.applied_opts)
 
 replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {"get_kernelize_map":replay_kernelize, "linearize":replay_linearize}
diff --git a/test/imported/__init__.py b/test/imported/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index fd19e728b3..e6facd4a64 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -61,7 +61,7 @@ class TestRealWorld(unittest.TestCase):
     derandomize_model(model)
     @TinyJit
     def test(t, t2): return model(t, Tensor([801]), t2).realize()
-    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)
+    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 32, 32),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)
 
   def test_unet_resblock(self):
     model = [ResBlock(16, 24, 16) for _ in range(4)]
diff --git a/test/test_dtype.py b/test/test_dtype.py
index 0c5f9b41ec..734d629887 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -523,6 +523,7 @@ class TestTypeSpec(unittest.TestCase):
       dtypes.default_float = default_float
       assert dtypes.default_float == default_float
 
+  @unittest.skip("this test is slow and spawning whole pythons")
   def test_env_set_default_float(self):
     # check default
     subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'],
diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py
index bce7278d08..b07936799a 100644
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
@@ -1254,6 +1254,7 @@ class TestLinearizerFailures(unittest.TestCase):
     opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))]
     helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[])
 
+  @unittest.skip("allocating over 200MB buffer")
   @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI")
   def test_failure_52(self):
     # resnet beam.
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index f06dc866a6..0f6f16b48a 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1213,8 +1213,8 @@ class TestMultiTransformer(unittest.TestCase):
     device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))
 
     from extra.models.llama import Transformer
-    args = {"dim": 64, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
-            "hidden_dim": 64, "max_context": 12}
+    args = {"dim": 32, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
+            "hidden_dim": 32, "max_context": 12}
     real_model = Transformer(**args)
     shard_model = Transformer(**args)
 
diff --git a/test/test_ops.py b/test/test_ops.py
index c97270199a..46a89dd8bb 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2122,8 +2122,8 @@ class TestOps(unittest.TestCase):
               lambda x,w: Tensor.conv2d(x,w,padding=p).relu())
 
   def _test_conv2d(self, bs=1, cin=1, cout=6):
-    for H in [1,2,3]:
-      for W in [1,2,3,5]:
+    for H in [2,3]:
+      for W in [1,3,5]:
         for groups in [1,3] if cin == 3 and cout == 6 and H == 3 and W == 3 else [1]:
           with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
             helper_test_op([(bs,cin,5,7), (cout,cin//groups,H,W)],
@@ -2300,7 +2300,7 @@ class TestOps(unittest.TestCase):
   def test_max_pool2d(self):
     for ksz in [(2,2), (3,3), 2, 3, (3,2), (5,5), (5,1)]:
       with self.subTest(kernel_size=ksz):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
           lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
           lambda x: Tensor.max_pool2d(x, kernel_size=ksz))
 
@@ -2308,7 +2308,7 @@ class TestOps(unittest.TestCase):
     for ksz in [(2,2), (3,3), 2, 3, (3,2)]:
       for p in [1, (1,0), (0,1)]:
         with self.subTest(kernel_size=ksz, padding=p):
-          helper_test_op([(32,2,110,28)],
+          helper_test_op([(32,2,11,28)],
             lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz, padding=p),
             lambda x: Tensor.max_pool2d(x, kernel_size=ksz, padding=p))
     self.helper_test_exception([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
@@ -2324,40 +2324,40 @@ class TestOps(unittest.TestCase):
 
   def test_max_pool2d_padding_int(self):
     ksz = (2,2)
-    helper_test_op([(32,2,110,28)],
+    helper_test_op([(32,2,11,28)],
       lambda x: torch.nn.functional.max_pool2d(x.int(), kernel_size=ksz, padding=1),
       lambda x: Tensor.max_pool2d(x.int(), kernel_size=ksz, padding=1), forward_only=True)
 
   def test_max_pool2d_bigger_stride(self):
     for stride in [(2,3), (3,2), 2, 3]:
       with self.subTest(stride=stride):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
           lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride),
           lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride))
 
   def test_max_pool2d_bigger_stride_dilation(self):
     for stride, dilation in zip([(2,3), (3,2), 2, 3, 4], [(3,2), (2,3), 2, 3, 6]):
       with self.subTest(stride=stride):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
           lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation),
           lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation))
 
   @unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this")
   def test_max_pool2d_unit_stride(self):
-    helper_test_op([(8, 2, 17, 14)],
+    helper_test_op([(3, 2, 17, 14)],
       lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1),
       lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=1))
 
   def test_max_pool2d_smaller_stride(self):
     for stride in [(2,3), (3,2), 2, 3]:
       with self.subTest(stride=stride):
-        helper_test_op([(8, 2, 17, 14)],
+        helper_test_op([(3, 2, 17, 14)],
           lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=stride),
           lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=stride))
 
   def test_max_pool2d_dilation(self):
     for dilation in [(2, 3), (3, 2), 2, 3]:
-      helper_test_op([(8, 2, 17, 14)],
+      helper_test_op([(3, 2, 17, 14)],
         lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), dilation=dilation),
         lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), dilation=dilation))
 
@@ -2541,13 +2541,13 @@ class TestOps(unittest.TestCase):
   def test_interpolate_nearest_exact(self): self.test_interpolate_nearest("nearest-exact")
 
   def test_interpolate_bilinear(self):
-    for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
+    for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
       helper_test_op([(2,3)+in_sz],
         lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear"),
         lambda x: Tensor.interpolate(x, size=out_sz, mode="linear"), atol=1e-4)
 
   def test_interpolate_bilinear_corners_aligned(self):
-    for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
+    for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
       helper_test_op([(2,3)+in_sz],
         lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear", align_corners=True),
         lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4)
@@ -2838,7 +2838,7 @@ class TestOps(unittest.TestCase):
     b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False)
     a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False)
     for reduce in ("sum", "prod", "mean", "amin", "amax"):
-      for dim in (0,1,2,-1,-2,-3):
+      for dim in (-1,1,-3):
         helper_test_op([(4,5,6), (4,5,6)],
           lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce),
           lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True)
diff --git a/test/test_schedule.py b/test/test_schedule.py
index ba12ae26aa..c6e7cd0b00 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1053,9 +1053,9 @@ class TestSchedule(unittest.TestCase):
 
   def test_scaled_dot_product_attention_multireduce_fusion(self):
     Tensor.manual_seed(0)
-    q = Tensor.randn(32,8,16,64).realize()
-    k = Tensor.randn(32,8,16,64).realize()
-    v = Tensor.randn(32,8,16,64).realize()
+    q = Tensor.randn(32,8,16,8).realize()
+    k = Tensor.randn(32,8,16,8).realize()
+    v = Tensor.randn(32,8,16,8).realize()
     out = Tensor.scaled_dot_product_attention(q,k,v)
     run_schedule(check_schedule(out, 5))
     if getenv("CHECK", 1):
@@ -1296,7 +1296,7 @@ class TestSchedule(unittest.TestCase):
 
   def test_sgd_4convs_fuse(self):
     with Tensor.train():
-      img = Tensor.empty(2,3,64,64)
+      img = Tensor.empty(2,3,16,16)
       c1 = nn.Conv2d(3,4,3,bias=False)
       c2 = nn.Conv2d(4,8,3,bias=False)
       c3 = nn.Conv2d(8,16,3,bias=False)
@@ -1309,7 +1309,7 @@ class TestSchedule(unittest.TestCase):
 
   def test_sgd_4convs_fuse_conv_bw(self):
     with Tensor.train():
-      img = Tensor.empty(2,3,64,64)
+      img = Tensor.empty(2,3,16,16)
       c1 = nn.Conv2d(3,4,3,bias=False)
       c2 = nn.Conv2d(4,8,3,bias=False)
       c3 = nn.Conv2d(8,16,3,bias=False)
diff --git a/test/test_conv.py b/test/unit/test_conv.py
similarity index 100%
rename from test/test_conv.py
rename to test/unit/test_conv.py
diff --git a/test/imported/test_indexing.py b/test/unit/test_indexing.py
similarity index 100%
rename from test/imported/test_indexing.py
rename to test/unit/test_indexing.py