move high level stuff to unit tests [pr] (#10708)

* move high level stuff to unit tests [pr] * process replay on unit tests * fix pr, less compute * set omp num threads * set 200MB buffer size limit * delete junk * fix tests * faster * move test_indexing to unit * faster
2026-01-08 06:34:03 -05:00 · 2025-06-08 14:05:56 -07:00
parent 171580e9ec
commit 81b9c04574
14 changed files with 42 additions and 32 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@@ -93,10 +93,13 @@ runs:
          . .venv/bin/activate
        fi
        python -m pip install -e . ${{ inputs.pydeps }}
-    - name: Export venv
+    - name: Set up venv environment
      shell: bash
      run: |
        echo "VIRTUAL_ENV=${{ github.workspace }}/.venv" >> "$GITHUB_ENV"
+        echo "OMP_NUM_THREADS=1" >> "$GITHUB_ENV"
+        # no buffers should be over 300MB in CI
+        echo "MAX_BUFFER_SIZE=300000000" >> "$GITHUB_ENV"
        if [[ "$RUNNER_OS" == "Windows" ]]; then
          echo "${{ github.workspace }}/.venv/Scripts" >> "$GITHUB_PATH"
        else
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -364,6 +364,8 @@ jobs:
    #  run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
    - name: Run GC tests
      run: PYTHONPATH="." python test/external/external_uop_gc.py
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
    - name: Repo line count < 14000 lines
      run: MAX_LINE_COUNT=14000 python sz.py

@@ -475,9 +477,9 @@ jobs:
        run: CPU=1 PYTHONPATH=. python3 test/test_quantize_onnx.py
      - name: Run REMOTE=1 Test
        run: |
-          REMOTEDEV=CPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
-          REMOTEDEV=GPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
-          REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
+          REMOTEDEV=CPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_jit.py test/test_multitensor.py
+          REMOTEDEV=GPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py test/test_jit.py
+          REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py
      - name: Test Optimization Helpers
        run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py
      - name: Test Action Space
@@ -618,7 +620,7 @@ jobs:
        if: matrix.backend=='amdllvm'
        run: python test/test_amd_llvm.py
      - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
      - name: Run TRANSCENDENTAL math
        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
      - name: Run TestOps.test_add with SQTT
@@ -654,7 +656,8 @@ jobs:
          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
          DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run pytest (cuda)
-        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --durations=20
+        # skip multitensor because it's slow
+        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
        pass_filenames: false
      - id: tests
        name: subset of tests
-        entry: env MAX_BUFFER_SIZE=200000000 PYTHONPATH="." python3 -m pytest -n=4 test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
+        entry: env MAX_BUFFER_SIZE=300000000 PYTHONPATH="." python3 -m pytest -n=4 --ignore=test/unit/test_keccak.py --ignore=test/unit/test_indexing.py test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
        language: system
        always_run: true
        pass_filenames: false
--- a/test/external/external_test_datasets.py
+++ b/test/external/external_test_datasets.py
@@ -152,7 +152,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):
      ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0))
      ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"]

-      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
+      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)
      np.testing.assert_equal(tinygrad_boxes[0].numpy(), ref_boxes.numpy())
      np.testing.assert_equal(tinygrad_labels[0].numpy(), ref_labels.numpy())

@@ -165,7 +165,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):

    for ((tinygrad_img, _, _, _), (ref_img, _)) in zip(tinygrad_dataloader, ref_dataloader):
      ref_img, _ = transform(ref_img.unsqueeze(0))
-      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
+      np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)

 if __name__ == '__main__':
  unittest.main()
--- a/test/external/process_replay/process_replay.py
+++ b/test/external/process_replay/process_replay.py
@@ -34,7 +34,7 @@ class ProcessReplayWarning(Warning): pass
 def replay_kernelize(ret:dict[UOp, UOp], big_sink:UOp) -> tuple[str, str, tuple[Any, ...]]:
  UOp.unique_num = itertools.count(max([u.arg for u in big_sink.toposort() if u.op is Ops.UNIQUE], default=0)+1)
  new_sink = big_sink.substitute(get_kernelize_map(big_sink))
-  def to_str(ret:UOp):
+  def to_str(ret:UOp) -> str:
    asts = [repr(u.arg.ast) for u in ret.toposort() if u.op is Ops.KERNEL]
    return "\n".join([f"{len(asts)} kernels", *asts])
  return to_str(new_sink), to_str(ret[big_sink]), (big_sink,)
@@ -44,7 +44,9 @@ def replay_linearize(k:Kernel, _:Kernel, name_override=None, ast_transform=None)
  # this should be made fully functional. It's fine for process replay since copy returns a fresh instance
  k2 = k.copy()
  k2.linearize(name_override=name_override or to_function_name(k.name), ast_transform=ast_transform)
-  def to_str(ret:Kernel): return ret.opts.render(ret.uops)
+  def to_str(ret:Kernel) -> str:
+    try: return ret.opts.render(ret.uops)
+    except NotImplementedError: return "" # NULL backend doesn't have a renderer, this is okay
  return to_str(k2), to_str(k), (k.ast, k.opts, k.applied_opts)

 replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {"get_kernelize_map":replay_kernelize, "linearize":replay_linearize}
--- a/test/imported/init.py
+++ b/test/imported/init.py
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -61,7 +61,7 @@ class TestRealWorld(unittest.TestCase):
    derandomize_model(model)
    @TinyJit
    def test(t, t2): return model(t, Tensor([801]), t2).realize()
-    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)
+    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 32, 32),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)

  def test_unet_resblock(self):
    model = [ResBlock(16, 24, 16) for _ in range(4)]
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -523,6 +523,7 @@ class TestTypeSpec(unittest.TestCase):
      dtypes.default_float = default_float
      assert dtypes.default_float == default_float

+  @unittest.skip("this test is slow and spawning whole pythons")
  def test_env_set_default_float(self):
    # check default
    subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'],
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
@@ -1254,6 +1254,7 @@ class TestLinearizerFailures(unittest.TestCase):
    opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))]
    helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[])

+  @unittest.skip("allocating over 200MB buffer")
  @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI")
  def test_failure_52(self):
    # resnet beam.
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1213,8 +1213,8 @@ class TestMultiTransformer(unittest.TestCase):
    device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))

    from extra.models.llama import Transformer
-    args = {"dim": 64, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
-            "hidden_dim": 64, "max_context": 12}
+    args = {"dim": 32, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
+            "hidden_dim": 32, "max_context": 12}
    real_model = Transformer(**args)
    shard_model = Transformer(**args)

--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2122,8 +2122,8 @@ class TestOps(unittest.TestCase):
              lambda x,w: Tensor.conv2d(x,w,padding=p).relu())

  def _test_conv2d(self, bs=1, cin=1, cout=6):
-    for H in [1,2,3]:
-      for W in [1,2,3,5]:
+    for H in [2,3]:
+      for W in [1,3,5]:
        for groups in [1,3] if cin == 3 and cout == 6 and H == 3 and W == 3 else [1]:
          with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
            helper_test_op([(bs,cin,5,7), (cout,cin//groups,H,W)],
@@ -2300,7 +2300,7 @@ class TestOps(unittest.TestCase):
  def test_max_pool2d(self):
    for ksz in [(2,2), (3,3), 2, 3, (3,2), (5,5), (5,1)]:
      with self.subTest(kernel_size=ksz):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
          lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
          lambda x: Tensor.max_pool2d(x, kernel_size=ksz))

@@ -2308,7 +2308,7 @@ class TestOps(unittest.TestCase):
    for ksz in [(2,2), (3,3), 2, 3, (3,2)]:
      for p in [1, (1,0), (0,1)]:
        with self.subTest(kernel_size=ksz, padding=p):
-          helper_test_op([(32,2,110,28)],
+          helper_test_op([(32,2,11,28)],
            lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz, padding=p),
            lambda x: Tensor.max_pool2d(x, kernel_size=ksz, padding=p))
    self.helper_test_exception([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
@@ -2324,40 +2324,40 @@ class TestOps(unittest.TestCase):

  def test_max_pool2d_padding_int(self):
    ksz = (2,2)
-    helper_test_op([(32,2,110,28)],
+    helper_test_op([(32,2,11,28)],
      lambda x: torch.nn.functional.max_pool2d(x.int(), kernel_size=ksz, padding=1),
      lambda x: Tensor.max_pool2d(x.int(), kernel_size=ksz, padding=1), forward_only=True)

  def test_max_pool2d_bigger_stride(self):
    for stride in [(2,3), (3,2), 2, 3]:
      with self.subTest(stride=stride):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
          lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride),
          lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride))

  def test_max_pool2d_bigger_stride_dilation(self):
    for stride, dilation in zip([(2,3), (3,2), 2, 3, 4], [(3,2), (2,3), 2, 3, 6]):
      with self.subTest(stride=stride):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op([(32,2,11,28)],
          lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation),
          lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation))

  @unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this")
  def test_max_pool2d_unit_stride(self):
-    helper_test_op([(8, 2, 17, 14)],
+    helper_test_op([(3, 2, 17, 14)],
      lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1),
      lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=1))

  def test_max_pool2d_smaller_stride(self):
    for stride in [(2,3), (3,2), 2, 3]:
      with self.subTest(stride=stride):
-        helper_test_op([(8, 2, 17, 14)],
+        helper_test_op([(3, 2, 17, 14)],
          lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=stride),
          lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=stride))

  def test_max_pool2d_dilation(self):
    for dilation in [(2, 3), (3, 2), 2, 3]:
-      helper_test_op([(8, 2, 17, 14)],
+      helper_test_op([(3, 2, 17, 14)],
        lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), dilation=dilation),
        lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), dilation=dilation))

@@ -2541,13 +2541,13 @@ class TestOps(unittest.TestCase):
  def test_interpolate_nearest_exact(self): self.test_interpolate_nearest("nearest-exact")

  def test_interpolate_bilinear(self):
-    for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
+    for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
      helper_test_op([(2,3)+in_sz],
        lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear"),
        lambda x: Tensor.interpolate(x, size=out_sz, mode="linear"), atol=1e-4)

  def test_interpolate_bilinear_corners_aligned(self):
-    for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
+    for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
      helper_test_op([(2,3)+in_sz],
        lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear", align_corners=True),
        lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4)
@@ -2838,7 +2838,7 @@ class TestOps(unittest.TestCase):
    b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False)
    a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False)
    for reduce in ("sum", "prod", "mean", "amin", "amax"):
-      for dim in (0,1,2,-1,-2,-3):
+      for dim in (-1,1,-3):
        helper_test_op([(4,5,6), (4,5,6)],
          lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce),
          lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True)
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1053,9 +1053,9 @@ class TestSchedule(unittest.TestCase):

  def test_scaled_dot_product_attention_multireduce_fusion(self):
    Tensor.manual_seed(0)
-    q = Tensor.randn(32,8,16,64).realize()
-    k = Tensor.randn(32,8,16,64).realize()
-    v = Tensor.randn(32,8,16,64).realize()
+    q = Tensor.randn(32,8,16,8).realize()
+    k = Tensor.randn(32,8,16,8).realize()
+    v = Tensor.randn(32,8,16,8).realize()
    out = Tensor.scaled_dot_product_attention(q,k,v)
    run_schedule(check_schedule(out, 5))
    if getenv("CHECK", 1):
@@ -1296,7 +1296,7 @@ class TestSchedule(unittest.TestCase):

  def test_sgd_4convs_fuse(self):
    with Tensor.train():
-      img = Tensor.empty(2,3,64,64)
+      img = Tensor.empty(2,3,16,16)
      c1 = nn.Conv2d(3,4,3,bias=False)
      c2 = nn.Conv2d(4,8,3,bias=False)
      c3 = nn.Conv2d(8,16,3,bias=False)
@@ -1309,7 +1309,7 @@ class TestSchedule(unittest.TestCase):

  def test_sgd_4convs_fuse_conv_bw(self):
    with Tensor.train():
-      img = Tensor.empty(2,3,64,64)
+      img = Tensor.empty(2,3,16,16)
      c1 = nn.Conv2d(3,4,3,bias=False)
      c2 = nn.Conv2d(4,8,3,bias=False)
      c3 = nn.Conv2d(8,16,3,bias=False)
--- a/test/unit/test_conv.py
+++ b/test/unit/test_conv.py
--- a/test/imported/test_indexing.py
+++ b/test/imported/test_indexing.py