move device tests to test/device + test cleanups (#11735)

* move device tests to test/device * test speedups * test device * linalg to unit * upd * so pytest just works * more divide and skip * speed * test devectorize * add pillow
2026-01-07 22:23:55 -05:00 · 2025-08-19 16:02:20 -07:00
parent bcc7623025
commit 1d307f568c
18 changed files with 49 additions and 125 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,7 +63,7 @@ jobs:
    - name: Run model inference benchmark
      run: METAL=1 python3.11 test/external/external_model_benchmark.py
    - name: Test speed vs torch
-      run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
+      run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test tensor cores
      run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test AMX tensor cores
@@ -187,7 +187,7 @@ jobs:
    - name: Run model inference benchmark
      run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
    - name: Test speed vs torch
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
+      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test speed vs theoretical
      run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test benchmark allreduce
@@ -389,7 +389,7 @@ jobs:
    #- name: Test speed vs torch
    #  run: |
    #    python3 -c "import torch; print(torch.__version__)"
-    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
+    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test speed vs theoretical
      run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test tensor cores
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -30,9 +30,9 @@ jobs:
    - name: External Benchmark Schedule
      run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
    - name: Speed Test
-      run: LLVM=1 python3 test/test_speed_v_torch.py
+      run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
    - name: Speed Test (BEAM=2)
-      run: BEAM=2 LLVM=1 python3 test/test_speed_v_torch.py
+      run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py

  docs:
    name: Docs
@@ -458,7 +458,7 @@ jobs:
  testopenpilot:
    name: 'openpilot Compile Tests'
    runs-on: ubuntu-22.04
-    timeout-minutes: 10
+    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
@@ -589,6 +589,29 @@ jobs:
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

+  testdevectorize:
+    name: Linux (devectorize)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    env:
+      IGNORE_OOB: 0
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: devectorize-minimal
+        deps: testing_minimal
+        pydeps: "pillow"
+        llvm: "true"
+    - name: Test LLVM=1 DEVECTORIZE=0
+      run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
+    - name: Test LLVM=1 DEVECTORIZE=0 for model
+      run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
+    - name: Test CPU=1 DEVECTORIZE=0
+      run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
+
  testdsp:
    name: Linux (DSP)
    runs-on: ubuntu-24.04
@@ -624,12 +647,6 @@ jobs:
      run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
    - name: Test quantize onnx
      run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
-    - name: Test LLVM=1 DEVECTORIZE=0
-      run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
-    - name: Test LLVM=1 DEVECTORIZE=0 for model
-      run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
-    - name: Test CPU=1 DEVECTORIZE=0
-      run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"

  testwebgpu:
    name: Linux (WebGPU)
@@ -689,9 +706,9 @@ jobs:
          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run LLVM test
        if: matrix.backend=='amdllvm'
-        run: python test/test_amd_llvm.py
+        run: python test/device/test_amd_llvm.py
      - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py --durations=20
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20
      - name: Run pytest (amd)
        run: python -m pytest test/external/external_test_am.py --durations=20
      - name: Run TRANSCENDENTAL math
@@ -816,14 +833,14 @@ jobs:
        AMD: 1
        FORWARD_ONLY: 1
      run: |
-        python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
+        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run pytest (amd with llvm backend)
      env:
        MOCKGPU: 1
        AMD: 1
        FORWARD_ONLY: 1
      run: |
-        python -m pytest -n=auto test/test_hcq.py test/test_tiny.py test/test_amd_llvm.py --durations=20
+        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
        MOCKGPU: 1
@@ -831,7 +848,7 @@ jobs:
        NV: 1
        FORWARD_ONLY: 1
      run: |
-        python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
+        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run process replay tests
      uses: ./.github/actions/process-replay

@@ -942,18 +959,18 @@ jobs:
        env:
          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
        run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py
+          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
      - name: Run REMOTE=1 Test (GPU)
        env:
          HOST: 127.0.0.1:7667*6
        run: |
-          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
+          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
          IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
      - name: Run REMOTE=1 Test (CPU)
        env:
          HOST: 127.0.0.1:8667*6
        run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
+          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
      - name: Show remote server logs
        if: always()
        run: |
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+norecursedirs = extra
--- a/test/device/test_amd_llvm.py
+++ b/test/device/test_amd_llvm.py
--- a/test/device/test_hcq.py
+++ b/test/device/test_hcq.py
--- a/test/device/test_hcq_iface.py
+++ b/test/device/test_hcq_iface.py
--- a/test/device/test_metal.py
+++ b/test/device/test_metal.py
--- a/test/external/external_benchmark_kernel_launch.py
+++ b/test/external/external_benchmark_kernel_launch.py
@@ -2,7 +2,7 @@ import time
 from tinygrad import Tensor, TinyJit, Device, Context
 from tinygrad.helpers import Profiling, Timing, GlobalCounters

-# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
+# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a

@TinyJit
 def plus(a:Tensor, b:Tensor): return a+b
--- a/test/external/external_test_speed_v_torch.py
+++ b/test/external/external_test_speed_v_torch.py
--- a/test/models/test_onnx.py
+++ b/test/models/test_onnx.py
@@ -32,6 +32,7 @@ OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/mod
 np.random.seed(1337)

 class TestOnnxModel(unittest.TestCase):
+  @unittest.skip("this isn't a test, it can't fail")
  def test_benchmark_openpilot_model(self):
    onnx_model = fetch(OPENPILOT_MODEL)
    run_onnx = OnnxRunner(onnx_model)
--- a/test/models/test_whisper.py
+++ b/test/models/test_whisper.py
@@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
 TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
 TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time."   # noqa: E501

-@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow")
+@unittest.skipIf(Device.DEFAULT in ["CPU", "LLVM"], "slow")
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
 class TestWhisper(unittest.TestCase):
  @classmethod
--- a/test/test_copy_speed.py
+++ b/test/test_copy_speed.py
@@ -3,7 +3,7 @@ from tinygrad import Tensor, Device, TinyJit
 from tinygrad.helpers import Timing, CI, OSX
 import multiprocessing.shared_memory as shared_memory

-N = 256 if CI else 4096
+N = 256
 class TestCopySpeed(unittest.TestCase):
  @classmethod
  def setUpClass(cls): Device[Device.DEFAULT].synchronize()
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -373,11 +373,11 @@ class TestMultiTensor(unittest.TestCase):
    np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)

  # NOTE: this is failing on LLVM CI, no idea why. Works locally.
-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
  def test_data_parallel_resnet(self):
    from extra.models.resnet import ResNet18

-    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
+    fake_image = Tensor.rand((2, 3, 224//16, 224//16))
    fake_image_sharded = fake_image.shard(devices_2, axis=0)
    m = ResNet18()
    m.load_from_pretrained()
@@ -409,10 +409,10 @@ class TestMultiTensor(unittest.TestCase):
    # sometimes there is zeros in these grads... why?
    np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)

-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
  def test_data_parallel_resnet_train_step(self):
    from extra.models.resnet import ResNet18
-    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
+    fake_image = Tensor.rand((2, 3, 224//16, 224//16))
    labels = Tensor.randint(2, low=0, high=1000)
    m = ResNet18()
    self._test_model_train_step(m, fake_image, labels)
--- a/test/test_net_speed.py
+++ b/test/test_net_speed.py
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-import time
-import unittest
-import torch
-from tinygrad import Tensor, Device
-from tinygrad.helpers import Profiling, CI
-
-@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
-class TestConvSpeed(unittest.TestCase):
-
-  def test_mnist(self):
-    # https://keras.io/examples/vision/mnist_convnet/
-    conv = 3
-    inter_chan, out_chan = 32, 64
-
-    # ****** torch baseline *******
-
-    torch.backends.mkldnn.enabled = False
-
-    conv = 3
-    inter_chan, out_chan = 32, 64
-    c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
-    c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
-    l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
-
-    c2d = torch.nn.functional.conv2d
-    mp = torch.nn.MaxPool2d((2,2))
-    lsm = torch.nn.LogSoftmax(dim=1)
-
-    cnt = 5
-    fpt, bpt = 0.0, 0.0
-    for i in range(cnt):
-      et0 = time.time()
-      x = torch.randn(128, 1, 28, 28, requires_grad=True)
-      x = mp(c2d(x,c1).relu())
-      x = mp(c2d(x,c2).relu())
-      x = x.reshape(x.shape[0], -1)
-      out = lsm(x.matmul(l1))
-      out = out.mean()
-      et1 = time.time()
-      out.backward()
-      et2 = time.time()
-      fpt += (et1-et0)
-      bpt += (et2-et1)
-
-    fpt_baseline = (fpt*1000/cnt)
-    bpt_baseline = (bpt*1000/cnt)
-    print("torch forward pass:  %.3f ms" % fpt_baseline)
-    print("torch backward pass: %.3f ms" % bpt_baseline)
-
-    # ****** tinygrad compare *******
-
-    c1 = Tensor(c1.detach().numpy(), requires_grad=True)
-    c2 = Tensor(c2.detach().numpy(), requires_grad=True)
-    l1 = Tensor(l1.detach().numpy(), requires_grad=True)
-
-    cnt = 5
-    fpt, bpt = 0.0, 0.0
-    for i in range(1+cnt):
-      et0 = time.time()
-      x = Tensor.randn(128, 1, 28, 28)
-      x = x.conv2d(c1).relu().avg_pool2d()
-      x = x.conv2d(c2).relu().max_pool2d()
-      x = x.reshape(shape=(x.shape[0], -1))
-      out = x.dot(l1).log_softmax()
-      out = out.mean()
-      out.backward()  # NOTE: we have to now compute this here, but it doesn't realize
-      out.realize()
-      et1 = time.time()
-      [x.grad.realize() for x in [c1, c2, l1]]
-      et2 = time.time()
-      if i == 0:
-        pr = Profiling(sort='time', frac=0.2)
-        pr.__enter__()
-      else:
-        fpt += (et1-et0)
-        bpt += (et2-et1)
-
-    pr.__exit__()
-    fpt = (fpt*1000/cnt)
-    bpt = (bpt*1000/cnt)
-    print("forward pass:  %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
-    print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
-
-
-if __name__ == '__main__':
-  unittest.main()
--- a/test/test_nn.py
+++ b/test/test_nn.py
--- a/test/unit/test_hashing.py
+++ b/test/unit/test_hashing.py
@@ -2,7 +2,6 @@ from typing_extensions import Callable
 import hashlib, random, unittest
 from tinygrad import Tensor, Device, getenv, dtypes
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import CI

@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
@@ -12,7 +11,7 @@ class TestHashing(unittest.TestCase):
    chunk_hashes = [hashlib.shake_128(chunk).digest(16) for chunk in chunks]
    return hashlib.shake_128(b''.join(chunk_hashes)).digest(16)

-  @unittest.skipIf(CI, "very slow")
+  @unittest.skip("very slow")
  def test_abc(self):
    expected = self._python_hash_1mb(b"abc" + b"\x00" * (2**20 - 3))
    out = Tensor(b"abc").hash()
@@ -65,7 +64,7 @@ class TestKeccak(unittest.TestCase):
    data = b"\x00" * 4
    self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))

-    data = b"\x00" * (1000 if CI else 4096)
+    data = b"\x00" * 1000
    self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))

 if __name__ == "__main__":
--- a/test/unit/test_linalg.py
+++ b/test/unit/test_linalg.py
--- a/test_driven_development.sh
+++ b/test_driven_development.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-python3 test/external/process_replay/reset.py
-CAPTURE_PROCESS_REPLAY=1 pytest -n auto test/test_tiny.py test/test_uop_graph.py test/test_ops.py test/test_linearizer.py
-while true; do
-  if python3 test/test_tiny.py; then
-    PYTHONPATH="." python3 test/external/process_replay/process_replay.py
-  fi
-done