move device tests to test/device + test cleanups (#11735)

* move device tests to test/device

* test speedups

* test device

* linalg to unit

* upd

* so pytest just works

* more divide and skip

* speed

* test devectorize

* add pillow
This commit is contained in:
George Hotz
2025-08-19 16:02:20 -07:00
committed by GitHub
parent bcc7623025
commit 1d307f568c
18 changed files with 49 additions and 125 deletions

View File

@@ -63,7 +63,7 @@ jobs:
- name: Run model inference benchmark
run: METAL=1 python3.11 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
- name: Test tensor cores
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
- name: Test AMX tensor cores
@@ -187,7 +187,7 @@ jobs:
- name: Run model inference benchmark
run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
- name: Test speed vs torch
run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
- name: Test speed vs theoretical
run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
- name: Test benchmark allreduce
@@ -389,7 +389,7 @@ jobs:
#- name: Test speed vs torch
# run: |
# python3 -c "import torch; print(torch.__version__)"
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
- name: Test speed vs theoretical
run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
- name: Test tensor cores

View File

@@ -30,9 +30,9 @@ jobs:
- name: External Benchmark Schedule
run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
- name: Speed Test
run: LLVM=1 python3 test/test_speed_v_torch.py
run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
- name: Speed Test (BEAM=2)
run: BEAM=2 LLVM=1 python3 test/test_speed_v_torch.py
run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py
docs:
name: Docs
@@ -458,7 +458,7 @@ jobs:
testopenpilot:
name: 'openpilot Compile Tests'
runs-on: ubuntu-22.04
timeout-minutes: 10
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
@@ -589,6 +589,29 @@ jobs:
- name: Run process replay tests
uses: ./.github/actions/process-replay
testdevectorize:
name: Linux (devectorize)
runs-on: ubuntu-24.04
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: devectorize-minimal
deps: testing_minimal
pydeps: "pillow"
llvm: "true"
- name: Test LLVM=1 DEVECTORIZE=0
run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
- name: Test LLVM=1 DEVECTORIZE=0 for model
run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
- name: Test CPU=1 DEVECTORIZE=0
run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
testdsp:
name: Linux (DSP)
runs-on: ubuntu-24.04
@@ -624,12 +647,6 @@ jobs:
run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
- name: Test quantize onnx
run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
- name: Test LLVM=1 DEVECTORIZE=0
run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
- name: Test LLVM=1 DEVECTORIZE=0 for model
run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
- name: Test CPU=1 DEVECTORIZE=0
run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
testwebgpu:
name: Linux (WebGPU)
@@ -689,9 +706,9 @@ jobs:
DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Run LLVM test
if: matrix.backend=='amdllvm'
run: python test/test_amd_llvm.py
run: python test/device/test_amd_llvm.py
- name: Run pytest (amd)
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py --durations=20
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20
- name: Run pytest (amd)
run: python -m pytest test/external/external_test_am.py --durations=20
- name: Run TRANSCENDENTAL math
@@ -816,14 +833,14 @@ jobs:
AMD: 1
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (amd with llvm backend)
env:
MOCKGPU: 1
AMD: 1
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/test_hcq.py test/test_tiny.py test/test_amd_llvm.py --durations=20
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
@@ -831,7 +848,7 @@ jobs:
NV: 1
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
@@ -942,18 +959,18 @@ jobs:
env:
HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
- name: Run REMOTE=1 Test (GPU)
env:
HOST: 127.0.0.1:7667*6
run: |
python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
- name: Run REMOTE=1 Test (CPU)
env:
HOST: 127.0.0.1:8667*6
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
- name: Show remote server logs
if: always()
run: |

2
pytest.ini Normal file
View File

@@ -0,0 +1,2 @@
[pytest]
norecursedirs = extra

View File

@@ -2,7 +2,7 @@ import time
from tinygrad import Tensor, TinyJit, Device, Context
from tinygrad.helpers import Profiling, Timing, GlobalCounters
# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a
@TinyJit
def plus(a:Tensor, b:Tensor): return a+b

View File

@@ -32,6 +32,7 @@ OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/mod
np.random.seed(1337)
class TestOnnxModel(unittest.TestCase):
@unittest.skip("this isn't a test, it can't fail")
def test_benchmark_openpilot_model(self):
onnx_model = fetch(OPENPILOT_MODEL)
run_onnx = OnnxRunner(onnx_model)

View File

@@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time." # noqa: E501
@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow")
@unittest.skipIf(Device.DEFAULT in ["CPU", "LLVM"], "slow")
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
class TestWhisper(unittest.TestCase):
@classmethod

View File

@@ -3,7 +3,7 @@ from tinygrad import Tensor, Device, TinyJit
from tinygrad.helpers import Timing, CI, OSX
import multiprocessing.shared_memory as shared_memory
N = 256 if CI else 4096
N = 256
class TestCopySpeed(unittest.TestCase):
@classmethod
def setUpClass(cls): Device[Device.DEFAULT].synchronize()

View File

@@ -373,11 +373,11 @@ class TestMultiTensor(unittest.TestCase):
np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)
# NOTE: this is failing on LLVM CI, no idea why. Works locally.
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
def test_data_parallel_resnet(self):
from extra.models.resnet import ResNet18
fake_image = Tensor.rand((2, 3, 224//8, 224//8))
fake_image = Tensor.rand((2, 3, 224//16, 224//16))
fake_image_sharded = fake_image.shard(devices_2, axis=0)
m = ResNet18()
m.load_from_pretrained()
@@ -409,10 +409,10 @@ class TestMultiTensor(unittest.TestCase):
# sometimes there is zeros in these grads... why?
np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
def test_data_parallel_resnet_train_step(self):
from extra.models.resnet import ResNet18
fake_image = Tensor.rand((2, 3, 224//8, 224//8))
fake_image = Tensor.rand((2, 3, 224//16, 224//16))
labels = Tensor.randint(2, low=0, high=1000)
m = ResNet18()
self._test_model_train_step(m, fake_image, labels)

View File

@@ -1,87 +0,0 @@
#!/usr/bin/env python
import time
import unittest
import torch
from tinygrad import Tensor, Device
from tinygrad.helpers import Profiling, CI
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
class TestConvSpeed(unittest.TestCase):
def test_mnist(self):
# https://keras.io/examples/vision/mnist_convnet/
conv = 3
inter_chan, out_chan = 32, 64
# ****** torch baseline *******
torch.backends.mkldnn.enabled = False
conv = 3
inter_chan, out_chan = 32, 64
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
c2d = torch.nn.functional.conv2d
mp = torch.nn.MaxPool2d((2,2))
lsm = torch.nn.LogSoftmax(dim=1)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(cnt):
et0 = time.time()
x = torch.randn(128, 1, 28, 28, requires_grad=True)
x = mp(c2d(x,c1).relu())
x = mp(c2d(x,c2).relu())
x = x.reshape(x.shape[0], -1)
out = lsm(x.matmul(l1))
out = out.mean()
et1 = time.time()
out.backward()
et2 = time.time()
fpt += (et1-et0)
bpt += (et2-et1)
fpt_baseline = (fpt*1000/cnt)
bpt_baseline = (bpt*1000/cnt)
print("torch forward pass: %.3f ms" % fpt_baseline)
print("torch backward pass: %.3f ms" % bpt_baseline)
# ****** tinygrad compare *******
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(1+cnt):
et0 = time.time()
x = Tensor.randn(128, 1, 28, 28)
x = x.conv2d(c1).relu().avg_pool2d()
x = x.conv2d(c2).relu().max_pool2d()
x = x.reshape(shape=(x.shape[0], -1))
out = x.dot(l1).log_softmax()
out = out.mean()
out.backward() # NOTE: we have to now compute this here, but it doesn't realize
out.realize()
et1 = time.time()
[x.grad.realize() for x in [c1, c2, l1]]
et2 = time.time()
if i == 0:
pr = Profiling(sort='time', frac=0.2)
pr.__enter__()
else:
fpt += (et1-et0)
bpt += (et2-et1)
pr.__exit__()
fpt = (fpt*1000/cnt)
bpt = (bpt*1000/cnt)
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
if __name__ == '__main__':
unittest.main()

0
test/test_nn.py Executable file → Normal file
View File

View File

@@ -2,7 +2,6 @@ from typing_extensions import Callable
import hashlib, random, unittest
from tinygrad import Tensor, Device, getenv, dtypes
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import CI
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
@@ -12,7 +11,7 @@ class TestHashing(unittest.TestCase):
chunk_hashes = [hashlib.shake_128(chunk).digest(16) for chunk in chunks]
return hashlib.shake_128(b''.join(chunk_hashes)).digest(16)
@unittest.skipIf(CI, "very slow")
@unittest.skip("very slow")
def test_abc(self):
expected = self._python_hash_1mb(b"abc" + b"\x00" * (2**20 - 3))
out = Tensor(b"abc").hash()
@@ -65,7 +64,7 @@ class TestKeccak(unittest.TestCase):
data = b"\x00" * 4
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
data = b"\x00" * (1000 if CI else 4096)
data = b"\x00" * 1000
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
if __name__ == "__main__":

View File

@@ -1,8 +0,0 @@
#!/bin/bash
python3 test/external/process_replay/reset.py
CAPTURE_PROCESS_REPLAY=1 pytest -n auto test/test_tiny.py test/test_uop_graph.py test/test_ops.py test/test_linearizer.py
while true; do
if python3 test/test_tiny.py; then
PYTHONPATH="." python3 test/external/process_replay/process_replay.py
fi
done