diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0a1ea908b4..9eb1248a9c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -63,7 +63,7 @@ jobs: - name: Run model inference benchmark run: METAL=1 python3.11 test/external/external_model_benchmark.py - name: Test speed vs torch - run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt + run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt - name: Test tensor cores run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops - name: Test AMX tensor cores @@ -187,7 +187,7 @@ jobs: - name: Run model inference benchmark run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py - name: Test speed vs torch - run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt + run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt - name: Test speed vs theoretical run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 - name: Test benchmark allreduce @@ -389,7 +389,7 @@ jobs: #- name: Test speed vs torch # run: | # python3 -c "import torch; print(torch.__version__)" - # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt + # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt - name: Test speed vs theoretical run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 - name: Test tensor cores diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a3997e4f68..73a951af3e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,9 +30,9 @@ jobs: - name: External Benchmark Schedule run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py - name: Speed Test - run: LLVM=1 python3 test/test_speed_v_torch.py + run: LLVM=1 python3 test/external/external_test_speed_v_torch.py - name: Speed Test (BEAM=2) - run: BEAM=2 LLVM=1 python3 test/test_speed_v_torch.py + run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py docs: name: Docs @@ -458,7 +458,7 @@ jobs: testopenpilot: name: 'openpilot Compile Tests' runs-on: ubuntu-22.04 - timeout-minutes: 10 + timeout-minutes: 15 env: IGNORE_OOB: 0 steps: @@ -589,6 +589,29 @@ jobs: - name: Run process replay tests uses: ./.github/actions/process-replay + testdevectorize: + name: Linux (devectorize) + runs-on: ubuntu-24.04 + timeout-minutes: 15 + env: + IGNORE_OOB: 0 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Setup Environment + uses: ./.github/actions/setup-tinygrad + with: + key: devectorize-minimal + deps: testing_minimal + pydeps: "pillow" + llvm: "true" + - name: Test LLVM=1 DEVECTORIZE=0 + run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" + - name: Test LLVM=1 DEVECTORIZE=0 for model + run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py + - name: Test CPU=1 DEVECTORIZE=0 + run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" + testdsp: name: Linux (DSP) runs-on: ubuntu-24.04 @@ -624,12 +647,6 @@ jobs: run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized - name: Test quantize onnx run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py - - name: Test LLVM=1 DEVECTORIZE=0 - run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" - - name: Test LLVM=1 DEVECTORIZE=0 for model - run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py - - name: Test CPU=1 DEVECTORIZE=0 - run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" testwebgpu: name: Linux (WebGPU) @@ -689,9 +706,9 @@ jobs: DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run LLVM test if: matrix.backend=='amdllvm' - run: python test/test_amd_llvm.py + run: python test/device/test_amd_llvm.py - name: Run pytest (amd) - run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py --durations=20 + run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20 - name: Run pytest (amd) run: python -m pytest test/external/external_test_am.py --durations=20 - name: Run TRANSCENDENTAL math @@ -816,14 +833,14 @@ jobs: AMD: 1 FORWARD_ONLY: 1 run: | - python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20 + python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20 - name: Run pytest (amd with llvm backend) env: MOCKGPU: 1 AMD: 1 FORWARD_ONLY: 1 run: | - python -m pytest -n=auto test/test_hcq.py test/test_tiny.py test/test_amd_llvm.py --durations=20 + python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20 - name: Run pytest (ptx) env: MOCKGPU: 1 @@ -831,7 +848,7 @@ jobs: NV: 1 FORWARD_ONLY: 1 run: | - python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20 + python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay @@ -942,18 +959,18 @@ jobs: env: HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6 run: | - python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py + python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20 - name: Run REMOTE=1 Test (GPU) env: HOST: 127.0.0.1:7667*6 run: | - python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py + python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20 IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py - name: Run REMOTE=1 Test (CPU) env: HOST: 127.0.0.1:8667*6 run: | - python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py + python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20 - name: Show remote server logs if: always() run: | diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000..cccc62e404 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +norecursedirs = extra diff --git a/test/test_amd_llvm.py b/test/device/test_amd_llvm.py similarity index 100% rename from test/test_amd_llvm.py rename to test/device/test_amd_llvm.py diff --git a/test/test_hcq.py b/test/device/test_hcq.py similarity index 100% rename from test/test_hcq.py rename to test/device/test_hcq.py diff --git a/test/test_hcq_iface.py b/test/device/test_hcq_iface.py similarity index 100% rename from test/test_hcq_iface.py rename to test/device/test_hcq_iface.py diff --git a/test/test_metal.py b/test/device/test_metal.py similarity index 100% rename from test/test_metal.py rename to test/device/test_metal.py diff --git a/test/external/external_benchmark_kernel_launch.py b/test/external/external_benchmark_kernel_launch.py index 5744ba0923..7011f7ab28 100644 --- a/test/external/external_benchmark_kernel_launch.py +++ b/test/external/external_benchmark_kernel_launch.py @@ -2,7 +2,7 @@ import time from tinygrad import Tensor, TinyJit, Device, Context from tinygrad.helpers import Profiling, Timing, GlobalCounters -# python3 test/test_speed_v_torch.py TestSpeed.test_add_a +# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a @TinyJit def plus(a:Tensor, b:Tensor): return a+b diff --git a/test/test_speed_v_torch.py b/test/external/external_test_speed_v_torch.py similarity index 100% rename from test/test_speed_v_torch.py rename to test/external/external_test_speed_v_torch.py diff --git a/test/models/test_onnx.py b/test/models/test_onnx.py index c2b1f8677d..79ce2046c6 100644 --- a/test/models/test_onnx.py +++ b/test/models/test_onnx.py @@ -32,6 +32,7 @@ OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/mod np.random.seed(1337) class TestOnnxModel(unittest.TestCase): + @unittest.skip("this isn't a test, it can't fail") def test_benchmark_openpilot_model(self): onnx_model = fetch(OPENPILOT_MODEL) run_onnx = OnnxRunner(onnx_model) diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py index f1696fd490..056615b227 100644 --- a/test/models/test_whisper.py +++ b/test/models/test_whisper.py @@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3' TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time." # noqa: E501 -@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow") +@unittest.skipIf(Device.DEFAULT in ["CPU", "LLVM"], "slow") @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support") class TestWhisper(unittest.TestCase): @classmethod diff --git a/test/test_copy_speed.py b/test/test_copy_speed.py index ed2080922d..391a4da0c6 100644 --- a/test/test_copy_speed.py +++ b/test/test_copy_speed.py @@ -3,7 +3,7 @@ from tinygrad import Tensor, Device, TinyJit from tinygrad.helpers import Timing, CI, OSX import multiprocessing.shared_memory as shared_memory -N = 256 if CI else 4096 +N = 256 class TestCopySpeed(unittest.TestCase): @classmethod def setUpClass(cls): Device[Device.DEFAULT].synchronize() diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 14e1cb1266..9766753daa 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -373,11 +373,11 @@ class TestMultiTensor(unittest.TestCase): np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6) # NOTE: this is failing on LLVM CI, no idea why. Works locally. - @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU") + @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU") def test_data_parallel_resnet(self): from extra.models.resnet import ResNet18 - fake_image = Tensor.rand((2, 3, 224//8, 224//8)) + fake_image = Tensor.rand((2, 3, 224//16, 224//16)) fake_image_sharded = fake_image.shard(devices_2, axis=0) m = ResNet18() m.load_from_pretrained() @@ -409,10 +409,10 @@ class TestMultiTensor(unittest.TestCase): # sometimes there is zeros in these grads... why? np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5) - @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU") + @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU") def test_data_parallel_resnet_train_step(self): from extra.models.resnet import ResNet18 - fake_image = Tensor.rand((2, 3, 224//8, 224//8)) + fake_image = Tensor.rand((2, 3, 224//16, 224//16)) labels = Tensor.randint(2, low=0, high=1000) m = ResNet18() self._test_model_train_step(m, fake_image, labels) diff --git a/test/test_net_speed.py b/test/test_net_speed.py deleted file mode 100644 index 9344ebe720..0000000000 --- a/test/test_net_speed.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python -import time -import unittest -import torch -from tinygrad import Tensor, Device -from tinygrad.helpers import Profiling, CI - -@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") -class TestConvSpeed(unittest.TestCase): - - def test_mnist(self): - # https://keras.io/examples/vision/mnist_convnet/ - conv = 3 - inter_chan, out_chan = 32, 64 - - # ****** torch baseline ******* - - torch.backends.mkldnn.enabled = False - - conv = 3 - inter_chan, out_chan = 32, 64 - c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True) - c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True) - l1 = torch.randn(out_chan*5*5, 10, requires_grad=True) - - c2d = torch.nn.functional.conv2d - mp = torch.nn.MaxPool2d((2,2)) - lsm = torch.nn.LogSoftmax(dim=1) - - cnt = 5 - fpt, bpt = 0.0, 0.0 - for i in range(cnt): - et0 = time.time() - x = torch.randn(128, 1, 28, 28, requires_grad=True) - x = mp(c2d(x,c1).relu()) - x = mp(c2d(x,c2).relu()) - x = x.reshape(x.shape[0], -1) - out = lsm(x.matmul(l1)) - out = out.mean() - et1 = time.time() - out.backward() - et2 = time.time() - fpt += (et1-et0) - bpt += (et2-et1) - - fpt_baseline = (fpt*1000/cnt) - bpt_baseline = (bpt*1000/cnt) - print("torch forward pass: %.3f ms" % fpt_baseline) - print("torch backward pass: %.3f ms" % bpt_baseline) - - # ****** tinygrad compare ******* - - c1 = Tensor(c1.detach().numpy(), requires_grad=True) - c2 = Tensor(c2.detach().numpy(), requires_grad=True) - l1 = Tensor(l1.detach().numpy(), requires_grad=True) - - cnt = 5 - fpt, bpt = 0.0, 0.0 - for i in range(1+cnt): - et0 = time.time() - x = Tensor.randn(128, 1, 28, 28) - x = x.conv2d(c1).relu().avg_pool2d() - x = x.conv2d(c2).relu().max_pool2d() - x = x.reshape(shape=(x.shape[0], -1)) - out = x.dot(l1).log_softmax() - out = out.mean() - out.backward() # NOTE: we have to now compute this here, but it doesn't realize - out.realize() - et1 = time.time() - [x.grad.realize() for x in [c1, c2, l1]] - et2 = time.time() - if i == 0: - pr = Profiling(sort='time', frac=0.2) - pr.__enter__() - else: - fpt += (et1-et0) - bpt += (et2-et1) - - pr.__exit__() - fpt = (fpt*1000/cnt) - bpt = (bpt*1000/cnt) - print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline)) - print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline)) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_nn.py b/test/test_nn.py old mode 100755 new mode 100644 diff --git a/test/unit/test_hashing.py b/test/unit/test_hashing.py index 803883b78f..1ab969b2fd 100644 --- a/test/unit/test_hashing.py +++ b/test/unit/test_hashing.py @@ -2,7 +2,6 @@ from typing_extensions import Callable import hashlib, random, unittest from tinygrad import Tensor, Device, getenv, dtypes from tinygrad.device import is_dtype_supported -from tinygrad.helpers import CI @unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64") @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI") @@ -12,7 +11,7 @@ class TestHashing(unittest.TestCase): chunk_hashes = [hashlib.shake_128(chunk).digest(16) for chunk in chunks] return hashlib.shake_128(b''.join(chunk_hashes)).digest(16) - @unittest.skipIf(CI, "very slow") + @unittest.skip("very slow") def test_abc(self): expected = self._python_hash_1mb(b"abc" + b"\x00" * (2**20 - 3)) out = Tensor(b"abc").hash() @@ -65,7 +64,7 @@ class TestKeccak(unittest.TestCase): data = b"\x00" * 4 self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16)) - data = b"\x00" * (1000 if CI else 4096) + data = b"\x00" * 1000 self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16)) if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/unit/test_linalg.py similarity index 100% rename from test/test_linalg.py rename to test/unit/test_linalg.py diff --git a/test_driven_development.sh b/test_driven_development.sh deleted file mode 100755 index e868497e91..0000000000 --- a/test_driven_development.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -python3 test/external/process_replay/reset.py -CAPTURE_PROCESS_REPLAY=1 pytest -n auto test/test_tiny.py test/test_uop_graph.py test/test_ops.py test/test_linearizer.py -while true; do - if python3 test/test_tiny.py; then - PYTHONPATH="." python3 test/external/process_replay/process_replay.py - fi -done