mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
move device tests to test/device + test cleanups (#11735)
* move device tests to test/device * test speedups * test device * linalg to unit * upd * so pytest just works * more divide and skip * speed * test devectorize * add pillow
This commit is contained in:
6
.github/workflows/benchmark.yml
vendored
6
.github/workflows/benchmark.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Run model inference benchmark
|
||||
run: METAL=1 python3.11 test/external/external_model_benchmark.py
|
||||
- name: Test speed vs torch
|
||||
run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
|
||||
run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
|
||||
- name: Test tensor cores
|
||||
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
- name: Test AMX tensor cores
|
||||
@@ -187,7 +187,7 @@ jobs:
|
||||
- name: Run model inference benchmark
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
|
||||
- name: Test speed vs torch
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
|
||||
run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
|
||||
- name: Test speed vs theoretical
|
||||
run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
||||
- name: Test benchmark allreduce
|
||||
@@ -389,7 +389,7 @@ jobs:
|
||||
#- name: Test speed vs torch
|
||||
# run: |
|
||||
# python3 -c "import torch; print(torch.__version__)"
|
||||
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
|
||||
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
|
||||
- name: Test speed vs theoretical
|
||||
run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
||||
- name: Test tensor cores
|
||||
|
||||
51
.github/workflows/test.yml
vendored
51
.github/workflows/test.yml
vendored
@@ -30,9 +30,9 @@ jobs:
|
||||
- name: External Benchmark Schedule
|
||||
run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
|
||||
- name: Speed Test
|
||||
run: LLVM=1 python3 test/test_speed_v_torch.py
|
||||
run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
|
||||
- name: Speed Test (BEAM=2)
|
||||
run: BEAM=2 LLVM=1 python3 test/test_speed_v_torch.py
|
||||
run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py
|
||||
|
||||
docs:
|
||||
name: Docs
|
||||
@@ -458,7 +458,7 @@ jobs:
|
||||
testopenpilot:
|
||||
name: 'openpilot Compile Tests'
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 10
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
IGNORE_OOB: 0
|
||||
steps:
|
||||
@@ -589,6 +589,29 @@ jobs:
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
testdevectorize:
|
||||
name: Linux (devectorize)
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
IGNORE_OOB: 0
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup Environment
|
||||
uses: ./.github/actions/setup-tinygrad
|
||||
with:
|
||||
key: devectorize-minimal
|
||||
deps: testing_minimal
|
||||
pydeps: "pillow"
|
||||
llvm: "true"
|
||||
- name: Test LLVM=1 DEVECTORIZE=0
|
||||
run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
|
||||
- name: Test LLVM=1 DEVECTORIZE=0 for model
|
||||
run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
|
||||
- name: Test CPU=1 DEVECTORIZE=0
|
||||
run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
|
||||
|
||||
testdsp:
|
||||
name: Linux (DSP)
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -624,12 +647,6 @@ jobs:
|
||||
run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
|
||||
- name: Test quantize onnx
|
||||
run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
|
||||
- name: Test LLVM=1 DEVECTORIZE=0
|
||||
run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
|
||||
- name: Test LLVM=1 DEVECTORIZE=0 for model
|
||||
run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
|
||||
- name: Test CPU=1 DEVECTORIZE=0
|
||||
run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
|
||||
|
||||
testwebgpu:
|
||||
name: Linux (WebGPU)
|
||||
@@ -689,9 +706,9 @@ jobs:
|
||||
DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
- name: Run LLVM test
|
||||
if: matrix.backend=='amdllvm'
|
||||
run: python test/test_amd_llvm.py
|
||||
run: python test/device/test_amd_llvm.py
|
||||
- name: Run pytest (amd)
|
||||
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py --durations=20
|
||||
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20
|
||||
- name: Run pytest (amd)
|
||||
run: python -m pytest test/external/external_test_am.py --durations=20
|
||||
- name: Run TRANSCENDENTAL math
|
||||
@@ -816,14 +833,14 @@ jobs:
|
||||
AMD: 1
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
|
||||
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
|
||||
- name: Run pytest (amd with llvm backend)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
AMD: 1
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python -m pytest -n=auto test/test_hcq.py test/test_tiny.py test/test_amd_llvm.py --durations=20
|
||||
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
|
||||
- name: Run pytest (ptx)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
@@ -831,7 +848,7 @@ jobs:
|
||||
NV: 1
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
|
||||
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
|
||||
- name: Run process replay tests
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
@@ -942,18 +959,18 @@ jobs:
|
||||
env:
|
||||
HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
|
||||
run: |
|
||||
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py
|
||||
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
|
||||
- name: Run REMOTE=1 Test (GPU)
|
||||
env:
|
||||
HOST: 127.0.0.1:7667*6
|
||||
run: |
|
||||
python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
|
||||
python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
|
||||
IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
|
||||
- name: Run REMOTE=1 Test (CPU)
|
||||
env:
|
||||
HOST: 127.0.0.1:8667*6
|
||||
run: |
|
||||
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
|
||||
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
|
||||
- name: Show remote server logs
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
norecursedirs = extra
|
||||
@@ -2,7 +2,7 @@ import time
|
||||
from tinygrad import Tensor, TinyJit, Device, Context
|
||||
from tinygrad.helpers import Profiling, Timing, GlobalCounters
|
||||
|
||||
# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
|
||||
# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a
|
||||
|
||||
@TinyJit
|
||||
def plus(a:Tensor, b:Tensor): return a+b
|
||||
|
||||
@@ -32,6 +32,7 @@ OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/mod
|
||||
np.random.seed(1337)
|
||||
|
||||
class TestOnnxModel(unittest.TestCase):
|
||||
@unittest.skip("this isn't a test, it can't fail")
|
||||
def test_benchmark_openpilot_model(self):
|
||||
onnx_model = fetch(OPENPILOT_MODEL)
|
||||
run_onnx = OnnxRunner(onnx_model)
|
||||
|
||||
@@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
|
||||
TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
|
||||
TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time." # noqa: E501
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow")
|
||||
@unittest.skipIf(Device.DEFAULT in ["CPU", "LLVM"], "slow")
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
|
||||
class TestWhisper(unittest.TestCase):
|
||||
@classmethod
|
||||
|
||||
@@ -3,7 +3,7 @@ from tinygrad import Tensor, Device, TinyJit
|
||||
from tinygrad.helpers import Timing, CI, OSX
|
||||
import multiprocessing.shared_memory as shared_memory
|
||||
|
||||
N = 256 if CI else 4096
|
||||
N = 256
|
||||
class TestCopySpeed(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls): Device[Device.DEFAULT].synchronize()
|
||||
|
||||
@@ -373,11 +373,11 @@ class TestMultiTensor(unittest.TestCase):
|
||||
np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)
|
||||
|
||||
# NOTE: this is failing on LLVM CI, no idea why. Works locally.
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
|
||||
def test_data_parallel_resnet(self):
|
||||
from extra.models.resnet import ResNet18
|
||||
|
||||
fake_image = Tensor.rand((2, 3, 224//8, 224//8))
|
||||
fake_image = Tensor.rand((2, 3, 224//16, 224//16))
|
||||
fake_image_sharded = fake_image.shard(devices_2, axis=0)
|
||||
m = ResNet18()
|
||||
m.load_from_pretrained()
|
||||
@@ -409,10 +409,10 @@ class TestMultiTensor(unittest.TestCase):
|
||||
# sometimes there is zeros in these grads... why?
|
||||
np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
|
||||
def test_data_parallel_resnet_train_step(self):
|
||||
from extra.models.resnet import ResNet18
|
||||
fake_image = Tensor.rand((2, 3, 224//8, 224//8))
|
||||
fake_image = Tensor.rand((2, 3, 224//16, 224//16))
|
||||
labels = Tensor.randint(2, low=0, high=1000)
|
||||
m = ResNet18()
|
||||
self._test_model_train_step(m, fake_image, labels)
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import unittest
|
||||
import torch
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad.helpers import Profiling, CI
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
|
||||
class TestConvSpeed(unittest.TestCase):
|
||||
|
||||
def test_mnist(self):
|
||||
# https://keras.io/examples/vision/mnist_convnet/
|
||||
conv = 3
|
||||
inter_chan, out_chan = 32, 64
|
||||
|
||||
# ****** torch baseline *******
|
||||
|
||||
torch.backends.mkldnn.enabled = False
|
||||
|
||||
conv = 3
|
||||
inter_chan, out_chan = 32, 64
|
||||
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
|
||||
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
|
||||
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
|
||||
|
||||
c2d = torch.nn.functional.conv2d
|
||||
mp = torch.nn.MaxPool2d((2,2))
|
||||
lsm = torch.nn.LogSoftmax(dim=1)
|
||||
|
||||
cnt = 5
|
||||
fpt, bpt = 0.0, 0.0
|
||||
for i in range(cnt):
|
||||
et0 = time.time()
|
||||
x = torch.randn(128, 1, 28, 28, requires_grad=True)
|
||||
x = mp(c2d(x,c1).relu())
|
||||
x = mp(c2d(x,c2).relu())
|
||||
x = x.reshape(x.shape[0], -1)
|
||||
out = lsm(x.matmul(l1))
|
||||
out = out.mean()
|
||||
et1 = time.time()
|
||||
out.backward()
|
||||
et2 = time.time()
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
|
||||
fpt_baseline = (fpt*1000/cnt)
|
||||
bpt_baseline = (bpt*1000/cnt)
|
||||
print("torch forward pass: %.3f ms" % fpt_baseline)
|
||||
print("torch backward pass: %.3f ms" % bpt_baseline)
|
||||
|
||||
# ****** tinygrad compare *******
|
||||
|
||||
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
|
||||
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
|
||||
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
|
||||
|
||||
cnt = 5
|
||||
fpt, bpt = 0.0, 0.0
|
||||
for i in range(1+cnt):
|
||||
et0 = time.time()
|
||||
x = Tensor.randn(128, 1, 28, 28)
|
||||
x = x.conv2d(c1).relu().avg_pool2d()
|
||||
x = x.conv2d(c2).relu().max_pool2d()
|
||||
x = x.reshape(shape=(x.shape[0], -1))
|
||||
out = x.dot(l1).log_softmax()
|
||||
out = out.mean()
|
||||
out.backward() # NOTE: we have to now compute this here, but it doesn't realize
|
||||
out.realize()
|
||||
et1 = time.time()
|
||||
[x.grad.realize() for x in [c1, c2, l1]]
|
||||
et2 = time.time()
|
||||
if i == 0:
|
||||
pr = Profiling(sort='time', frac=0.2)
|
||||
pr.__enter__()
|
||||
else:
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
|
||||
pr.__exit__()
|
||||
fpt = (fpt*1000/cnt)
|
||||
bpt = (bpt*1000/cnt)
|
||||
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
|
||||
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
0
test/test_nn.py
Executable file → Normal file
0
test/test_nn.py
Executable file → Normal file
@@ -2,7 +2,6 @@ from typing_extensions import Callable
|
||||
import hashlib, random, unittest
|
||||
from tinygrad import Tensor, Device, getenv, dtypes
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import CI
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
|
||||
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
|
||||
@@ -12,7 +11,7 @@ class TestHashing(unittest.TestCase):
|
||||
chunk_hashes = [hashlib.shake_128(chunk).digest(16) for chunk in chunks]
|
||||
return hashlib.shake_128(b''.join(chunk_hashes)).digest(16)
|
||||
|
||||
@unittest.skipIf(CI, "very slow")
|
||||
@unittest.skip("very slow")
|
||||
def test_abc(self):
|
||||
expected = self._python_hash_1mb(b"abc" + b"\x00" * (2**20 - 3))
|
||||
out = Tensor(b"abc").hash()
|
||||
@@ -65,7 +64,7 @@ class TestKeccak(unittest.TestCase):
|
||||
data = b"\x00" * 4
|
||||
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
|
||||
|
||||
data = b"\x00" * (1000 if CI else 4096)
|
||||
data = b"\x00" * 1000
|
||||
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
python3 test/external/process_replay/reset.py
|
||||
CAPTURE_PROCESS_REPLAY=1 pytest -n auto test/test_tiny.py test/test_uop_graph.py test/test_ops.py test/test_linearizer.py
|
||||
while true; do
|
||||
if python3 test/test_tiny.py; then
|
||||
PYTHONPATH="." python3 test/external/process_replay/process_replay.py
|
||||
fi
|
||||
done
|
||||
Reference in New Issue
Block a user