name: Unit Tests env: # increment this when downloads substantially change to avoid the internet DOWNLOAD_CACHE_VERSION: '12' PYTHON_CACHE_VERSION: '3' APT_CACHE_VERSION: '1' BUILD_CACHE_VERSION: '1' CAPTURE_PROCESS_REPLAY: 1 GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PYTHONPATH: ${{ github.workspace }} on: push: branches: - master pull_request: workflow_dispatch: jobs: llvmspeed: name: LLVM Speed runs-on: ubuntu-24.04 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: llvm-speed deps: testing_minimal llvm: 'true' - name: Speed Test run: CPU=1 CPU_LLVM=1 python3 test/speed/external_test_speed_v_torch.py - name: Speed Test (BEAM=2) run: BEAM=2 CPU=1 CPU_LLVM=1 python3 test/speed/external_test_speed_v_torch.py docs: name: Docs runs-on: ubuntu-22.04 timeout-minutes: 10 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: deps: docs pydeps: "capstone torch" - name: Build wheel and show size run: | pip install build python -m build --wheel --outdir dist ls -lh dist/*.whl - name: Use as an external package run: | mkdir $HOME/test_external_dir cd $HOME/test_external_dir python -m venv venv source venv/bin/activate pip install $GITHUB_WORKSPACE python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))" pip install mypy mypy -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))" - name: Run beautiful_mnist with tinygrad only run: | mkdir $GITHUB_WORKSPACE/test_dir cd $GITHUB_WORKSPACE/test_dir python -m venv venv source venv/bin/activate pip install $GITHUB_WORKSPACE cp $GITHUB_WORKSPACE/examples/beautiful_mnist.py . BS=2 STEPS=10 python beautiful_mnist.py - name: Test Docs Build run: python -m mkdocs build --strict - name: Test Docs run: | python docs/abstractions2.py python docs/abstractions3.py - name: Test README run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py - name: Test Quickstart run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && python quickstart.py - name: Test DEBUG run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())" - name: Compile EfficientNet to C and test it run: | CPU=1 CPU_LLVM=0 python examples/compile_efficientnet.py > recognize.c clang -O2 recognize.c -lm -o recognize cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock torchbackend: name: Torch Backend Tests runs-on: ubuntu-latest timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: torch-backend-pillow-torchvision-et-pt deps: testing_minimal pydeps: "pillow torchvision expecttest" llvm: 'true' - name: Install ninja run: | sudo apt update || true sudo apt install -y --no-install-recommends ninja-build - name: Lint with ruff run: | pip3 install --upgrade --force-reinstall ruff==0.11.0 python3 -m ruff check extra/torch_backend/backend.py - name: Test one op run: FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add - name: Test ResNet-18 run: DEBUG=2 python3 extra/torch_backend/example.py - name: My (custom) tests run: python3 extra/torch_backend/test.py - name: Test one op in torch tests run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32 - name: Test Ops with TINY_BACKEND run: CPU=1 CPU_LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20 - name: Test in-place operations on views run: TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py - name: Test multi-gpu run: CPU=1 CPU_LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py torchbackendmore: name: Torch Backend Tests More runs-on: ubuntu-latest timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: torch-backend-pillow-torchvision-et-pt deps: testing_minimal llvm: 'true' - name: Install ninja run: | sudo apt update || true sudo apt install -y --no-install-recommends ninja-build - name: Test beautiful_mnist in torch with TINY_BACKEND run: SPLIT_REDUCEOP=0 FUSE_ARANGE=1 CPU=1 CPU_LLVM=1 TARGET_EVAL_ACC_PCT=96.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py - name: Test some torch tests (expect failure) run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true bepython: name: Python Backend runs-on: ubuntu-latest timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: be-minimal deps: testing_minimal - name: Test dtype with Python emulator run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py - name: Test ops with Python emulator run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20 - name: Test uops with Python emulator run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20 - name: Test symbolic with Python emulator run: PYTHON=1 python3 test/test_symbolic_ops.py - name: test_renderer_failures with Python emulator run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures - name: Test IMAGE=2 support run: | IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_simple_conv2d - name: Test emulated METAL tensor cores run: | DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_big_gemm DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test emulated AMX tensor cores run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm - name: Test emulated AMD tensor cores run: | DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test emulated AMD MFMA tensor cores run: | DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test emulated AMD RDNA4 tensor cores run: | DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test emulated CUDA tensor cores run: | DEBUG=2 EMULATE=CUDA FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm DEBUG=2 EMULATE=CUDA_SM75 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16 DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test emulated INTEL OpenCL tensor cores run: DEBUG=2 EMULATE=INTEL FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py - name: Test emulated AMX tensor cores run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py - name: Test device flop counts run: | DEBUG=2 EMULATE=METAL PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf DEBUG=2 EMULATE=AMD PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf DEBUG=2 EMULATE=CUDA PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf DEBUG=2 EMULATE=INTEL PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf DEBUG=2 AMX=1 EMULATE=AMX PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStats.test_simple_matmul linter: name: Linters runs-on: ubuntu-latest timeout-minutes: 10 # TODO: run the pre-commit hook to replace a lot of this steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: linting-only python-version: '3.10' deps: linting - name: Lint bad-indentation and trailing-whitespace with pylint run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' --recursive=y . - name: Lint with ruff run: | pip3 install --upgrade --force-reinstall ruff==0.11.0 python3 -m ruff check . python3 -m ruff check examples/mlperf/ --ignore E501 - name: Lint tinygrad with pylint run: python -m pylint tinygrad/ - name: Run mypy run: | python -m mypy --strict-equality --lineprecision-report . cat lineprecision.txt - name: Run TYPED=1 run: TYPED=1 python -c "import tinygrad" unittest: name: Unit Tests runs-on: ubuntu-latest timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: unittest-12 pydeps: "pillow numpy ftfy regex" deps: testing_unit - name: Run unit tests run: python -m pytest -n=auto test/unit/ --durations=20 - name: Run targetted tests on NULL backend run: NULL=1 python3 test/test_multitensor.py TestMultiTensor.test_data_parallel_resnet_train_step - name: Run SDXL on NULL backend run: MAX_BUFFER_SIZE=0 NULL=1 DEBUG=1 python3 examples/sdxl.py --seed 0 --noshow --timing --fakeweights - name: Run Clip tests for SD MLPerf on NULL backend run: MAX_BUFFER_SIZE=0 NULL=1 python -m pytest -n=auto test/external/mlperf_stable_diffusion/external_test_models.py::TestOpenClip --durations=20 # TODO: support fake weights #- name: Run LLaMA 7B on 4 fake devices # run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing - name: Run GC tests run: python test/external/external_uop_gc.py - name: External Benchmark Schedule run: python3 test/external/external_benchmark_schedule.py - name: Run process replay tests uses: ./.github/actions/process-replay - name: Regen dataset on test_tiny run: | test/external/process_replay/reset.py CAPTURE_PROCESS_REPLAY=1 python test/test_tiny.py TestTiny.test_plus python extra/optimization/extract_dataset.py gzip -c /tmp/sops > extra/datasets/sops.gz #DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py - name: Repo line count < 18000 lines run: MAX_LINE_COUNT=18000 python sz.py fuzzing: name: Fuzzing runs-on: ubuntu-latest timeout-minutes: 10 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: fuzzing-unit deps: testing_unit - name: Fuzz Test symbolic run: python test/external/fuzz_symbolic.py - name: Fuzz Test fast idiv run: python test/external/fuzz_fast_idiv.py - name: Fuzz Test shapetracker run: | python test/external/fuzz_shapetracker.py python test/external/fuzz_shapetracker_math.py - name: Fuzz Test shape ops run: python test/external/fuzz_shape_ops.py testopenclimage: name: CL IMAGE Tests runs-on: ubuntu-22.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: gpu-image deps: testing_minimal opencl: 'true' - name: Test CL IMAGE=2 ops + training run: | CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist - name: Test CL IMAGE=2 ops + training (rangeify) run: | RANGEIFY=1 CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 RANGEIFY=1 CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist - name: Run process replay tests uses: ./.github/actions/process-replay testgpumisc: name: CL Misc tests runs-on: ubuntu-22.04 timeout-minutes: 10 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: gen-dataset deps: testing_minimal opencl: 'true' - name: Generate Dataset run: CL=1 extra/optimization/generate_dataset.sh - name: Run Kernel Count Test run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py - name: Run fused optimizer tests run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py - name: Upload artifact uses: actions/upload-artifact@v4 with: name: sops.gz path: /tmp/sops.gz testopenpilot: name: openpilot Compile Tests runs-on: ubuntu-22.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: openpilot-compile deps: testing opencl: 'true' llvm: 'true' - name: Test openpilot model kernel count and gate usage run: | ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2160 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot alt model correctness (float32) run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot fastvits model correctness (float32) run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx # - name: Test openpilot simple_plan vision model correctness (float32) # run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b - name: Test openpilot LLVM compile run: CPU=1 CPU_LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot compile4 run: NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py - name: Run process replay tests uses: ./.github/actions/process-replay # ****** ONNX Tests ****** testonnxcpu: name: ONNX (CPU) Tests runs-on: ubuntu-22.04 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: onnxoptc deps: testing python-version: '3.11' llvm: 'true' - name: Test ONNX (CPU) run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Test ONNX (LLVM) run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Test ONNX Runner (CPU) run: CPU=1 CPU_LLVM=0 python3 test/external/external_test_onnx_runner.py - name: Test Additional ONNX Ops (CPU) run: CPU=1 CPU_LLVM=0 python3 test/external/external_test_onnx_ops.py - name: Test Quantize ONNX run: CPU=1 CPU_LLVM=0 python3 test/test_quantize_onnx.py - name: Run process replay tests uses: ./.github/actions/process-replay testopencl: name: ONNX (CL)+Optimization Tests runs-on: ubuntu-22.04 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: onnxoptl deps: testing pydeps: "tensorflow==2.15.1 tensorflow_addons" python-version: '3.11' opencl: 'true' - name: Test ONNX (CL) run: CL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 #- name: Test Optimization Helpers # run: DEBUG=1 python3 extra/optimization/test_helpers.py #- name: Test Action Space # run: DEBUG=1 CL=1 python3 extra/optimization/get_action_space.py - name: Test Beam Search run: CL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py - name: Test MLPerf stuff run: CL=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20 - name: Test llama 3 training run: MAX_BUFFER_SIZE=0 DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py - name: Run process replay tests uses: ./.github/actions/process-replay testllm: name: Test LLM runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: apps_llm - name: Test 1B LLM run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm | grep -i rooster # ****** Models Tests ****** testmodels: name: Models (llvm+cpu+gpu) runs-on: ubuntu-22.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: models deps: testing opencl: 'true' llvm: 'true' - name: Test models (llvm) run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20 - name: Test models (opencl) run: CL=1 python -m pytest -n=auto test/models --durations=20 - name: Test models (cpu) run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay testmetalmodels: name: Models (metal) runs-on: macos-14 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: metal deps: testing python-version: '3.11' - name: Test models (Metal) run: METAL=1 python -m pytest -n=auto test/models --durations=20 - name: Test LLaMA compile speed run: METAL=1 python test/external/external_test_speed_llama.py # ****** Feature Tests ****** testrangeifycpu: name: Linux (rangeify) CPU runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: rangeify-minimal-llvm deps: testing_minimal opencl: 'true' llvm: "true" - name: Test CPU=1 RANGEIFY=1 # TODO: add more passing tests here # test_instancenorm_3d is very slow # rangeify diamond cycle gives the wrong answer run: | CPU=1 CPU_LLVM=0 RANGEIFY=1 python3 -m pytest -n auto --durations 20 \ -k "not test_instancenorm_3d and not test_assign_diamond_cycle" \ test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_tensor_variable.py \ test/test_outerworld_range.py test/test_randomness.py test/test_nn.py test/test_arange.py test/test_tensor.py test/test_optim.py \ test/test_setitem.py test/test_assign.py - name: Test const folding run: CPU=1 RANGEIFY=1 python3 -m pytest -n auto --durations 20 test/test_const_folding.py -k "not test_cast_padded and not TestReduceOpsConstFolding and not TestMultiConstFolding" - name: Test multitensor run: | CPU=1 RANGEIFY=1 python3 test/test_multitensor.py TestMultiTensor.test_matmul_shard_1_1 TestMultiTensor.test_simple_add_W TestMultiTensor.test_simple_reduce \ TestMultiTensor.test_elementwise_dtype TestMultiTensor.test_shard_no_recompile TestHandleData.test_copied_to_device TestMultiRamUsage CPU=1 RANGEIFY=1 python3 -m pytest test/test_multitensor.py::TestMultiAssign -k 'not (multi_assign_piece_noncontig or multi_assign_var_offset)' CPU=1 RANGEIFY=1 python3 -m pytest -n=auto test/test_multitensor.py::TestMultiTensor test/unit/test_allreduce.py -k 'not const_folding' - name: Test CPU=1 RANGEIFY=2 run: CPU=1 CPU_LLVM=0 RANGEIFY=2 python3 -m pytest -n auto test/test_tiny.py test/test_rangeify.py test/test_ops.py --durations 20 # slow (and still wrong on beautiful_mnist) #- name: Test LLVM RANGEIFY=1 (slow tests) # run: CPU=1 CPU_LLVM=1 RANGEIFY=1 python3 -m pytest -n auto test/models/test_mnist.py --durations 20 - name: Run process replay tests uses: ./.github/actions/process-replay testrangeifycl: name: Linux (rangeify) CL runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: rangeify-cl deps: testing opencl: 'true' llvm: "true" - name: Test CL=1 RANGEIFY=1 run: CL=1 RANGEIFY=1 pytest -n auto test/test_ops.py test/test_schedule.py test/test_symbolic_ops.py test/test_jit.py test/unit/test_disk_tensor.py test/models/test_mnist.py test/unit/test_mnist_dataset.py test/test_optim.py --durations 20 - name: Test Fuse run: CL=1 RANGEIFY=2 python3 -m pytest --durations 20 test/test_softmax_fusion.py -k "not test_auto_softmax" - name: Test ONNX run: CL=1 RANGEIFY=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay testrangeifymacos: name: MacOS (rangeify) runs-on: macos-14 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: metal deps: testing - name: some unit tests run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/unit/test_winograd.py --durations=20 - name: Test METAL=1 RANGEIFY=1 run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/test_ops.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay testdevectorize: name: Linux (devectorize) runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: devectorize-minimal deps: testing_minimal pydeps: "pillow" llvm: "true" - name: Test LLVM=1 DEVECTORIZE=0 run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" - name: Test LLVM=1 DEVECTORIZE=0 for model run: CPU=1 CPU_LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py - name: Test CPU=1 DEVECTORIZE=0 run: CPU=1 CPU_LLVM=0 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" testdsp: name: Linux (DSP) runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: dsp-minimal deps: testing_minimal pydeps: "onnx==1.18.0 onnxruntime pillow" llvm: "true" - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build QEMU Docker with cache uses: docker/build-push-action@v4 with: file: extra/dsp/Dockerfile push: false load: true tags: qemu-hexagon:latest cache-from: type=gha cache-to: type=gha,mode=min - name: Set MOCKDSP env run: printf "MOCKDSP=1" >> $GITHUB_ENV - name: Run test_tiny on DSP run: DEBUG=2 DSP=1 python test/test_tiny.py - name: Test transcendentals run: CC=clang-20 DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized - name: Test quantize onnx run: DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py testwebgpu: name: Linux (WebGPU) runs-on: ubuntu-22.04 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: webgpu-minimal deps: testing_minimal python-version: '3.11' webgpu: 'true' - name: Check Device.DEFAULT (WEBGPU) and print some source run: | WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT" WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run selected webgpu tests run: | WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Vulkan" python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit \ --ignore=test/test_copy_speed.py --ignore=test/test_rearrange_einops.py \ --ignore=test/test_fuzz_shape_ops.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay testamd: strategy: fail-fast: false matrix: backend: [amd, amdllvm] name: Linux (${{ matrix.backend }}) runs-on: ubuntu-22.04 timeout-minutes: 20 env: AMD: 1 MOCKGPU: 1 FORWARD_ONLY: 1 AMD_LLVM: ${{ matrix.backend == 'amdllvm' && '1' || matrix.backend != 'amdllvm' && '0' }} steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: ${{ matrix.backend }}-minimal deps: testing_minimal amd: 'true' llvm: ${{ matrix.backend == 'amdllvm' && 'true' }} - name: Check Device.DEFAULT and print some source run: | python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['AMD'], Device.DEFAULT" DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run LLVM test if: matrix.backend=='amdllvm' run: python test/device/test_amd_llvm.py - name: Run pytest (amd) run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20 - name: Run pytest (amd) run: python -m pytest test/external/external_test_am.py --durations=20 - name: Run TRANSCENDENTAL math run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run TestOps.test_add with SQTT run: | VIZ=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp - name: Run process replay tests uses: ./.github/actions/process-replay testnvidia: strategy: fail-fast: false matrix: backend: [ptx, nv] name: Linux (${{ matrix.backend }}) runs-on: ubuntu-22.04 timeout-minutes: 20 env: MOCKGPU: 1 FORWARD_ONLY: 1 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: ${{ matrix.backend }}-minimal deps: testing_minimal cuda: 'true' ocelot: 'true' - name: Set env run: printf "${{ matrix.backend == 'PTX' && 'CUDA=1\nCUDA_PTX=1' || matrix.backend == 'nv' && 'NV=1\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT" DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run pytest (cuda) # skip multitensor because it's slow run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay testcpuopencl: strategy: fail-fast: false matrix: backend: [llvm, cpu, opencl] name: Linux (${{ matrix.backend }}) runs-on: ubuntu-22.04 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: ${{ matrix.backend }}-minimal deps: testing_minimal opencl: ${{ matrix.backend == 'opencl' && 'true' }} llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'CL=1' }}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT" DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run pytest (${{ matrix.backend }}) run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20 - name: Run TRANSCENDENTAL math run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay amdremote: name: Linux (remote) runs-on: ubuntu-22.04 timeout-minutes: 20 env: REMOTE: 1 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: linux-remote deps: testing_minimal amd: 'true' llvm: 'true' opencl: 'true' - name: Start remote server run: | start_server() { systemd-run --user \ --unit="$1" \ --setenv=REMOTEDEV="$2" \ --setenv=MOCKGPU=1 \ --setenv=PYTHONPATH=. \ --setenv=PORT="$3" \ --working-directory="$(pwd)" \ python tinygrad/runtime/ops_remote.py } start_server "remote-server-amd-1" "AMD" 6667 start_server "remote-server-amd-2" "AMD" 6668 start_server "remote-server-gpu" "CL" 7667 start_server "remote-server-cpu" "CPU" 8667 - name: Check Device.DEFAULT and print some source env: HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6 run: | python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT" python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'AMD', Device.default.properties.real_device" DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus - name: Run REMOTE=1 Test (AMD) env: HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6 run: | python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20 - name: Run REMOTE=1 Test (CL) env: HOST: 127.0.0.1:7667*6 run: | python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20 IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py - name: Run REMOTE=1 Test (CPU) env: HOST: 127.0.0.1:8667*6 run: | python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20 - name: Show remote server logs if: always() run: | journalctl --user -u remote-server-amd-1 --no-pager journalctl --user -u remote-server-amd-2 --no-pager journalctl --user -u remote-server-gpu --no-pager journalctl --user -u remote-server-cpu --no-pager # ****** OSX Tests ****** testmetal: name: MacOS (unit) runs-on: macos-14 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: metal deps: testing python-version: '3.11' amd: 'true' cuda: 'true' ocelot: 'true' llvm: 'true' - name: Run unit tests run: METAL=1 python -m pytest -n=auto test/unit/ --durations=20 - name: Run ONNX run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Test tensor core ops (fake) run: METAL=1 DEBUG=3 TC=2 python test/test_ops.py TestOps.test_gemm - name: Test tensor core ops (real) run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm - name: Test Beam Search run: METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py #- name: Fuzz Test linearizer # run: METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py - name: Run TRANSCENDENTAL math run: METAL=1 TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run pytest (amd) env: MOCKGPU: 1 AMD: 1 AMD_LLVM: 0 FORWARD_ONLY: 1 run: | python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20 - name: Run pytest (amd with llvm backend) env: MOCKGPU: 1 AMD: 1 AMD_LLVM: 1 FORWARD_ONLY: 1 run: | python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20 - name: Run pytest (ptx) env: MOCKGPU: 1 NV_PTX: 1 NV: 1 FORWARD_ONLY: 1 run: | python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay osxwebgpu: name: MacOS (WebGPU) runs-on: macos-14 timeout-minutes: 10 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: osx-webgpu deps: testing webgpu: 'true' - name: Test infinity math in WGSL run: WEBGPU=1 python -m pytest -n=auto test/test_renderer_failures.py::TestWGSLFailures::test_multiply_infinity --durations=20 - name: Build WEBGPU Efficientnet run: WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Metal" python3 -m examples.compile_efficientnet - name: Clean npm cache run: npm cache clean --force - name: Install Puppeteer run: npm install puppeteer # this is also flaky #- name: Run WEBGPU Efficientnet # run: node test/web/test_webgpu.js # this is flaky #- name: Run VIZ tests as external package # run: | # mkdir $GITHUB_WORKSPACE/test_dir # cd $GITHUB_WORKSPACE/test_dir # python -m venv venv # source venv/bin/activate # pip install $GITHUB_WORKSPACE # cp $GITHUB_WORKSPACE/test/web/test_viz.js . # node test_viz.js - name: Test ONNX Runner (WEBGPU) run: WEBGPU=1 python3 test/external/external_test_onnx_runner.py osxremote: name: MacOS (remote metal) runs-on: macos-15 timeout-minutes: 10 env: REMOTE: 1 REMOTEDEV: METAL steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: macos-remote deps: testing_minimal - name: Check Device.DEFAULT and print some source run: | python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT" python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'METAL', Device.default.properties.real_device" DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus - name: Run REMOTE=1 Test run: | python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_tensor_variable.py osxtests: strategy: fail-fast: false matrix: backend: [metal, llvm, cpu] name: MacOS (${{ matrix.backend }}) runs-on: macos-15 timeout-minutes: 20 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: macos-${{ matrix.backend }}-minimal deps: testing_minimal pydeps: "capstone" llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT" DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus - name: Run pytest (${{ matrix.backend }}) run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay - name: Run macOS-specific unit test if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated # ****** Windows Tests ****** wintests: strategy: fail-fast: false matrix: backend: [llvm, cpu, webgpu] name: Windows (${{ matrix.backend }}) runs-on: windows-latest timeout-minutes: 15 steps: - name: Checkout Code uses: actions/checkout@v4 - name: Setup Environment uses: ./.github/actions/setup-tinygrad with: key: windows-${{ matrix.backend }}-minimal deps: testing_unit pydeps: ${{ matrix.backend == 'webgpu' && 'dawn-python' || '' }} - name: Set env shell: bash run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'webgpu' && 'WEBGPU=1'}}" >> $GITHUB_ENV - name: Run unit tests if: matrix.backend=='llvm' # test_newton_schulz hits RecursionError run: python -m pytest -n=auto test/unit/ --ignore=test/unit/test_disk_tensor.py --ignore=test/unit/test_elf.py --ignore=test/unit/test_tar.py --ignore=test/unit/test_linalg.py --durations=20 - name: Run pytest (${{ matrix.backend }}) shell: bash run: | python -c "from tinygrad import Device; assert Device.DEFAULT == {'LLVM':'CPU'}.get(x:='${{ matrix.backend }}'.upper(), x), Device.DEFAULT" python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20