From 5bdd6a1cc4b274bdcf4758049b48a18ed765c2a0 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 8 Feb 2025 09:04:36 +0800
Subject: [PATCH] increase CI speed with more runners [pr] (#8961)

* increase CI speed with more runners [pr]

* splits + cleanups [pr]

* more runners

* need that dep

* split that too

* can't be minimal

* move test readme

* bugfix + naming

* one more split

* bump to 22.04
---
 .github/workflows/test.yml | 303 +++++++++++++++++++++++--------------
 1 file changed, 192 insertions(+), 111 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 269eb8e1c7..a7e23b7f18 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -137,10 +137,10 @@ jobs:
         ./autogen_stubs.sh llvm
         diff /tmp/llvm.py.bak tinygrad/runtime/autogen/llvm.py
 
-  uops:
-    name: uops tests
+  tc:
+    name: Tensor Core tests
     runs-on: ubuntu-latest
-    timeout-minutes: 20
+    timeout-minutes: 10
     steps:
     - name: Checkout Code
       uses: actions/checkout@v4
@@ -195,6 +195,19 @@ jobs:
         PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
         PYTHONPATH=. DEBUG=2 EMULATE_INTEL=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
         PYTHONPATH=. DEBUG=2 AMX=1 EMULATE_AMX=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStats.test_simple_matmul
+
+  bepython:
+    name: Python Backend
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: be-minimal
+        deps: testing_minimal
     - name: Test dtype with Python emulator
       run: DEBUG=1 PYTHONPATH=. PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
     - name: Test ops with Python emulator
@@ -207,9 +220,9 @@ jobs:
       run: PYTHONPATH=. PYTHON=1 python3 -m pytest -rA test/test_linearizer_failures.py::TestLinearizerFailures::test_failure_1
 
   linter:
-    name: Linters+fuzz+unit Tests
+    name: Linters
     runs-on: ubuntu-latest
-    timeout-minutes: 20
+    timeout-minutes: 10
 
     # TODO: run the pre-commit hook to replace a lot of this
     steps:
@@ -218,9 +231,9 @@ jobs:
     - name: Setup Environment
       uses: ./.github/actions/setup-tinygrad
       with:
-        key: linting
+        key: linting-only
         python-version: '3.10'
-        deps: linting,testing
+        deps: linting
     - name: Lint bad-indentation and trailing-whitespace with pylint
       run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' --recursive=y .
     - name: Lint with ruff
@@ -231,32 +244,58 @@ jobs:
       run: python -m pylint tinygrad/
     - name: Run mypy
       run: python -m mypy --strict-equality --lineprecision-report . && cat lineprecision.txt
+
+  unittest:
+    name: Unit Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: unittest-12
+        deps: testing
     - name: Test README
       run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  PYTHONPATH=. python README.py
     - name: Run unit tests
       run: PYTHONPATH="." python -m pytest -n=auto test/unit/
+    - name: Repo line count < 11200 lines
+      run: MAX_LINE_COUNT=11200 python sz.py
+
+  fuzzing:
+    name: Fuzzing
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: fuzzing-minimal
+        deps: testing_minimal
     - name: Fuzz Test symbolic
       run: python test/external/fuzz_symbolic.py
     - name: Fuzz Test shapetracker
       run: |
         PYTHONPATH="." python test/external/fuzz_shapetracker.py
         PYTHONPATH="." python test/external/fuzz_shapetracker_math.py
-    - name: Repo line count < 11200 lines
-      run: MAX_LINE_COUNT=11200 python sz.py
 
   testgpuimage:
-    name: 'GPU IMAGE+compile Tests'
+    name: 'GPU IMAGE Tests'
     runs-on: ubuntu-20.04
-    timeout-minutes: 20
+    timeout-minutes: 10
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
       - name: Setup Environment
         uses: ./.github/actions/setup-tinygrad
         with:
-          key: gpuimage
-          deps: testing
-          python-version: '3.11'
+          key: gpu-image
+          deps: testing_minimal
           opencl: 'true'
       - name: Run Kernel Count Test
         run: PYTHONPATH="." GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
@@ -266,6 +305,22 @@ jobs:
         run: |
           PYTHONPATH="." GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
           PYTHONPATH="." GPU=1 IMAGE=2 python3 test/models/test_end2end.py TestEnd2End.test_linear_mnist
+      - name: Run process replay tests
+        uses: ./.github/actions/process-replay
+
+  testopenpilot:
+    name: 'openpilot Compile Tests'
+    runs-on: ubuntu-20.04
+    timeout-minutes: 10
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Setup Environment
+        uses: ./.github/actions/setup-tinygrad
+        with:
+          key: openpilot-compile
+          deps: testing
+          opencl: 'true'
       - name: Test openpilot model kernel count and gate usage
         run: |
           PYTHONPATH="." ALLOWED_KERNEL_COUNT=209 ALLOWED_READ_IMAGE=2105 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
@@ -278,7 +333,7 @@ jobs:
 
   testopencl:
     name: 'ONNX+Optimization Tests'
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     timeout-minutes: 20
 
     steps:
@@ -287,14 +342,17 @@ jobs:
       - name: Setup Environment
         uses: ./.github/actions/setup-tinygrad
         with:
-          key: onnxopt
+          key: onnxoptl
           deps: testing,testing_tf
           python-version: '3.11'
           opencl: 'true'
+          llvm: 'true'
       - name: Test ONNX (GPU)
         run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Test ONNX (CLANG)
         run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - name: Test ONNX (LLVM)
+        run: LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Run CLOUD=1 Test
         run: |
           CLOUDDEV=CLANG CLOUD=1 python3 test/test_tiny.py
@@ -314,7 +372,7 @@ jobs:
         uses: ./.github/actions/process-replay
 
   testdsp:
-    name: DSP Tests
+    name: Linux (DSP)
     runs-on: ubuntu-24.04
     timeout-minutes: 10
     steps:
@@ -337,7 +395,7 @@ jobs:
       run: DEBUG=2 DSP=1 python test/test_tiny.py
 
   testwebgpu:
-    name: WebGPU Tests
+    name: Linux (WebGPU)
     runs-on: ubuntu-22.04
     timeout-minutes: 20
     steps:
@@ -362,56 +420,6 @@ jobs:
     - name: Run process replay tests
       uses: ./.github/actions/process-replay
 
-  testmetal:
-    name: Metal Tests
-    runs-on: macos-14
-    timeout-minutes: 20
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v4
-    - name: Setup Environment
-      uses: ./.github/actions/setup-tinygrad
-      with:
-        key: metal
-        deps: testing
-        python-version: '3.11'
-        webgpu: 'true'
-    - name: Check Device.DEFAULT (METAL) and print some source
-      run: |
-        METAL=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'METAL', Device.DEFAULT"
-        METAL=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
-    - name: Run metal test
-      run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --ignore=test/unit --durations=20
-    - name: Run real world test
-      run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
-    - name: Run ONNX
-      run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
-    - name: Test tensor core ops (fake)
-      run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
-    - name: Test tensor core ops (real)
-      run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
-    - name: Test LLaMA compile speed
-      run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
-    - name: Test Beam Search
-      run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
-    - name: Fuzz Test linearizer
-      run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
-    # - name: Fuzz Test models schedule
-    # run: FUZZ_SCHEDULE=1 FUZZ_SCHEDULE_MAX_PATHS=5 python -m pytest test/models/test_train.py test/models/test_end2end.py
-    - name: Run TRANSCENDENTAL math
-      run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
-    # WebGPU e2e tests
-    - name: Build WEBGPU Efficientnet
-      run: WEBGPU=1 python3 -m examples.compile_efficientnet
-    - name: Clean npm cache
-      run: npm cache clean --force
-    - name: Install Puppeteer
-      run: npm install puppeteer
-    - name: Run WEBGPU Efficientnet
-      run: node test/web/test_webgpu.js
-    - name: Run process replay tests
-      uses: ./.github/actions/process-replay
 
   tests:
     strategy:
@@ -419,7 +427,7 @@ jobs:
       matrix:
         backend: [llvm, clang, gpu, ptx, amd, nv] #, triton]
 
-    name: Tests on (${{ matrix.backend }})
+    name: Linux (${{ matrix.backend }})
     runs-on: ubuntu-22.04
     timeout-minutes: 20
 
@@ -443,9 +451,6 @@ jobs:
       - name: Run pytest (not cuda or amd)
         if: matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv'
         run: python -m pytest -n=auto test/ --ignore=test/unit --durations=20
-      - name: Run ONNX (only LLVM)
-        if: matrix.backend == 'llvm'
-        run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Run pytest (cuda)
         if: matrix.backend=='ptx'||matrix.backend=='triton'||matrix.backend=='nv'
         run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --durations=20
@@ -457,13 +462,110 @@ jobs:
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
-  osxtests:
-    strategy:
-      fail-fast: false
+# ****** OSX Tests ******
 
-    name: Tests on MacOS
+  testmetal2:
+    name: MacOS (unit)
+    runs-on: macos-14
+    timeout-minutes: 10
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: metal2
+        deps: testing
+        python-version: '3.11'
+    - name: Run real world test
+      run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
+    - name: Run ONNX
+      run: JIT=2 METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+    - name: Test tensor core ops (fake)
+      run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
+    - name: Test tensor core ops (real)
+      run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
+    - name: Test LLaMA compile speed
+      run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
+    - name: Test Beam Search
+      run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
+    - name: Fuzz Test linearizer
+      run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
+    # - name: Fuzz Test models schedule
+    # run: FUZZ_SCHEDULE=1 FUZZ_SCHEDULE_MAX_PATHS=5 python -m pytest test/models/test_train.py test/models/test_end2end.py
+    - name: Run TRANSCENDENTAL math
+      run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
+
+  testmetal:
+    name: MacOS (metal)
+    runs-on: macos-14
+    timeout-minutes: 10
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: metal
+        deps: testing
+        python-version: '3.11'
+    - name: Check Device.DEFAULT (METAL) and print some source
+      run: |
+        METAL=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'METAL', Device.DEFAULT"
+        METAL=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+    - name: Run metal test
+      run: JIT=2 METAL=1 python -m pytest -n=auto test/ --ignore=test/external --ignore=test/models --ignore=test/unit --durations=20
+    - name: Run process replay tests
+      uses: ./.github/actions/process-replay
+
+  osxwebgpu:
+    name: MacOS (WebGPU)
+    runs-on: macos-14
+    timeout-minutes: 10
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: osx-webgpu
+        webgpu: 'true'
+    - name: Build WEBGPU Efficientnet
+      run: WEBGPU=1 python3 -m examples.compile_efficientnet
+    - name: Clean npm cache
+      run: npm cache clean --force
+    - name: Install Puppeteer
+      run: npm install puppeteer
+    - name: Run WEBGPU Efficientnet
+      run: node test/web/test_webgpu.js
+
+  osxclang:
+    name: MacOS (clang)
     runs-on: macos-15
-    timeout-minutes: 45
+    timeout-minutes: 10
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Setup Environment
+        uses: ./.github/actions/setup-tinygrad
+        with:
+          key: macos-clang
+          deps: testing
+      - name: Run pytest (clang)
+        env:
+          CLANG: 1
+        run: |
+          python3 -m pytest -n=auto test/ --ignore=test/unit --durations=20
+          ! (DEBUG=7 python3 test/test_zero_copy.py 2>&1 || true) | grep -E '^0x.*[^0](x18|w18).*$'
+
+  osxtests:
+    name: MacOS (amd+llvm)
+    runs-on: macos-15
+    timeout-minutes: 10
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
@@ -474,27 +576,6 @@ jobs:
           deps: testing
           amd: 'true'
           llvm: 'true'
-      - name: Check Device.DEFAULT and print some source (AMD)
-        env:
-          PYTHONPATH: ${{ github.workspace }}
-          MOCKGPU: 1
-          AMD: 1
-          FORWARD_ONLY: 1
-        run: |
-          python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'AMD', Device.DEFAULT"
-          DEBUG=5 python3 test/test_ops.py TestOps.test_add
-      - name: Check Device.DEFAULT and print some source (LLVM)
-        env:
-          LLVM: 1
-        run: |
-          python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
-          DEBUG=5 python3 test/test_ops.py TestOps.test_add
-      - name: Check Device.DEFAULT and print some source (CLANG)
-        env:
-          CLANG: 1
-        run: |
-          python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
-          DEBUG=5 python3 test/test_ops.py TestOps.test_add
       - name: Run pytest (amd)
         env:
           MOCKGPU: 1
@@ -508,17 +589,18 @@ jobs:
         run: |
           python3 -m pytest -n=auto test/ --ignore=test/unit --durations=20
           ! (DEBUG=7 python3 test/test_zero_copy.py 2>&1 || true) | grep -E '^0x.*[^0](x18|w18).*$'
-      - name: Run pytest (clang)
-        env:
-          CLANG: 1
-        run: |
-          python3 -m pytest -n=auto test/ --ignore=test/unit --durations=20
-          ! (DEBUG=7 python3 test/test_zero_copy.py 2>&1 || true) | grep -E '^0x.*[^0](x18|w18).*$'
+
+# ****** Windows Tests ******
 
   wintests:
-    name: Tests on Windows (llvm+clang)
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [llvm, clang]
+
+    name: Windows (${{ matrix.backend }})
     runs-on: windows-latest
-    timeout-minutes: 45
+    timeout-minutes: 10
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
@@ -527,9 +609,8 @@ jobs:
         with:
           key: windows-minimal
           deps: testing_minimal
-      - name: Run pytest (llvm)
+      - name: Set env
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1'}}"
+      - name: Run pytest (${{ matrix.backend }})
         shell: bash
-        run: LLVM=1 python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
-      - name: Run pytest (clang)
-        shell: bash
-        run: CLANG=1 python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20
+        run: python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20