From a0965ee1988cb4bfd08e829d4cff1829fa591ff0 Mon Sep 17 00:00:00 2001
From: cheeetoo <102839837+cheeetoo@users.noreply.github.com>
Date: Sun, 23 Jul 2023 15:00:56 -0500
Subject: [PATCH] CI < 5 minutes (#1252)

* models matrix

* fix typo and install gpu deps

* install llvm deps if needed

* fix

* testops with cuda

* remove pip cache since not work

* cuda env

* install cuda deps

* maybe it will work now

* i can't read

* all tests in matrix

* trim down more

* opencl stuff in matrix

* opencl pip cache

* test split

* change cuda test exclusion

* test

* fix cuda maybe

* add models

* add more n=auto

* third thing

* fix bug

* cache pip more

* change name

* update tests

* try again cause why not

* balance

* try again...

* try apt cache for cuda

* try on gpu:

* try cuda again

* update packages step

* replace libz-dev with zlib1g-dev

* only cache cuda

* why error

* fix gpuocelot bug

* apt cache err

* apt cache to slow?

* opt and image in single runner

* add a couple n=autos

* remove test matrix

* try cuda apt cache again

* libz-dev -> zlib1g-dev

* remove -s since not supported by xdist

* the cache takes too long and doesn't work

* combine webgpu and metal tests

* combine imagenet to c and cpu tests

* torch tests with linters

* torch back by itself

* small windows clang test with torch tests

* fix a goofy windows bug

* im dumb

* bro

* clang with linters

* fix pylint error

* linter not work on windows

* try with clang again

* clang and imagenet?

* install deps

* fix

* fix quote

* clang by itself (windows too slow)

* env vars for imagenet

* cache pip for metal and webgpu tests

* try torch with metal and webgpu

* doesn't work, too long

* remove -v

* try -n=logical

* don't use logical

* revert accidental thing

* remove some prints unless CI

* fix print unless CI

* ignore speed tests for slow tests

* clang windows in matrix (ubuntu being tested in imagenet->c test)

* try manual pip cache

* fix windows pip cache path

* all manual pip cache

* fix pip cache dir for macos

* print_ci function in helpers

* CI as variable, no print_ci

* missed one

* cuda tests with docker image

* remove setup-python action for cuda

* python->python3?

* remove -s -v

* try fix pip cache

* maybe fix

* try to fix pip cache

* is this the path?

* maybe cache pip

* try again

* create wheels dir

* ?

* cuda pip deps in dockerfile

* disable pip cache for clang

* image from ghcr instead of docker hub

* why is clang like this

* fast deps

* try use different caches

* remove the fast thing

* try with lighter image

* remove setup python for cuda

* small docker and cuda fast deps

* ignore a few more tests

* cool docker thing (maybe)

* oops

* quotes

* fix docker command

* fix bug

* ignore train efficientnet test

* remove dockerfile (docker stuff takes too long)

* remove docker stuff and normal cuda

* oops

* ignore the tests for cuda

* does this work

* ignore test_train on slow backends

* add space

* llvm ignore same tests as cuda

* nvm

* ignore lr scheduler tests

* get some stats

* fix ignore bug

* remove extra '

* remove and

* ignore test for llvm

* change ignored tests and durationon all backends

* fix

* and -> or

* ignore some more cuda tests

* finally?

* does this fix it

* remove durations=0

* add some more tests to llvm

* make last pytest more readable

* fix

* don't train efficientnet on cpu

* try w/out pip cache

* pip cache seems to be generally better

* pytest file markers

* try apt fast for cuda

* use quick install for apt-fast

* apt-fast not worth

* apt-get to apt

* fix typo

* suppress warnings

* register markers

* disable debug on fuzz tests

* change marker names

* apt update and apt install in one command

* update marker names in test.yml

* webgpu pytest marker
---
 .github/workflows/test.yml            | 346 +++++++++++---------------
 pytest.ini                            |   2 +
 test/external/external_test_yolov8.py |   2 +-
 test/extra/test_lr_scheduler.py       |   3 +
 test/models/test_end2end.py           |  11 +-
 test/models/test_mnist.py             |   3 +
 test/models/test_onnx.py              |  30 ++-
 test/models/test_train.py             |   3 +
 test/test_assign.py                   |   3 +
 test/test_conv.py                     |   3 +
 test/test_conv_shapetracker.py        |   3 +
 test/test_custom_function.py          |   3 +
 test/test_dtype.py                    |   2 +-
 test/test_jit.py                      |   3 +
 test/test_net_speed.py                |   3 +
 test/test_nn.py                       |   3 +
 test/test_ops.py                      |  13 +-
 test/test_optim.py                    |   3 +
 test/test_randomness.py               |   3 +
 test/test_specific_conv.py            |   3 +
 test/test_speed_v_torch.py            |   7 +-
 test/test_tensor.py                   |   3 +
 test/unit/test_example.py             |   8 +-
 23 files changed, 237 insertions(+), 226 deletions(-)
 create mode 100644 pytest.ini

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 83f137e024..be1076926b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,6 +18,11 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: 3.8
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: linting
     - name: Install dependencies
       run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Repo line count
@@ -31,12 +36,12 @@ jobs:
     - name: Run mypy
       run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
     - name: Install SLOCCount
-      run: sudo apt-get install sloccount
+      run: sudo apt install sloccount
     - name: Check <5000 lines
       run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
 
-  testcpu:
-    name: CPU Tests
+  testcpuimagenet:
+    name: CPU and ImageNet to C Tests
     runs-on: ubuntu-latest
     timeout-minutes: 20
 
@@ -47,6 +52,11 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: 3.8
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: testing
     - name: Install Dependencies
       run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Test Docs
@@ -54,49 +64,11 @@ jobs:
     - name: Test Quickstart
       run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
     - name: Run Pytest
-      run: python -m pytest -s -v -n=auto test/
+      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
     - name: Fuzz Test symbolic
-      run: DEBUG=1 python test/external/fuzz_symbolic.py
+      run: python test/external/fuzz_symbolic.py
     - name: Fuzz Test shapetracker
-      run: PYTHONPATH="." DEBUG=1 python test/external/fuzz_shapetracker.py
-  
-  testwebgpu:
-    name: WebGPU Tests
-    runs-on: macos-13
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e '.[testing,webgpu]' --extra-index-url https://download.pytorch.org/whl/cpu
-    # - name: Set Env
-    #   run: printf "WEBGPU=1\nWGPU_BACKEND_TYPE=D3D12\n" >> $GITHUB_ENV
-    - name: Run Pytest
-      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -s -v -n=auto test/test_ops.py test/test_speed_v_torch.py test/test_nn.py test/test_jit.py test/test_randomness.py test/test_tensor.py test/test_assign.py test/test_conv.py test/test_nn.py test/test_custom_function.py test/test_conv_shapetracker.py
-    - name: Build WEBGPU Efficientnet
-      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
-    # - name: Install Puppeteer
-    #   run: npm install puppeteer
-    # - name: Run Efficientnet
-    #   run: node test/test_webgpu.js
-  testimagenet:
-    name: ImageNet to C Compile Test
-    runs-on: ubuntu-latest
-    timeout-minutes: 20
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e .
+      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
     - name: Compile EfficientNet to C
       run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
     - name: Compile C to native
@@ -104,44 +76,6 @@ jobs:
     - name: Test EfficientNet
       run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
 
-  testllvm:
-    name: LLVM Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 20
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e '.[llvm,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Run Pytest
-      run: ENABLE_METHOD_CACHE=1 LLVM=1 python -m pytest -s -v -n=auto test/
-
-  testclang:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, windows-latest]
-    runs-on: ${{ matrix.os }}
-    name: CLANG Tests ${{ matrix.os }} (w method cache)
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Set env
-      run: printf "CI=1\nCLANG=1\nENABLE_METHOD_CACHE=1" >> $GITHUB_ENV
-    - name: Run Pytest
-      run: python -m pytest -s -v -n=auto test/
-
   testtorch:
     name: Torch Tests
     runs-on: ubuntu-latest
@@ -154,79 +88,72 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: 3.8
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: testing
     - name: Install Dependencies
       run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Run Pytest
-      run: TORCH=1 python -m pytest -s -v -n=auto test/
+      run: TORCH=1 python -m pytest -n=auto test/
     - name: Run ONNX
-      run: TORCH=1 python -m pytest test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
-
-  testgpu:
-    name: GPU Tests
-    runs-on: ubuntu-20.04
-    timeout-minutes: 20
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Update packages
-      run: |
-        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-        echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
-        sudo apt-get update
-    - name: Install OpenCL
-      #run: sudo apt-get install -y pocl-opencl-icd
-      run: sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Run Optimizer Test (OPT 2 and 3)
-      run: |
-        PYTHONPATH="." OPT=2 GPU=1 python test/external/external_test_opt.py
-        PYTHONPATH="." OPT=3 GPU=1 python test/external/external_test_opt.py
-    - name: Run Pytest (default)
-      run: GPU=1 python -m pytest -s -v -n=auto test/
+      run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
 
   testopencl:
-    name: openpilot (OpenCL) Test
+    strategy:
+      matrix:
+        task: [optimage, openpilot]
+    name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests'||'openpilot (OpenCL) Tests'}}
     runs-on: ubuntu-20.04
     timeout-minutes: 20
 
     steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Update packages
-      run: |
-        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-        echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
-        sudo apt-get update
-    - name: Install OpenCL
-      #run: sudo apt-get install -y pocl-opencl-icd
-      run: sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Install Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Test openpilot model compile and size
-      run: |
-        DEBUG=2 ALLOWED_KERNEL_COUNT=199 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
-        python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
-    - name: Test GPU IMAGE ops
-      run: |
-        GPU=1 IMAGE=1 python3 test/test_ops.py
-        FORWARD_ONLY=1 GPU=1 IMAGE=2 python3 test/test_ops.py
-    - name: Test openpilot model correctness (float32)
-      run: DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
-    - name: Test tensor core ops
-      run: GPU=1 TC=2 python3 test/test_ops.py
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Update packages
+        run: |
+          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt update
+      - name: Install OpenCL
+        #run: sudo apt-get install -y pocl-opencl-icd
+        run: sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: testing
+      - name: Install Dependencies
+        run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - if: ${{ matrix.task == 'optimage' }}
+        name: Run Optimizer Test (OPT 2 and 3)
+        run: |
+          PYTHONPATH="." OPT=2 GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
+          PYTHONPATH="." OPT=3 GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
+      - if: ${{ matrix.task == 'optimage'}}
+        name: Test GPU IMAGE ops
+        run: |
+          GPU=1 IMAGE=1 python3 -m pytest -n=auto test/test_ops.py
+          FORWARD_ONLY=1 GPU=1 IMAGE=2 python3 -m pytest -n=auto test/test_ops.py
+      - if: ${{ matrix.task == 'openpilot' }}
+        name: Test openpilot model compile and size
+        run: |
+          DEBUG=2 ALLOWED_KERNEL_COUNT=199 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
+          python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
+      - if: ${{ matrix.task == 'openpilot' }}
+        name: Test openpilot model correctness (float32)
+        run: DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
+      - if: ${{ matrix.task == 'openpilot' }}
+        name: Test tensor core ops
+        run: GPU=1 TC=2 python3 -m pytest -n=auto test/test_ops.py
 
-  testmetal:
-    name: Metal Tests
+  testmetalwebgpu:
+    name: Metal and WebGPU Tests
     runs-on: macos-13
     timeout-minutes: 20
 
@@ -237,19 +164,27 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: 3.11
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        path: ~/Library/Caches/pip
+        key: metalwebgpu
     - name: Install Dependencies
-      run: pip install -e '.[metal,testing]'
+      run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Test LLaMA compile speed
       run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
     #- name: Run dtype test
     #  run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
     # dtype test has issues on test_half_to_int8
-    - name: Run ops test
+    - name: Run metal ops test
       run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
     - name: Run JIT test
       run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
     # TODO: why not testing the whole test/?
-
+    - name: Run webgpu pytest
+      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
+    - name: Build WEBGPU Efficientnet
+      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
 
   testdocker:
     name: Docker Test
@@ -264,58 +199,73 @@ jobs:
     - name: Test Docker
       run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
 
+  tests:
+    strategy:
+      matrix:
+        backend: [llvm, clang, gpu, cuda]
 
-  testcuda:
-    name: (emulated) cuda test
-    runs-on: ubuntu-22.04
+    name: Tests on (${{ matrix.backend }})
+    runs-on: ${{ matrix.backend == 'gpu'  && 'ubuntu-20.04' || matrix.backend=='clang'&&'windows-latest'|| 'ubuntu-latest' }}
     timeout-minutes: 20
 
     steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Update packages
-      run: |
-        export DEBIAN_FRONTEND=noninteractive
-        sudo apt-get update -y
-    - name: Install packages
-      run: sudo apt-get install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev libz-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
-    - name: Cache gpuocelot
-      id: cache-build
-      uses: actions/cache@v3
-      env:
-        cache-name: cache-gpuocelot-build
-      with:
-        path: ${{ github.workspace }}/gpuocelot/ocelot/
-        key: ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
-        restore-keys: |
-          ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
-    - if: ${{ steps.cache-build.outputs.cache-hit != 'true' }}
-      name: Clone gpuocelot
-      uses: actions/checkout@v3
-      with:
-        repository: gpuocelot/gpuocelot
-        ref: 19626fc00b6ee321638c3111074269c69050e091
-        path: ${{ github.workspace }}/gpuocelot
-        submodules: true
-    - if: ${{ steps.cache-build.outputs.cache-hit != 'true' }}
-      name: Compile gpuocelot
-      run: |
-        cd ${{ github.workspace }}/gpuocelot/ocelot
-        mkdir build
-        cd build
-        cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF
-        ninja
-    - name: Install gpuocelot
-      run: |
-        cd ${{ github.workspace }}/gpuocelot/ocelot/build
-        sudo ninja install
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-        cache: 'pip'
-        cache-dependency-path: setup.py
-    - name: Install tinygrad dependencies
-      run: pip install -e '.[testing, cuda]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Run pytest
-      run: FORWARD_ONLY=1 JIT=1 OPT=2 CUDA=1 CUDACPU=1 python -m pytest -s -v -n=auto test --ignore=test/external --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_specific_conv.py --ignore=test/test_net_speed.py --ignore=test/test_nn.py -k "not half"
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ${{ matrix.backend=='clang'&&'~\AppData\Local\pip\cache'||'~/.cache/pip' }}
+          key: ${{ matrix.backend }}
+      - name: Set env
+        run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
+      - name: Install packages (gpu)
+        if: matrix.backend == 'gpu'
+        run: |
+          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt update && \
+          sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl 
+      - name: Install packages (cuda)
+        if: matrix.backend == 'cuda'
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          sudo apt update -y && \
+          sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
+      - name: Cache gpuocelot
+        if: matrix.backend == 'cuda'
+        id: cache-build
+        uses: actions/cache@v3
+        env:
+          cache-name: cache-gpuocelot-build
+        with:
+          path: ${{ github.workspace }}/gpuocelot/ocelot/
+          key: ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
+          restore-keys: |
+            ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
+      - name: Clone/compile gpuocelot
+        if: matrix.backend == 'cuda' && steps.cache-build.outputs.cache-hit != 'true'
+        run: |
+          git clone --recurse-submodules https://github.com/gpuocelot/gpuocelot.git ${{ github.workspace }}/gpuocelot
+          cd ${{ github.workspace }}/gpuocelot/ocelot
+          git checkout 19626fc00b6ee321638c3111074269c69050e091
+          mkdir build
+          cd build
+          cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF
+          ninja
+      - name: Install gpuocelot
+        if: matrix.backend == 'cuda'
+        run: |
+          cd ${{ github.workspace }}/gpuocelot/ocelot/build
+          sudo ninja install
+      - name: Install dependencies
+        run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Run pytest (not cuda)
+        if: matrix.backend!='cuda'
+        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
+      - name: Run pytest (cuda)
+        if: matrix.backend=='cuda'
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000..0f6e52a427
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+markers = ['exclude_cuda', 'exclude_gpu', 'exclude_clang', 'webgpu']
\ No newline at end of file
diff --git a/test/external/external_test_yolov8.py b/test/external/external_test_yolov8.py
index 2053f2203d..e250c59c2e 100644
--- a/test/external/external_test_yolov8.py
+++ b/test/external/external_test_yolov8.py
@@ -73,5 +73,5 @@ class TestYOLOv8(unittest.TestCase):
     np.testing.assert_allclose(onnx_output[0], tiny_output.cpu().numpy(), atol=5e-4, rtol=0.025)
     
 if __name__ == '__main__':
-    unittest.main()
+  unittest.main()
     
\ No newline at end of file
diff --git a/test/extra/test_lr_scheduler.py b/test/extra/test_lr_scheduler.py
index 1e39c3d4e0..283652b48a 100644
--- a/test/extra/test_lr_scheduler.py
+++ b/test/extra/test_lr_scheduler.py
@@ -7,6 +7,9 @@ from tinygrad.nn.optim import Adam
 from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR
 from extra.training import train, evaluate
 from extra.datasets import fetch_mnist
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu]
 
 np.random.seed(1337)
 Tensor.manual_seed(1337)
diff --git a/test/models/test_end2end.py b/test/models/test_end2end.py
index 09a7f0ed19..b206e50bc7 100644
--- a/test/models/test_end2end.py
+++ b/test/models/test_end2end.py
@@ -6,13 +6,14 @@ from tinygrad.state import get_parameters, get_state_dict
 from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
 from tinygrad.tensor import Tensor
 from extra.datasets import fetch_mnist
+from tinygrad.helpers import CI
 
 def compare_tiny_torch(model, model_torch, X, Y):
   Tensor.training = True
   model_torch.train()
   model_state_dict = get_state_dict(model)
   for k,v in model_torch.named_parameters():
-    print(f"initting {k} from torch")
+    if not CI: print(f"initting {k} from torch")
     model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
 
   optimizer = optim.SGD(get_parameters(model), lr=0.01)
@@ -23,11 +24,11 @@ def compare_tiny_torch(model, model_torch, X, Y):
 
   out = model(X)
   loss = (out * Y).mean()
-  print(loss.realize().numpy())
+  if not CI: print(loss.realize().numpy())
 
   out_torch = model_torch(torch.Tensor(X.numpy()))
   loss_torch = (out_torch * torch.Tensor(Y.numpy())).mean()
-  print(loss_torch.detach().numpy())
+  if not CI: print(loss_torch.detach().numpy())
 
   # assert losses match
   np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
@@ -41,7 +42,7 @@ def compare_tiny_torch(model, model_torch, X, Y):
   for k,v in list(model_torch.named_parameters())[::-1]:
     g = model_state_dict[k].grad.numpy()
     gt = v.grad.detach().numpy()
-    print("testing grads", k)
+    if not CI: print("testing grads", k)
     np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
 
   # take the steps
@@ -50,7 +51,7 @@ def compare_tiny_torch(model, model_torch, X, Y):
 
   # assert weights match (they don't!)
   for k,v in model_torch.named_parameters():
-    print("testing weight", k)
+    if not CI: print("testing weight", k)
     np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
 
 def get_mnist_data():
diff --git a/test/models/test_mnist.py b/test/models/test_mnist.py
index e990f3a0e5..fca4c85084 100644
--- a/test/models/test_mnist.py
+++ b/test/models/test_mnist.py
@@ -6,6 +6,9 @@ from tinygrad.tensor import Tensor, Device
 from tinygrad.nn import optim, BatchNorm2d
 from extra.training import train, evaluate
 from extra.datasets import fetch_mnist
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
 
 # load the mnist dataset
 X_train, Y_train, X_test, Y_test = fetch_mnist()
diff --git a/test/models/test_onnx.py b/test/models/test_onnx.py
index e09be76f00..156dd52131 100644
--- a/test/models/test_onnx.py
+++ b/test/models/test_onnx.py
@@ -8,6 +8,10 @@ import onnx
 from extra.utils import fetch, temp
 from extra.onnx import get_run_onnx
 from tinygrad.tensor import Tensor
+from tinygrad.helpers import CI
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
 
 def run_onnx_torch(onnx_model, inputs):
   import torch
@@ -48,22 +52,24 @@ class TestOnnxModel(unittest.TestCase):
       mt2 = time.monotonic()
       tinygrad_out = tinygrad_out.numpy()
       et = time.monotonic()
-      print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
+      if not CI: print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
 
-    import cProfile
-    import pstats
-    inputs = get_inputs()
-    pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
-    pr.enable()
+    if not CI:
+      import cProfile
+      import pstats
+      inputs = get_inputs()
+      pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
+      pr.enable()
     tinygrad_out = run_onnx(inputs)['outputs']
     tinygrad_out.realize()
     tinygrad_out = tinygrad_out.numpy()
-    pr.disable()
-    stats = pstats.Stats(pr)
-    stats.dump_stats(temp("net.prof"))
-    os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
-    ps = stats.sort_stats(pstats.SortKey.TIME)
-    ps.print_stats(30)
+    if not CI:
+      pr.disable()
+      stats = pstats.Stats(pr)
+      stats.dump_stats(temp("net.prof"))
+      os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
+      ps = stats.sort_stats(pstats.SortKey.TIME)
+      ps.print_stats(30)
 
   def test_openpilot_model(self):
     dat = fetch(OPENPILOT_MODEL)
diff --git a/test/models/test_train.py b/test/models/test_train.py
index 6ea30ab0c8..3f58358564 100644
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@@ -11,6 +11,9 @@ from models.efficientnet import EfficientNet
 from models.transformer import Transformer
 from models.vit import ViT
 from models.resnet import ResNet18
+import pytest
+
+pytestmark = pytest.mark.exclude_gpu
 
 BS = getenv("BS", 2)
 
diff --git a/test/test_assign.py b/test/test_assign.py
index d979901cb5..37b322168c 100644
--- a/test/test_assign.py
+++ b/test/test_assign.py
@@ -5,6 +5,9 @@ from tinygrad.tensor import Tensor
 from tinygrad.lazy import LAZY
 from tinygrad.ops import GlobalCounters
 from tinygrad.graph import nm
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 N = 200  # has to be bigger than the cache to fail
 
diff --git a/test/test_conv.py b/test/test_conv.py
index 433a705345..8042754b2f 100644
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -1,6 +1,9 @@
 import unittest
 import numpy as np
 from tinygrad.tensor import Tensor
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.webgpu]
 
 class TestConv(unittest.TestCase):
   def test_simple(self):
diff --git a/test/test_conv_shapetracker.py b/test/test_conv_shapetracker.py
index 7975d3aebb..1a1219b217 100644
--- a/test/test_conv_shapetracker.py
+++ b/test/test_conv_shapetracker.py
@@ -3,6 +3,9 @@ import unittest
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn import Conv2d
 from tinygrad.ops import GlobalCounters
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 #@unittest.skipUnless(Device.DEFAULT == "GPU", "Only GPU supports cache")
 @unittest.skip("with JIT changes, you only get the raw buffer")
diff --git a/test/test_custom_function.py b/test/test_custom_function.py
index 2583ff79c8..e35a53c02b 100644
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -11,6 +11,9 @@ from tinygrad.helpers import prod, dtypes
 from tinygrad.lazy import LazyBuffer, create_lazybuffer, Device
 from tinygrad.ops import ASTRunner
 from tinygrad.shape.shapetracker import ShapeTracker
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 # we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
 def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
diff --git a/test/test_dtype.py b/test/test_dtype.py
index 301c39e0a7..608a959b47 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -1,6 +1,6 @@
 import unittest
 import numpy as np
-from tinygrad.helpers import getenv, DType, DEBUG
+from tinygrad.helpers import getenv, DType, DEBUG, CI
 from tinygrad.lazy import Device
 from tinygrad.tensor import Tensor, dtypes
 from typing import List, Optional
diff --git a/test/test_jit.py b/test/test_jit.py
index fa53e44d02..6c1d929d0d 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3,6 +3,9 @@ import unittest
 import numpy as np
 from tinygrad.tensor import Tensor, Device
 from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 # NOTE: METAL fails, might be platform and optimization options dependent.
 @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["METAL", "WEBGPU"], f"no JIT on {Device.DEFAULT}")
diff --git a/test/test_net_speed.py b/test/test_net_speed.py
index e382783505..69675b1081 100644
--- a/test/test_net_speed.py
+++ b/test/test_net_speed.py
@@ -5,6 +5,9 @@ import pstats
 import unittest
 import torch
 from tinygrad.tensor import Tensor, Device
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
 
 def start_profile():
   import time
diff --git a/test/test_nn.py b/test/test_nn.py
index c31d780bec..dc91446550 100755
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -7,6 +7,9 @@ from tinygrad.jit import TinyJit
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
 import torch
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.webgpu]
 
 class TestNN(unittest.TestCase):
 
diff --git a/test/test_ops.py b/test/test_ops.py
index efb04e6dd2..067f6d2198 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -4,8 +4,15 @@ import math
 import numpy as np
 import unittest
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, IMAGE, DEBUG
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI
 from tinygrad.lazy import Device
+import pytest
+
+pytestmark = pytest.mark.webgpu
+
+if CI:
+  import warnings
+  warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
 
 FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
 PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
@@ -49,7 +56,7 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, gra
     for i, (t, tt) in enumerate(zip(ts, tst)):
       compare(f"backward pass tensor {i}", tt.grad.numpy(), t.grad.detach().numpy(), atol=grad_atol, rtol=grad_rtol)
 
-  print("\ntesting %40r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms " % (shps, torch_fp*1000, tinygrad_fp*1000, torch_fbp*1000, tinygrad_fbp*1000), end="")
+  if not CI: print("\ntesting %40r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms " % (shps, torch_fp*1000, tinygrad_fp*1000, torch_fbp*1000, tinygrad_fbp*1000), end="")
 
 def prepare_test_op(a, b, shps, vals):
   torch.manual_seed(0)
@@ -68,7 +75,7 @@ class TestOps(unittest.TestCase):
     with self.assertRaises(expected) as tinygrad_cm:
       tinygrad_fxn(*tst)
     if exact: self.assertEqual(str(torch_cm.exception), str(tinygrad_cm.exception))
-    print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")
+    if not CI: print("\ntesting %40r   torch/tinygrad exception: %s / %s" % (shps, torch_cm.exception, tinygrad_cm.exception), end="")
 
   def test_full_like(self):
     a = Tensor([[1,2,3],[4,5,6]])
diff --git a/test/test_optim.py b/test/test_optim.py
index b9bef30a21..c973873fdc 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -5,6 +5,9 @@ import torch
 import unittest
 from tinygrad.tensor import Tensor
 from tinygrad.nn.optim import Adam, SGD, AdamW
+import pytest
+
+pytestmark = pytest.mark.exclude_cuda
 
 np.random.seed(1337)
 x_init = np.random.randn(1,4).astype(np.float32)
diff --git a/test/test_randomness.py b/test/test_randomness.py
index 91a4a38b05..3a3dce38b0 100644
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@@ -4,6 +4,9 @@ import numpy as np
 import torch
 from tinygrad.tensor import Tensor
 import tinygrad.nn as nn
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 # https://gist.github.com/devries/11405101
 def ksprob(a):
diff --git a/test/test_specific_conv.py b/test/test_specific_conv.py
index 1737a78d45..8e14ccd6e1 100644
--- a/test/test_specific_conv.py
+++ b/test/test_specific_conv.py
@@ -2,8 +2,11 @@ import unittest
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import dtypes
 from tinygrad.lazy import Device
+import pytest
 # similar to test/external/external_test_gpu_ast.py, but universal
 
+pytestmark = pytest.mark.exclude_cuda
+
 class TestSpecific(unittest.TestCase):
   # from openpilot
 
diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py
index 5264f74826..3c07f1439f 100644
--- a/test/test_speed_v_torch.py
+++ b/test/test_speed_v_torch.py
@@ -14,8 +14,11 @@ from tinygrad.lazy import Device
 from tinygrad.ops import GlobalCounters
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
-from tinygrad.helpers import colored, getenv, DEBUG
+from tinygrad.helpers import colored, getenv, DEBUG, CI
 from tinygrad.jit import TinyJit
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang, pytest.mark.webgpu]
 
 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
 
@@ -93,7 +96,7 @@ def helper_test_generic(name, f1, f1_args, f2, f2_args):
   desc = "faster" if et_torch > et_tinygrad else "slower"
   flops = save_ops*1e-6
   mem = save_mem*1e-6
-  print(f"\r{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
+  if not CI: print(f"\r{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
   np.testing.assert_allclose(val_tinygrad, val_torch, atol=1e-4, rtol=1e-3)
 
 def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_x):
diff --git a/test/test_tensor.py b/test/test_tensor.py
index 5c745a6307..b903cf09ff 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -6,6 +6,9 @@ import itertools
 from tinygrad.tensor import Tensor, Device
 from tinygrad.helpers import dtypes
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
+import pytest
+
+pytestmark = pytest.mark.webgpu
 
 x_init = np.random.randn(1,3).astype(np.float32)
 U_init = np.random.randn(3,3).astype(np.float32)
diff --git a/test/unit/test_example.py b/test/unit/test_example.py
index db5e7a7aea..dc6567607d 100644
--- a/test/unit/test_example.py
+++ b/test/unit/test_example.py
@@ -2,22 +2,22 @@ import unittest
 import numpy as np
 from tinygrad.lazy import Device
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv
+from tinygrad.helpers import getenv, CI
 
 def multidevice_test(fxn):
   exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
   def ret(self):
     for device in Device._buffers:
       if device in ["DISK", "FAKE"]: continue
-      print(device)
+      if not CI: print(device)
       if device in exclude_devices:
-        print(f"WARNING: {device} test is excluded")
+        if not CI: print(f"WARNING: {device} test is excluded")
         continue
       with self.subTest(device=device):
         try:
           Device[device]
         except Exception:
-          print(f"WARNING: {device} test isn't running")
+          if not CI: print(f"WARNING: {device} test isn't running")
           continue
         fxn(self, device)
   return ret