From 8af8808c61a97f1a7b4fb79b190bd517e43bba8b Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Tue, 19 Aug 2025 21:21:07 -0700
Subject: [PATCH] cleanup tests, bump caches (#11746)

---
 .github/actions/setup-tinygrad/action.yml     | 14 ++++++-------
 .github/workflows/benchmark.yml               |  6 +++---
 .github/workflows/test.yml                    | 10 +++++----
 test/{ => device}/test_ocl.py                 |  0
 .../external_benchmark_kernel_launch.py       |  2 +-
 .../external_test_copy_speed.py}              |  0
 .../external_test_device_speed.py}            |  0
 .../external_test_speed_v_torch.py            |  0
 test/test_compile_failures.py                 | 17 ++++++++++++++-
 test/test_disassembly.py                      | 21 -------------------
 10 files changed, 33 insertions(+), 37 deletions(-)
 rename test/{ => device}/test_ocl.py (100%)
 rename test/{test_copy_speed.py => speed/external_test_copy_speed.py} (100%)
 rename test/{test_device_speed.py => speed/external_test_device_speed.py} (100%)
 rename test/{external => speed}/external_test_speed_v_torch.py (100%)
 delete mode 100644 test/test_disassembly.py

diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml
index e6e95248ef..051acada34 100644
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@@ -121,7 +121,7 @@ runs:
         echo 'Acquire::GzipIndexes "true";' | sudo tee /etc/apt/apt.conf.d/gzip
         echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
         echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' | sudo tee -a /etc/apt/apt.conf.d/99keep-debs
-    
+
     - name: Add OpenCL Repo
       if: inputs.opencl == 'true' && runner.os == 'Linux'
       shell: bash
@@ -174,7 +174,7 @@ runs:
         if [[ "${{ inputs.llvm }}" == "true" ]]; then
           pkgs+=" libllvm20 clang-20 lld-20"
         fi
-        
+
         echo "pkgs=$pkgs" >> "$GITHUB_OUTPUT"
         echo "hash=$(echo -n "$pkgs" | sha256sum | cut -d' ' -f1)" >> "$GITHUB_OUTPUT"
 
@@ -183,21 +183,21 @@ runs:
       uses: actions/cache@v4
       with:
         path: /var/cache/apt/archives/
-        key: ${{ runner.os }}-apt-${{ steps.apt-pkgs.outputs.hash }}
+        key: ${{ runner.os }}-apt-${{ steps.apt-pkgs.outputs.hash }}-${{ env.APT_CACHE_VERSION }}
 
     - name: Run apt Update + Install
       if: runner.os == 'Linux' && (inputs.opencl == 'true' || inputs.amd == 'true' || inputs.cuda == 'true' || inputs.webgpu == 'true' || inputs.llvm == 'true')
       shell: bash
       run: |
         sudo apt -qq update || true
-        
+
         # ******** do install ********
         if [[ -n "${{ steps.apt-pkgs.outputs.pkgs }}" ]]; then
           sudo apt-get -y --allow-unauthenticated --no-install-recommends install ${{ steps.apt-pkgs.outputs.pkgs }}
         fi
-        
+
         sudo chown -R $USER:$USER /var/cache/apt/archives/
-    
+
     # **** AMD ****
     - name: Setup AMD (Linux)
       if: inputs.amd == 'true' && runner.os == 'Linux'
@@ -234,7 +234,7 @@ runs:
         cache-name: cache-gpuocelot-build
       with:
         path: ${{ github.workspace }}/gpuocelot/ocelot
-        key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0
+        key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-${{ env.BUILD_CACHE_VERSION }}
     - name: Clone/compile gpuocelot
       if: inputs.ocelot == 'true' && steps.cache-build.outputs.cache-hit != 'true'
       shell: bash
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9eb1248a9c..d01bb69314 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,7 +63,7 @@ jobs:
     - name: Run model inference benchmark
       run: METAL=1 python3.11 test/external/external_model_benchmark.py
     - name: Test speed vs torch
-      run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+      run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test tensor cores
       run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
     - name: Test AMX tensor cores
@@ -187,7 +187,7 @@ jobs:
     - name: Run model inference benchmark
       run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
     - name: Test speed vs torch
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test speed vs theoretical
       run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
     - name: Test benchmark allreduce
@@ -389,7 +389,7 @@ jobs:
     #- name: Test speed vs torch
     #  run: |
     #    python3 -c "import torch; print(torch.__version__)"
-    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test speed vs theoretical
       run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
     - name: Test tensor cores
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 73a951af3e..59e3b0ee1c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,8 +1,10 @@
 name: Unit Tests
 env:
   # increment this when downloads substantially change to avoid the internet
-  DOWNLOAD_CACHE_VERSION: '11'
-  PYTHON_CACHE_VERSION: '2'
+  DOWNLOAD_CACHE_VERSION: '12'
+  PYTHON_CACHE_VERSION: '3'
+  APT_CACHE_VERSION: '1'
+  BUILD_CACHE_VERSION: '1'
   CAPTURE_PROCESS_REPLAY: 1
   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -30,9 +32,9 @@ jobs:
     - name: External Benchmark Schedule
       run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
     - name: Speed Test
-      run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
+      run: LLVM=1 python3 test/speed/external_test_speed_v_torch.py
     - name: Speed Test (BEAM=2)
-      run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py
+      run: BEAM=2 LLVM=1 python3 test/speed/external_test_speed_v_torch.py
 
   docs:
     name: Docs
diff --git a/test/test_ocl.py b/test/device/test_ocl.py
similarity index 100%
rename from test/test_ocl.py
rename to test/device/test_ocl.py
diff --git a/test/external/external_benchmark_kernel_launch.py b/test/external/external_benchmark_kernel_launch.py
index 7011f7ab28..1369ddc0d5 100644
--- a/test/external/external_benchmark_kernel_launch.py
+++ b/test/external/external_benchmark_kernel_launch.py
@@ -2,7 +2,7 @@ import time
 from tinygrad import Tensor, TinyJit, Device, Context
 from tinygrad.helpers import Profiling, Timing, GlobalCounters
 
-# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a
+# python3 test/speed/external_test_speed_v_torch.py TestSpeed.test_add_a
 
 @TinyJit
 def plus(a:Tensor, b:Tensor): return a+b
diff --git a/test/test_copy_speed.py b/test/speed/external_test_copy_speed.py
similarity index 100%
rename from test/test_copy_speed.py
rename to test/speed/external_test_copy_speed.py
diff --git a/test/test_device_speed.py b/test/speed/external_test_device_speed.py
similarity index 100%
rename from test/test_device_speed.py
rename to test/speed/external_test_device_speed.py
diff --git a/test/external/external_test_speed_v_torch.py b/test/speed/external_test_speed_v_torch.py
similarity index 100%
rename from test/external/external_test_speed_v_torch.py
rename to test/speed/external_test_speed_v_torch.py
diff --git a/test/test_compile_failures.py b/test/test_compile_failures.py
index 5ab87c0b62..16559e907a 100644
--- a/test/test_compile_failures.py
+++ b/test/test_compile_failures.py
@@ -1,7 +1,10 @@
-import unittest
+import unittest, io
+from contextlib import redirect_stdout
 from tinygrad import Tensor, dtypes, Device
+from tinygrad.helpers import OSX
 from tinygrad.engine.realize import lower_schedule
 from tinygrad.device import is_dtype_supported
+from tinygrad.engine.realize import get_program
 
 class TestCompileFailures(unittest.TestCase):
   def compile(self, out:Tensor):
@@ -14,5 +17,17 @@ class TestCompileFailures(unittest.TestCase):
   def test_add_max_uchar(self):
     self.compile((Tensor.empty(1024, dtype='uint8') + Tensor.empty(1024, dtype='uint8')).max())
 
+class TestDisassembly(unittest.TestCase):
+  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
+  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic")
+  def test_float16_alu(self):
+    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
+    s = c.schedule()[-1]
+    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
+    lib = Device[Device.DEFAULT].compiler.compile(p.src)
+    out = io.StringIO()
+    with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
+    assert "fcvt" not in out.getvalue()
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_disassembly.py b/test/test_disassembly.py
deleted file mode 100644
index e908b83710..0000000000
--- a/test/test_disassembly.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import unittest, io
-from tinygrad import Tensor, dtypes
-from contextlib import redirect_stdout
-from tinygrad.device import Device
-from tinygrad.helpers import OSX
-from tinygrad.engine.realize import get_program
-
-class TestDisassembly(unittest.TestCase):
-  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
-  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic")
-  def test_float16_alu(self):
-    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
-    s = c.schedule()[-1]
-    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
-    lib = Device[Device.DEFAULT].compiler.compile(p.src)
-    out = io.StringIO()
-    with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
-    assert "fcvt" not in out.getvalue()
-
-if __name__ == "__main__":
-  unittest.main()
\ No newline at end of file