diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9c7b59eba4..cf5f1e33b6 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -637,3 +637,127 @@ jobs:
           openpilot_0_9_7.txt
           openpilot_image_0_9_4.txt
           openpilot_image_0_9_7.txt
+
+  testreddriverbenchmark:
+    name: AM Benchmark
+    runs-on: [self-hosted, Linux, tinyboxrandom]
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash -e -o pipefail {0}
+    if: github.repository_owner == 'tinygrad'
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Remove amdgpu
+      run: sudo rmmod amdgpu || true
+    - name: Cleanup running AM processes
+      run: python extra/amdpci/am_smi.py --pids --kill
+    - name: Symlink models and datasets
+      run: |
+        mkdir -p weights
+        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
+        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
+        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
+        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
+        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
+        mkdir -p extra/datasets
+        ln -s /raid/datasets/imagenet extra/datasets/imagenet
+    - name: setup staging db
+      if: github.ref == 'refs/heads/update_benchmark_staging'
+      run: |
+        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
+        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
+    - name: reset process replay
+      run: test/external/process_replay/reset.py
+    # Fails on 9070
+    # - name: Test tensor cores
+    #   run: |
+    #     AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
+    #     AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
+    #     AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
+    - name: Run Tensor Core GEMM (AMD)
+      run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee am_matmul_amd.txt
+    - name: Test AMD=1
+      run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
+    - name: Test driver cold start time
+      run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test driver warm start time
+      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test DISK copy time
+      run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
+    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
+      run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt
+    - name: Run 10 MLPerf Bert training steps (6 gpu)
+      # TODO: remove BERT_LAYERS once scheduler is fast
+      run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt
+    - uses: actions/upload-artifact@v4
+      with:
+        name: Speed (AM Driver)
+        path: |
+          am_matmul_amd.txt
+          am_train_cifar_one_gpu.txt
+          am_train_resnet_one_gpu.txt
+          am_train_bert_one_gpu.txt
+    - name: Run process replay tests
+      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
+
+  testgreendriverbenchmark:
+    name: NV Benchmark
+    runs-on: [self-hosted, Linux, tinyboxrandom]
+    timeout-minutes: 10
+    defaults:
+      run:
+        shell: bash -e -o pipefail {0}
+    if: github.repository_owner == 'tinygrad'
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Remove nv modules
+      run: ./extra/nvpci/nv_smi.py rmmod
+    - name: Symlink models and datasets
+      run: |
+        mkdir -p weights
+        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
+        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
+        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
+        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
+        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
+        mkdir -p extra/datasets
+        ln -s /raid/datasets/imagenet extra/datasets/imagenet
+    - name: setup staging db
+      if: github.ref == 'refs/heads/update_benchmark_staging'
+      run: |
+        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
+        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
+    - name: reset process replay
+      run: test/external/process_replay/reset.py
+    - name: Test tensor cores
+      run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
+    - name: Test driver cold start time
+      run: time NV=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test driver warm start time
+      run: time NV=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test DISK copy time
+      run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
+    - name: Test LLAMA-3
+      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
+    - name: Run full CIFAR training w 1 GPU
+      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt
+    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
+      run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt
+    - name: Run 10 MLPerf Bert training steps (6 gpu)
+      # TODO: remove BERT_LAYERS once scheduler is fast
+      run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt
+    - uses: actions/upload-artifact@v4
+      with:
+        name: Speed (NV Driver)
+        path: |
+          nv_llama3_beam.txt
+          nv_train_cifar_one_gpu.txt
+          nv_train_resnet_one_gpu.txt
+          nv_train_bert_one_gpu.txt
+    - name: Run process replay tests
+      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py
diff --git a/test/external/external_benchmark_disk_raw.py b/test/external/external_benchmark_disk_raw.py
index 6566ae4109..4d4962352d 100644
--- a/test/external/external_benchmark_disk_raw.py
+++ b/test/external/external_benchmark_disk_raw.py
@@ -1,7 +1,8 @@
 import pathlib
 from tinygrad import Tensor, Device, Context
+from tinygrad.helpers import getenv
 
 if __name__ == "__main__":
   with Context(DEBUG=2):
-    disk_llama = Tensor(pathlib.Path("/raid/weights/LLaMA-3/8B/consolidated.00.pth"))
+    disk_llama = Tensor(pathlib.Path(getenv("TESTFILE", "/raid/weights/LLaMA-3/8B/consolidated.00.pth")))
     device_llama = disk_llama.to(Device.DEFAULT).realize()