diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0a1ea908b4..9eb1248a9c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,7 +63,7 @@ jobs:
     - name: Run model inference benchmark
       run: METAL=1 python3.11 test/external/external_model_benchmark.py
     - name: Test speed vs torch
-      run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt
+      run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test tensor cores
       run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
     - name: Test AMX tensor cores
@@ -187,7 +187,7 @@ jobs:
     - name: Run model inference benchmark
       run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
     - name: Test speed vs torch
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
+      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test speed vs theoretical
       run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
     - name: Test benchmark allreduce
@@ -389,7 +389,7 @@ jobs:
     #- name: Test speed vs torch
     #  run: |
     #    python3 -c "import torch; print(torch.__version__)"
-    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
+    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
     - name: Test speed vs theoretical
       run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
     - name: Test tensor cores
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a3997e4f68..73a951af3e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -30,9 +30,9 @@ jobs:
     - name: External Benchmark Schedule
       run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
     - name: Speed Test
-      run: LLVM=1 python3 test/test_speed_v_torch.py
+      run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
     - name: Speed Test (BEAM=2)
-      run: BEAM=2 LLVM=1 python3 test/test_speed_v_torch.py
+      run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py
 
   docs:
     name: Docs
@@ -458,7 +458,7 @@ jobs:
   testopenpilot:
     name: 'openpilot Compile Tests'
     runs-on: ubuntu-22.04
-    timeout-minutes: 10
+    timeout-minutes: 15
     env:
       IGNORE_OOB: 0
     steps:
@@ -589,6 +589,29 @@ jobs:
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
+  testdevectorize:
+    name: Linux (devectorize)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 15
+    env:
+      IGNORE_OOB: 0
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Setup Environment
+      uses: ./.github/actions/setup-tinygrad
+      with:
+        key: devectorize-minimal
+        deps: testing_minimal
+        pydeps: "pillow"
+        llvm: "true"
+    - name: Test LLVM=1 DEVECTORIZE=0
+      run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
+    - name: Test LLVM=1 DEVECTORIZE=0 for model
+      run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
+    - name: Test CPU=1 DEVECTORIZE=0
+      run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
+
   testdsp:
     name: Linux (DSP)
     runs-on: ubuntu-24.04
@@ -624,12 +647,6 @@ jobs:
       run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
     - name: Test quantize onnx
       run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
-    - name: Test LLVM=1 DEVECTORIZE=0
-      run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
-    - name: Test LLVM=1 DEVECTORIZE=0 for model
-      run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
-    - name: Test CPU=1 DEVECTORIZE=0
-      run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
 
   testwebgpu:
     name: Linux (WebGPU)
@@ -689,9 +706,9 @@ jobs:
           DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Run LLVM test
         if: matrix.backend=='amdllvm'
-        run: python test/test_amd_llvm.py
+        run: python test/device/test_amd_llvm.py
       - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py --durations=20
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20
       - name: Run pytest (amd)
         run: python -m pytest test/external/external_test_am.py --durations=20
       - name: Run TRANSCENDENTAL math
@@ -816,14 +833,14 @@ jobs:
         AMD: 1
         FORWARD_ONLY: 1
       run: |
-        python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
+        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
     - name: Run pytest (amd with llvm backend)
       env:
         MOCKGPU: 1
         AMD: 1
         FORWARD_ONLY: 1
       run: |
-        python -m pytest -n=auto test/test_hcq.py test/test_tiny.py test/test_amd_llvm.py --durations=20
+        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
     - name: Run pytest (ptx)
       env:
         MOCKGPU: 1
@@ -831,7 +848,7 @@ jobs:
         NV: 1
         FORWARD_ONLY: 1
       run: |
-        python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
+        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
     - name: Run process replay tests
       uses: ./.github/actions/process-replay
 
@@ -942,18 +959,18 @@ jobs:
         env:
           HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
         run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py
+          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
       - name: Run REMOTE=1 Test (GPU)
         env:
           HOST: 127.0.0.1:7667*6
         run: |
-          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
+          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
           IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
       - name: Run REMOTE=1 Test (CPU)
         env:
           HOST: 127.0.0.1:8667*6
         run: |
-          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
+          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
       - name: Show remote server logs
         if: always()
         run: |
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000..cccc62e404
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+norecursedirs = extra
diff --git a/test/test_amd_llvm.py b/test/device/test_amd_llvm.py
similarity index 100%
rename from test/test_amd_llvm.py
rename to test/device/test_amd_llvm.py
diff --git a/test/test_hcq.py b/test/device/test_hcq.py
similarity index 100%
rename from test/test_hcq.py
rename to test/device/test_hcq.py
diff --git a/test/test_hcq_iface.py b/test/device/test_hcq_iface.py
similarity index 100%
rename from test/test_hcq_iface.py
rename to test/device/test_hcq_iface.py
diff --git a/test/test_metal.py b/test/device/test_metal.py
similarity index 100%
rename from test/test_metal.py
rename to test/device/test_metal.py
diff --git a/test/external/external_benchmark_kernel_launch.py b/test/external/external_benchmark_kernel_launch.py
index 5744ba0923..7011f7ab28 100644
--- a/test/external/external_benchmark_kernel_launch.py
+++ b/test/external/external_benchmark_kernel_launch.py
@@ -2,7 +2,7 @@ import time
 from tinygrad import Tensor, TinyJit, Device, Context
 from tinygrad.helpers import Profiling, Timing, GlobalCounters
 
-# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
+# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a
 
 @TinyJit
 def plus(a:Tensor, b:Tensor): return a+b
diff --git a/test/test_speed_v_torch.py b/test/external/external_test_speed_v_torch.py
similarity index 100%
rename from test/test_speed_v_torch.py
rename to test/external/external_test_speed_v_torch.py
diff --git a/test/models/test_onnx.py b/test/models/test_onnx.py
index c2b1f8677d..79ce2046c6 100644
--- a/test/models/test_onnx.py
+++ b/test/models/test_onnx.py
@@ -32,6 +32,7 @@ OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/mod
 np.random.seed(1337)
 
 class TestOnnxModel(unittest.TestCase):
+  @unittest.skip("this isn't a test, it can't fail")
   def test_benchmark_openpilot_model(self):
     onnx_model = fetch(OPENPILOT_MODEL)
     run_onnx = OnnxRunner(onnx_model)
diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py
index f1696fd490..056615b227 100644
--- a/test/models/test_whisper.py
+++ b/test/models/test_whisper.py
@@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
 TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
 TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time."   # noqa: E501
 
-@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow")
+@unittest.skipIf(Device.DEFAULT in ["CPU", "LLVM"], "slow")
 @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
 class TestWhisper(unittest.TestCase):
   @classmethod
diff --git a/test/test_copy_speed.py b/test/test_copy_speed.py
index ed2080922d..391a4da0c6 100644
--- a/test/test_copy_speed.py
+++ b/test/test_copy_speed.py
@@ -3,7 +3,7 @@ from tinygrad import Tensor, Device, TinyJit
 from tinygrad.helpers import Timing, CI, OSX
 import multiprocessing.shared_memory as shared_memory
 
-N = 256 if CI else 4096
+N = 256
 class TestCopySpeed(unittest.TestCase):
   @classmethod
   def setUpClass(cls): Device[Device.DEFAULT].synchronize()
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 14e1cb1266..9766753daa 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -373,11 +373,11 @@ class TestMultiTensor(unittest.TestCase):
     np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)
 
   # NOTE: this is failing on LLVM CI, no idea why. Works locally.
-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
   def test_data_parallel_resnet(self):
     from extra.models.resnet import ResNet18
 
-    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
+    fake_image = Tensor.rand((2, 3, 224//16, 224//16))
     fake_image_sharded = fake_image.shard(devices_2, axis=0)
     m = ResNet18()
     m.load_from_pretrained()
@@ -409,10 +409,10 @@ class TestMultiTensor(unittest.TestCase):
     # sometimes there is zeros in these grads... why?
     np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
 
-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU", "AMD"), "slow, and flaky on LLVM/CPU")
   def test_data_parallel_resnet_train_step(self):
     from extra.models.resnet import ResNet18
-    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
+    fake_image = Tensor.rand((2, 3, 224//16, 224//16))
     labels = Tensor.randint(2, low=0, high=1000)
     m = ResNet18()
     self._test_model_train_step(m, fake_image, labels)
diff --git a/test/test_net_speed.py b/test/test_net_speed.py
deleted file mode 100644
index 9344ebe720..0000000000
--- a/test/test_net_speed.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-import time
-import unittest
-import torch
-from tinygrad import Tensor, Device
-from tinygrad.helpers import Profiling, CI
-
-@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
-class TestConvSpeed(unittest.TestCase):
-
-  def test_mnist(self):
-    # https://keras.io/examples/vision/mnist_convnet/
-    conv = 3
-    inter_chan, out_chan = 32, 64
-
-    # ****** torch baseline *******
-
-    torch.backends.mkldnn.enabled = False
-
-    conv = 3
-    inter_chan, out_chan = 32, 64
-    c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
-    c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
-    l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
-
-    c2d = torch.nn.functional.conv2d
-    mp = torch.nn.MaxPool2d((2,2))
-    lsm = torch.nn.LogSoftmax(dim=1)
-
-    cnt = 5
-    fpt, bpt = 0.0, 0.0
-    for i in range(cnt):
-      et0 = time.time()
-      x = torch.randn(128, 1, 28, 28, requires_grad=True)
-      x = mp(c2d(x,c1).relu())
-      x = mp(c2d(x,c2).relu())
-      x = x.reshape(x.shape[0], -1)
-      out = lsm(x.matmul(l1))
-      out = out.mean()
-      et1 = time.time()
-      out.backward()
-      et2 = time.time()
-      fpt += (et1-et0)
-      bpt += (et2-et1)
-
-    fpt_baseline = (fpt*1000/cnt)
-    bpt_baseline = (bpt*1000/cnt)
-    print("torch forward pass:  %.3f ms" % fpt_baseline)
-    print("torch backward pass: %.3f ms" % bpt_baseline)
-
-    # ****** tinygrad compare *******
-
-    c1 = Tensor(c1.detach().numpy(), requires_grad=True)
-    c2 = Tensor(c2.detach().numpy(), requires_grad=True)
-    l1 = Tensor(l1.detach().numpy(), requires_grad=True)
-
-    cnt = 5
-    fpt, bpt = 0.0, 0.0
-    for i in range(1+cnt):
-      et0 = time.time()
-      x = Tensor.randn(128, 1, 28, 28)
-      x = x.conv2d(c1).relu().avg_pool2d()
-      x = x.conv2d(c2).relu().max_pool2d()
-      x = x.reshape(shape=(x.shape[0], -1))
-      out = x.dot(l1).log_softmax()
-      out = out.mean()
-      out.backward()  # NOTE: we have to now compute this here, but it doesn't realize
-      out.realize()
-      et1 = time.time()
-      [x.grad.realize() for x in [c1, c2, l1]]
-      et2 = time.time()
-      if i == 0:
-        pr = Profiling(sort='time', frac=0.2)
-        pr.__enter__()
-      else:
-        fpt += (et1-et0)
-        bpt += (et2-et1)
-
-    pr.__exit__()
-    fpt = (fpt*1000/cnt)
-    bpt = (bpt*1000/cnt)
-    print("forward pass:  %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
-    print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
-
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/test/test_nn.py b/test/test_nn.py
old mode 100755
new mode 100644
diff --git a/test/unit/test_hashing.py b/test/unit/test_hashing.py
index 803883b78f..1ab969b2fd 100644
--- a/test/unit/test_hashing.py
+++ b/test/unit/test_hashing.py
@@ -2,7 +2,6 @@ from typing_extensions import Callable
 import hashlib, random, unittest
 from tinygrad import Tensor, Device, getenv, dtypes
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import CI
 
 @unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
 @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
@@ -12,7 +11,7 @@ class TestHashing(unittest.TestCase):
     chunk_hashes = [hashlib.shake_128(chunk).digest(16) for chunk in chunks]
     return hashlib.shake_128(b''.join(chunk_hashes)).digest(16)
 
-  @unittest.skipIf(CI, "very slow")
+  @unittest.skip("very slow")
   def test_abc(self):
     expected = self._python_hash_1mb(b"abc" + b"\x00" * (2**20 - 3))
     out = Tensor(b"abc").hash()
@@ -65,7 +64,7 @@ class TestKeccak(unittest.TestCase):
     data = b"\x00" * 4
     self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
 
-    data = b"\x00" * (1000 if CI else 4096)
+    data = b"\x00" * 1000
     self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
 
 if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/unit/test_linalg.py
similarity index 100%
rename from test/test_linalg.py
rename to test/unit/test_linalg.py
diff --git a/test_driven_development.sh b/test_driven_development.sh
deleted file mode 100755
index e868497e91..0000000000
--- a/test_driven_development.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-python3 test/external/process_replay/reset.py
-CAPTURE_PROCESS_REPLAY=1 pytest -n auto test/test_tiny.py test/test_uop_graph.py test/test_ops.py test/test_linearizer.py
-while true; do
-  if python3 test/test_tiny.py; then
-    PYTHONPATH="." python3 test/external/process_replay/process_replay.py
-  fi
-done