diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0b88ec8e6f..8a13dc7e2f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         DEBUG=2 EMULATE_HIP=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
         DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
     - name: Test dtype with Python emulator
-      run: DEBUG=2 PYTHON=1 python3 test/test_dtype.py
+      run: PYTHONPATH=. DEBUG=2 PYTHON=1 python3 test/test_dtype.py
     - name: Test ops with Python emulator
       run: DEBUG=2 PYTHON=1 python3 -m pytest test/test_ops.py -k "not (test_split or test_simple_cumsum or test_cumsum or test_einsum or test_dot or test_dot_1d or test_big_gemm or test_broadcastdot or test_multidot or test_var_axis or test_std_axis or test_broadcast_full or test_broadcast_partial or test_simple_conv3d or test_dilated_conv_transpose2d or test_simple_conv_transpose3d or test_large_input_conv2d or test_maxpool2d or test_maxpool2d_simple or test_maxpool2d_bigger_stride or test_avgpool2d or test_cat or test_scaled_product_attention or test_scaled_product_attention_causal)" --durations=20
     - name: Test symbolic with Python emulator
diff --git a/test/external/external_test_onnx_backend.py b/test/external/external_test_onnx_backend.py
index c5cf06257f..12781a026f 100644
--- a/test/external/external_test_onnx_backend.py
+++ b/test/external/external_test_onnx_backend.py
@@ -3,9 +3,9 @@ from typing import Any, Tuple
 from onnx.backend.base import Backend, BackendRep
 import onnx.backend.test
 import numpy as np
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, CI, OSX
-from tinygrad.device import Device
+from tinygrad import Tensor, Device, dtypes
+from tinygrad.helpers import getenv, OSX
+from test.helpers import is_dtype_supported
 
 # pip3 install tabulate
 pytest_plugins = 'onnx.backend.test.report',
@@ -49,7 +49,7 @@ backend_test.exclude('test_adam_multiple_cpu')
 backend_test.exclude('test_nesterov_momentum_cpu')
 
 # about different dtypes
-if Device.DEFAULT in ["METAL"] or (OSX and Device.DEFAULT == "GPU"):
+if not is_dtype_supported(dtypes.float64):
   backend_test.exclude('float64')
   backend_test.exclude('DOUBLE')
   # these have float64 inputs
@@ -59,8 +59,7 @@ if Device.DEFAULT in ["METAL"] or (OSX and Device.DEFAULT == "GPU"):
   backend_test.exclude('test_einsum_*')
   backend_test.exclude('test_cumsum_*')
 
-# no float16 in CI, LLVM segfaults, GPU requires cl_khr_fp16
-if Device.DEFAULT in ['LLVM', 'CUDA', 'GPU'] and CI:
+if not is_dtype_supported(dtypes.float16):
   backend_test.exclude('float16')
   backend_test.exclude('FLOAT16')
 
diff --git a/test/helpers.py b/test/helpers.py
index e14a054f32..273705bae5 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,7 +1,9 @@
+import sys
+from tinygrad import Tensor, Device, dtypes
 from tinygrad.device import JITRunner
+from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
-from tinygrad import Tensor
-from tinygrad.helpers import Context
+from tinygrad.helpers import Context, CI, OSX
 
 def derandomize_model(model):
   with Context(GRAPH=0):
@@ -17,3 +19,18 @@ def assert_jit_cache_len(fxn, expected_len):
   else:
     assert len(fxn.jit_cache) == 1
     assert len(fxn.jit_cache[0].prg.jit_cache) == expected_len
+
+def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT):
+  if dtype == dtypes.bfloat16:
+    # NOTE: this requires bf16 buffer support
+    return device in ["HIP"]
+  if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
+  # for CI GPU, cl_khr_fp16 isn't supported
+  # for CI LLVM, it segfaults because it can't link to the casting function
+  # CUDACPU architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
+  # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
+  if dtype == dtypes.half:
+    if device in ["GPU", "LLVM", "CUDA"]: return not CI
+    if device == "PYTHON": return sys.version_info >= (3, 12)
+  if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU")
+  return True
\ No newline at end of file
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 50a7a4ae07..4615006910 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -7,8 +7,7 @@ from tinygrad import Tensor, Device, GlobalCounters, dtypes
 from tinygrad.helpers import CI, getenv
 from tinygrad.shape.symbolic import Variable
 from extra.lr_scheduler import OneCycleLR
-from test.helpers import derandomize_model
-from test.test_dtype import is_dtype_supported
+from test.helpers import derandomize_model, is_dtype_supported
 
 from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
 from examples.hlb_cifar10 import SpeedyResNet, hyp
diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py
index ea61c19390..7dd87bca32 100644
--- a/test/models/test_whisper.py
+++ b/test/models/test_whisper.py
@@ -2,7 +2,8 @@ import unittest
 import pathlib
 from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform
 from tinygrad.helpers import CI, fetch
-from tinygrad import Device
+from tinygrad import Device, dtypes
+from test.helpers import is_dtype_supported
 
 # Audio generated with the command on MacOS:
 # say "Could you please let me out of the box?" --file-format=WAVE  --data-format=LEUI8@16000 -o test
@@ -15,7 +16,8 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
 TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
 TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time."   # noqa: E501
 
-@unittest.skipIf(CI and Device.DEFAULT in ["LLVM", "CLANG", "CPU", "GPU"], "Not working on LLVM, slow on others. GPU reequires cl_khr_fp16")
+@unittest.skipIf(CI and Device.DEFAULT in ["CLANG"], "slow")
+@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
 class TestWhisper(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
diff --git a/test/test_dtype.py b/test/test_dtype.py
index 8952eaa913..c6142b9bee 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -1,11 +1,12 @@
-import unittest, operator, sys
+import unittest, operator
 import numpy as np
 import torch
 from typing import Any, List
-from tinygrad.helpers import CI, getenv, DEBUG, OSX
+from tinygrad.helpers import getenv, DEBUG
 from tinygrad.dtype import DType, DTYPES_DICT, ImageDType, PtrDType, least_upper_float, least_upper_dtype
 from tinygrad import Device, Tensor, dtypes
 from hypothesis import given, settings, strategies as strat
+from test.helpers import is_dtype_supported
 
 settings.register_profile("my_profile", max_examples=200, deadline=None)
 settings.load_profile("my_profile")
@@ -13,20 +14,6 @@ settings.load_profile("my_profile")
 core_dtypes = list(DTYPES_DICT.values())
 if Device.DEFAULT == "CPU": core_dtypes.remove(dtypes.bfloat16)  # NOTE: this is for teenygrad, don't remove
 floats = [dt for dt in core_dtypes if dtypes.is_float(dt)]
-def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT):
-  if dtype == dtypes.bfloat16:
-    # NOTE: this requires bf16 buffer support
-    return device in ["HIP"]
-  if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
-  # for CI GPU, cl_khr_fp16 isn't supported
-  # for CI LLVM, it segfaults because it can't link to the casting function
-  # CUDA in CI uses CUDACPU that does not support half
-  # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
-  if dtype == dtypes.half:
-    if device in ["GPU", "LLVM", "CUDA"]: return not CI
-    if device == "PYTHON": return sys.version_info >= (3, 12)
-  if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU")
-  return True
 
 def get_available_cast_dtypes(dtype: DType) -> List[DType]:
   if not is_dtype_supported(dtype): return []
diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py
index 753a68cfe6..d196937c0e 100644
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -8,7 +8,7 @@ from tinygrad.dtype import DType
 from tinygrad.helpers import CI, getenv
 from tinygrad.realize import create_schedule
 from tinygrad.ops import UnaryOps, get_lazyop_info
-from test.test_dtype import is_dtype_supported
+from test.helpers import is_dtype_supported
 
 settings.register_profile("my_profile", max_examples=200, deadline=None)
 settings.load_profile("my_profile")
diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py
index ea4813aac9..dde54cd3ec 100644
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
@@ -7,7 +7,7 @@ from tinygrad.features.search import Opt, OptOps
 from tinygrad import Device, dtypes, Tensor
 from tinygrad.helpers import CI
 from test.external.fuzz_linearizer import run_linearizer, get_fuzz_rawbufs, get_fuzz_rawbuf_like
-from test.test_dtype import is_dtype_supported
+from test.helpers import is_dtype_supported
 
 from tinygrad.ops import LazyOp, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer, get_lazyop_info
 from tinygrad.shape.shapetracker import ShapeTracker
diff --git a/test/test_specific_conv.py b/test/test_specific_conv.py
index c0d6e76cca..cf79479027 100644
--- a/test/test_specific_conv.py
+++ b/test/test_specific_conv.py
@@ -1,7 +1,7 @@
 import unittest
-from tinygrad.tensor import Tensor
 from tinygrad.helpers import CI
-from tinygrad import Device, dtypes
+from tinygrad import Tensor, Device, dtypes
+from test.helpers import is_dtype_supported
 # similar to test/external/external_test_gpu_ast.py, but universal
 
 @unittest.skipIf(Device.DEFAULT == "CUDA" and CI, "slow on CUDA CI")
@@ -20,7 +20,7 @@ class TestSpecific(unittest.TestCase):
     w = Tensor.randn(2048, 512)
     (x @ w).reshape(1, 128, 4).contiguous().realize()
 
-  @unittest.skipIf(Device.DEFAULT in ["LLVM", "WEBGPU", "GPU", "CUDA"], "Broken on LLVM and webgpu, GPU requires cl_khr_fp16")
+  @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
   def test_big_vec_mul(self):
     # from LLaMA
     #   0 buffer<4096, dtypes.float>                      [View((1024, 1, 1, 4), (4, 0, 0, 1), 0, None)]
diff --git a/test/test_uops.py b/test/test_uops.py
index a8bfa55f55..e0c0954385 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -8,7 +8,7 @@ from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
 from tinygrad.realize import create_schedule
 from tinygrad.codegen.linearizer import UOps, UOp
 from tinygrad.codegen.uops import exec_alu, UOpGraph
-from test.test_dtype import is_dtype_supported
+from test.helpers import is_dtype_supported
 
 def _uops_to_prg(uops):
   src = Device[Device.DEFAULT].compiler.render("test", uops)
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index e581470a3b..154a514bff 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -2,7 +2,8 @@ import pathlib, unittest
 import numpy as np
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
-from tinygrad.helpers import Timing, CI, fetch, temp, getenv
+from tinygrad.helpers import Timing, fetch, temp, getenv
+from test.helpers import is_dtype_supported
 
 def compare_weights_both(url):
   import torch
@@ -25,10 +26,7 @@ class TestTorchLoad(unittest.TestCase):
   # pytorch zip format
   def test_load_convnext(self): compare_weights_both('https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth')
 
-  # for GPU, cl_khr_fp16 isn't supported
-  # for LLVM, it segfaults because it can't link to the casting function
-  # CUDACPU architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
-  @unittest.skipIf(Device.DEFAULT in ["GPU", "LLVM", "CUDA"] and CI, "fp16 broken in some backends")
+  @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
   def test_load_llama2bfloat(self): compare_weights_both("https://huggingface.co/qazalin/bf16-lightweight/resolve/main/consolidated.00.pth?download=true")
 
   # pytorch tar format