mark slow tests as slow instead of as CI (#13736)

* mark slow tests as slow instead of as CI * CI shouldn't have different behavior * more skips / CI * slow
2026-01-06 21:53:53 -05:00 · 2025-12-17 10:29:57 -04:00
parent 9015a22523
commit 3dbde178c1
26 changed files with 80 additions and 264 deletions
--- a/test/device/test_hcq.py
+++ b/test/device/test_hcq.py
@@ -1,6 +1,7 @@
 import unittest, ctypes, struct, os, random, numpy as np
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import getenv, CI, mv_address, DEBUG
+from tinygrad.helpers import getenv, mv_address, DEBUG
+from test.helpers import slow
 from tinygrad.device import Buffer, BufferSpec
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
 from tinygrad.runtime.autogen import libc
@@ -220,7 +221,7 @@ class TestHCQ(unittest.TestCase):
    mv_buf1 = buf1.as_buffer().cast('Q')
    assert libc.memcmp(mv_address(mv_buf1), buf2._buf.va_addr, sz) == 0

-  @unittest.skipIf(CI, "skip in CI")
+  @slow
  def test_copy_64bit(self):
    if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")

--- a/test/device/test_ocl.py
+++ b/test/device/test_ocl.py
@@ -2,12 +2,11 @@ import unittest
 from tinygrad import Device
 from tinygrad.device import Buffer
 from tinygrad.dtype import dtypes
-from tinygrad.helpers import CI
 from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram

@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
 class TestCLError(unittest.TestCase):
-  @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
+  @unittest.skip("allocates tons of memory")
  def test_oom(self):
    with self.assertRaises(RuntimeError) as err:
      allocator = CLAllocator(CLDevice())
--- a/test/external/external_test_hcq.py
+++ b/test/external/external_test_hcq.py
@@ -261,7 +261,7 @@ class TestHCQ(unittest.TestCase):
    et = _time_queue(q, TestHCQ.d0)
    gb_s = (SZ/1e9)/et
    print(f"same device copy:  {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
-    assert (0.3 if CI else 10) <= gb_s <= 1000
+    assert 0.3 <= gb_s <= 1000

  def test_cross_device_copy_bandwidth(self):
    SZ = 2_000_000_000
@@ -273,7 +273,7 @@ class TestHCQ(unittest.TestCase):
    et = _time_queue(q, TestHCQ.d0)
    gb_s = (SZ/1e9)/et
    print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
-    assert (0.3 if CI else 2) <= gb_s <= 50
+    assert 0.3 <= gb_s <= 50

  def test_interleave_compute_and_copy(self):
    q = TestHCQ.compute_queue()
--- a/test/external/external_test_mamba.py
+++ b/test/external/external_test_mamba.py
@@ -1,12 +1,12 @@
 import unittest
-from tinygrad.helpers import CI
+from test.helpers import slow
 from examples.mamba import Mamba, generate
 from transformers import AutoTokenizer

 PROMPT = 'Why is gravity '
 TOKENIZER = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

-@unittest.skipIf(CI, "model is slow for CI")
+@slow
 class TestMamba(unittest.TestCase):
  def test_mamba_130M(self):
    OUT_130M = '''Why is gravity \nnot a good idea?\n\nA:'''
--- a/test/external/fuzz_shape_ops.py
+++ b/test/external/fuzz_shape_ops.py
@@ -8,11 +8,11 @@ from hypothesis.extra import numpy as stn
 import numpy as np
 import torch
 from tinygrad import Tensor
-from tinygrad.helpers import CI, getenv
+from tinygrad.helpers import getenv


 settings.register_profile(__file__, settings.default,
-                          max_examples=100 if CI else 250, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
+                          max_examples=100, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))


 # torch wraparound for large numbers
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,4 +1,4 @@
-import time, struct, functools
+import os, time, struct, functools, unittest
 from typing import Any, Callable
 import numpy as np
 from tinygrad import Tensor, dtypes, Device
@@ -9,6 +9,9 @@ from tinygrad.dtype import DType
 from tinygrad.nn.state import get_parameters
 from tinygrad.helpers import T, CI
 from tinygrad.codegen import full_rewrite
+
+# decorator to skip slow tests by default, run with RUN_SLOW=1 to include them
+slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run")
 from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler

 def derandomize_model(model):
--- a/test/models/test_efficientnet.py
+++ b/test/models/test_efficientnet.py
@@ -4,7 +4,8 @@ import numpy as np
 from PIL import Image

 from tinygrad import Tensor
-from tinygrad.helpers import getenv, CI
+from tinygrad.helpers import getenv
+from test.helpers import slow
 from extra.models.efficientnet import EfficientNet
 from extra.models.vit import ViT
 from extra.models.resnet import ResNet50
@@ -56,12 +57,12 @@ class TestEfficientNet(unittest.TestCase):
  def tearDownClass(cls):
    del cls.model

-  @unittest.skipIf(CI, "covered by test_chicken_car")
+  @slow
  def test_chicken(self):
    labels = _infer(self.model, chicken_img)
    self.assertEqual(_LABELS[labels[0]], "hen")

-  @unittest.skipIf(CI, "covered by test_chicken_car")
+  @slow
  def test_car(self):
    labels = _infer(self.model, car_img)
    self.assertEqual(_LABELS[labels[0]], "sports car, sport car")
--- a/test/models/test_mnist.py
+++ b/test/models/test_mnist.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import unittest
 import numpy as np
-from tinygrad import Tensor, Device
-from tinygrad.helpers import CI
+from tinygrad import Tensor
+from test.helpers import slow
 from tinygrad.nn.state import get_parameters
 from tinygrad.nn import optim, BatchNorm2d
 from extra.training import train, evaluate
@@ -49,7 +49,7 @@ class TinyConvNet:
    x = x.reshape(shape=[x.shape[0], -1])
    return x.dot(self.l1)

-@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
+@slow
 class TestMNIST(unittest.TestCase):
  def test_sgd_onestep(self):
    np.random.seed(1337)
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -5,11 +5,12 @@ from tinygrad.nn import optim
 from tinygrad.nn.state import get_parameters
 from tinygrad.engine.jit import TinyJit
 from tinygrad import Tensor, Device, GlobalCounters, dtypes, Variable
-from tinygrad.helpers import CI, Context
+from tinygrad.helpers import Context
+from test.helpers import slow
 from extra.lr_scheduler import OneCycleLR
 from test.helpers import derandomize_model

-from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
+from examples.gpt2 import Transformer as GPT2Transformer
 from examples.hlb_cifar10 import SpeedyResNet, hyp
 from examples.llama import Transformer as LLaMaTransformer
 from examples.stable_diffusion import UNetModel, unet_params
@@ -20,7 +21,7 @@ global_mem_used = 0
 def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jitted=False):
  with Context(JIT=2):
    tms = []
-    for _ in range(2 if CI else 4):
+    for _ in range(2):
      early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
      GlobalCounters.reset()
      Device[Device.DEFAULT].synchronize()
@@ -52,7 +53,7 @@ class TestRealWorld(unittest.TestCase):
  def tearDown(self):
    dtypes.default_float = self.old_float

-  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL")
+  @slow
  @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
  def test_stable_diffusion(self):
    params = unet_params
@@ -92,14 +93,14 @@ class TestRealWorld(unittest.TestCase):
    dtypes.default_float = dtypes.float16

    args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
-    model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
+    model = GPT2Transformer(**args_tiny)
    derandomize_model(model)
    @TinyJit
    def test(t, v):
      with Context(JIT=0): return model(t, v).realize()
-    helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 160 if CI else 468, all_jitted=True)
+    helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23, 160, all_jitted=True)

-  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
+  @slow
  def test_train_mnist(self):
    from examples.beautiful_mnist import Model
    with Tensor.train():
@@ -117,7 +118,7 @@ class TestRealWorld(unittest.TestCase):

      helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.017, 103)

-  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
+  @slow
  def test_forward_cifar(self):
    BS = 32
    # with training batchnorm still though
@@ -127,7 +128,7 @@ class TestRealWorld(unittest.TestCase):
      def run(X): return model(X)
      helper_test("forward_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), run, 0.033, 27)

-  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
+  @slow
  def test_train_cifar(self):
    with Tensor.train():
      model = SpeedyResNet(Tensor.ones((12,3,2,2)))
@@ -157,7 +158,7 @@ class TestRealWorld(unittest.TestCase):
                                final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4)
      assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half"

-  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
+  @slow
  def test_bert(self):
    with Tensor.train():
      args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2,
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@@ -3,7 +3,8 @@ import numpy as np
 from tinygrad import Device
 from tinygrad.nn.state import get_parameters
 from tinygrad.nn import optim
-from tinygrad.helpers import getenv, CI
+from tinygrad.helpers import getenv
+from test.helpers import slow
 from extra.training import train
 from extra.models.convnext import ConvNeXt
 from extra.models.efficientnet import EfficientNet
@@ -38,7 +39,7 @@ class TestTrain(unittest.TestCase):
    train_one_step(model,X,Y)
    check_gc()

-  @unittest.skipIf(CI, "slow")
+  @slow
  def test_efficientnet(self):
    model = EfficientNet(0)
    X = np.zeros((BS,3,224,224), dtype=np.float32)
@@ -46,7 +47,7 @@ class TestTrain(unittest.TestCase):
    train_one_step(model,X,Y)
    check_gc()

-  @unittest.skipIf(CI, "slow")
+  @slow
  def test_vit(self):
    model = ViT()
    X = np.zeros((BS,3,224,224), dtype=np.float32)
@@ -54,7 +55,7 @@ class TestTrain(unittest.TestCase):
    train_one_step(model,X,Y)
    check_gc()

-  @unittest.skipIf(CI, "slow")
+  @slow
  def test_transformer(self):
    # this should be small GPT-2, but the param count is wrong
    # (real ff_dim is 768*4)
@@ -64,7 +65,7 @@ class TestTrain(unittest.TestCase):
    train_one_step(model,X,Y)
    check_gc()

-  @unittest.skipIf(CI, "slow")
+  @slow
  def test_resnet(self):
    X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
    Y = np.zeros((BS), dtype=np.int32)
--- a/test/models/test_whisper.py
+++ b/test/models/test_whisper.py
@@ -2,7 +2,8 @@ import unittest
 import pathlib
 from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform
 import examples.mlperf.metrics as metrics
-from tinygrad.helpers import CI, fetch, CPU_LLVM
+from tinygrad.helpers import fetch
+from test.helpers import slow
 from tinygrad import Device, dtypes
 from tinygrad.device import is_dtype_supported

@@ -75,11 +76,11 @@ class TestWhisper(unittest.TestCase):
  def test_transcribe_file1(self):
    self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1),  TRANSCRIPTION_1)

-  @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
+  @slow
  def test_transcribe_file2(self):
    self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_2),  TRANSCRIPTION_2)

-  @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
+  @slow
  def test_transcribe_batch12(self):
    waveforms = [load_file_waveform(TEST_FILE_1), load_file_waveform(TEST_FILE_2)]
    transcriptions = transcribe_waveform(self.model, self.enc, waveforms)
@@ -95,14 +96,14 @@ class TestWhisper(unittest.TestCase):
    self.assertEqual(TRANSCRIPTION_1,  transcriptions[1])

  @unittest.skip("file 3 url is broken")
-  @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
+  @slow
  def test_transcribe_long(self):
    waveform = [load_file_waveform(fetch(TEST_FILE_3_URL))]
    transcription = transcribe_waveform(self.model, self.enc, waveform)
    self.assertWER(transcription, TRANSCRIPTION_3, 0.085)

  @unittest.skip("file 3 url is broken")
-  @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
+  @slow
  def test_transcribe_long_no_batch(self):
    waveforms = [load_file_waveform(fetch(TEST_FILE_3_URL)), load_file_waveform(TEST_FILE_1)]

--- a/test/opt/test_tensor_cores.py
+++ b/test/opt/test_tensor_cores.py
@@ -7,7 +7,8 @@ from tinygrad.tensor import _to_np_dtype
 from tinygrad.uop.ops import Ops
 from tinygrad.dtype import DType
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import AMX, CI, AMD_LLVM, CPU_LLVM
+from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
+from test.helpers import slow
 from tinygrad.engine.realize import CompiledRunner, get_program
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError

@@ -119,7 +120,7 @@ class TestTensorCores(unittest.TestCase):
        helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)

  @unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
-  @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
+  @slow
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
  def test_tensor_cores_multi_reduce(self):
    for tc in Device[Device.DEFAULT].renderer.tensor_cores:
--- a/test/test_arange.py
+++ b/test/test_arange.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
 from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
-from tinygrad.helpers import CI, Context, getenv
+from tinygrad.helpers import Context, getenv
 from tinygrad.engine.realize import run_schedule
 from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
 from tinygrad.uop.ops import Ops
@@ -143,7 +143,7 @@ class TestIndexing(unittest.TestCase):

  def test_llama_embedding(self, noopt=1, op_limit=65536):
    # llama3 is 128256
-    vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
+    vocab_size, embed_size = (10, 3)
    emb = nn.Embedding(vocab_size, embed_size)
    emb_w = emb.weight.numpy()
    x = Tensor([1,2,3,4])
@@ -161,7 +161,7 @@ class TestIndexing(unittest.TestCase):
      # TODO: reshape to match torch, should we do this in nn?
      np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
  # at least the arange is being fused
-  def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
+  def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000)

 if __name__ == "__main__":
  unittest.main()
--- a/test/test_graph.py
+++ b/test/test_graph.py
@@ -3,7 +3,7 @@ import functools, unittest, ctypes

 from tinygrad.device import Device, Buffer
 from tinygrad.tensor import Tensor, _to_np_dtype
-from tinygrad.helpers import Context, CI, dedup, from_mv
+from tinygrad.helpers import Context, dedup, from_mv
 from tinygrad.dtype import dtypes
 from tinygrad.engine.jit import MultiGraphRunner
 from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner
@@ -12,8 +12,8 @@ from test.helpers import needs_second_gpu

 np.random.seed(1337)
 Tensor.manual_seed(1337)
-BUF_SIZE = 4096 if CI else 4096 * 128
-RUN_CNT = 4 if CI else 32
+BUF_SIZE = 4096
+RUN_CNT = 4

 cached_prgs = {}
 def helper_exec_op(device, outbuf, inbufs):
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -2,12 +2,12 @@ import unittest, functools, random
 from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import Ops, UOp
-from tinygrad.helpers import CI, getenv, prod, Context
+from tinygrad.helpers import getenv, prod, Context
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
 import numpy as np
 from hypothesis import given, strategies as strat, settings
-from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu
+from test.helpers import not_support_multi_device, needs_second_gpu, slow

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@@ -420,7 +420,7 @@ class TestMultiTensor(unittest.TestCase):
    np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)

  # NOTE: this is failing on LLVM CI, no idea why. Works locally.
-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
+  @slow
  def test_data_parallel_resnet(self):
    from extra.models.resnet import ResNet18

@@ -456,7 +456,7 @@ class TestMultiTensor(unittest.TestCase):
    # sometimes there is zeros in these grads... why?
    np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)

-  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
+  @slow
  @unittest.skip("TODO: pm_rangeify hangs")
  def test_data_parallel_resnet_train_step(self):
    from extra.models.resnet import ResNet18
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4,14 +4,14 @@ import numpy as np
 import torch
 from tinygrad import Tensor, Device, TinyJit, dtypes
 from tinygrad.uop.ops import Ops
-from tinygrad.helpers import GlobalCounters, CI, Context
+from tinygrad.helpers import GlobalCounters, Context
 from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding
 from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
 from tinygrad.nn.state import load_state_dict
 from tinygrad.engine.realize import run_schedule
-from test.helpers import not_support_multi_device, needs_second_gpu
+from test.helpers import not_support_multi_device, needs_second_gpu, slow

-@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
+@slow
 class TestNN(unittest.TestCase):
  def test_batchnorm2d(self, training=False, threed=False, track_running_stats=True):
    with Tensor.train(training):
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -11,8 +11,7 @@ if getenv("TINY_BACKEND"):
  import tinygrad.nn.torch # noqa: F401 # pylint: disable=unused-import
  torch.set_default_device("tiny")

-if CI:
-  warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
+warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")

 FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
 PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -3,9 +3,8 @@ import torch
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.nn.optim import Adam, SGD, AdamW, Muon, LAMB
-from tinygrad.helpers import CI
 from tinygrad.device import is_dtype_supported
-from test.helpers import needs_second_gpu
+from test.helpers import needs_second_gpu, slow

 np.random.seed(1337)
 x_init = np.random.randn(1,4).astype(np.float32)
@@ -42,7 +41,7 @@ def step(tensor, optim, steps=1, teeny=False, **kwargs):
    optim.step()
  return net.x.detach().numpy(), net.W.detach().numpy()

-@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
+@slow
 class TestOptim(unittest.TestCase):
  def setUp(self):
    self.old_training = Tensor.training
--- a/test/test_rangeify.py
+++ b/test/test_rangeify.py
@@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Tensor, nn, Device
-from tinygrad.helpers import Context, GlobalCounters, CI, getenv, PCONTIG, DEBUG
+from tinygrad.helpers import Context, GlobalCounters, getenv, PCONTIG, DEBUG
 from tinygrad.uop.ops import graph_rewrite, PatternMatcher, UPat, Ops
 from tinygrad.codegen.opt import OptOps, Opt
 from tinygrad.renderer.ptx import PTXRenderer
@@ -153,199 +153,6 @@ class TestPcontig(unittest.TestCase):
    opts += (Opt(OptOps.UPCAST, 4, 4),)
    self.test_flash_attention(opts)

-# *** non CI rangeify tests below this line ***
-
-N = 256
-
-@unittest.skipIf(CI, "useless in CI, doesn't test anything")
-class TestRangeifyOpt(unittest.TestCase):
-  def test_randperm(self):
-    Tensor.randperm(10000).realize()
-
-  def test_one_getitem(self):
-    X = Tensor.empty(10000)
-    sel = Tensor.arange(1000).contiguous().realize()
-    Xsel = X[sel]
-    Tensor.realize(Xsel)
-
-  def test_two_getitem(self):
-    # this is splitting on the child even when it really shouldn't
-    X = Tensor.empty(10000)
-    Y = Tensor.empty(10000)
-    sel = Tensor.arange(1000).contiguous().realize()
-    Xsel, Ysel = X[sel], Y[sel]
-    Tensor.realize(Xsel, Ysel)
-
-  def test_resnetconv(self):
-    conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=2, bias=False, padding=3)
-    conv1.weight.replace(conv1.weight.empty_like())
-    x = Tensor.empty(1, 3, 56, 56)
-    x = conv1(x).pad([1,1,1,1])+1
-    x.realize()
-
-  # CPU=1 NOOPT=1 DEBUG=4 RANGEIFY=1 python3 test/test_rangeify.py TestRangeifyOpt.test_matmul_reshaped
-  def test_matmul_reshaped(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    (A@B).reshape(N*N).contiguous().realize()
-
-  def test_reduce_reshapes(self):
-    A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten()
-    A.sum().realize()
-
-@unittest.skipIf(CI, "useless in CI, doesn't test anything")
-class TestRangeify(unittest.TestCase):
-  def test_groupnorm(self):
-    # ranges 1 and 3 are merging
-    x = nn.GroupNorm(32, 128)
-    x(Tensor.empty(1, 128, 64, 64)).realize()
-
-  def test_expand_children(self):
-    A = Tensor.empty(N, N).sum(axis=1)
-    ba = A.expand(N, N)
-    ((ba+1).sum(axis=1) + (ba+2).sum(axis=0)).realize()
-
-  def test_partial_contig(self):
-    A = Tensor.empty(64, 64, 64)
-    ret = A.sum(axis=2).contiguous(arg=(1,)).sum(axis=1)
-    ret.realize()
-
-  @unittest.skip("RANGEIFY=0 does nothing")
-  def test_double_gemm_real(self):
-    def go():
-      with Context(DEBUG=0):
-        Tensor.manual_seed(1337)
-        A,B,C = [Tensor.randn(N, N) for _ in range(3)]
-        Tensor.realize(A, B, C)
-      GlobalCounters.reset()
-      return (A@B@C).realize()
-    rng = go()
-    with Context(RANGEIFY=0, DEBUG=2):
-      ref = go()
-      mse = ((rng-ref)**2).sum().item()
-    print(f"mse: {mse}")
-    self.assertLessEqual(mse, 1e-2)
-
-  def test_double_gemm(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    (A@B@C).realize()
-
-  def test_double_gemm_exp(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    (((A@B).exp()@C).exp()).realize()
-
-  def test_double_gemm_exp_child(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    # A@B is used with exp, and also on the sum. this is two kernels now, is this right?
-    ret = A@B
-    ((ret.exp()@C)+ret).realize()
-
-  def test_double_gemm_relu(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    (((A@B).relu()@C).relu()).realize()
-
-  def test_double_gemm_relu_half_contig(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    (((A@B).relu().contiguous(arg=(1,))@C).relu()).realize()
-
-  def test_double_gemm_half_contig(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    ((A@B).contiguous(arg=(1,))@C).realize()
-
-  def test_double_gemm_contig(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    ((A@B).contiguous()@C).realize()
-
-  def test_many_gemm(self):
-    A = Tensor.empty(N, N)
-    B = Tensor.empty(N, N)
-    C = Tensor.empty(N, N)
-    D = Tensor.empty(N, N)
-    E = Tensor.empty(N, N)
-    F = Tensor.empty(N, N)
-    (A@B@C@D@E@F).realize()
-
-  def test_conv2d(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    x.conv2d(w1).realize()
-
-  def test_conv2d_elu(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    x.conv2d(w1).elu().realize()
-
-  def test_conv2d_t(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    (x*2).conv2d(w1).realize()
-
-  def test_double_conv2d(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    w2 = Tensor.empty(12, 8, 3, 3)
-    x.conv2d(w1).conv2d(w2).realize()
-
-  def test_resnet_conv2d(self):
-    x = Tensor.empty(1, 8, 32, 32)
-    w1 = Tensor.empty(8, 8, 3, 3)
-    w2 = Tensor.empty(8, 8, 1, 1)
-    x.conv2d(w1).conv2d(w2).realize()
-
-  def test_xception_conv2d(self):
-    # NOTE: this fusion is bad, it's recomputing the inner many times
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 1, 1)
-    w2 = Tensor.empty(8, 1, 3, 3)
-    x.conv2d(w1).conv2d(w2, groups=8).realize()
-
-  def test_conv_maxpool_contig(self): self.test_conv_maxpool(True)
-  def test_conv_maxpool(self, contig=False):
-    GlobalCounters.reset()
-    x = Tensor.empty(32, 16, 64, 64)
-    l1 = nn.Conv2d(16, 16, 3)
-    for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape))
-    x = l1(x)
-    if contig: x = x.contiguous()
-    x.max_pool2d().realize()
-
-  def test_double_conv2d_half_contig(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    w2 = Tensor.empty(12, 8, 3, 3)
-    # NOTE: this contiguous doesn't help
-    x.conv2d(w1).contiguous(arg=(1,)).conv2d(w2).permute(0,2,3,1).contiguous().realize()
-
-  def test_double_conv2d_contig(self):
-    x = Tensor.empty(1, 4, 32, 32)
-    w1 = Tensor.empty(8, 4, 3, 3)
-    w2 = Tensor.empty(12, 8, 3, 3)
-    x.conv2d(w1).contiguous().conv2d(w2).realize()
-
-  def test_transformer_ffn(self):
-    from tinygrad.apps.llm import TransformerBlock
-    from tinygrad import nn
-    blk = TransformerBlock(1024, 4096, 1, 1, 1e-5, head_dim=1024, rope_theta=10000.0)
-    for p in nn.state.get_parameters(blk): p.replace(Tensor.empty(p.shape))
-
-    x = Tensor.empty(128, 1024)
-    out = blk._feed_forward(x)
-    out.realize()
-
 # contiguous + reduce can support ranges?

@unittest.skip("pm_rangeify no longer exists. test this in a different way")
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1755,7 +1755,7 @@ class TestSchedule(unittest.TestCase):
  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
  def test_precompute_freqs_cis(self):
    from extra.models.llama import precompute_freqs_cis
-    args = {"dim":32 if CI else 128, "end":2048 if CI else 8192, "theta":10000}
+    args = {"dim":32, "end":2048, "theta":10000}
    fused = precompute_freqs_cis(**args)
    run_schedule(check_schedule(fused, 1))
    if getenv("CHECK", 1):
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -3,7 +3,8 @@ import numpy as np
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.dtype import DType
 from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
-from tinygrad.helpers import Timing, fetch, temp, CI, OSX
+from tinygrad.helpers import Timing, fetch, temp, OSX
+from test.helpers import slow
 from tinygrad.device import is_dtype_supported

 def compare_weights_both(url):
@@ -340,8 +341,8 @@ class TestDiskTensor(unittest.TestCase):
      on_dev = t.to(Device.DEFAULT).realize()
      np.testing.assert_equal(on_dev.numpy(), t.numpy())

+  @slow
  def test_copy_from_disk_huge(self):
-    if CI and not hasattr(Device["DISK"], 'io_uring'): self.skipTest("slow on ci without iouring")

    fn = pathlib.Path(temp("dt_copy_from_disk_huge"))
    fn.unlink(missing_ok=True)
--- a/test/unit/test_dtype_spec.py
+++ b/test/unit/test_dtype_spec.py
@@ -2,7 +2,8 @@ import unittest, math, operator, subprocess, struct
 from tinygrad.tensor import Tensor, dtypes, Device
 from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import getenv, CI, DEBUG
+from tinygrad.helpers import getenv, DEBUG
+from test.helpers import slow
 from hypothesis import given, settings, strategies as strat
 import numpy as np
 import torch
@@ -594,7 +595,7 @@ class TestAutoCastType(unittest.TestCase):
    dtypes.default_float = old_default_float

  @unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow")
-  @unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow")
+  @slow
  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size")
  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
  def test_mean_half_precision_underflow(self):
--- a/test/unit/test_hashing.py
+++ b/test/unit/test_hashing.py
@@ -1,8 +1,8 @@
 from typing_extensions import Callable
 import hashlib, random, unittest
 from tinygrad import Tensor, Device, getenv, dtypes
+from test.helpers import slow
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import CI
 from tinygrad.uop.ops import UOp
 from tinygrad.engine.jit import TinyJit

@@ -58,7 +58,7 @@ class TestKeccak(unittest.TestCase):
    self.assertEqual(bytes(Tensor(b"abc").keccak().tolist()),
                     bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))

-  @unittest.skipIf(CI, "times out in ci")
+  @slow
  def test_long(self):
    data = b"\x00" * 4
    self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
@@ -74,7 +74,7 @@ class TestKeccak(unittest.TestCase):
    self.assertEqual(bytes(out[1].tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))
    self.assertEqual(bytes(out[2].tolist()), bytearray.fromhex("8e0d8f672252acb0 ffc5093db8653b18 1513bf9a2097e737 b4f73533dcaf46df"))

-  @unittest.skipIf(CI, "redundant with test_variable_bs")
+  @slow
  def test_variable_bs_jit(self):
    def f(data):
      return data.keccak()
--- a/test/unit/test_helpers.py
+++ b/test/unit/test_helpers.py
@@ -1,6 +1,6 @@
 import ctypes, gzip, unittest, timeit
 from tinygrad import Variable
-from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, CI, mv_address, get_contraction
+from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, mv_address, get_contraction
 from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv, polyN, time_to_str, cdiv, cmod, getbits
 from tinygrad.tensor import Tensor, get_shape
 import numpy as np
@@ -198,7 +198,7 @@ class TestMemoryview(unittest.TestCase):
    mv[0] = 2
    assert base[0] == 2

-  @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
+  @unittest.skip("allocates tons of memory")
  def test_to_mv(self):
    sizes = [
      (16, "16 B"),
--- a/test/unit/test_indexing.py
+++ b/test/unit/test_indexing.py
@@ -5,7 +5,8 @@ import numpy as np

 from tinygrad import Tensor, dtypes, Device, TinyJit
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import CI, all_same, prod
+from tinygrad.helpers import all_same, prod
+from test.helpers import slow

 random.seed(42)

@@ -1140,7 +1141,7 @@ def get_set_tensor(indexed: Tensor, indexer):
  set_tensor = Tensor.randint(set_count, high=set_count).reshape(set_size) #.cast(dtypes.float64)
  return set_tensor

-@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
+@slow
 class TestAdvancedIndexing(unittest.TestCase):
  def test_integer_array_indexing(self):
    # pick a random valid indexer type
--- a/test/unit/test_shm_tensor.py
+++ b/test/unit/test_shm_tensor.py
@@ -20,7 +20,7 @@ class TestRawShmBuffer(unittest.TestCase):
    assert np.allclose(t.numpy(), t2.numpy())
    s.unlink()

-  @unittest.skipIf(CI, "CI doesn't like big shared memory")
+  @unittest.skip("big shared memory")
  def test_e2e_big(self):
    # bigger than this doesn't work on Linux, maybe this is a limit somewhere?
    t = Tensor.randn(2048, 128, 8).realize()