Use REAL_DEV for test skips (#10420)

This should fix remote cpu tests flakiness (segfaults were in `test_data_parallel_resnet_train_step` which is skipped on cpu but wasn't skipped on remote cpu)
2026-01-09 06:58:11 -05:00 · 2025-05-20 05:32:14 +05:00
parent 9a199ccd81
commit ec9955c956
2 changed files with 9 additions and 8 deletions
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -8,7 +8,7 @@ from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner,
 import numpy as np
 from hypothesis import given, strategies as strat, settings
 from tinygrad.device import is_dtype_supported
-from test.helpers import not_support_multi_device
+from test.helpers import REAL_DEV, not_support_multi_device

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@@ -358,8 +358,8 @@ class TestMultiTensor(unittest.TestCase):
    np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)

  # NOTE: this is failing on LLVM CI, no idea why. Works locally.
-  @unittest.skipIf(CI and Device.DEFAULT in ("CUDA", "NV", "LLVM"), "slow")
-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM"), "slow")
+  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_data_parallel_resnet(self):
    from extra.models.resnet import ResNet18

@@ -395,8 +395,8 @@ class TestMultiTensor(unittest.TestCase):
    # sometimes there is zeros in these grads... why?
    np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)

-  @unittest.skipIf(CI and Device.DEFAULT in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
+  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
+  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_data_parallel_resnet_train_step(self):
    from extra.models.resnet import ResNet18
    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
@@ -983,7 +983,7 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
    np.testing.assert_allclose(output.numpy(), expected)

@unittest.skipIf(not_support_multi_device(), "no multi")
-@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
+@unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
 class TestBatchNorm(unittest.TestCase):
  def test_unsynced_backprop_conv_bn(self):
    with Tensor.train():
@@ -1011,7 +1011,7 @@ class TestBatchNorm(unittest.TestCase):
      optim.step()
      out.numpy()

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
+  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_unsynced_backprop_standalone_bn(self):
    from extra.lr_scheduler import OneCycleLR
    GPUS = (d1, d2)
--- a/test/test_subbuffer.py
+++ b/test/test_subbuffer.py
@@ -2,6 +2,7 @@ import unittest
 from tinygrad import Device, dtypes, Tensor
 from tinygrad.device import Buffer
 from tinygrad.helpers import Context
+from test.helpers import REAL_DEV

@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
 class TestSubBuffer(unittest.TestCase):
@@ -39,7 +40,7 @@ class TestSubBuffer(unittest.TestCase):
    out = (vt + 100).tolist()
    assert out == [102, 103]

-  @unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
+  @unittest.skipIf(REAL_DEV not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
  def test_subbuffer_transfer(self):
    t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
    vt = t[2:5].contiguous().realize()