add needs_second_gpu decorator (#13543)

* add needs_second_gpu decorator * more skips * two more fixes
2026-01-06 21:53:53 -05:00 · 2025-12-02 19:08:23 -08:00
parent 0d55aec605
commit 6bd355fa26
10 changed files with 52 additions and 8 deletions
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,4 +1,4 @@
-import time, struct
+import time, struct, functools
 from typing import Any, Callable
 import numpy as np
 from tinygrad import Tensor, dtypes, Device
@@ -60,5 +60,14 @@ def not_support_multi_device():
  # CL and CUDA don't support multi device if in CI
  return CI and REAL_DEV in ("CL", "CUDA")

+def needs_second_gpu(fn):
+  @functools.wraps(fn)
+  def wrapper(self, *args, **kwargs):
+    # check if there's a second GPU, if not, skip multi tests
+    try: Tensor.zeros(10, device=f"{Device.DEFAULT}:1").contiguous().realize()
+    except Exception as e: self.skipTest(f"second device not available: {e}")
+    return fn(self, *args, **kwargs)
+  return wrapper
+
 # NOTE: This will open REMOTE if it's the default device
 REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)
--- a/test/models/test_end2end.py
+++ b/test/models/test_end2end.py
@@ -113,7 +113,7 @@ class TestEnd2End(unittest.TestCase):

  def test_bn_linear(self):
    BS, K = 2, 1
-    eps = 0
+    eps = 1e-12  # torch asserts if this is 0
    X = Tensor([1,0]).reshape(BS, K, 1, 1)
    Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
    class LinTiny:
--- a/test/test_graph.py
+++ b/test/test_graph.py
@@ -8,6 +8,8 @@ from tinygrad.dtype import dtypes
 from tinygrad.engine.jit import MultiGraphRunner
 from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner

+from test.helpers import needs_second_gpu
+
 np.random.seed(1337)
 Tensor.manual_seed(1337)
 BUF_SIZE = 4096 if CI else 4096 * 128
@@ -154,6 +156,7 @@ class TestGraph(unittest.TestCase):

    helper_test_graphs(Device[d0].graph, graphs)

+  @needs_second_gpu
  def test_copies_2_devs(self):
    self.skip_if_not_multigraph()

@@ -167,6 +170,7 @@ class TestGraph(unittest.TestCase):

    helper_test_graphs(Device[d0].graph, graphs)

+  @needs_second_gpu
  def test_copies_after_graph_global(self):
    self.skip_if_not_multigraph()

@@ -215,6 +219,7 @@ class TestGraph(unittest.TestCase):

    helper_test_graphs(Device[d0].graph, graphs)

+  @needs_second_gpu
  def test_graph_after_copies_devs(self):
    self.skip_if_not_multigraph()

--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3,7 +3,7 @@ import unittest, functools
 import numpy as np

 from hypothesis import given, settings, strategies as strat
-from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV
+from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV, needs_second_gpu
 from tinygrad.tensor import Tensor
 from tinygrad.engine.jit import TinyJit, GraphRunner, MultiGraphRunner, graph_class
 from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
@@ -439,6 +439,7 @@ class TestJit(unittest.TestCase):
      ja = jf(a)
      np.testing.assert_allclose(a.numpy(), ja.numpy(), atol=1e-4, rtol=1e-5)

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_jitted_transfers(self):
    d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
@@ -472,6 +473,7 @@ class TestJit(unittest.TestCase):
      np.testing.assert_allclose((a.numpy()+b.numpy()), zc.numpy(), atol=1e-4, rtol=1e-5)
      np.testing.assert_allclose((a.numpy()*b.numpy()), wc.numpy(), atol=1e-4, rtol=1e-5)

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_jitted_view(self):
    d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -7,7 +7,7 @@ from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
 import numpy as np
 from hypothesis import given, strategies as strat, settings
-from test.helpers import REAL_DEV, not_support_multi_device
+from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@@ -35,6 +35,9 @@ def _test_allreduce(t:Tensor):

@unittest.skipIf(not_support_multi_device(), "no multi")
 class TestMultiTensor(unittest.TestCase):
+  @needs_second_gpu
+  def setUp(self): pass
+
  def test_to(self):
    X = Tensor.ones(256).contiguous().realize()
    X.to_(devices_2)
@@ -827,6 +830,7 @@ class TestMultiTensor(unittest.TestCase):

@unittest.skipIf(not_support_multi_device(), "no multi")
 class TestHandleData(unittest.TestCase):
+  @needs_second_gpu
  def test_copied_to_device(self):
    device = (d0, d1, d2, d3)
    t = Tensor([1, 2, 3, 4]).shard(device).realize()
@@ -851,6 +855,9 @@ class TestHandleData(unittest.TestCase):

@unittest.skipIf(not_support_multi_device(), "no multi")
 class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
+  @needs_second_gpu
+  def setUp(self): pass
+
  # shrink a multitensor on sharded axis
  def test_shrink_bad_args(self):
    t = Tensor.arange(64).reshape(8, 8).contiguous().realize()
@@ -972,6 +979,9 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):

@unittest.skipIf(not_support_multi_device(), "no multi")
 class TestBatchNorm(unittest.TestCase):
+  @needs_second_gpu
+  def setUp(self): pass
+
  def test_unsynced_backprop_conv_bn(self):
    with Tensor.train():
      from extra.lr_scheduler import OneCycleLR
@@ -1126,9 +1136,11 @@ def helper_test_shard_op(shps, fxn, atol=1e-6, rtol=1e-3):

@unittest.skipIf(not_support_multi_device(), "no multi")
 class TestTensorOps(unittest.TestCase):
+  @needs_second_gpu
  def test_interpolate(self):
    helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))

+  @needs_second_gpu
  def test_bitcast(self):
    helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))

@@ -1171,6 +1183,7 @@ class TestMultiRamUsage(unittest.TestCase):

@unittest.skipIf(not_support_multi_device(), "need multi")
 class TestMultiFromUnrenderable(unittest.TestCase):
+  @needs_second_gpu
  def test_from_npy(self):
    t = Tensor(np.arange(100, dtype=np.uint32))
    ll = t.shard((d0, d1), axis=0) + 1
@@ -1180,6 +1193,9 @@ class TestMultiFromUnrenderable(unittest.TestCase):
 class TestMultiAssign(unittest.TestCase):
  device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))

+  @needs_second_gpu
+  def setUp(self): pass
+
  def test_multi_assign_realized(self):
    out = Tensor.zeros(4).shard(self.device, 0).contiguous().realize()
    ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize()
@@ -1242,6 +1258,7 @@ class TestMultiAssign(unittest.TestCase):

@unittest.skipIf(not_support_multi_device(), "need multi")
 class TestMultiTransformer(unittest.TestCase):
+  @needs_second_gpu
  def test_transformer(self):
    device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))

--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9,7 +9,7 @@ from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear
 from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
 from tinygrad.nn.state import load_state_dict
 from tinygrad.engine.realize import run_schedule
-from test.helpers import not_support_multi_device
+from test.helpers import not_support_multi_device, needs_second_gpu

@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
 class TestNN(unittest.TestCase):
@@ -481,6 +481,7 @@ class TestNN(unittest.TestCase):
    np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
    np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_load_state_dict_sharded_model(self):
    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
@@ -519,6 +520,7 @@ class TestNN(unittest.TestCase):
    np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
    np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_load_state_dict_sharded_model_dict_same_axis(self):
    devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@@ -7,7 +7,7 @@ from tinygrad.device import is_dtype_supported
 from tinygrad.engine.realize import lower_schedule, CompiledRunner
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.nir import NIRRenderer
-from test.helpers import not_support_multi_device
+from test.helpers import not_support_multi_device, needs_second_gpu

 import numpy as np
 import torch
@@ -141,6 +141,7 @@ class TestRandomness(unittest.TestCase):
    r = Tensor.rand(10).numpy()
    np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5)

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_threefry_tensors_cnt(self):
    Tensor.manual_seed(1337)
@@ -160,6 +161,7 @@ class TestRandomness(unittest.TestCase):
    assert len(Tensor._device_rng_counters) == 0
    assert len(Tensor._device_seeds) == 0

+  @needs_second_gpu
  @unittest.skipIf(not_support_multi_device(), "no multi")
  def test_threefry_same_kernels(self):
    Tensor.manual_seed(0)
--- a/test/test_subbuffer.py
+++ b/test/test_subbuffer.py
@@ -2,7 +2,7 @@ import unittest
 from tinygrad import Device, dtypes, Tensor
 from tinygrad.device import Buffer
 from tinygrad.helpers import Context
-from test.helpers import REAL_DEV
+from test.helpers import REAL_DEV, needs_second_gpu

@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
 class TestSubBuffer(unittest.TestCase):
@@ -41,6 +41,7 @@ class TestSubBuffer(unittest.TestCase):
    out = (vt + 100).tolist()
    assert out == [102, 103]

+  @needs_second_gpu
  @unittest.skipIf(REAL_DEV not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
  def test_subbuffer_transfer(self):
    t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
--- a/test/testextra/test_tk.py
+++ b/test/testextra/test_tk.py
@@ -12,6 +12,11 @@ from extra.thunder.tiny.tk.tiles import ST_16X32, RT_16X32, RT_16X16, TileLayout

@unittest.skipIf(CI or Device.DEFAULT not in ["AMD"], "only amd")
 class TestTK(unittest.TestCase):
+  def setUp(self):
+    arch = Device["AMD"].arch
+    if not arch.startswith("gfx9"):
+      self.skipTest(f"arch {arch} not supported")
+
  @unittest.skipIf(CI, "no wmma in ci")
  def test_simple_matmul(self):
    N = 8192
--- a/test/unit/test_shm_tensor.py
+++ b/test/unit/test_shm_tensor.py
@@ -22,7 +22,8 @@ class TestRawShmBuffer(unittest.TestCase):

  @unittest.skipIf(CI, "CI doesn't like big shared memory")
  def test_e2e_big(self):
-    t = Tensor.randn(2048, 2048, 8).realize()
+    # bigger than this doesn't work on Linux, maybe this is a limit somewhere?
+    t = Tensor.randn(2048, 128, 8).realize()

    # copy to shm
    shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name