add needs_second_gpu decorator (#13543)

* add needs_second_gpu decorator

* more skips

* two more fixes
This commit is contained in:
George Hotz
2025-12-02 19:08:23 -08:00
committed by GitHub
parent 0d55aec605
commit 6bd355fa26
10 changed files with 52 additions and 8 deletions

View File

@@ -1,4 +1,4 @@
import time, struct
import time, struct, functools
from typing import Any, Callable
import numpy as np
from tinygrad import Tensor, dtypes, Device
@@ -60,5 +60,14 @@ def not_support_multi_device():
# CL and CUDA don't support multi device if in CI
return CI and REAL_DEV in ("CL", "CUDA")
def needs_second_gpu(fn):
@functools.wraps(fn)
def wrapper(self, *args, **kwargs):
# check if there's a second GPU, if not, skip multi tests
try: Tensor.zeros(10, device=f"{Device.DEFAULT}:1").contiguous().realize()
except Exception as e: self.skipTest(f"second device not available: {e}")
return fn(self, *args, **kwargs)
return wrapper
# NOTE: This will open REMOTE if it's the default device
REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)

View File

@@ -113,7 +113,7 @@ class TestEnd2End(unittest.TestCase):
def test_bn_linear(self):
BS, K = 2, 1
eps = 0
eps = 1e-12 # torch asserts if this is 0
X = Tensor([1,0]).reshape(BS, K, 1, 1)
Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
class LinTiny:

View File

@@ -8,6 +8,8 @@ from tinygrad.dtype import dtypes
from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner
from test.helpers import needs_second_gpu
np.random.seed(1337)
Tensor.manual_seed(1337)
BUF_SIZE = 4096 if CI else 4096 * 128
@@ -154,6 +156,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_copies_2_devs(self):
self.skip_if_not_multigraph()
@@ -167,6 +170,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_copies_after_graph_global(self):
self.skip_if_not_multigraph()
@@ -215,6 +219,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_graph_after_copies_devs(self):
self.skip_if_not_multigraph()

View File

@@ -3,7 +3,7 @@ import unittest, functools
import numpy as np
from hypothesis import given, settings, strategies as strat
from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV
from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV, needs_second_gpu
from tinygrad.tensor import Tensor
from tinygrad.engine.jit import TinyJit, GraphRunner, MultiGraphRunner, graph_class
from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
@@ -439,6 +439,7 @@ class TestJit(unittest.TestCase):
ja = jf(a)
np.testing.assert_allclose(a.numpy(), ja.numpy(), atol=1e-4, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_jitted_transfers(self):
d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
@@ -472,6 +473,7 @@ class TestJit(unittest.TestCase):
np.testing.assert_allclose((a.numpy()+b.numpy()), zc.numpy(), atol=1e-4, rtol=1e-5)
np.testing.assert_allclose((a.numpy()*b.numpy()), wc.numpy(), atol=1e-4, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_jitted_view(self):
d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"

View File

@@ -7,7 +7,7 @@ from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
import numpy as np
from hypothesis import given, strategies as strat, settings
from test.helpers import REAL_DEV, not_support_multi_device
from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")
@@ -35,6 +35,9 @@ def _test_allreduce(t:Tensor):
@unittest.skipIf(not_support_multi_device(), "no multi")
class TestMultiTensor(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
def test_to(self):
X = Tensor.ones(256).contiguous().realize()
X.to_(devices_2)
@@ -827,6 +830,7 @@ class TestMultiTensor(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi")
class TestHandleData(unittest.TestCase):
@needs_second_gpu
def test_copied_to_device(self):
device = (d0, d1, d2, d3)
t = Tensor([1, 2, 3, 4]).shard(device).realize()
@@ -851,6 +855,9 @@ class TestHandleData(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi")
class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
# shrink a multitensor on sharded axis
def test_shrink_bad_args(self):
t = Tensor.arange(64).reshape(8, 8).contiguous().realize()
@@ -972,6 +979,9 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi")
class TestBatchNorm(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
def test_unsynced_backprop_conv_bn(self):
with Tensor.train():
from extra.lr_scheduler import OneCycleLR
@@ -1126,9 +1136,11 @@ def helper_test_shard_op(shps, fxn, atol=1e-6, rtol=1e-3):
@unittest.skipIf(not_support_multi_device(), "no multi")
class TestTensorOps(unittest.TestCase):
@needs_second_gpu
def test_interpolate(self):
helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))
@needs_second_gpu
def test_bitcast(self):
helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
@@ -1171,6 +1183,7 @@ class TestMultiRamUsage(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "need multi")
class TestMultiFromUnrenderable(unittest.TestCase):
@needs_second_gpu
def test_from_npy(self):
t = Tensor(np.arange(100, dtype=np.uint32))
ll = t.shard((d0, d1), axis=0) + 1
@@ -1180,6 +1193,9 @@ class TestMultiFromUnrenderable(unittest.TestCase):
class TestMultiAssign(unittest.TestCase):
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))
@needs_second_gpu
def setUp(self): pass
def test_multi_assign_realized(self):
out = Tensor.zeros(4).shard(self.device, 0).contiguous().realize()
ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize()
@@ -1242,6 +1258,7 @@ class TestMultiAssign(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "need multi")
class TestMultiTransformer(unittest.TestCase):
@needs_second_gpu
def test_transformer(self):
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))

View File

@@ -9,7 +9,7 @@ from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear
from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
from tinygrad.nn.state import load_state_dict
from tinygrad.engine.realize import run_schedule
from test.helpers import not_support_multi_device
from test.helpers import not_support_multi_device, needs_second_gpu
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
class TestNN(unittest.TestCase):
@@ -481,6 +481,7 @@ class TestNN(unittest.TestCase):
np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_load_state_dict_sharded_model(self):
devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
@@ -519,6 +520,7 @@ class TestNN(unittest.TestCase):
np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_load_state_dict_sharded_model_dict_same_axis(self):
devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")

View File

@@ -7,7 +7,7 @@ from tinygrad.device import is_dtype_supported
from tinygrad.engine.realize import lower_schedule, CompiledRunner
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.nir import NIRRenderer
from test.helpers import not_support_multi_device
from test.helpers import not_support_multi_device, needs_second_gpu
import numpy as np
import torch
@@ -141,6 +141,7 @@ class TestRandomness(unittest.TestCase):
r = Tensor.rand(10).numpy()
np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_threefry_tensors_cnt(self):
Tensor.manual_seed(1337)
@@ -160,6 +161,7 @@ class TestRandomness(unittest.TestCase):
assert len(Tensor._device_rng_counters) == 0
assert len(Tensor._device_seeds) == 0
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi")
def test_threefry_same_kernels(self):
Tensor.manual_seed(0)

View File

@@ -2,7 +2,7 @@ import unittest
from tinygrad import Device, dtypes, Tensor
from tinygrad.device import Buffer
from tinygrad.helpers import Context
from test.helpers import REAL_DEV
from test.helpers import REAL_DEV, needs_second_gpu
@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
class TestSubBuffer(unittest.TestCase):
@@ -41,6 +41,7 @@ class TestSubBuffer(unittest.TestCase):
out = (vt + 100).tolist()
assert out == [102, 103]
@needs_second_gpu
@unittest.skipIf(REAL_DEV not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
def test_subbuffer_transfer(self):
t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()

View File

@@ -12,6 +12,11 @@ from extra.thunder.tiny.tk.tiles import ST_16X32, RT_16X32, RT_16X16, TileLayout
@unittest.skipIf(CI or Device.DEFAULT not in ["AMD"], "only amd")
class TestTK(unittest.TestCase):
def setUp(self):
arch = Device["AMD"].arch
if not arch.startswith("gfx9"):
self.skipTest(f"arch {arch} not supported")
@unittest.skipIf(CI, "no wmma in ci")
def test_simple_matmul(self):
N = 8192

View File

@@ -22,7 +22,8 @@ class TestRawShmBuffer(unittest.TestCase):
@unittest.skipIf(CI, "CI doesn't like big shared memory")
def test_e2e_big(self):
t = Tensor.randn(2048, 2048, 8).realize()
# bigger than this doesn't work on Linux, maybe this is a limit somewhere?
t = Tensor.randn(2048, 128, 8).realize()
# copy to shm
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name