add needs_second_gpu decorator (#13543)

* add needs_second_gpu decorator

* more skips

* two more fixes
This commit is contained in:
George Hotz
2025-12-02 19:08:23 -08:00
committed by GitHub
parent 0d55aec605
commit 6bd355fa26
10 changed files with 52 additions and 8 deletions

View File

@@ -1,4 +1,4 @@
import time, struct import time, struct, functools
from typing import Any, Callable from typing import Any, Callable
import numpy as np import numpy as np
from tinygrad import Tensor, dtypes, Device from tinygrad import Tensor, dtypes, Device
@@ -60,5 +60,14 @@ def not_support_multi_device():
# CL and CUDA don't support multi device if in CI # CL and CUDA don't support multi device if in CI
return CI and REAL_DEV in ("CL", "CUDA") return CI and REAL_DEV in ("CL", "CUDA")
def needs_second_gpu(fn):
@functools.wraps(fn)
def wrapper(self, *args, **kwargs):
# check if there's a second GPU, if not, skip multi tests
try: Tensor.zeros(10, device=f"{Device.DEFAULT}:1").contiguous().realize()
except Exception as e: self.skipTest(f"second device not available: {e}")
return fn(self, *args, **kwargs)
return wrapper
# NOTE: This will open REMOTE if it's the default device # NOTE: This will open REMOTE if it's the default device
REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device) REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)

View File

@@ -113,7 +113,7 @@ class TestEnd2End(unittest.TestCase):
def test_bn_linear(self): def test_bn_linear(self):
BS, K = 2, 1 BS, K = 2, 1
eps = 0 eps = 1e-12 # torch asserts if this is 0
X = Tensor([1,0]).reshape(BS, K, 1, 1) X = Tensor([1,0]).reshape(BS, K, 1, 1)
Y = Tensor([-1,0]).reshape(BS, K, 1, 1) Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
class LinTiny: class LinTiny:

View File

@@ -8,6 +8,8 @@ from tinygrad.dtype import dtypes
from tinygrad.engine.jit import MultiGraphRunner from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner
from test.helpers import needs_second_gpu
np.random.seed(1337) np.random.seed(1337)
Tensor.manual_seed(1337) Tensor.manual_seed(1337)
BUF_SIZE = 4096 if CI else 4096 * 128 BUF_SIZE = 4096 if CI else 4096 * 128
@@ -154,6 +156,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs) helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_copies_2_devs(self): def test_copies_2_devs(self):
self.skip_if_not_multigraph() self.skip_if_not_multigraph()
@@ -167,6 +170,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs) helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_copies_after_graph_global(self): def test_copies_after_graph_global(self):
self.skip_if_not_multigraph() self.skip_if_not_multigraph()
@@ -215,6 +219,7 @@ class TestGraph(unittest.TestCase):
helper_test_graphs(Device[d0].graph, graphs) helper_test_graphs(Device[d0].graph, graphs)
@needs_second_gpu
def test_graph_after_copies_devs(self): def test_graph_after_copies_devs(self):
self.skip_if_not_multigraph() self.skip_if_not_multigraph()

View File

@@ -3,7 +3,7 @@ import unittest, functools
import numpy as np import numpy as np
from hypothesis import given, settings, strategies as strat from hypothesis import given, settings, strategies as strat
from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV from test.helpers import assert_jit_cache_len, not_support_multi_device, REAL_DEV, needs_second_gpu
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
from tinygrad.engine.jit import TinyJit, GraphRunner, MultiGraphRunner, graph_class from tinygrad.engine.jit import TinyJit, GraphRunner, MultiGraphRunner, graph_class
from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
@@ -439,6 +439,7 @@ class TestJit(unittest.TestCase):
ja = jf(a) ja = jf(a)
np.testing.assert_allclose(a.numpy(), ja.numpy(), atol=1e-4, rtol=1e-5) np.testing.assert_allclose(a.numpy(), ja.numpy(), atol=1e-4, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_jitted_transfers(self): def test_jitted_transfers(self):
d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
@@ -472,6 +473,7 @@ class TestJit(unittest.TestCase):
np.testing.assert_allclose((a.numpy()+b.numpy()), zc.numpy(), atol=1e-4, rtol=1e-5) np.testing.assert_allclose((a.numpy()+b.numpy()), zc.numpy(), atol=1e-4, rtol=1e-5)
np.testing.assert_allclose((a.numpy()*b.numpy()), wc.numpy(), atol=1e-4, rtol=1e-5) np.testing.assert_allclose((a.numpy()*b.numpy()), wc.numpy(), atol=1e-4, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_jitted_view(self): def test_jitted_view(self):
d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"

View File

@@ -7,7 +7,7 @@ from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
import numpy as np import numpy as np
from hypothesis import given, strategies as strat, settings from hypothesis import given, strategies as strat, settings
from test.helpers import REAL_DEV, not_support_multi_device from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile") settings.load_profile("my_profile")
@@ -35,6 +35,9 @@ def _test_allreduce(t:Tensor):
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
class TestMultiTensor(unittest.TestCase): class TestMultiTensor(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
def test_to(self): def test_to(self):
X = Tensor.ones(256).contiguous().realize() X = Tensor.ones(256).contiguous().realize()
X.to_(devices_2) X.to_(devices_2)
@@ -827,6 +830,7 @@ class TestMultiTensor(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
class TestHandleData(unittest.TestCase): class TestHandleData(unittest.TestCase):
@needs_second_gpu
def test_copied_to_device(self): def test_copied_to_device(self):
device = (d0, d1, d2, d3) device = (d0, d1, d2, d3)
t = Tensor([1, 2, 3, 4]).shard(device).realize() t = Tensor([1, 2, 3, 4]).shard(device).realize()
@@ -851,6 +855,9 @@ class TestHandleData(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
class TestShrinkMultiTensorShardedAxis(unittest.TestCase): class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
# shrink a multitensor on sharded axis # shrink a multitensor on sharded axis
def test_shrink_bad_args(self): def test_shrink_bad_args(self):
t = Tensor.arange(64).reshape(8, 8).contiguous().realize() t = Tensor.arange(64).reshape(8, 8).contiguous().realize()
@@ -972,6 +979,9 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
class TestBatchNorm(unittest.TestCase): class TestBatchNorm(unittest.TestCase):
@needs_second_gpu
def setUp(self): pass
def test_unsynced_backprop_conv_bn(self): def test_unsynced_backprop_conv_bn(self):
with Tensor.train(): with Tensor.train():
from extra.lr_scheduler import OneCycleLR from extra.lr_scheduler import OneCycleLR
@@ -1126,9 +1136,11 @@ def helper_test_shard_op(shps, fxn, atol=1e-6, rtol=1e-3):
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
class TestTensorOps(unittest.TestCase): class TestTensorOps(unittest.TestCase):
@needs_second_gpu
def test_interpolate(self): def test_interpolate(self):
helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19))) helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))
@needs_second_gpu
def test_bitcast(self): def test_bitcast(self):
helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int)) helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
@@ -1171,6 +1183,7 @@ class TestMultiRamUsage(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "need multi") @unittest.skipIf(not_support_multi_device(), "need multi")
class TestMultiFromUnrenderable(unittest.TestCase): class TestMultiFromUnrenderable(unittest.TestCase):
@needs_second_gpu
def test_from_npy(self): def test_from_npy(self):
t = Tensor(np.arange(100, dtype=np.uint32)) t = Tensor(np.arange(100, dtype=np.uint32))
ll = t.shard((d0, d1), axis=0) + 1 ll = t.shard((d0, d1), axis=0) + 1
@@ -1180,6 +1193,9 @@ class TestMultiFromUnrenderable(unittest.TestCase):
class TestMultiAssign(unittest.TestCase): class TestMultiAssign(unittest.TestCase):
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))
@needs_second_gpu
def setUp(self): pass
def test_multi_assign_realized(self): def test_multi_assign_realized(self):
out = Tensor.zeros(4).shard(self.device, 0).contiguous().realize() out = Tensor.zeros(4).shard(self.device, 0).contiguous().realize()
ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize() ones = Tensor.ones(4).shard(self.device, 0).contiguous().realize()
@@ -1242,6 +1258,7 @@ class TestMultiAssign(unittest.TestCase):
@unittest.skipIf(not_support_multi_device(), "need multi") @unittest.skipIf(not_support_multi_device(), "need multi")
class TestMultiTransformer(unittest.TestCase): class TestMultiTransformer(unittest.TestCase):
@needs_second_gpu
def test_transformer(self): def test_transformer(self):
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2)) device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))

View File

@@ -9,7 +9,7 @@ from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear
from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
from tinygrad.nn.state import load_state_dict from tinygrad.nn.state import load_state_dict
from tinygrad.engine.realize import run_schedule from tinygrad.engine.realize import run_schedule
from test.helpers import not_support_multi_device from test.helpers import not_support_multi_device, needs_second_gpu
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") @unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
class TestNN(unittest.TestCase): class TestNN(unittest.TestCase):
@@ -481,6 +481,7 @@ class TestNN(unittest.TestCase):
np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_load_state_dict_sharded_model(self): def test_load_state_dict_sharded_model(self):
devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")
@@ -519,6 +520,7 @@ class TestNN(unittest.TestCase):
np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy()) np.testing.assert_allclose(layer.weight.numpy(), state_dict['weight'].numpy())
np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy()) np.testing.assert_allclose(layer.bias.numpy(), state_dict['bias'].numpy())
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_load_state_dict_sharded_model_dict_same_axis(self): def test_load_state_dict_sharded_model_dict_same_axis(self):
devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3") devices = (f"{Device.DEFAULT}:1", f"{Device.DEFAULT}:2", f"{Device.DEFAULT}:3")

View File

@@ -7,7 +7,7 @@ from tinygrad.device import is_dtype_supported
from tinygrad.engine.realize import lower_schedule, CompiledRunner from tinygrad.engine.realize import lower_schedule, CompiledRunner
from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.nir import NIRRenderer from tinygrad.renderer.nir import NIRRenderer
from test.helpers import not_support_multi_device from test.helpers import not_support_multi_device, needs_second_gpu
import numpy as np import numpy as np
import torch import torch
@@ -141,6 +141,7 @@ class TestRandomness(unittest.TestCase):
r = Tensor.rand(10).numpy() r = Tensor.rand(10).numpy()
np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5) np.testing.assert_allclose(r, jr, atol=1e-5, rtol=1e-5)
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_threefry_tensors_cnt(self): def test_threefry_tensors_cnt(self):
Tensor.manual_seed(1337) Tensor.manual_seed(1337)
@@ -160,6 +161,7 @@ class TestRandomness(unittest.TestCase):
assert len(Tensor._device_rng_counters) == 0 assert len(Tensor._device_rng_counters) == 0
assert len(Tensor._device_seeds) == 0 assert len(Tensor._device_seeds) == 0
@needs_second_gpu
@unittest.skipIf(not_support_multi_device(), "no multi") @unittest.skipIf(not_support_multi_device(), "no multi")
def test_threefry_same_kernels(self): def test_threefry_same_kernels(self):
Tensor.manual_seed(0) Tensor.manual_seed(0)

View File

@@ -2,7 +2,7 @@ import unittest
from tinygrad import Device, dtypes, Tensor from tinygrad import Device, dtypes, Tensor
from tinygrad.device import Buffer from tinygrad.device import Buffer
from tinygrad.helpers import Context from tinygrad.helpers import Context
from test.helpers import REAL_DEV from test.helpers import REAL_DEV, needs_second_gpu
@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported") @unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
class TestSubBuffer(unittest.TestCase): class TestSubBuffer(unittest.TestCase):
@@ -41,6 +41,7 @@ class TestSubBuffer(unittest.TestCase):
out = (vt + 100).tolist() out = (vt + 100).tolist()
assert out == [102, 103] assert out == [102, 103]
@needs_second_gpu
@unittest.skipIf(REAL_DEV not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA") @unittest.skipIf(REAL_DEV not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
def test_subbuffer_transfer(self): def test_subbuffer_transfer(self):
t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize() t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()

View File

@@ -12,6 +12,11 @@ from extra.thunder.tiny.tk.tiles import ST_16X32, RT_16X32, RT_16X16, TileLayout
@unittest.skipIf(CI or Device.DEFAULT not in ["AMD"], "only amd") @unittest.skipIf(CI or Device.DEFAULT not in ["AMD"], "only amd")
class TestTK(unittest.TestCase): class TestTK(unittest.TestCase):
def setUp(self):
arch = Device["AMD"].arch
if not arch.startswith("gfx9"):
self.skipTest(f"arch {arch} not supported")
@unittest.skipIf(CI, "no wmma in ci") @unittest.skipIf(CI, "no wmma in ci")
def test_simple_matmul(self): def test_simple_matmul(self):
N = 8192 N = 8192

View File

@@ -22,7 +22,8 @@ class TestRawShmBuffer(unittest.TestCase):
@unittest.skipIf(CI, "CI doesn't like big shared memory") @unittest.skipIf(CI, "CI doesn't like big shared memory")
def test_e2e_big(self): def test_e2e_big(self):
t = Tensor.randn(2048, 2048, 8).realize() # bigger than this doesn't work on Linux, maybe this is a limit somewhere?
t = Tensor.randn(2048, 128, 8).realize()
# copy to shm # copy to shm
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name