diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 82d6880583..14e1cb1266 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -2,7 +2,7 @@ import unittest, functools, random from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable from tinygrad.device import is_dtype_supported from tinygrad.uop.ops import Ops, UOp -from tinygrad.helpers import CI, getenv, prod, Context, OSX +from tinygrad.helpers import CI, getenv, prod, Context from tinygrad.nn.state import get_parameters, get_state_dict from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule import numpy as np @@ -374,7 +374,6 @@ class TestMultiTensor(unittest.TestCase): # NOTE: this is failing on LLVM CI, no idea why. Works locally. @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU") - @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_data_parallel_resnet(self): from extra.models.resnet import ResNet18 @@ -411,7 +410,6 @@ class TestMultiTensor(unittest.TestCase): np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5) @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU") - @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_data_parallel_resnet_train_step(self): from extra.models.resnet import ResNet18 fake_image = Tensor.rand((2, 3, 224//8, 224//8)) @@ -938,7 +936,6 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase): np.testing.assert_allclose(output.numpy(), expected) @unittest.skipIf(not_support_multi_device(), "no multi") -@unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") class TestBatchNorm(unittest.TestCase): def test_unsynced_backprop_conv_bn(self): with Tensor.train(): @@ -966,7 +963,6 @@ class TestBatchNorm(unittest.TestCase): optim.step() out.numpy() - @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_unsynced_backprop_standalone_bn(self): from extra.lr_scheduler import OneCycleLR GPUS = (d1, d2) diff --git a/test/test_nn.py b/test/test_nn.py index f85aa43f59..86d0bb99a3 100755 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4,7 +4,7 @@ import numpy as np import torch from tinygrad import Tensor, Device, TinyJit from tinygrad.uop.ops import Ops -from tinygrad.helpers import GlobalCounters, CI, Context, OSX +from tinygrad.helpers import GlobalCounters, CI, Context from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell from tinygrad.nn.state import load_state_dict @@ -284,7 +284,6 @@ class TestNN(unittest.TestCase): torch_z = torch_layer(torch_x) np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_groupnorm(self): BS, H, W, C, G = 20, 10, 10, 6, 3 @@ -311,7 +310,6 @@ class TestNN(unittest.TestCase): np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4) np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_layernorm(self): N, C, H, W = 20, 5, 10, 10 @@ -338,7 +336,6 @@ class TestNN(unittest.TestCase): np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4) np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_layernorm_2d(self): N, C, H, W = 20, 5, 10, 10 @@ -365,7 +362,6 @@ class TestNN(unittest.TestCase): np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4) np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_instancenorm_2d(self): N, C, H, W = 20, 10, 10, 10 @@ -392,7 +388,6 @@ class TestNN(unittest.TestCase): np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=1e-3, rtol=1e-3) np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_instancenorm_3d(self): N, C, D, H, W = 20, 10, 10, 10, 10 @@ -419,7 +414,6 @@ class TestNN(unittest.TestCase): np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=2e-3, rtol=1e-3) np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_rmsnorm(self): class TorchRMSNorm(torch.nn.Module): # https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L34C1-L77C36 diff --git a/test/test_ops.py b/test/test_ops.py index 7634edda6e..8ec9e54885 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings import numpy as np from typing import List, Callable import torch -from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, OSX, AMD_LLVM +from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, AMD_LLVM from tinygrad import Tensor, Device, dtypes from tinygrad.tensor import _to_np_dtype from tinygrad.device import is_dtype_supported @@ -2682,7 +2682,6 @@ class TestOps(unittest.TestCase): i, j, k, o, p = [Tensor(tor.detach().cpu().numpy().astype(np.int32), requires_grad=False) for tor in [a,b,c,d,e]] return a,b,c,d,e,i,j,k,o,p - @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU can only run kernels with up to 10 buffers") def test_slice_fancy_indexing_no_dim_collapse(self): a,b,c,d,e,i,j,k,o,p = self._get_index_randoms() # no dim collapse from int or dim injection from None @@ -2734,7 +2733,6 @@ class TestOps(unittest.TestCase): helper_test_op([(2,3)], lambda x: x[torch.tensor([[0,1,-1],[-1,-2,0]]), torch.tensor([2,1,-1])], lambda x: x[Tensor([[0,1,-1],[-1,-2,0]]), Tensor([2,1,-1])]) - @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU can only run kernels with up to 10 buffers") def test_slice_fancy_indexing_list_indices(self): a,b,c,d,e,i,j,k,o,p = self._get_index_randoms() helper_test_op([(2,5,6,5,3,4)], lambda x: x[[[0]]], lambda x: x[[[0]]]) @@ -2754,7 +2752,6 @@ class TestOps(unittest.TestCase): helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,((2,),(1,),(0,)),c,(2,1,0)], lambda x: x[i,((2,),(1,),(0,)),k,(2,1,0)]) helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,(2,1,0),None,c,(2,1,0),e], lambda x: x[1,(2,1,0),None,k,(2,1,0),p]) - @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") def test_slice_fancy_indexing_list_with_tensors(self): a,b,c,d,e,i,j,k,o,p = self._get_index_randoms() helper_test_op([(2,5,6,5,3,4)], lambda x: x[[a]], lambda x: x[[i]]) diff --git a/test/test_sample.py b/test/test_sample.py index ae71327924..d53474632a 100644 --- a/test/test_sample.py +++ b/test/test_sample.py @@ -3,6 +3,7 @@ import numpy as np from tinygrad import Tensor, Variable, Device from tinygrad.helpers import OSX +# TODO: still fails with MAX_KERNEL_BUFFERS @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers") class TestSample(unittest.TestCase): def test_sample(self):