enable WEBGPU tests with buffer limit (#11489)

TestSample still fails?
2026-01-09 06:58:11 -05:00 · 2025-08-03 13:02:44 -07:00
parent 8f374ee1f7
commit dbc7807c61
4 changed files with 4 additions and 16 deletions
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -2,7 +2,7 @@ import unittest, functools, random
 from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import Ops, UOp
-from tinygrad.helpers import CI, getenv, prod, Context, OSX
+from tinygrad.helpers import CI, getenv, prod, Context
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
 import numpy as np
@@ -374,7 +374,6 @@ class TestMultiTensor(unittest.TestCase):

  # NOTE: this is failing on LLVM CI, no idea why. Works locally.
  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
-  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_data_parallel_resnet(self):
    from extra.models.resnet import ResNet18

@@ -411,7 +410,6 @@ class TestMultiTensor(unittest.TestCase):
    np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)

  @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "LLVM", "CPU"), "slow, and flaky on LLVM/CPU")
-  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_data_parallel_resnet_train_step(self):
    from extra.models.resnet import ResNet18
    fake_image = Tensor.rand((2, 3, 224//8, 224//8))
@@ -938,7 +936,6 @@ class TestShrinkMultiTensorShardedAxis(unittest.TestCase):
    np.testing.assert_allclose(output.numpy(), expected)

@unittest.skipIf(not_support_multi_device(), "no multi")
-@unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
 class TestBatchNorm(unittest.TestCase):
  def test_unsynced_backprop_conv_bn(self):
    with Tensor.train():
@@ -966,7 +963,6 @@ class TestBatchNorm(unittest.TestCase):
      optim.step()
      out.numpy()

-  @unittest.skipIf(REAL_DEV == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_unsynced_backprop_standalone_bn(self):
    from extra.lr_scheduler import OneCycleLR
    GPUS = (d1, d2)
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch
 from tinygrad import Tensor, Device, TinyJit
 from tinygrad.uop.ops import Ops
-from tinygrad.helpers import GlobalCounters, CI, Context, OSX
+from tinygrad.helpers import GlobalCounters, CI, Context
 from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding
 from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
 from tinygrad.nn.state import load_state_dict
@@ -284,7 +284,6 @@ class TestNN(unittest.TestCase):
    torch_z = torch_layer(torch_x)
    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_groupnorm(self):
    BS, H, W, C, G = 20, 10, 10, 6, 3

@@ -311,7 +310,6 @@ class TestNN(unittest.TestCase):
      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_layernorm(self):
    N, C, H, W = 20, 5, 10, 10

@@ -338,7 +336,6 @@ class TestNN(unittest.TestCase):
      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_layernorm_2d(self):
    N, C, H, W = 20, 5, 10, 10

@@ -365,7 +362,6 @@ class TestNN(unittest.TestCase):
      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=5e-4, rtol=5e-4)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=5e-4, rtol=5e-4)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_instancenorm_2d(self):
    N, C, H, W = 20, 10, 10, 10

@@ -392,7 +388,6 @@ class TestNN(unittest.TestCase):
      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=1e-3, rtol=1e-3)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_instancenorm_3d(self):
    N, C, D, H, W = 20, 10, 10, 10, 10

@@ -419,7 +414,6 @@ class TestNN(unittest.TestCase):
      np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=2e-3, rtol=1e-3)
      np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3)

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_rmsnorm(self):
    class TorchRMSNorm(torch.nn.Module):
      # https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L34C1-L77C36
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
 import numpy as np
 from typing import List, Callable
 import torch
-from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, OSX, AMD_LLVM
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, AMD_LLVM
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
@@ -2682,7 +2682,6 @@ class TestOps(unittest.TestCase):
    i, j, k, o, p = [Tensor(tor.detach().cpu().numpy().astype(np.int32), requires_grad=False) for tor in [a,b,c,d,e]]
    return a,b,c,d,e,i,j,k,o,p

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU can only run kernels with up to 10 buffers")
  def test_slice_fancy_indexing_no_dim_collapse(self):
    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    # no dim collapse from int or dim injection from None
@@ -2734,7 +2733,6 @@ class TestOps(unittest.TestCase):
    helper_test_op([(2,3)], lambda x: x[torch.tensor([[0,1,-1],[-1,-2,0]]), torch.tensor([2,1,-1])],
                            lambda x: x[Tensor([[0,1,-1],[-1,-2,0]]), Tensor([2,1,-1])])

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU can only run kernels with up to 10 buffers")
  def test_slice_fancy_indexing_list_indices(self):
    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    helper_test_op([(2,5,6,5,3,4)], lambda x: x[[[0]]], lambda x: x[[[0]]])
@@ -2754,7 +2752,6 @@ class TestOps(unittest.TestCase):
    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,((2,),(1,),(0,)),c,(2,1,0)], lambda x: x[i,((2,),(1,),(0,)),k,(2,1,0)])
    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,(2,1,0),None,c,(2,1,0),e], lambda x: x[1,(2,1,0),None,k,(2,1,0),p])

-  @unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
  def test_slice_fancy_indexing_list_with_tensors(self):
    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    helper_test_op([(2,5,6,5,3,4)], lambda x: x[[a]], lambda x: x[[i]])
--- a/test/test_sample.py
+++ b/test/test_sample.py
@@ -3,6 +3,7 @@ import numpy as np
 from tinygrad import Tensor, Variable, Device
 from tinygrad.helpers import OSX

+# TODO: still fails with MAX_KERNEL_BUFFERS
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
 class TestSample(unittest.TestCase):
  def test_sample(self):