fix TINY_BACKEND=1 cumsum (#14138)

* fix TINY_BACKEND=1 cumsum old hack was wrong, need to apply contiguous on the input * test time * test_linalg_svd is slow
2026-04-29 03:00:14 -04:00 · 2026-01-14 09:54:49 -05:00
parent 434dbafab5
commit 986e865830
3 changed files with 13 additions and 2 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -110,7 +110,7 @@ jobs:
    - name: Test ResNet-18
      run: DEBUG=2 python3 extra/torch_backend/example.py
    - name: custom tests
-      run: python3 extra/torch_backend/test.py
+      run: python3 -m pytest -n auto extra/torch_backend/test.py --durations=20
    - name: Test one op in torch tests
      run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
    - name: Test Ops with TINY_BACKEND
--- a/extra/torch_backend/backend.py
+++ b/extra/torch_backend/backend.py
@@ -630,7 +630,8 @@ tiny_backend = {**{k:wrap_out(v) for k,v in tiny_backend_out.items()}, **{
    Tensor.linspace(start, stop, steps, **({"dtype": _from_torch_dtype(dtype)} if dtype is not None else {})),
  "aten.topk": Tensor.topk,
  "aten.constant_pad_nd": lambda self, padding, value=0.0: self.pad(padding, mode="constant", value=value).contiguous(),
-  "aten.cumsum": lambda self, dim: self.cumsum(dim).contiguous(), # TODO: fix test_simple_cumsum, fails without contiguous for shapes >512
+  # TODO: input contiguous is needed to prevent CFGContext circular dependency assertion for shapes >512 (see test_cumsum_arange_large)
+  "aten.cumsum": lambda self, dim: self.contiguous().cumsum(dim),
  "aten.logsumexp": lambda self, axis, keepdim=False: self.logsumexp(axis[0], keepdim=keepdim),
  "aten.roll": Tensor.roll,
  "aten.logcumsumexp": Tensor.logcumsumexp,
--- a/extra/torch_backend/test.py
+++ b/extra/torch_backend/test.py
@@ -191,6 +191,7 @@ class TestTorchBackend(unittest.TestCase):
    assert torch.equal(tensor_a, tensor_b)
    assert not torch.equal(tensor_a, tensor_c)

+  @unittest.skip("# TODO: this test is slow")
  def test_linalg_svd(self):
    A = torch.randn(5, 5, device=device)
    U, S, Vh = torch.linalg.svd(A)
@@ -699,6 +700,15 @@ class TestTorchBackend(unittest.TestCase):
    expected = np.array([4.0, 3.0, 2.0, 1.0])
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected, rtol=1e-5)

+  def test_cumsum_arange_large(self):
+    # Tests cumsum with an unrealized arange input with size > 512 (the split threshold)
+    # This exercises the _split_cumalu path which uses a two-stage algorithm
+    for size in [513, 1022]:
+      a = torch.arange(size, dtype=torch.float32, device=device)
+      result = torch.cumsum(a, dim=0)
+      expected = torch.arange(size, dtype=torch.float32).cumsum(dim=0)
+      np.testing.assert_allclose(result.cpu().numpy(), expected.numpy(), rtol=1e-5)
+
  def test_diag_1d_to_2d(self):
    a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)