fix TINY_BACKEND=1 cumsum (#14138)

* fix TINY_BACKEND=1 cumsum

old hack was wrong, need to apply contiguous on the input

* test time

* test_linalg_svd is slow
This commit is contained in:
chenyu
2026-01-14 09:54:49 -05:00
committed by GitHub
parent 434dbafab5
commit 986e865830
3 changed files with 13 additions and 2 deletions

View File

@@ -110,7 +110,7 @@ jobs:
- name: Test ResNet-18
run: DEBUG=2 python3 extra/torch_backend/example.py
- name: custom tests
run: python3 extra/torch_backend/test.py
run: python3 -m pytest -n auto extra/torch_backend/test.py --durations=20
- name: Test one op in torch tests
run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
- name: Test Ops with TINY_BACKEND

View File

@@ -630,7 +630,8 @@ tiny_backend = {**{k:wrap_out(v) for k,v in tiny_backend_out.items()}, **{
Tensor.linspace(start, stop, steps, **({"dtype": _from_torch_dtype(dtype)} if dtype is not None else {})),
"aten.topk": Tensor.topk,
"aten.constant_pad_nd": lambda self, padding, value=0.0: self.pad(padding, mode="constant", value=value).contiguous(),
"aten.cumsum": lambda self, dim: self.cumsum(dim).contiguous(), # TODO: fix test_simple_cumsum, fails without contiguous for shapes >512
# TODO: input contiguous is needed to prevent CFGContext circular dependency assertion for shapes >512 (see test_cumsum_arange_large)
"aten.cumsum": lambda self, dim: self.contiguous().cumsum(dim),
"aten.logsumexp": lambda self, axis, keepdim=False: self.logsumexp(axis[0], keepdim=keepdim),
"aten.roll": Tensor.roll,
"aten.logcumsumexp": Tensor.logcumsumexp,

View File

@@ -191,6 +191,7 @@ class TestTorchBackend(unittest.TestCase):
assert torch.equal(tensor_a, tensor_b)
assert not torch.equal(tensor_a, tensor_c)
@unittest.skip("# TODO: this test is slow")
def test_linalg_svd(self):
A = torch.randn(5, 5, device=device)
U, S, Vh = torch.linalg.svd(A)
@@ -699,6 +700,15 @@ class TestTorchBackend(unittest.TestCase):
expected = np.array([4.0, 3.0, 2.0, 1.0])
np.testing.assert_allclose(a.grad.cpu().numpy(), expected, rtol=1e-5)
def test_cumsum_arange_large(self):
# Tests cumsum with an unrealized arange input with size > 512 (the split threshold)
# This exercises the _split_cumalu path which uses a two-stage algorithm
for size in [513, 1022]:
a = torch.arange(size, dtype=torch.float32, device=device)
result = torch.cumsum(a, dim=0)
expected = torch.arange(size, dtype=torch.float32).cumsum(dim=0)
np.testing.assert_allclose(result.cpu().numpy(), expected.numpy(), rtol=1e-5)
def test_diag_1d_to_2d(self):
a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
b = torch.diag(a)