diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9c8dcd265c..50c1dc0dd9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -110,7 +110,7 @@ jobs:
     - name: Test ResNet-18
       run: DEBUG=2 python3 extra/torch_backend/example.py
     - name: custom tests
-      run: python3 extra/torch_backend/test.py
+      run: python3 -m pytest -n auto extra/torch_backend/test.py --durations=20
     - name: Test one op in torch tests
       run: DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
     - name: Test Ops with TINY_BACKEND
diff --git a/extra/torch_backend/backend.py b/extra/torch_backend/backend.py
index 2826428b5a..c1e145f923 100644
--- a/extra/torch_backend/backend.py
+++ b/extra/torch_backend/backend.py
@@ -630,7 +630,8 @@ tiny_backend = {**{k:wrap_out(v) for k,v in tiny_backend_out.items()}, **{
     Tensor.linspace(start, stop, steps, **({"dtype": _from_torch_dtype(dtype)} if dtype is not None else {})),
   "aten.topk": Tensor.topk,
   "aten.constant_pad_nd": lambda self, padding, value=0.0: self.pad(padding, mode="constant", value=value).contiguous(),
-  "aten.cumsum": lambda self, dim: self.cumsum(dim).contiguous(), # TODO: fix test_simple_cumsum, fails without contiguous for shapes >512
+  # TODO: input contiguous is needed to prevent CFGContext circular dependency assertion for shapes >512 (see test_cumsum_arange_large)
+  "aten.cumsum": lambda self, dim: self.contiguous().cumsum(dim),
   "aten.logsumexp": lambda self, axis, keepdim=False: self.logsumexp(axis[0], keepdim=keepdim),
   "aten.roll": Tensor.roll,
   "aten.logcumsumexp": Tensor.logcumsumexp,
diff --git a/extra/torch_backend/test.py b/extra/torch_backend/test.py
index cde48acb18..b56e126d8e 100644
--- a/extra/torch_backend/test.py
+++ b/extra/torch_backend/test.py
@@ -191,6 +191,7 @@ class TestTorchBackend(unittest.TestCase):
     assert torch.equal(tensor_a, tensor_b)
     assert not torch.equal(tensor_a, tensor_c)
 
+  @unittest.skip("# TODO: this test is slow")
   def test_linalg_svd(self):
     A = torch.randn(5, 5, device=device)
     U, S, Vh = torch.linalg.svd(A)
@@ -699,6 +700,15 @@ class TestTorchBackend(unittest.TestCase):
     expected = np.array([4.0, 3.0, 2.0, 1.0])
     np.testing.assert_allclose(a.grad.cpu().numpy(), expected, rtol=1e-5)
 
+  def test_cumsum_arange_large(self):
+    # Tests cumsum with an unrealized arange input with size > 512 (the split threshold)
+    # This exercises the _split_cumalu path which uses a two-stage algorithm
+    for size in [513, 1022]:
+      a = torch.arange(size, dtype=torch.float32, device=device)
+      result = torch.cumsum(a, dim=0)
+      expected = torch.arange(size, dtype=torch.float32).cumsum(dim=0)
+      np.testing.assert_allclose(result.cpu().numpy(), expected.numpy(), rtol=1e-5)
+
   def test_diag_1d_to_2d(self):
     a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
     b = torch.diag(a)