upcast overflowed idx to int64 [pr] (#8268)

* use full_shape to determine if index can potentially overflow * update comment * use shapetracker to check max index value * wip * lint * handle mask * upcast to int64 by st is noop on WGSL * fix comments * Handle negative overflow, intermediaries overflow, int64 support handle negative overflow handle symbolic wip handle intermediate values wip check if typemap support int64 lint comment * add invalid_dtype lint * Fix bug on checking mask overflow wip wip * Add more tests, need to resolve partial upcast test Valid_view_dup test valid op overflow refine test cases clean up cleanup wip refine tests lint * Upcast is handled by lower_load_store upcast as graph_rewrite to backtrack update test wip cleanup wip cleanup do upcast in lower_load_store lint * cleanup * do upcast within lower_load_store and mutate ctx * do upcast in get_idx and view revert lint * cleanup * Upcast in vec, const upcast to const test case 3 upcast on vector lint * simplify idx with symbolic in case of fake overflow test case4 test case 4 update test * test case4 is only for metal * try: upcast inside graph_rewrite instead of shapetracker wip * checking overflow can just be done directly on all views, with idxs * cleanup * REMOVE hard coded uop test for idx upcast * refactor cleanup refactor * do actual casting when necessary, instead of rewriting all idx hard code uop test new upcast * check dtype for int64 in webgpu * cleanup cleanup * cleanup * update tests cleanup comment cleanup cleanup * comment * comment * update comment update comment * refactor * typo * keep the scope to only upcasting * white space * Revert "white space" This reverts commit 314d7eb184. * Revert "keep the scope to only upcasting" This reverts commit 1ef701dd85. * sym folding is not necessary lint1 * fold symbolic lint * use symbolic simple when folding shapetracker idx * full sym folding is required after all... * Ops.CAST should retain the src min max * put rewrite to lowerer wip * start testing on higher level wip test higher level in test_tensor * find Ops.STORE in list instead of recursively * check dtype support when upcasting * remove invalid_dtype * lint * fix int64 support checks in upcast lint * skipif skipunless * revert fold to find test case * Revert "revert fold to find test case" This reverts commit 225bb6e801. * test sym folding * handle ptx * wip * wip * delete hard coded uop test * lint fixes * wip * fix checking for None * lint * handle ptx * comment * dtype for overflow() * update skipIf skipUnless * assert in wgsl renderer for int64 wip * do folded_upcast in to_indexed_op, real_size uses views_to_indexed_ops * assert in lowerer for dtype support lint * Revert "assert in lowerer for dtype support" This reverts commit 8e9b1b79bf. * assert dtype in kernel.py * Revert "assert dtype in kernel.py" This reverts commit e29b9a9893. * wip * assert in render * remove old assert * check dtype from rendere, assert in upcast wip * smaller arange for sym fold case * linearize directly * use expand directly * lint * lint * rename * no need to check dtype in device.py * trigger pr * remove dtype assert in upcast, make wgpu fail in render * use DType for type hint instead of dtypes * assert on KeyError in tests for webgpu backend int64 * use a tuple for src * test real kernel run wip * lint error * restore * fix real_size * update test example * resolve merge stuff --------- Co-authored-by: Mesozoic Egg <mesozoic.egg@proton.mail>
2026-04-07 03:00:26 -04:00 · 2025-01-18 00:52:31 +08:00
parent 23f0ff0ed8
commit 3506a7585f
2 changed files with 99 additions and 2 deletions
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -7,6 +7,12 @@ from tinygrad.helpers import getenv, temp, _METADATA, mv_address
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from hypothesis import given, settings, strategies as strat
 from tinygrad.device import is_dtype_supported
+from tinygrad.ops import Ops, UOp
+from tinygrad.runtime.support.compiler_cuda import PTX
+from tinygrad.codegen.linearize import linearize_uop
+from tinygrad.codegen.uopgraph import full_graph_rewrite
+from tinygrad.codegen.lowerer import rewrite_shapetracker_with_index
+from tinygrad.dtype import DType

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@@ -773,5 +779,73 @@ class TestTensorMetadata(unittest.TestCase):
    self.assertEqual(len(bw), 1)
    self.assertEqual(bw[0].name, "sigmoid")

+class TestIdxUpcast(unittest.TestCase):
+  def _find_op(self, ast: UOp, op: Ops):
+    if ast.op is op: return ast
+    for src in ast.src:
+      if (ret:=self._find_op(src, op)) is not None: return ret
+  def _schedule_render(self, a: Tensor):
+    schedule, _ = a.schedule_with_vars()
+    for s in schedule:
+      if s.ast.op is Ops.SINK:
+        renderer = Device[s.bufs[0].device].renderer
+        uops = linearize_uop(full_graph_rewrite(rewrite_shapetracker_with_index(s.ast, renderer), renderer))
+        renderer.render("test", uops)
+        return uops
+
+  def _assert(self, dtype: DType, a: Tensor):
+    uops = self._schedule_render(a)
+    # Assert the dtype of the INDEX value, This will need be updated if UOp spec changes
+    store = next(uop for uop in uops if uop.op is Ops.STORE)
+    assert store.op is Ops.STORE
+    idx = self._find_op(store, Ops.INDEX)
+    if idx is not None: # PTX turns Ops.INDEX into pointer arithmetic earlier than cstyle, plus it's already cast to int64
+      assert idx.op is Ops.INDEX
+      idx_val = idx.src[1]
+      assert idx_val.dtype is dtype
+
+  # use expand to generate kernel that uses large idx
+  def do_op_then_assert(self, dtype: DType, dim1, dim2, dim3):
+    self._assert(dtype, Tensor.empty(dim1, dim2, 1).expand(-1, -1, dim3).contiguous())
+
+  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
+  def test_overflow(self):
+    # 2**11, 2**11, 2**11 -> 2**33 will overflow when indexed
+    self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)
+
+  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
+  def test_overflow_sym(self):
+    self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))
+
+  def test_regular(self):
+    self.do_op_then_assert(dtypes.int, 64, 64, 64)
+
+  def test_regular_sym(self):
+    self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 0, 64).bind(32))
+
+  @unittest.skipIf(PTX, "PTX always convert Ops.INDEX to int64")
+  def test_symfold(self):
+    # This would cause an overflow, but after sym fold it's within int32
+    a = Tensor.arange(65535)
+    uops = self._schedule_render(a)
+    assert all(uop.dtype is not dtypes.long for uop in uops)
+
+  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
+  def test_int64_unsupported_overflow_sym(self):
+    with self.assertRaises(KeyError):
+      self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))
+
+  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
+  def test_int64_unsupported_overflow(self):
+    with self.assertRaises(KeyError):
+      self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)
+
+  @unittest.skip("This is kept for reference, it requires large memory to run")
+  def test_overflow_kernel_run(self):
+    # This creates a total of 2**31+10 elements, requiring at least 2147 MB memory to run
+    # Modified example from issue 3271
+    a = Tensor.empty(2**11, 2**11, 1, dtype=dtypes.int8).permute((2, 0, 1)).expand((2**9+10, -1, -1)).contiguous()
+    a.realize()
+
 if __name__ == '__main__':
  unittest.main()