upcast overflowed idx to int64 [pr] (#8268)

* use full_shape to determine if index can potentially overflow

* update comment

* use shapetracker to check max index value

* wip

* lint

* handle mask

* upcast to int64 by st is noop on WGSL

* fix comments

* Handle negative overflow, intermediaries overflow, int64 support

handle negative overflow

handle symbolic

wip

handle intermediate values

wip

check if typemap support int64

lint

comment

* add invalid_dtype

lint

* Fix bug on checking mask overflow

wip

wip

* Add more tests, need to resolve partial upcast

test Valid_view_dup

test valid op overflow

refine test cases

clean up

cleanup

wip

refine tests

lint

* Upcast is handled by lower_load_store

upcast as graph_rewrite to backtrack

update test

wip

cleanup

wip

cleanup

do upcast in lower_load_store

lint

* cleanup

* do upcast within lower_load_store and mutate ctx

* do upcast in get_idx and view

revert

lint

* cleanup

* Upcast in vec, const

upcast to const

test case 3

upcast on vector

lint

* simplify idx with symbolic in case of fake overflow

test case4

test case 4

update test

* test case4 is only for metal

* try: upcast inside graph_rewrite instead of shapetracker

wip

* checking overflow can just be done directly on all views, with idxs

* cleanup

* REMOVE hard coded uop test for idx upcast

* refactor

cleanup

refactor

* do actual casting when necessary, instead of rewriting all idx

hard code uop test

new upcast

* check dtype for int64 in webgpu

* cleanup

cleanup

* cleanup

* update tests

cleanup

comment

cleanup

cleanup

* comment

* comment

* update comment

update comment

* refactor

* typo

* keep the scope to only upcasting

* white space

* Revert "white space"

This reverts commit 314d7eb184.

* Revert "keep the scope to only upcasting"

This reverts commit 1ef701dd85.

* sym folding is not necessary

lint1

* fold symbolic

lint

* use symbolic simple when folding shapetracker idx

* full sym folding is required after all...

* Ops.CAST should retain the src min max

* put rewrite to lowerer

wip

* start testing on higher level

wip

test higher level in test_tensor

* find Ops.STORE in list instead of recursively

* check dtype support when upcasting

* remove invalid_dtype

* lint

* fix int64 support checks in upcast

lint

* skipif skipunless

* revert fold to find test case

* Revert "revert fold to find test case"

This reverts commit 225bb6e801.

* test sym folding

* handle ptx

* wip

* wip

* delete hard coded uop test

* lint fixes

* wip

* fix checking for None

* lint

* handle ptx

* comment

* dtype for overflow()

* update skipIf skipUnless

* assert in wgsl renderer for int64

wip

* do folded_upcast in to_indexed_op, real_size uses views_to_indexed_ops

* assert in lowerer for dtype support

lint

* Revert "assert in lowerer for dtype support"

This reverts commit 8e9b1b79bf.

* assert dtype in kernel.py

* Revert "assert dtype in kernel.py"

This reverts commit e29b9a9893.

* wip

* assert in render

* remove old assert

* check dtype from rendere, assert in upcast

wip

* smaller arange for sym fold case

* linearize directly

* use expand directly

* lint

* lint

* rename

* no need to check dtype in device.py

* trigger pr

* remove dtype assert in upcast, make wgpu fail in render

* use DType for type hint instead of dtypes

* assert on KeyError in tests for webgpu backend int64

* use a tuple for src

* test real kernel run

wip

* lint error

* restore

* fix real_size

* update test example

* resolve merge stuff

---------

Co-authored-by: Mesozoic Egg <mesozoic.egg@proton.mail>
This commit is contained in:
mesozoic-egg
2025-01-18 00:52:31 +08:00
committed by GitHub
parent 23f0ff0ed8
commit 3506a7585f
2 changed files with 99 additions and 2 deletions

View File

@@ -7,6 +7,12 @@ from tinygrad.helpers import getenv, temp, _METADATA, mv_address
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
from hypothesis import given, settings, strategies as strat
from tinygrad.device import is_dtype_supported
from tinygrad.ops import Ops, UOp
from tinygrad.runtime.support.compiler_cuda import PTX
from tinygrad.codegen.linearize import linearize_uop
from tinygrad.codegen.uopgraph import full_graph_rewrite
from tinygrad.codegen.lowerer import rewrite_shapetracker_with_index
from tinygrad.dtype import DType
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")
@@ -773,5 +779,73 @@ class TestTensorMetadata(unittest.TestCase):
self.assertEqual(len(bw), 1)
self.assertEqual(bw[0].name, "sigmoid")
class TestIdxUpcast(unittest.TestCase):
def _find_op(self, ast: UOp, op: Ops):
if ast.op is op: return ast
for src in ast.src:
if (ret:=self._find_op(src, op)) is not None: return ret
def _schedule_render(self, a: Tensor):
schedule, _ = a.schedule_with_vars()
for s in schedule:
if s.ast.op is Ops.SINK:
renderer = Device[s.bufs[0].device].renderer
uops = linearize_uop(full_graph_rewrite(rewrite_shapetracker_with_index(s.ast, renderer), renderer))
renderer.render("test", uops)
return uops
def _assert(self, dtype: DType, a: Tensor):
uops = self._schedule_render(a)
# Assert the dtype of the INDEX value, This will need be updated if UOp spec changes
store = next(uop for uop in uops if uop.op is Ops.STORE)
assert store.op is Ops.STORE
idx = self._find_op(store, Ops.INDEX)
if idx is not None: # PTX turns Ops.INDEX into pointer arithmetic earlier than cstyle, plus it's already cast to int64
assert idx.op is Ops.INDEX
idx_val = idx.src[1]
assert idx_val.dtype is dtype
# use expand to generate kernel that uses large idx
def do_op_then_assert(self, dtype: DType, dim1, dim2, dim3):
self._assert(dtype, Tensor.empty(dim1, dim2, 1).expand(-1, -1, dim3).contiguous())
@unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
def test_overflow(self):
# 2**11, 2**11, 2**11 -> 2**33 will overflow when indexed
self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)
@unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
def test_overflow_sym(self):
self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))
def test_regular(self):
self.do_op_then_assert(dtypes.int, 64, 64, 64)
def test_regular_sym(self):
self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 0, 64).bind(32))
@unittest.skipIf(PTX, "PTX always convert Ops.INDEX to int64")
def test_symfold(self):
# This would cause an overflow, but after sym fold it's within int32
a = Tensor.arange(65535)
uops = self._schedule_render(a)
assert all(uop.dtype is not dtypes.long for uop in uops)
@unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
def test_int64_unsupported_overflow_sym(self):
with self.assertRaises(KeyError):
self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))
@unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
def test_int64_unsupported_overflow(self):
with self.assertRaises(KeyError):
self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)
@unittest.skip("This is kept for reference, it requires large memory to run")
def test_overflow_kernel_run(self):
# This creates a total of 2**31+10 elements, requiring at least 2147 MB memory to run
# Modified example from issue 3271
a = Tensor.empty(2**11, 2**11, 1, dtype=dtypes.int8).permute((2, 0, 1)).expand((2**9+10, -1, -1)).contiguous()
a.realize()
if __name__ == '__main__':
unittest.main()