diff --git a/.github/actions/process-replay/action.yml b/.github/actions/process-replay/action.yml index efd2e1fc59..b2d1374d8c 100644 --- a/.github/actions/process-replay/action.yml +++ b/.github/actions/process-replay/action.yml @@ -11,5 +11,5 @@ runs: git fetch origin $CURRENT_SHA export COMMIT_MESSAGE=$(git show -s --format=%B "$CURRENT_SHA") export CURRENT_HEAD=$(git rev-parse HEAD) - cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && IGNORE_OOB=1 PYTHONPATH=. python3 process_replay.py + cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && CHECK_OOB=0 PYTHONPATH=. python3 process_replay.py git checkout $CURRENT_HEAD # restore to branch diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f400d38ec1..45dc28f06a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,7 @@ env: CAPTURE_PROCESS_REPLAY: 1 GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PYTHONPATH: ${{ github.workspace }} - IGNORE_OOB: 0 + CHECK_OOB: 1 on: push: @@ -38,7 +38,7 @@ jobs: runs-on: ubuntu-22.04 timeout-minutes: 10 env: - IGNORE_OOB: 1 + CHECK_OOB: 0 steps: - name: Checkout Code uses: actions/checkout@v4 @@ -476,7 +476,7 @@ jobs: runs-on: ubuntu-24.04 timeout-minutes: 15 env: - IGNORE_OOB: 1 + CHECK_OOB: 0 steps: - name: Checkout Code uses: actions/checkout@v4 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh index 68e5fdfcde..0c7818174b 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh @@ -4,7 +4,7 @@ export PYTHONPATH="." AMD=1 export MODEL="bert" export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128 -export IGNORE_OOB=1 +export CHECK_OOB=0 export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh index 278eff316d..b50ea17996 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh @@ -5,7 +5,7 @@ export MODEL="bert" export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh index a6a42a6de0..6655e3bd4e 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh @@ -8,7 +8,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export TRAIN_STEPS=3900 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh index 1dbef0e48e..fba66c861a 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh @@ -11,7 +11,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export TRAIN_STEPS=3900 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fa.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fa.sh index ec21783aac..bf10aef6ee 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fa.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fa.sh @@ -8,7 +8,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export TRAIN_STEPS=3900 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 export BEAM=0 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fp8.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fp8.sh index 59e1f9240d..b641a15d4d 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fp8.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/dev_run_fp8.sh @@ -8,7 +8,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export TRAIN_STEPS=3900 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh index a2d5cc7df2..8724db7d43 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI350X/run_and_time.sh @@ -11,7 +11,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024 export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1 export TRAIN_STEPS=3900 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh index 265455d1db..c8e6c508b5 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh @@ -4,7 +4,7 @@ export PYTHONPATH="." NV=1 export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh index 38c7966a29..e71359b449 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh @@ -4,7 +4,7 @@ export PYTHONPATH="." NV=1 export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh index 8dd27b3ea8..bcee96f38a 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh @@ -7,7 +7,7 @@ export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_green" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=72 EVAL_BS=72 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index d21bf8d9e8..07b71891a4 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -4,7 +4,7 @@ export PYTHONPATH="." AMD=1 export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh index 3010d3cc4a..019a1239ff 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -4,7 +4,7 @@ export PYTHONPATH="." AMD=1 export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index c3025bdcfa..c75bb88b60 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -7,7 +7,7 @@ export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_red" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=500000 export BEAM=5 BEAM_UOPS_MAX=8000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh index e5e32ed339..fe17f46a27 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export DEV=${DEV:-AMD} -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 export DEBUG=${DEBUG:-2} diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh index daa1448754..f81635ef8c 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh @@ -2,7 +2,7 @@ export PYTHONPATH="." export DEV=${DEV:-AMD} -export IGNORE_OOB=1 +export CHECK_OOB=0 export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000 export DEBUG=${DEBUG:-0} diff --git a/extra/assembly/amd/emu2.py b/extra/assembly/amd/emu2.py index 49d1f3b92b..44ac9c1267 100644 --- a/extra/assembly/amd/emu2.py +++ b/extra/assembly/amd/emu2.py @@ -1043,7 +1043,7 @@ def _get_runner(inst_bytes: bytes): canonical_name = f"{_op_name(inst).lower()}_{base.to_bytes(size, 'little').hex()}" sink = sink.replace(arg=KernelInfo(name=canonical_name)).rtag(1) - with Context(NOOPT=1, IGNORE_OOB=1, TUPLE_ORDER=0): + with Context(NOOPT=1, CHECK_OOB=0, TUPLE_ORDER=0): runner = get_runner('CPU', sink) _canonical_runner_cache.append((base, mask, size, runner)) return runner, True diff --git a/test/test_setitem.py b/test/test_setitem.py index c1ac34aabe..d207c3ec66 100644 --- a/test/test_setitem.py +++ b/test/test_setitem.py @@ -131,7 +131,7 @@ class TestSetitem(unittest.TestCase): np.testing.assert_allclose(t.numpy(), n) def test_jit_setitem_variable_offset(self): - with Context(IGNORE_OOB=1): + with Context(CHECK_OOB=0): @TinyJit def f(t:Tensor, a:Tensor, v:Variable): t.shrink(((v,v+1), None)).assign(a).realize() diff --git a/test/unit/test_linalg.py b/test/unit/test_linalg.py index 5bc33590b1..9bdff0b5cf 100644 --- a/test/unit/test_linalg.py +++ b/test/unit/test_linalg.py @@ -27,7 +27,7 @@ class TestLinAlg(unittest.TestCase): reconstruction_helper([U,s_diag,V],a) def _test_svd_nonfull(self, size): - with Context(IGNORE_OOB=1): # sometimes this is slow in CI + with Context(CHECK_OOB=0): # sometimes this is slow in CI a = Tensor.randn(size).realize() U,S,V = a.svd(full_matrices=False) b_shape,m,n = size[0:-2],size[-2],size[-1] diff --git a/test/unit/test_validate_oob.py b/test/unit/test_validate_oob.py index 8f1fab6291..92a4e26e32 100644 --- a/test/unit/test_validate_oob.py +++ b/test/unit/test_validate_oob.py @@ -10,7 +10,7 @@ class TestValidateOOB(unittest.TestCase): # basic index patterns def test_const_index(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) to_uops_list([buf.index(UOp.const(dtypes.int, 0), ptr=True).load(dtype=dtypes.int)]) # valid to_uops_list([buf.index(UOp.const(dtypes.int, 15), ptr=True).load(dtype=dtypes.int)]) # valid (last element) @@ -20,7 +20,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(UOp.const(dtypes.int, 42), ptr=True).load(dtype=dtypes.int)]) # way out def test_variable_index(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) to_uops_list([buf.index(Variable("i", 0, 15), ptr=True).load(dtype=dtypes.int)]) # valid with self.assertRaises(RuntimeError): @@ -29,7 +29,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(Variable("i", -5, 10), ptr=True).load(dtype=dtypes.int)]) # negative def test_range_with_mask(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) r = UOp.range(42, 0, AxisType.GLOBAL) to_uops_list([buf.index(r.valid(r < 16), ptr=True).load(dtype=dtypes.int)]) # valid @@ -37,7 +37,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(r.valid(r < 17), ptr=True).load(dtype=dtypes.int)]) # oob def test_variable_with_mask(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) v = Variable("v", -5, 80) to_uops_list([buf.index(v.valid((v >= 0) & (v < 16)), ptr=True).load(dtype=dtypes.int)]) # valid @@ -45,7 +45,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(v.valid(v < 20), ptr=True).load(dtype=dtypes.int)]) # negative not masked def test_gated_store(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) v = Variable("v", 0, 20) to_uops_list([buf.index(v.valid(v < 16)).store(0)]) # valid @@ -54,14 +54,14 @@ class TestValidateOOB(unittest.TestCase): # ALU ops in index def test_idiv(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) to_uops_list([buf.index(UOp.range(32, 0, AxisType.GLOBAL) // 2, ptr=True).load(dtype=dtypes.int)]) # 0..15 valid with self.assertRaises(RuntimeError): to_uops_list([buf.index(UOp.range(34, 0, AxisType.GLOBAL) // 2, ptr=True).load(dtype=dtypes.int)]) # 0..16 oob def test_mod(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) r = UOp.range(100, 0, AxisType.GLOBAL) to_uops_list([buf.index(r % 16, ptr=True).load(dtype=dtypes.int)]) # 0..15 valid @@ -69,14 +69,14 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(r % 20, ptr=True).load(dtype=dtypes.int)]) # 0..19 oob def test_shr(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) to_uops_list([buf.index(UOp.range(64, 0, AxisType.GLOBAL) >> 2, ptr=True).load(dtype=dtypes.int)]) # 0..15 valid with self.assertRaises(RuntimeError): to_uops_list([buf.index(UOp.range(128, 0, AxisType.GLOBAL) >> 2, ptr=True).load(dtype=dtypes.int)]) # 0..31 oob def test_shl(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(64), (), 0) r = UOp.range(8, 0, AxisType.GLOBAL) to_uops_list([buf.index(r << 2, ptr=True).load(dtype=dtypes.int)]) # 0..28 valid @@ -84,7 +84,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(r << 4, ptr=True).load(dtype=dtypes.int)]) # 0..112 oob def test_and(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) r = UOp.range(100, 0, AxisType.GLOBAL) to_uops_list([buf.index(r & 15, ptr=True).load(dtype=dtypes.int)]) # 0..15 valid @@ -92,14 +92,14 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf.index(r & 31, ptr=True).load(dtype=dtypes.int)]) # 0..31 oob def test_max(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) to_uops_list([buf.index(Variable("v", -10, 15).maximum(0), ptr=True).load(dtype=dtypes.int)]) # 0..15 valid with self.assertRaises(RuntimeError): to_uops_list([buf.index(Variable("v2", -10, 20).maximum(0), ptr=True).load(dtype=dtypes.int)]) # 0..20 oob def test_xor_in_mask(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) r = UOp.range(32, 0, AxisType.GLOBAL) to_uops_list([buf.index(r.valid((r < 8) ^ ((r >= 8) & (r < 16))), ptr=True).load(dtype=dtypes.int)]) # 0..15 valid @@ -108,21 +108,21 @@ class TestValidateOOB(unittest.TestCase): # cast patterns def test_float_cast_in_index(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) r = UOp.range(20, 0) i = (r.cast(dtypes.float) * 0.68).trunc().cast(dtypes.int) to_uops_list([buf.index(i.valid((i >= 0) & (i < 16)), ptr=True).load(dtype=dtypes.int)]) def test_bool_cast_in_mask(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), (), 0) r = UOp.range(20, 0) to_uops_list([buf.index(r.valid(r.cast(dtypes.bool).logical_not()), ptr=True).load(dtype=dtypes.int)]) # only r=0 valid # load result as index/mask def test_load_as_index(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf0 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) buf1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(64), (), 1) r = UOp.range(42, 0, AxisType.GLOBAL) @@ -132,7 +132,7 @@ class TestValidateOOB(unittest.TestCase): to_uops_list([buf1.index((ld0 * 2).valid((ld0 >= 0) & (ld0 < 64)), ptr=True).load(dtype=dtypes.int)]) # oob def test_load_bool_as_mask(self): - with Context(IGNORE_OOB=0, SPEC=2): + with Context(CHECK_OOB=1, SPEC=2): buf_bool = UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(16), (), 0) buf_int = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(8), (), 1) gidx = UOp(Ops.SPECIAL, dtypes.index, (UOp.const(dtypes.index, 16),), "gidx0") @@ -143,7 +143,7 @@ class TestValidateOOB(unittest.TestCase): # skipped tests (moved from test_uop_graph.py) @unittest.skip("if not allowed in graph") def test_in_bounds_access_gated_local(self): - with Context(IGNORE_OOB=0): + with Context(CHECK_OOB=1): # Define buffers gbuf = UOp(Ops.DEFINE_GLOBAL, dtypes.uint.ptr(400), (), 0) sbuf = UOp(Ops.DEFINE_LOCAL, dtypes.uint.ptr(8, addrspace=AddrSpace.LOCAL), (), "temp0") @@ -168,7 +168,7 @@ class TestValidateOOB(unittest.TestCase): @unittest.skip("Bool load is not supported yet") def test_load_mask(self): - with Context(IGNORE_OOB=0): + with Context(CHECK_OOB=1): glbl0 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(16), (), 0) mask = UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(16), (), 0) ridx = UOp.range(20, 0) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 1bc0578617..4f60324e86 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -190,7 +190,7 @@ VIZ = ContextVar("VIZ", 0) PROFILE = ContextVar("PROFILE", abs(VIZ.value)) SPEC = ContextVar("SPEC", 1) # TODO: disable by default due to speed -IGNORE_OOB = ContextVar("IGNORE_OOB", 1) +CHECK_OOB = ContextVar("CHECK_OOB", 0) PCONTIG = ContextVar("PCONTIG", 0) # partial contiguous in rangeify DEBUG_RANGEIFY = ContextVar("DEBUG_RANGEIFY", 0) # set to 1, this uses tuplize in the linearizer sort order diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py index 769de66b9d..7d07ab762a 100644 --- a/tinygrad/uop/ops.py +++ b/tinygrad/uop/ops.py @@ -89,7 +89,7 @@ class UOpMetaClass(type): if SPEC > 1: from tinygrad.uop.spec import full_spec, test_pyrender if SPEC > 2: test_pyrender(created) - with Context(IGNORE_OOB=1): fret = cast(bool|None, full_spec.rewrite(created)) + with Context(CHECK_OOB=0): fret = cast(bool|None, full_spec.rewrite(created)) if fret is not True: raise RuntimeError(f"SPEC ISSUE {fret}: {created}") return created diff --git a/tinygrad/uop/spec.py b/tinygrad/uop/spec.py index 13ee3e6afe..1764ec4ef4 100644 --- a/tinygrad/uop/spec.py +++ b/tinygrad/uop/spec.py @@ -2,13 +2,13 @@ import math from typing import cast, Any from tinygrad.uop.ops import PatternMatcher, UPat, GroupOp, Ops, UOp, print_uops, AxisType, KernelInfo, pyrender, Kernel, CustomKernel from tinygrad.dtype import DType, ImageDType, dtypes, PtrDType, AddrSpace, Invalid, ConstFloat -from tinygrad.helpers import DEBUG, Context, prod, SPEC, Metadata, panic, IGNORE_OOB +from tinygrad.helpers import DEBUG, Context, prod, SPEC, Metadata, panic, CHECK_OOB def validate_index(buf:UOp, idx:UOp, gate:UOp|None=None): if idx.op is Ops.CONST and idx.arg is Invalid: return True if gate is None: gate = UOp.const(dtypes.bool, True) # TODO: check for overflow - if IGNORE_OOB or isinstance(buf.dtype, ImageDType) or (sz := buf.ptrdtype.size) == -1: return True + if not CHECK_OOB or isinstance(buf.dtype, ImageDType) or (sz := buf.ptrdtype.size) == -1: return True # We can use UOp min/max to do a faster check, but it can give false positive since its not an exact bound and doesn't consider the mask if 0<=idx.vmin and idx.vmax= 4.12.4, use IGNORE_OOB=1 to disable, or \"pip install 'z3-solver>=4.12.4\"") + raise ImportError("bounds checking requires z3 >= 4.12.4, use CHECK_OOB=0 to disable, or \"pip install 'z3-solver>=4.12.4\"") # IDIV is truncated division but z3 does euclidian division (floor if b>0 ceil otherwise); mod by power of two sometimes uses Ops.AND def z3_cdiv(a, b):return z3.If((a<0), z3.If(0