diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bbea0ab112..20d18b139a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -310,8 +310,6 @@ jobs: run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20 - name: Test symbolic with Python emulator run: PYTHONPATH=. PYTHON=1 python3 test/test_symbolic_ops.py - - name: test_linearizer_failures with Python emulator - run: PYTHONPATH=. PYTHON=1 python3 -m pytest -rA test/test_linearizer_failures.py::TestLinearizerFailures::test_failure_1 - name: test_renderer_failures with Python emulator run: PYTHONPATH=. PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures @@ -523,8 +521,8 @@ jobs: REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py - name: Test Optimization Helpers run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py - - name: Test Action Space - run: PYTHONPATH="." DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py + #- name: Test Action Space + # run: PYTHONPATH="." DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py - name: Test Beam Search run: PYTHONPATH="." GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py - name: Test MLPerf stuff @@ -623,7 +621,7 @@ jobs: run: | WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Vulkan" python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit \ --ignore=test/test_copy_speed.py --ignore=test/test_rearrange_einops.py \ - --ignore=test/test_fuzz_shape_ops.py --ignore=test/test_linearizer_failures.py --durations=20 + --ignore=test/test_fuzz_shape_ops.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay @@ -774,8 +772,8 @@ jobs: run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py - name: Test Beam Search run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py - - name: Fuzz Test linearizer - run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py + #- name: Fuzz Test linearizer + # run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py - name: Run TRANSCENDENTAL math run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20 - name: Run pytest (amd) diff --git a/extra/optimization/generate_dataset.sh b/extra/optimization/generate_dataset.sh index 66473154f7..e4a7fe03d1 100755 --- a/extra/optimization/generate_dataset.sh +++ b/extra/optimization/generate_dataset.sh @@ -18,7 +18,7 @@ python3 examples/beautiful_cartpole.py python3 examples/mlperf/model_spec.py python3 examples/yolov8.py ./test/models/efficientnet/Chicken.jpg examples/openpilot/go.sh -JIT=2 BIG=1 MPS=1 pytest -n=auto test/ --ignore=test/test_fusion_op.py --ignore=test/test_linearizer_failures.py --ignore=test/test_gc.py --ignore=test/test_speed_v_torch.py --ignore=test/test_jit.py +JIT=2 BIG=1 MPS=1 pytest -n=auto test/ --ignore=test/test_fusion_op.py --ignore=test/test_gc.py --ignore=test/test_speed_v_torch.py --ignore=test/test_jit.py JIT=2 BIG=1 MPS=1 python -m pytest test/test_gc.py JIT=2 BIG=1 MPS=1 python -m pytest test/test_jit.py JIT=2 BIG=1 MPS=1 python -m pytest test/test_speed_v_torch.py @@ -26,4 +26,4 @@ JIT=2 BIG=1 MPS=1 python -m pytest test/test_speed_v_torch.py # extract, sort and uniq extra/optimization/extract_dataset.py sort -u /tmp/ops > /tmp/sops -ls -lh /tmp/ops /tmp/sops +ls -lh /tmp/ops /tmp/sops diff --git a/test/test_linearizer.py b/test/test_linearizer.py index ba50f9dbfe..3eb98fd6e1 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -3,7 +3,6 @@ import numpy as np import unittest from dataclasses import replace -from test.helpers import ast_const from tinygrad.opt.kernel import Opt, OptOps, KernelOptError, Kernel from tinygrad.codegen.lowerer import get_grouped_dims from tinygrad.uop.ops import UOp, Ops, GroupOp, KernelInfo @@ -137,420 +136,6 @@ class TestLinearizer(unittest.TestCase): x = Tensor.randn(4,).realize() helper_linearizer_ast(store.sink(), [x], wanna_output=[x.numpy()+1], opts=[]) - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(32, dtype=dtypes.float).realize() - st_x = x.uop.st - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((1, 32)).expand((32, 32))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (1,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((32, 1))),)) - diff = second_x + first_reduce*ast_const(dtypes.float, -1, (32, 1)) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (0,))) - store = UOp(Ops.STORE, dtypes.void, (g0.view(ShapeTracker.from_shape((1, 1))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [ - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], # grouping - [Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 8)], - [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 16)], - [Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.GROUPTOP, 0, 32)], - [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2)], # unroll reduce - [Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], - [Opt(OptOps.UNROLL, 0, 8), Opt(OptOps.UNROLL, 1, 8)] if Device.DEFAULT not in {"NV", "METAL"} else [], # can't do float8, - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], # grouping + unrolling - [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UNROLL, 2, 8), Opt(OptOps.UNROLL, 2, 8)], - [Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 0, 8)], - ] - wanna_output = (x.numpy()-x.numpy().sum(-1, keepdims=True)).sum(-1).reshape(1,1) - lins = helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - self._test_no_nested_ranges(lins, [0]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_mid_dim_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() - st_x = x.uop.st - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((27, 1, 32, 5)).expand((27, 32, 32, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(st_x.reshape((27, 32, 1, 5))),)) - diff = second_x + first_reduce*ast_const(dtypes.float, -1, (27, 32, 1, 5)) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [ - # locals - [Opt(OptOps.LOCAL, 0, 3)], - [Opt(OptOps.LOCAL, 0, 9)], - [Opt(OptOps.LOCAL, 0, 27)], - # grouping - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - [Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 8)], - [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 16)], - [Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.GROUPTOP, 0, 32)], - # # unroll - [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2)], - [Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], - [Opt(OptOps.UNROLL, 0, 8), Opt(OptOps.UNROLL, 1, 8)] if Device.DEFAULT not in {"NV", "METAL"} else [], - # # upcasting - [Opt(OptOps.UPCAST, 0, 3)], - [Opt(OptOps.UPCAST, 0, 9)], - # locals with grouping - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - # locals with unroll - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2)], - # locals with upcasting - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.UPCAST, 0, 9)], - # grouping with unrolling - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UNROLL, 2, 8), Opt(OptOps.UNROLL, 2, 8)], - # grouping with upcasting - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UPCAST, 0, 3)], - # locals with grouping with unroll - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UNROLL, 2, 8), Opt(OptOps.UNROLL, 2, 8)], - # locals with grouping with upcasting - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.UPCAST, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - [Opt(OptOps.LOCAL, 0, 9), Opt(OptOps.UPCAST, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - # grouping with unrolling and upcasting - [Opt(OptOps.UPCAST, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - [Opt(OptOps.UPCAST, 0, 3), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UNROLL, 2, 8), Opt(OptOps.UNROLL, 2, 8)], - # locals + grouping + unrolling + upcasting - [Opt(OptOps.LOCAL, 0, 3), Opt(OptOps.UPCAST, 0, 3), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), - Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - ] - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - lins = helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - self._test_no_nested_ranges(lins, [0]) - - def test_triple_multireduce(self): - Tensor.manual_seed(0) - x0 = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() - x1 = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() - x2 = Tensor.randn(27, 32, 5, dtype=dtypes.float).realize() - g0, g1, g2, g3 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(4)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x0.uop.st.reshape((27, 1, 1, 32, 5)).expand((27, 32, 32, 32, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x1.uop.st.reshape((27, 1, 32, 1, 5)).expand((27, 32, 32, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 32, 32, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (2,))) - third_x = UOp(Ops.LOAD, dtypes.float, (g3.view(x2.uop.st.reshape((27, 32, 1, 1, 5))),)) - mul = (third_x*second_reduce) - third_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (mul,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 1, 5))), third_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - wanna_output = (x2.numpy()*(x1.numpy()-x0.numpy().sum(axis=1, keepdims=True)).sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,1,5) - lins = helper_linearizer_ast(sink, [x0,x1,x2], wanna_output=[wanna_output]) - self._test_no_nested_ranges(lins, [0]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - @unittest.skip("this is not supported, it worked by luck") - def test_double_reduce_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(8, 32, 8, 16, dtype=dtypes.float).realize() - st = x.uop.st - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, st.reshape((8, 1, 32, 8, 1, 16)).expand((8, 32, 32, 8, 16, 16)).to_uop())) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2, 5))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, st.reshape((8, 32, 1, 8, 16, 1)).to_uop())) - neg_first_reduce = first_reduce * ast_const(dtypes.float, -1, (8, 32, 1, 8, 16, 1)) - squares = (second_x+neg_first_reduce) - squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (1, 4))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((8, 1, 1, 8, 1, 1)).to_uop(), squares_sum,)) - sink = UOp(Ops.SINK, src=(store,)) - wanna_output = (x.numpy()-x.numpy().sum(axis=(1,3), keepdims=True)).sum(axis=(1,3)).reshape((8,1,1,8,1,1)) - opts = [ - # openCL / GPU=1 is 256 max threads - # grouping - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], # first dim of both reduces - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 3, 2)], # both dims of the second reduce - [Opt(OptOps.GROUPTOP, 2, 2), Opt(OptOps.GROUPTOP, 3, 2)], # second dim of both reduces - [Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.GROUPTOP, 3, 2)], # both dims of the first reduce - # group all reduce dims - [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.GROUPTOP, 2, 2), Opt(OptOps.GROUPTOP, 3, 2)], - # checking how it works with 2 grouped reduces + unrolling - [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.GROUPTOP, 2, 4), Opt(OptOps.GROUPTOP, 3, 4), - Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - # Checking how it works with 2 grouped reduces + locals. - [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 0, 4), - Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.GROUPTOP, 2, 2), Opt(OptOps.GROUPTOP, 3, 2)], - # Checking how it works with 2 grouped reduces + locals + unroll. - [Opt(OptOps.LOCAL, 0, 2), - Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.GROUPTOP, 2, 4), Opt(OptOps.GROUPTOP, 3, 4), - Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - # Checking how it works with 2 grouped reduces + locals + upcast. - [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.UPCAST, 0, 2), - Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.GROUPTOP, 2, 2), Opt(OptOps.GROUPTOP, 3, 2)], - # Checking how it works with 2 grouped reduces + locals + upcast + unroll. - [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.UPCAST, 0, 2), - Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.GROUPTOP, 2, 4), Opt(OptOps.GROUPTOP, 3, 4), - Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], - ] - lins = helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - self._test_no_nested_ranges(lins, [0, 1]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_partial_opt_multireduce(self): - # check how it works with one reduce optimized and one unoptimized - Tensor.manual_seed(0) - x = Tensor.randn(27, 15, 5, dtype=dtypes.float).softmax(1).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((27, 15, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [ - [Opt(OptOps.GROUPTOP, 0, 3)], # grouping - [Opt(OptOps.GROUPTOP, 1, 3)], - [Opt(OptOps.GROUPTOP, 0, 15)], - [Opt(OptOps.GROUPTOP, 1, 15)], - [Opt(OptOps.UNROLL, 0, 3)], - [Opt(OptOps.UNROLL, 1, 3)], - ] - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - lins = helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - self._test_no_nested_ranges(lins, [0]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_multireduce_with_parallel(self): - Tensor.manual_seed(0) - x = Tensor.randn(4, 32, dtype=dtypes.float).realize() - x_p = Tensor.randn(4, 32, dtype=dtypes.float).realize() - g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((4, 1, 32)).expand((4, 32, 32))),)) - first_x_p = UOp(Ops.LOAD, dtypes.float, (g2.view(x_p.uop.st.reshape((4, 1, 32)).expand((4, 32, 32))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - first_reduce_p = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x_p.alu(Ops.EXP2),), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1.view(x.uop.st.reshape((4, 32, 1))),)) - diff = (second_x+(first_reduce + first_reduce_p)*ast_const(dtypes.float, -1, (4, 32, 1))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((4, 1, 1))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [ - # [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], # grouping - # [Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 8)], - # [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 16)], - # [Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.GROUPTOP, 0, 32)], - [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2)], # unroll reduce - [Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], - [Opt(OptOps.UNROLL, 0, 8), Opt(OptOps.UNROLL, 1, 8)] if Device.DEFAULT not in {"NV", "METAL"} else [], # can't do float8, - # [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 2, 2), Opt(OptOps.UNROLL, 3, 2)], # grouping + unrolling - # [Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UNROLL, 1, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], - # [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UNROLL, 2, 8), Opt(OptOps.UNROLL, 2, 8)], - # [Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 0, 8)], - ] - wanna_output = (x.numpy()-(x.numpy().sum(-1, keepdims=True)+np.exp2(x_p.numpy()).sum(-1, keepdims=True))).sum(-1).reshape(4, 1,1) - lins = helper_linearizer_ast(sink, [x,x_p], wanna_output=[wanna_output], opts=opts) - self._test_no_nested_ranges(lins, [0]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_multiout_multireduce(self): - # check how multireduce works with multioutput - Tensor.manual_seed(0) - x = Tensor.randn(27, 15, 5, dtype=dtypes.float).realize() - g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g2.view(x.uop.st.reshape((27, 15, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store0 = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - second_out = second_reduce * ast_const(dtypes.float, 1/15, (27, 1, 1, 5)) - store1 = UOp(Ops.STORE, src=(g1.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_out)) - sink = UOp(Ops.SINK, src=(store0, store1)) - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output, wanna_output/15]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_multiout_intermediate_multireduce(self): - # check how it outputing at different stages of the multireduce works - # TODO: Fails because the stores shapes do not match: store1.shape = (27,15,1,5) != store0.shape = (27,1,1,5) - # so the output shapes are different (FAIL!), - # if we change the shape of store1 to be contiguous, it will match store0 but not the value it's storing (FAIL!) - Tensor.manual_seed(0) - x = Tensor.randn(27, 15, 5, dtype=dtypes.float).realize() - g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g2.view(x.uop.st.reshape((27, 1, 15, 5)).expand((27, 15, 15, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g2.view(x.uop.st.reshape((27, 15, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 15, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store0 = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - store1 = UOp(Ops.STORE, src=(g1.view(ShapeTracker(views=(View(shape=(27,15,1,5), strides=(5,0,1,1), offset=0, mask=None, contiguous=False),))), first_reduce)) # noqa: E501 - wanna_output0 = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - wanna_output1 = x.numpy().sum(axis=1).reshape(27,1,1,5) - - sink = UOp(Ops.SINK, src=(store0, store1)) - with self.assertRaises(RuntimeError): # AST is invalid - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output0, wanna_output1]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_complete_unroll_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(27, 3, 5, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 3, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 3, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [[Opt(OptOps.UNROLL, 0, 3), Opt(OptOps.UNROLL, 0, 3)]] - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_upcast_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(27, 3, 5, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 1, 3, 5)).expand((27, 3, 3, 5))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((27, 3, 1, 5))),)) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 3, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((27, 1, 1, 5))), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [[Opt(OptOps.UPCAST, 0, 3)]] - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skip("can't group with multiple reduces yet") - def test_early_endif(self): - # make sure the if block of a grouped reduce can be closed early and the result loaded back in - Tensor.manual_seed(0) - x = Tensor.randn(27, 12, 5, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.uop.st.reshape((27, 1, 12, 5)).expand((27, 12, 12, 5)).to_uop())) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.uop.st.reshape((27, 12, 1, 5)).to_uop())) - diff = (second_x+first_reduce*ast_const(dtypes.float, -1, (27, 12, 1, 5))) - second_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (diff,), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((27, 1, 1, 5)).to_uop(), second_reduce)) - sink = UOp(Ops.SINK, src=(store,)) - opts = [[Opt(OptOps.GROUPTOP, 0, 3), Opt(OptOps.GROUPTOP, 1, 3)]] - wanna_output = (x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(27,1,1,5) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output], opts=opts) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_mean_std_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) - neg_mean = first_reduce * ast_const(dtypes.float, -1/35, (15, 25, 35, 1)) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 35, 1))),)) - squares = (second_x+neg_mean)*(second_x+neg_mean) - squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) - variance = squares_sum * ast_const(dtypes.float, 1/35, (15, 25, 1, 1)) - std = variance.alu(Ops.SQRT) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((15, 25, 1, 1))), std)) - sink = UOp(Ops.SINK, src=(store,)) - wanna_output = x.numpy().std(axis=2, ddof=0).reshape((15,25,1,1)) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_mean_std_multireduce_mid_dim(self): - Tensor.manual_seed(0) - x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 1, 25, 35)).expand((15, 25, 25, 35))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (2,))) - neg_mean = first_reduce * ast_const(dtypes.float, -0.04, (15, 25, 1, 35)) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((15, 25, 1, 35))),)) - squares = (second_x+neg_mean)*(second_x+neg_mean) - squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (1,))) - variance = squares_sum * ast_const(dtypes.float, 0.04, (15, 1, 1, 35)) - std = variance.alu(Ops.SQRT) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((15, 1, 1, 35))), std)) - sink = UOp(Ops.SINK, src=(store,)) - wanna_output = x.numpy().std(axis=1, ddof=0).reshape((15,1,1,35)) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - @unittest.expectedFailure - def test_mean_std_multireduce_multiout(self): - # TODO: Similar error to test_multiout_intermediate_multireduce (implicit expand vs shape mismatch) - Tensor.manual_seed(0) - x = Tensor.randn(15, 25, 35, dtype=dtypes.float).realize() - g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - first_x = UOp(Ops.LOAD, dtypes.float, (g2, x.uop.st.reshape((15, 25, 1, 35)).expand((15, 25, 35, 35)).to_uop())) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) - neg_mean = first_reduce * ast_const(dtypes.float, -1/35, (15, 25, 35, 1)) - second_x = UOp(Ops.LOAD, dtypes.float, (g2, x.uop.st.reshape((15, 25, 35, 1)).to_uop())) - squares = (second_x+neg_mean)*(second_x+neg_mean) - squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) - variance = squares_sum * ast_const(dtypes.float, 1/35, (15, 25, 1, 1)) - std = variance.alu(Ops.SQRT) - store_mean = UOp(Ops.STORE, src=(g1, ShapeTracker.from_shape((15, 25, 1, 1)).to_uop(), neg_mean)) - store_std = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((15, 25, 1, 1)).to_uop(), std)) - sink = UOp(Ops.SINK, src=(store_std, store_mean)) - wanna_output = [x.numpy().std(axis=2, ddof=0).reshape(15,25,1,1), x.numpy().mean(axis=2).reshape(15,25,1,1)] - - lins = helper_linearizer_ast(sink, [x], wanna_output=wanna_output) - for k in lins: - assert len([u for u in k.uops if u.op is Ops.DEFINE_ACC]) == 2, "got more than two accs (implies the kernel didn't reuse the mean reduce)" - - @unittest.skipIf(CI and Device.DEFAULT in {"PTX", "AMD", "NV"}, "ocelot/remu doesn't have multiple wave syncs yet") - def test_var_multireduce(self): - Tensor.manual_seed(0) - x = Tensor.randn(3, 27, 32, dtype=dtypes.float).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - # push reduce (3, 27, 32) -> (3, 27, 1) -> (3, 27, 32) expand to LOAD - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((3, 27, 1, 32)).expand((3, 27, 32, 32))),)) - first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) - neg_mean = first_reduce * ast_const(dtypes.float, -0.03125, (3, 27, 32, 1)) - # store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((3, 27, 32, 1)).to_uop(), mean)) - # verify_lazyop(store) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((3, 27, 32, 1))),)) - squares = (second_x+neg_mean)*(second_x+neg_mean) - squares_sum = UOp(Ops.REDUCE_AXIS, dtypes.float, (squares,), (Ops.ADD, (2,))) - variance = squares_sum * ast_const(dtypes.float, 0.03125, (3, 27, 1, 1)) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((3, 27, 1, 1))), variance)) - sink = UOp(Ops.SINK, src=(store,)) - wanna_output = x.numpy().var(axis=2, ddof=0).reshape((3,27,1,1)) - helper_linearizer_ast(sink, [x], wanna_output=[wanna_output]) - # tinygrad ref - y_tiny = x.var(axis=2, correction=0).reshape(3,27,1,1) - np.testing.assert_allclose(y_tiny.numpy(), wanna_output, atol=1e-4, rtol=1e-4) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_softmax_multireduce(self): - x = Tensor.rand(4, 32).realize() - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - first_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((4, 1, 32,)).expand((4, 32, 32))),)) - max_x = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.MAX, (2,))) - second_x = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((4, 32, 1,))),)) - centered_x = second_x+max_x*ast_const(dtypes.float, -1, (4, 32, 1)) - exp_x = centered_x.alu(Ops.EXP2) - sum_exp_x = UOp(Ops.REDUCE_AXIS, dtypes.float, (exp_x,), (Ops.ADD, (1,))) - # y = exp_x * sum_exp_x.alu(Ops.RECIP) # kernels cannot do a return to full shape - recip_sum_exp_x = sum_exp_x.alu(Ops.RECIP) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((4,1,1))), recip_sum_exp_x)) - sink = UOp(Ops.SINK, src=(store,)) - expected = 1/np.exp2(x.numpy() - x.numpy().max(axis=-1, keepdims=True)).sum(axis=-1, keepdims=True).reshape(4,1,1) - helper_linearizer_ast(sink, [x], wanna_output=[expected]) - @unittest.skipIf(CI and Device.DEFAULT in {"PTX", "AMD", "NV"}, "very slow") def test_indexing_multireduce(self): dataset = Tensor.rand(16384, 256).realize() @@ -560,271 +145,6 @@ class TestLinearizer(unittest.TestCase): real_index = dataset.numpy()[idxs.numpy()].reshape(4, 256, 1, 1) helper_linearizer_ast(sink, [dataset, idxs], wanna_output=[real_index]) - # AssertionError: repeated stores in uops - def test_argmax_multireduce_axis0(self): - t = Tensor.randn(10, 20).realize() - t_max = t.max((0,)).realize() - real_argmax = np.argmax(t.numpy(), axis=0, keepdims=False).reshape(1, 20, 1) - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(-1), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=10, src=( - x6:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x8:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x6,)), - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.MAX, (0,)), src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CAST, dtypes.int, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(200), arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(20, 1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(20), arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), # noqa: E501 - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 20, 10), strides=(1, 0, 20), offset=0, mask=None, contiguous=False))), src=()),)), # noqa: E501 - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x28:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 20, 10), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x28,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=10, src=( - x21,)),)),)),)),)),)), - x8,)),)),)) - helper_linearizer_ast(ast, [t, t_max], wanna_output=[real_argmax]) - - def test_argmax_multireduce_flat(self): - t = Tensor.randn(10, 20).realize() - t_max = t.max().realize() - real_argmax = np.argmax(t.numpy()) - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(-1), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=200, src=( - x6:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=()),)), # noqa: E501 - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x8:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x6,)), - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.MAX, (0,)), src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CAST, dtypes.int, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(200), arg=ShapeTracker(views=(View(shape=(200, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(200, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(-1), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), # noqa: E501 - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(201, 399), strides=(0, 0), offset=0, mask=((0, 201), (199, 399)), contiguous=False), View(shape=(200, 200), strides=(1, 400), offset=0, mask=None, contiguous=False))), src=()),)), # noqa: E501 - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x28:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(200, 200), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), # noqa: E501 - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x28,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=200, src=( - x21,)),)),)),)),)),)), - x8,)),)),)) - helper_linearizer_ast(ast, [t, t_max], wanna_output=[real_argmax]) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_padto_sum_multireduce(self): - Tensor.manual_seed(0) - N = 17 - x = Tensor.rand(N, N).realize() - opts = [ - [Opt(OptOps.PADTO, 0, 32)], - [Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),], - # TODO: multireduce pads - # causes an issue because the acc won't be masked in the second reduce - # [Opt(OptOps.PADTO, 1, 32), Opt(OptOps.PADTO, 2, 32)] - ] - - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((1, N, N)).expand((N,N,N))),)) - x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N))),)) - r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.ADD, (1,))) - r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, 1, N)),),(Ops.ADD, (0,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,1,N))), r1)) - sink = UOp(Ops.SINK, src=(store,)) - helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().sum(axis=0, keepdims=True)).sum(axis=0).reshape(1,1,N)], opts=opts) - - x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N)).expand((N,N,N))),)) - x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, N, 1))),)) - r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.ADD, (2,))) - r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, N, 1)),), (Ops.ADD, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((N,1,1))), r1)) - sink = UOp(Ops.SINK, src=(store,)) - helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().sum(axis=1, keepdims=True)).sum(axis=1).reshape(N,1,1)], opts=opts) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_padto_max_multireduce(self): - Tensor.manual_seed(0) - N = 17 - x = Tensor.rand(N, N).realize() - opts = [ - [Opt(OptOps.PADTO, 0, 32)], - [Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),] - ] - - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(2)] - x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((1, N, N)).expand((N,N,N))),)) - x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N))),)) - r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.MAX, (1,))) - r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, 1, N)),), (Ops.MAX, (0,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,1,N))), r1)) - sink = UOp(Ops.SINK, src=(store,)) - helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().max(axis=0, keepdims=True)).max(axis=0).reshape(1,1,N)], opts=opts) - - x_ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, 1, N)).expand((N,N,N))),)) - x_ld1 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(x.uop.st.reshape((N, N, 1))),)) - r0 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld0,), (Ops.MAX, (2,))) - r1 = UOp(Ops.REDUCE_AXIS, dtypes.float, (x_ld1+r0*ast_const(dtypes.float, -1, (N, N, 1)),), (Ops.MAX, (1,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((N,1,1))), r1)) - sink = UOp(Ops.SINK, src=(store,)) - helper_linearizer_ast(sink, [x], wanna_output=[(x.numpy()-x.numpy().max(axis=1, keepdims=True)).max(axis=1).reshape(N,1,1)], opts=opts) - - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI doesn't support multiple sync threads yet") - def test_padto_where_multireduce(self): - # ternary operators try to use both ridxs - - # we need to make sure the ternary operators nest properly - N = 17 - x = Tensor.rand(N, N).realize() - a = Tensor.rand(1, 1).realize() - b = Tensor.rand(1, 1).realize() - opts = [[Opt(OptOps.PADTO, 0, 32)],[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],] - - wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(axis=1,keepdims=True), a.numpy(), b.numpy())).sum(axis=1),0.0,1.0).reshape((N,1,1)) # noqa: E501 - ld0 = x.uop.st.reshape((N, 1, N)).expand((N,N,N)) - ld1 = x.uop.st.reshape((N, N, 1)) - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.5*N, (N, 1, 1)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld1, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.75*N, (N, N, 1)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld0, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3),)),)),)),)),)),)), - ast_const(dtypes.float, 0.0, (N, 1, 1)), - ast_const(dtypes.float, 1.0, (N, 1, 1)),)),)),)) - helper_linearizer_ast(ast, [x,a,b], opts=opts, wanna_output=[wanna_output]) - - ld0 = x.uop.st.reshape((1, N, N)).expand((N,N,N)) - ld1 = x.uop.st.reshape((N, 1, N)) - wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(axis=0,keepdims=True), a.numpy(), b.numpy())).sum(axis=0),0.0,1.0).reshape(1,1,N) # noqa: E501 - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(1, 1, N), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.5*N, (1, 1, N)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld1, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.75*N, (N, 1, N)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ld0, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, 1, N), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3, src=()),)),)),)),)),)),)), - ast_const(dtypes.float, 0.0, (1, 1, N)), - ast_const(dtypes.float, 1.0, (1, 1, N)),)),)),)) - helper_linearizer_ast(ast, [x,a,b], opts=opts, wanna_output=[wanna_output]) - # pad reduce axis - helper_linearizer_ast(ast, [x,a,b], opts=[[Opt(OptOps.PADTO, 1, 32)],], wanna_output=[wanna_output]) - - ld0 = x.uop.st.reshape((1,1,N,N)).expand((N,N,N,N)) - ld1 = x.uop.st.reshape((N,N,1,1)) - wanna_output = np.where(0.5*17 < (x.numpy()+np.where(0.75*17 < x.numpy().sum(keepdims=True), a.numpy(), b.numpy())).sum(keepdims=True),0.0,1.0).reshape((1,1,1,1))# noqa: E501 - ast = UOp(Ops.SINK, src=( - UOp(Ops.STORE, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.5*N, (1, 1, 1, 1)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 1)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(N, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - ast_const(dtypes.float, 0.75*N, (N, N, 1, 1)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 3)), src=( - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, N, N), strides=(0, 0, N, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=1),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2),)),)), - UOp(Ops.LOAD, dtypes.float, src=( - UOp(Ops.VIEW, dtypes.float.ptr(), arg=ShapeTracker(views=(View(shape=(N, N, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=3),)),)),)),)),)),)), - ast_const(dtypes.float, 0.0, (1, 1, 1, 1)), - ast_const(dtypes.float, 1.0, (1, 1, 1, 1)),)),)),)) - helper_linearizer_ast(ast, [x,a,b], opts=[[Opt(OptOps.PADTO, 0, 32)],], wanna_output=[wanna_output]) - - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_end_local(self): - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=i) for i in range(2)] - load = UOp(Ops.LOAD, dtypes.int, (g1.view(ShapeTracker.from_shape((32,))),)) - reduce = UOp(Ops.REDUCE_AXIS, dtypes.int, (load,), (Ops.ADD, (0,))) - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker.from_shape((1,))), reduce)) - sink = UOp(Ops.SINK, src=(store,)) - load_t = Tensor.full(load.st_arg.shape, 1).contiguous().realize() - k = helper_linearizer_ast(sink, [load_t], wanna_output=[load_t.numpy().sum()])[1] - self.assertEqual(k.uops[-2].op, Ops.ENDIF) - self.assertEqual(k.uops[-1].op, Ops.SINK) - self.assertLess(k.uops.index([x for x in k.uops if x.op is Ops.STORE][-1]), k.uops.index(k.uops[-1])) - def test_two_nested_range(self): a = Tensor.randn(2, ).realize() out = a.reshape(2, 1).expand(2, 3).sum() @@ -914,24 +234,6 @@ class TestLinearizer(unittest.TestCase): assert num_loads <= 4, "more load uops than needed" assert num_loads >= 4, "unexpected number of uops, maybe this test needs updating?" - @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason") - def test_load_cache_const_bufs(self): - # make sure const buffers are differentiated from local and mem buffers - ST, DT = ShapeTracker(views=(View(shape=((1,)), strides=(0, 0), offset=0, mask=None, contiguous=False),)).to_uop(), dtypes.int - VAL = ast_const(DT, 2, ST.arg.shape) - g0, g1 = [UOp(Ops.DEFINE_GLOBAL, DT.ptr(), arg=i) for i in range(2)] - - # data1[0] + VAL - a = UOp(Ops.LOAD, DT, (g1.view(ST.arg),)) + VAL - # (literal const 1) + VAL - b = ast_const(DT, 1, ST.arg.shape) + VAL - - store = UOp(Ops.STORE, src=(g0.view(ST.arg), (a+b))) - sink = UOp(Ops.SINK, src=(store,)) - sink = sink.replace(arg=KernelInfo(opts_to_apply=tuple())) - program = get_program(sink, Device[Device.DEFAULT].renderer) - assert len(program.uops) <= 10, "too many uops" - def test_upcast_cse(self): # when upcasting, within a subtree, there may be common expressions. @@ -1986,29 +1288,6 @@ class TestKernelOpts(unittest.TestCase): with self.assertRaises(AssertionError): assert k.apply_tensor_cores(use_tensor_cores=1, extra_opts=x), "no valid tensor core" # for METAL in runners - @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") - def test_buf_index_not_found_tensor_core(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(1, 256), strides=(0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(256), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(256), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1243), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1243), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1243), arg=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501 - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1243), arg=3, src=()),)),)),)),)),)),)) - k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) - with self.assertRaises(KernelOptError): - k.apply_opt(Opt(OptOps.TC, 0, (-1, 1, 1))) - @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") @unittest.skipUnless(any(tc.dtype_in == tc.dtype_out == dtypes.half for tc in Device[Device.DEFAULT].renderer.tensor_cores), "test requires tensor cores with accumulation in half") # testing with half suffices. @@ -2198,23 +1477,6 @@ class TestKernelOpts(unittest.TestCase): [Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),], ]) - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") - def test_padto_group(self): - Tensor.manual_seed(0) - g0, g1, g2 = [UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=i) for i in range(3)] - ld0 = UOp(Ops.LOAD, dtypes.float, src=(g1.view(ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),))),)) # noqa: E501 - ld1 = UOp(Ops.LOAD, dtypes.float, src=(g2.view(ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),))),)) # noqa: E501 - store = UOp(Ops.STORE, src=(g0.view(ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),))), UOp(Ops.REDUCE_AXIS, dtypes.float, (ld0*ld1,), (Ops.ADD, (0, 2, 4, 6)),))) # noqa: E501 - sink = UOp(Ops.SINK, src=(store,)) - data1 = Tensor.randn(2, 1, 4, 1, 3, 4, 2, 6, 1, 3).realize() - data2 = Tensor.randn(2, 1, 4, 1, 3, 4, 2, 6, 1, 3).realize() - helper_linearizer_ast(sink, [data1, data2], opts=[ - #[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.GROUP, 0, 4)], - #[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8)], - #[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.GROUP, 0, 4)] - ]) - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") def test_color_shapes_with_local(self): diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py deleted file mode 100644 index a5e5a1a0ef..0000000000 --- a/test/test_linearizer_failures.py +++ /dev/null @@ -1,1706 +0,0 @@ -# ruff: noqa: E501 -import unittest, random -import numpy as np -from tinygrad.opt.kernel import Kernel, KernelOptError -from tinygrad.device import is_dtype_supported -from tinygrad.uop.ops import UOp, Ops -from tinygrad.opt.search import Opt, OptOps -from tinygrad import Device, dtypes, Tensor -from tinygrad.helpers import CI -from test.external.fuzz_linearizer import compare_linearizer - -from tinygrad.shape.shapetracker import ShapeTracker -from tinygrad.shape.view import View - -def helper_test_lin(lin: Kernel, opts, failed_platforms, rtol=1e-2, atol=1e-2): - if any(b.dtype.base == dtypes.half for b in lin.membufs) and not is_dtype_supported(dtypes.half): return - if any(b.dtype.base == dtypes.bfloat16 for b in lin.membufs) and not is_dtype_supported(dtypes.bfloat16): return - - try: - lin.apply_opts(opts) - except KernelOptError: - # it's considered fixed if we invalidated the opts - assert Device.DEFAULT not in failed_platforms, f"unexpected success on {Device.DEFAULT}" - return - - compare_result = compare_linearizer(lin, rtol=rtol, atol=atol) - if compare_result[0] in ["PASS", "KernelOptError"]: - # it's considered fixed if we invalidated the opts - assert Device.DEFAULT not in failed_platforms, f"unexpected success on {Device.DEFAULT}" - else: - assert Device.DEFAULT in failed_platforms, f"failed on {Device.DEFAULT} with {compare_result[0]}" - return lin - -@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "failed on CUDA CI") -class TestLinearizerFailures(unittest.TestCase): - def setUp(self): - random.seed(42) - np.random.seed(42) - Tensor.manual_seed(42) - - def test_failure_1(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - x8:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=1, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - x8,)),)),)),)),)) - helper_test_lin(Kernel(ast), [], failed_platforms=[]) - - def test_failure_2(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(21312), arg=ShapeTracker(views=(View(shape=(32, 2, 37, 9, 1, 1), strides=(666, 333, 9, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(21312), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (4, 5)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(197119), arg=ShapeTracker(views=(View(shape=(32, 2, 111, 27), strides=(6160, 3080, 28, 1), offset=0, mask=((0, 32), (0, 2), (0, 110), (0, 27)), contiguous=False), View(shape=(32, 2, 37, 9, 2, 2), strides=(5994, 2997, 81, 3, 27, 1), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(197119), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=0, arg=32)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_3(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(32, 8, 16, 1), strides=(128, 16, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(65536), arg=ShapeTracker(views=(View(shape=(32, 8, 16, 16), strides=(2048, 256, 16, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(65536), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.UNROLL, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=32)] - # METAL: AssertionError: Error Domain=AGXMetalG13X Code=3 "Threadgroup memory size (65536) exceeds the maximum threadgroup memory allowed (32768)" UserInfo={NSLocalizedDescription=Threadgroup memory size (65536) exceeds the maximum threadgroup memory allowed (32768)} - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_5(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.CONST, dtypes.float, arg=0.1464405059814453, src=( - x8:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - x8,)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 1, 4, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=1, src=()),)),)),)), - x5,)),)),)),)) - opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)] - # EXEC_ERROR, it has no global_size - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_6(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(10), arg=ShapeTracker(views=(View(shape=(10, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(10), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(10, 10), strides=(1, 20), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=10, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0)] - # COMPILE FAILED, KeyError: Ops.CONST - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_7(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(18939904), arg=ShapeTracker(views=(View(shape=(512, 32, 1, 34, 1, 34), strides=(36992, 1156, 0, 34, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(18939904), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 4)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(37748736), arg=ShapeTracker(views=(View(shape=(512, 32, 6, 8, 4, 6, 8, 4), strides=(2048, 64, 6291456, 8, 0, 1048576, 1, 0), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 8), (0, 1), (0, 6), (0, 8), (0, 1)), contiguous=False), View(shape=(512, 32, 6, 35, 6, 35), strides=(1179648, 36864, 6144, 192, 32, 1), offset=0, mask=((0, 512), (0, 32), (0, 6), (0, 32), (0, 6), (0, 32)), contiguous=False), View(shape=(512, 32, 238, 238), strides=(1411200, 44100, 210, 1), offset=0, mask=((0, 512), (0, 32), (0, 210), (0, 210)), contiguous=False), View(shape=(512, 32, 7, 34, 7, 34), strides=(1812608, 56644, 8092, 238, 34, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(37748736), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=4)] - # test/test_linearizer_failures.py Fatal Python error: Segmentation fault - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_8(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), - UOp(Ops.SQRT, dtypes.float, arg=None, src=( - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - x9:=UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4096), arg=ShapeTracker(views=(View(shape=(1, 1, 4096), strides=(0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4096), arg=2, src=()),)),)),)), - x9,)),)), - UOp(Ops.CONST, dtypes.float, arg=0.000244140625, src=( - x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1e-06, src=( - x17,)),)),)),)),)),)) - opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4)] - # fatal error: bracket nesting level exceeded maximum of 256 - # note: use -fbracket-depth=N to increase maximum nesting level - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_9(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(13500), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 0, 0, 4500, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13500), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9000), arg=ShapeTracker(views=(View(shape=(1, 2, 1, 3, 1, 1, 1, 1, 5, 15, 5, 3, 4), strides=(0, 4500, 0, 0, 0, 0, 0, 0, 900, 60, 12, 4, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9000), arg=2, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_10(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(50257), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(50257), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(51463168), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 50257), strides=(0, 0, 1, 1024), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51463168), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1, 1, 1024, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=3, src=()),)),)),)),)),)) - helper_test_lin(Kernel(ast), [], failed_platforms=[]) - - def test_failure_11(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(1, 64, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=0, src=()),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1179648), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True),)), src=( - x12:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1179648), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - x15:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - x20:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=3, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - x17,)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)), - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1179648), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - x12,)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - x15,)),)),)), - x39:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x40:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - x20,)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - x40,)),)), - UOp(Ops.SQRT, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(64), arg=ShapeTracker(views=(View(shape=(64,), strides=(1,), offset=0, mask=None, contiguous=True), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=4, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=5.425347222222222e-05, src=( - x53:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64,), strides=(0,), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 3, 2, 2), strides=(2304, 36, 12, 2, 6, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( - x53,)),)),)),)),)),)), - x39,)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=5, src=()),)),)),)),)),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=6, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(512, 64, 3, 3, 2, 2), strides=(576, 9, 3, 1, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 3, 2, 3, 2), strides=(2304, 36, 12, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(512, 64, 6, 6), strides=(2304, 36, 6, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=7, src=()),)),)),)),)),)),)),)),)) - helper_test_lin(Kernel(ast), [], failed_platforms=[]) - - def test_failure_12(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=1, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=2, src=()),)),)),)), - x5,)),)),)),)) - opts = [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.GROUP, axis=0, arg=4)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - # both kernels are correct from a code standpoint, but generate different results due to precision errors (switching to float results in output matches) - def test_failure_13(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(768), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(103728), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(51864, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(103728), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(19915776), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 51864), strides=(0, 0, 1, 384), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(19915776), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(19968), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=19584, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(19968), arg=3, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=4)] - helper_test_lin(Kernel(ast), opts, failed_platforms=["METAL", "GPU", "CUDA"]) - - def test_failure_14(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 1, 1, 4, 1, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 4, 6)), src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - x5:=UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(72), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 18, 0, 3, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(72), arg=1, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 3, 4, 2, 6, 1, 3), strides=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=2, src=()),)),)),)), - x5,)),)),)),)) - opts = [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)] - # COMPILE_ERROR on METAL in fuzz_linearizer: unused variables and undeclared variables - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_15(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(21952), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(21952), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(94080), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 0, 14, 1, 196, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(94080), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(53760), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 480, 1, 1), strides=(0, 0, 480, 0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(53760), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=3, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=4, src=()),)),)),)), - UOp(Ops.SQRT, dtypes.float, arg=None, src=( - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=5, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(112), arg=ShapeTracker(views=(View(shape=(1, 1, 112, 14, 14, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(112), arg=6, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.LOCAL, axis=1, arg=16)] - # COMPILE_ERROR on METAL in fuzz_linearizer ast 115: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_16(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(13), arg=ShapeTracker(views=(View(shape=(1, 13, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(13312), arg=ShapeTracker(views=(View(shape=(1, 13, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(13312), arg=1, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=0.0009765625, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 13, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=1, arg=4)] - # COMPILE_ERROR on METAL/GPU (probably HIP/CUDA too) in fuzz_linearizer ast 154: bracket nesting level exceeded maximum of 256 - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_17(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(62720), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 1, 28, 28, 1, 1), strides=(31360, 0, 784, 0, 28, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(62720), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(0, 0, 1, 40, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(376320), arg=ShapeTracker(views=(View(shape=(2, 1, 40, 240, 28, 28, 1, 1), strides=(188160, 0, 0, 784, 28, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(376320), arg=2, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.GROUPTOP, axis=0, arg=16), Opt(op=OptOps.PADTO, axis=1, arg=32), Opt(op=OptOps.LOCAL, axis=1, arg=4)] - # COMPILE_ERROR on METAL in fuzz_linearizer ast 178: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_18(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(768), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(768), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(384, 0, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(768), arg=1, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(3072), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(1536, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(3072), arg=2, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(589824), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1536), strides=(0, 0, 1536, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(589824), arg=3, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(384), arg=ShapeTracker(views=(View(shape=(2, 1, 384, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(384), arg=4, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUPTOP, axis=0, arg=256), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=3)] - # COMPILE_ERROR on METAL in fuzz_linearizer ast 239: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_19(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4536), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 1, 9, 7, 3, 3), strides=(2268, 0, 567, 0, 63, 9, 3, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4536), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(144), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(0, 0, 36, 9, 0, 0, -3, -1), offset=8, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(144), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(504), arg=ShapeTracker(views=(View(shape=(2, 1, 4, 4, 9, 7, 3, 3), strides=(252, 0, 0, 63, 7, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(504), arg=2, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=7), Opt(op=OptOps.UPCAST, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=3)] - # COMPILE_ERROR on METAL in fuzz_linearizer ast 379: Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_20(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(4, 4), strides=(4, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4), arg=ShapeTracker(views=(View(shape=(4, 4), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4), arg=1, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 4), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_21(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(2925), arg=ShapeTracker(views=(View(shape=(45, 65), strides=(65, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(2925), arg=0, src=()),)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(45, 65), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)) - opts = [Opt(op=OptOps.PADTO, axis=0, arg=32)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - #@unittest.skipIf(Device.DEFAULT in ("LLVM", "METAL", "CPU"), "flaky") - @unittest.skip("flaky everywhere") - def test_failure_22(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - x4:=UOp(Ops.CONST, dtypes.float, arg=0.000244140625, src=( - x5:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(393216), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(393216), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=2, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=3, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=4, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=5, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=6, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(32, 96, 8, 16), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=7, src=()),)),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=8, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=9, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=10, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=11, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=12, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=13, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=14, src=()),)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(276461), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 8640, 180, 18, 1), offset=19, mask=((1, 2), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(276461), arg=15, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(544301), arg=ShapeTracker(views=(View(shape=(2, 32, 48, 8, 16), strides=(0, 17280, 180, 18, 1), offset=19, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(2, 32, 48, 8, 16), strides=(0, 12288, 128, 16, 1), offset=0, mask=((0, 1), (0, 32), (0, 48), (0, 8), (0, 16)), contiguous=False), View(shape=(1536, 2, 128), strides=(128, 196608, 1), offset=0, mask=None, contiguous=False), View(shape=(32, 96, 8, 16), strides=(12288, 128, 16, 1), offset=0, mask=None, contiguous=True))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(544301), arg=16, src=()),)),)),)),)),)),)),)),)),)),)),)),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=17, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=2.0, src=( - x5,)),)),)),)), - x79:=UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(96), arg=ShapeTracker(views=(View(shape=(1, 96, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(96), arg=18, src=()),)),)), - x4,)), - UOp(Ops.CONST, dtypes.float, arg=1e-05, src=( - x5,)),)),)),)), - x79,)),)),)),)) - opts = [] - helper_test_lin(Kernel(ast), opts, failed_platforms=["METAL", "CUDA"]) - - def test_failure_23(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=0, src=()),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9600), arg=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9600), arg=1, src=()),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.LOCAL, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_24(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=0, src=()),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(256), arg=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(256), arg=1, src=()),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=2), Opt(op=OptOps.LOCAL, axis=1, arg=8), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=2)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - # this is the cause of the GPT2 BEAM instability. bisects to PR#3530 O(n) arange attempt - def test_failure_25(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1025, 2047), strides=(0, 0), offset=0, mask=((0, 1025), (1023, 2047)), contiguous=False), View(shape=(1024, 1024), strides=(1, 2048), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1024), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=16), Opt(op=OptOps.UNROLL, axis=0, arg=4)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - # COMPARE_ERROR from GPT2 kernel - stems from uops.py self.simplify_phi_loops - def test_failure_26(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(128), arg=ShapeTracker(views=(View(shape=(128, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(128), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(129, 255), strides=(0, 0), offset=0, mask=((0, 129), (127, 255)), contiguous=False), View(shape=(128, 128), strides=(1, 256), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 128), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - all_failing_opts = [ - [Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.GROUPTOP, axis=0, arg=32), Opt(op=OptOps.UNROLL, axis=0, arg=0)], - [Opt(op=OptOps.GROUPTOP, axis=0, arg=32), Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4)], - [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=4)], - [Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - [Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=4)], - [Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], - [Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.GROUP, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=1, arg=4)], - [Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.GROUP, axis=0, arg=16), Opt(op=OptOps.UNROLL, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - [Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - ] - for opts in all_failing_opts: - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - # COMPARE_ERROR from GPT2 kernel - just the first element off - # testing ast 41 - # 0 ━┳ STORE MemBuffer(idx=0, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),))) - # 1 ┗━┳ MAX (3,) - # 2 ┗━━ LOAD MemBuffer(idx=1, dtype=dtypes.half, st=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),))) - # 208 13 - # ... - # Mismatched elements: 1 / 1232 (0.0812%) - # Max absolute difference: 0.8687 - # Max relative difference: 1. - # x: array([0. , 0.996, 0.829, ..., 0. , 0. , 0. ], dtype=float16) - # y: array([0.8687, 0.996 , 0.829 , ..., 0. , 0. , 0. ], dtype=float16) - # COMPARE FAILED!! - def test_failure_27(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(208), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.half, arg=(Ops.MAX, (3,)), src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(2704), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(2704), arg=1, src=()),)),)),)),)),)) - all_failing_opts = [ - [Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=7), Opt(op=OptOps.UPCAST, axis=0, arg=0)], - ] - for opts in all_failing_opts: - helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - - def test_failure_28(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.bfloat16.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bfloat16.ptr(1), arg=0, src=()),)), - UOp(Ops.WHERE, dtypes.bfloat16, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - x5:=UOp(Ops.CAST, dtypes.bfloat16, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=1, src=()),)),)),)), - x9:=UOp(Ops.CONST, dtypes.bfloat16, arg=230.0, src=( - x10:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=()),)),)), - UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( - UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( - UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( - x5, - UOp(Ops.CONST, dtypes.bfloat16, arg=0.004347826086956522, src=( - x10,)),)), - UOp(Ops.CONST, dtypes.bfloat16, arg=0.199374800625, src=( - x10,)),)), - UOp(Ops.CONST, dtypes.bfloat16, arg=1.99375e-07, src=( - x10,)),)), - UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( - UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( - UOp(Ops.MUL, dtypes.bfloat16, arg=None, src=( - UOp(Ops.ADD, dtypes.bfloat16, arg=None, src=( - x5, - x9,)), - UOp(Ops.CONST, dtypes.bfloat16, arg=0.0012987012987012987, src=( - x10,)),)), - UOp(Ops.CONST, dtypes.bfloat16, arg=-0.19439062499999998, src=( - x10,)),)), - UOp(Ops.CONST, dtypes.bfloat16, arg=0.199375, src=( - x10,)),)),)),)),)) - helper_test_lin(Kernel(ast), opts=[], failed_platforms=[]) - - def test_failure_29(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(1, 128, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 128), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(36864), arg=ShapeTracker(views=(View(shape=(128, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(36864), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 1, 1)), Opt(op=OptOps.PADTO, axis=2, arg=32)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[], atol=1.0) - - def test_failure_30(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(2952192), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 1, 1, 1), strides=(11532, 0, 961, 31, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(2952192), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(786432), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(3072, 0, 0, 32, 1, 1024, 32, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(786432), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(144), arg=ShapeTracker(views=(View(shape=(256, 1, 12, 31, 31, 3, 2, 2), strides=(0, 0, 12, 0, 0, 4, 2, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(144), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.PADTO, axis=3, arg=32), Opt(op=OptOps.LOCAL, axis=3, arg=32), Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - # from METAL=1 fuzz_linearizer command in test.yml - def test_failure_31(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(208), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(2704), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(2704), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(208), arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(208), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - @unittest.skipIf(CI, "for real AMD GPU") - def test_failure_32(self): - # kernel from beaming resnet - # Memory access fault on tinybox red - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(12845056), arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 1, 1, 1), strides=(50176, 0, 196, 14, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12845056), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(12845056), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 256, 4, 16, 4, 16), strides=(0, 50176, 0, 196, 0, 14, 0, 1), offset=-15, mask=((0, 1), (0, 256), (0, 1), (0, 256), (0, 4), (1, 15), (0, 4), (1, 15)), contiguous=False), View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(1048576, 0, 0, 64, 1, 4096, 1088, 17), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12845056), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(589824), arg=ShapeTracker(views=(View(shape=(256, 1, 256, 14, 14, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(589824), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=7), Opt(op=OptOps.UNROLL, axis=1, arg=0), Opt(op=OptOps.LOCAL, axis=1, arg=16)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[], atol=0.1, rtol=0.05) - - def test_failure_33(self): - # Ops.UNMUL left after linearize - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - x5:=UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(26040), arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=0, mask=((0, 26040),), contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(26040), arg=1, src=()),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - x5, - x10:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x11:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - x18:=UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((0, 26040),), contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.float, arg=0.06788442333021306, src=( - x11,)), - x10,)), - x5,)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - x18, - UOp(Ops.CONST, dtypes.float, arg=-0.03394221166510653, src=( - x11,)), - x10,)),)), - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(6600), arg=ShapeTracker(views=(View(shape=(32640,), strides=(1,), offset=-26040, mask=((26040, 32640),), contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6600), arg=2, src=()),)),)), - UOp(Ops.WHERE, dtypes.float, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32640,), strides=(0,), offset=0, mask=((26040, 32640),), contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.float, arg=-0.18257418583505536, src=( - x11,)), - x10,)),)),)), - x10,)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x11,)), - UOp(Ops.CONST, dtypes.float, arg=1.0, src=( - x11,)),)), - x10,)),)),)),)),)) - opts = [Opt(op=OptOps.GROUPTOP, axis=0, arg=16)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - # from fuzzing on metal - def test_failure_34(self, unroll=False): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(720), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 1, 1), strides=(180, 0, 30, 3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(720), arg=0, src=()),)), - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(308), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(77, 0, 0, 7, 1, 0, 7, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(308), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 2, 5), strides=(0, 0, 10, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=2, src=()),)),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 1, 6, 10, 3, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UNROLL, axis=0, arg=0)] if unroll else [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - def test_failure_35(self): self.test_failure_34(True) - - # from world fuzz_linearizer: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_N=100 FUZZ_NTH=84 python3 ./test/external/fuzz_linearizer.py - def test_failure_36(self): - # Ops.UNMUL left after linearize - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(5), arg=ShapeTracker(views=(View(shape=(5, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(5), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.uchar, arg=None, src=( - UOp(Ops.ADD, dtypes.uint, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.uint, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.CAST, dtypes.uint, arg=None, src=( - UOp(Ops.WHERE, dtypes.uchar, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(6, 9), strides=(0, 0), offset=0, mask=((0, 6), (4, 9)), contiguous=False), View(shape=(5, 5), strides=(1, 10), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.uchar, arg=1, src=( - x11:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(5, 5), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.uchar, arg=0, src=( - x11,)),)),)),)), - UOp(Ops.CONST, dtypes.uint, arg=-1, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(5, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - # BEGIN METAL=1 ./examples/beautiful_mnist.py failures - # log : PYTHONPATH=. LOGKERNS=/tmp/beautiful_mnist.kernels.txt METAL=1 python3 ./examples/beautiful_mnist.py - def test_failure_37(self): - # beautiful mnist kernel number 28: 6 possible TC axis_choices (3 for axis_buf1 and 2 reduce) and all fail - # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=28 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9437184), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9437184), arg=0, src=()),)), - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(401408), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(401408), arg=1, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(800), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(800), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(32), arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(32), arg=3, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - for axis in [0,1,2,3,4,5]: - opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - def test_failure_38(self): - # beautiful mnist kernel number 87: 6 possible TC axis_choices (2 for axis_buf1 and 3 reduce) and first/second reduce axis fail for both axis_buf1 choices - # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=87 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(1, 1, 32, 1, 1, 1, 5, 5, 256), strides=(0, 0, 6400, 0, 0, 0, 1280, 256, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 3, 4)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(401408), arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(784, 0, 0, 28, 1, 0, 28, 1, 1568), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(401408), arg=1, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(9437184), arg=ShapeTracker(views=(View(shape=(2, 1, 32, 24, 24, 1, 5, 5, 256), strides=(18432, 0, 576, 24, 1, 0, 0, 0, 36864), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9437184), arg=2, src=()),)),)),)),)),)),)) - for axis in [0,1,3,4]: - opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - @unittest.skip("very slow, similar to test_failure_37") - def test_failure_39(self): - # beautiful mnist kernel number 127: 6 possible TC axis_choices (3 for axis_buf1 and 2 reduce) and all fail - # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_NTH=127 DEBUG=2 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(184320000), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(18432, 0, 576, 24, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(184320000), arg=0, src=()),)), - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (6, 7)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(7840000), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(784, 0, 0, 28, 1, 0, 28, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(7840000), arg=1, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(800), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 5, 5), strides=(0, 0, 25, 0, 0, 0, 5, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(800), arg=2, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(32), arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(32), arg=3, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10000, 1, 32, 24, 24, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - for axis in [0,1,2,3,4,5]: - opts = [Opt(op=OptOps.TC, axis=axis, arg=(-1, 2, 1))] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - def test_failure_40(self): - # beautiful mnist kernel number 3: - # fuzz: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=2 DEBUG=2 FUZZ_NTH=3 python3 ./test/external/fuzz_linearizer.py --logfile /tmp/beautiful_mnist.kernels.txt - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(60000), arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(60000), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 60000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - for amt in [16,32]: - opts = [Opt(op=OptOps.GROUPTOP, axis=0, arg=amt), Opt(op=OptOps.UNROLL, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - # END METAL=1 ./examples/beautiful_mnist.py failures - - @unittest.skipIf(CI, "for real AMD GPU") - def test_failure_41(self): - # One more resnet crash with a page fault on AMD. Checked on rocm6.1.3, -O1 works, -O2 fails - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(25690112), arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 1, 1, 1), strides=(100352, 0, 784, 28, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(25690112), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(102760448), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 128, 4, 58, 4, 58), strides=(0, 401408, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 128), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(6889472, 0, 0, 464, 2, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(102760448), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(147456), arg=ShapeTracker(views=(View(shape=(256, 1, 128, 28, 28, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(147456), arg=2, src=()),)),)),)),)),)),)),)),)) - opts=[Opt(op=OptOps.TC, axis=5, arg=(-1, 2, 1)), Opt(op=OptOps.UNROLL, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=["AMD", "HIP"], atol=0.02) - - # llama3 8B failure with BEAM=2 https://github.com/tinygrad/tinygrad/actions/runs/10150118124/job/28066519425#step:14:1, these don't compile - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test needs local") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test needs shared") - def test_failure_42(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.PADTO, axis=0, arg=32)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test needs local") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test needs shared") - def test_failure_43(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test needs local") - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test needs shared") - def test_failure_44(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(25, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(25), arg=ShapeTracker(views=(View(shape=(26, 49), strides=(0, -1), offset=48, mask=((0, 26), (24, 49)), contiguous=False), View(shape=(25, 25), strides=(1, 50), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(25), arg=1, src=()),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=0, arg=32), Opt(op=OptOps.LOCAL, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)] - k = helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - assert k is not None - ifs = [u for u in k.uops if u.op is Ops.IF] - self.assertEqual(len(ifs), 3) - #for st in k.uops.sink.src: self.assertEqual(len(st.src), 4) - self.assertLessEqual(len(ifs[0].src[0].toposort()), 17) - - def test_failure_45(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 1, 1, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2, 3)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 3, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6), arg=1, src=()),)),)), - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1), arg=2, src=()),)),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (4,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(3, 3), strides=(0, 0), offset=0, mask=((0, 3), (1, 3)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 1, 0, 4), offset=0, mask=((0, 2), (0, 3), (0, 2), (0, 3), (0, 2)), contiguous=False))), src=()),)), - x20:=UOp(Ops.CONST, dtypes.int, arg=1, src=( - x21:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - x22:=UOp(Ops.CONST, dtypes.int, arg=0, src=( - x21,)),)),)), - x23:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x24:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - x25:=UOp(Ops.CONST, dtypes.bool, arg=True, src=( - x24,)),)), - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(6), arg=ShapeTracker(views=(View(shape=(2, 3, 2, 3, 1), strides=(3, 1, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(6), arg=3, src=()),)),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (4,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(4, 5), strides=(0, 0), offset=0, mask=((0, 4), (2, 5)), contiguous=False), View(shape=(2, 3, 2, 3, 3), strides=(0, 0, 0, 1, 6), offset=0, mask=None, contiguous=False))), src=()),)), - x20, - x22,)),)), - x23,)),)), - x25,)),)),)),)),)),)),)) - # ValueError: size mismatched, can't reshape self.shape=(6, 2, 3, 3) -> new_shape=(6, 2, 3, 1, 2) - opts = [Opt(op=OptOps.UNROLL, axis=2, arg=0)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - def test_failure_46(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(10), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(10), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(512), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), - UOp(Ops.LOAD, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.bool.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(512), arg=3, src=()),)),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(1), arg=ShapeTracker(views=(View(shape=(512, 10), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(1), arg=4, src=()),)),)),)),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(512), arg=ShapeTracker(views=(View(shape=(512, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(512), arg=5, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=2)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - def test_failure_47(self): - # upcast an arange, failed with UOP_IS_SYMBOLIC=1 (fixed!) - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(60000), arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(60000), arg=0, src=()),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60001, 119999), strides=(0, 0), offset=0, mask=((0, 60001), (59999, 119999)), contiguous=False), View(shape=(60000, 60000), strides=(1, 120000), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x9:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 60000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(60000, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=3)] - helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[]) - - @unittest.skipUnless(not CI and Device.DEFAULT in ("NV", "CUDA"), "for real NV") - def test_failure_48(self): - # with UOP_IS_SYMBOLIC=1, generates the wrong IDIV (fixed!) - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(4194304), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 1, 1, 256, 1, 1, 256), strides=(0, 0, 65536, 0, 0, 256, 0, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (3, 4)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(205520896), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 0, 56, 1, 3136, 0, 0, 802816), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(205520896), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(1, 1, 64, 56, 56, 256, 1, 1, 256), strides=(0, 0, 3136, 56, 1, 0, 0, 0, 200704), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=2, src=()),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 0, 1)), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=2)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_49(self): - # with UOP_IS_SYMBOLIC=1, on METAL it breaks store fusion and has A+B and B+A being two different UOp - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(10, 6, 1), strides=(6, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(100), arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(10, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(100), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(60), arg=ShapeTracker(views=(View(shape=(10, 6, 10), strides=(0, 1, 6), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(60), arg=2, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=0, arg=2)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_50(self): - # from BEAM_COMPARE=2 running tinyphysics.onnx model - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.bool.ptr(400), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 1, 20), strides=(0, 0, 20, 0, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(400), arg=0, src=()),)), - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.bool, arg=(Ops.ADD, (3,)), src=( - UOp(Ops.MUL, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.bool.ptr(400), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 20, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.bool.ptr(400), arg=1, src=()),)),)), - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(20), arg=2, src=()),)),)), - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(20), arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(20), arg=3, src=()),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 20, 20), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1, 20, 1, 20), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=2)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_51(self): - # regression test for #7019, training bert on tinybox red - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(12288), arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(1024, 1, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(12288), arg=0, src=()),)), - UOp(Ops.RECIP, dtypes.half, arg=None, src=( - UOp(Ops.ADD, dtypes.half, arg=None, src=( - UOp(Ops.CONST, dtypes.half, arg=1.0, src=( - x6:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.EXP2, dtypes.half, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.CONST, dtypes.half, arg=2.0, src=( - x6,)), - UOp(Ops.ADD, dtypes.half, arg=None, src=( - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(5768192), arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(524288, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(5768192), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1048576), arg=ShapeTracker(views=(View(shape=(12, 1024, 1024), strides=(0, 1024, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1048576), arg=2, src=()),)),)),)),)),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(12, 1024, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=3, src=()),)),)),)),)), - UOp(Ops.CONST, dtypes.half, arg=-1.4426950408889634, src=( - x6,)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - @unittest.skip("allocating over 200MB buffer") - @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI") - def test_failure_52(self): - # resnet beam. - # NV also fails with a pf. - # CUDA Error 700, an illegal memory access was encountered - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(205520896), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(205520896), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(38535168), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 256), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(38535168), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(9408), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(9408), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=16)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_53(self): - # COMPILE_ERROR, val scope issue - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(1024), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.uchar, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.MUL, dtypes.uchar, arg=None, src=( - UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( - UOp(Ops.VIEW, dtypes.uchar.ptr(50000), arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(50000), arg=1, src=()),)),)), - UOp(Ops.CAST, dtypes.uchar, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=2, src=()),)),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,)), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(50001, 99999), strides=(0, 0), offset=0, mask=((0, 50001), (49999, 99999)), contiguous=False), View(shape=(1024, 50000, 50000), strides=(0, 1, 100000), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x20:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 50000, 50000), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x20,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x23:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 50000, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - x23,)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.GROUPTOP, axis=1, arg=16)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=["AMD", "GPU", "METAL", "NV", "CUDA"]) - - @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI") - def test_failure_54(self): - # resnet beam - # HIP: Memory access fault by GPU node-1 (Agent handle: 0x56c21f1d1480) on address 0x730cc242e000. Reason: Page not present or supervisor privilege. - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(51380224), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, 256), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(51380224), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(36864), arg=ShapeTracker(views=(View(shape=(256, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(36864), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.TC, axis=2, arg=(-1, 2, 1)), Opt(op=OptOps.UPCAST, axis=2, arg=7), Opt(op=OptOps.UPCAST, axis=1, arg=2)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=["HIP", "AMD"]) - - @unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI") - def test_failure_55(self): - W = 2 - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(W * 200704), arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 1, 1, 1), strides=(200704, 0, 3136, 56, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 200704), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(W * 200704), arg=ShapeTracker(views=(View(shape=(1, W, 1, 64, 4, 58, 4, 58), strides=(0, 200704, 0, 3136, 0, 56, 0, 1), offset=-57, mask=((0, 1), (0, W), (0, 1), (0, 64), (0, 4), (1, 57), (0, 4), (1, 57)), contiguous=False), View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(3444736, 0, 0, 232, 1, 53824, 13688, 59), offset=0, mask=None, contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 200704), arg=1, src=()),)),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(W * 18432), arg=ShapeTracker(views=(View(shape=(W, 1, 64, 56, 56, 64, 3, 3), strides=(0, 0, 576, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(W * 18432), arg=2, src=()),)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.SWAP, axis=1, arg=2)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_56(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - x7:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x8:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(247808), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(247808), arg=1, src=()),)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x8,)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=3, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=4, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=5, src=()),)),)),)), - x7,)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=6, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=2, arg=32)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - def test_failure_57(self): - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(1, 16, 1, 1), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=0, src=()),)), - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (0, 2, 3)), src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.CMPLT, dtypes.bool, arg=None, src=( - x7:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=( - x8:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.MAX, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(247808), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(1936, 121, 11, 1), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(247808), arg=1, src=()),)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=2, src=()),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x8,)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=3, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=4, src=()),)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(16), arg=ShapeTracker(views=(View(shape=(128, 16, 11, 11), strides=(0, 1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16), arg=5, src=()),)),)),)), - x7,)),)),)), - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(204800), arg=ShapeTracker(views=(View(shape=(128, 16, 5, 2, 5, 2), strides=(1600, 100, 20, 2, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(128, 16, 11, 11), strides=(1600, 100, 10, 1), offset=0, mask=((0, 128), (0, 16), (0, 10), (0, 10)), contiguous=False))), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(204800), arg=6, src=()),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.PADTO, axis=1, arg=32)] - helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[]) - - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test needs local") - def test_failure_59(self): - # stable diffusion with SINGLE_KERNEL_SOFTMAX=1 - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(134217728, 16777216, 4096, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(134217728, 16777216, 4096, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - x9:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(268435456), arg=1, src=()),)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x15:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x15,)),)),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(134217728, 16777216, 4096, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=( - x9,)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(268435456), arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 4096), strides=(134217728, 16777216, 4096, 0, 0, 1), offset=0, mask=None, contiguous=False),)), src=( - x9,)),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x29:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 8, 4096, 4096, 4096, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x29,)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UNROLL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.LOCAL, axis=1, arg=16)] - # NOTE: this is slow to run, just confirm it can generate the program without Exception - Kernel(ast, opts=Device[Device.DEFAULT].renderer).apply_opts(opts).to_program() - - @unittest.expectedFailure - @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test needs local") - def test_failure_60(self): - # TestSymbolicOps.test_attention - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x2:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x2,)), - UOp(Ops.CONST, dtypes.int, arg=4, src=()),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x1,)), 0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=0, src=()),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x2:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x2,)), - UOp(Ops.CONST, dtypes.int, arg=4, src=()),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)), - x1,)), 0, 1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=1, src=()),)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=4, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=0, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 0, 1), offset=0, mask=None, contiguous=False),)), src=( - x14:=UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(80), arg=2, src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x16:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1, 1), strides=(0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x16,)),)),)), - UOp(Ops.RECIP, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=( - UOp(Ops.EXP2, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.ADD, dtypes.float, arg=None, src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=4, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=0, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 1, 0), offset=0, mask=None, contiguous=False),)), src=( - x14,)),)), - UOp(Ops.MUL, dtypes.float, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.MAX, (5,), True), src=( - UOp(Ops.LOAD, dtypes.float, arg=None, src=( - UOp(Ops.VIEW, dtypes.float.ptr(80), arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=4, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=0, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=4, src=()), - x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=0, src=()), - x2:=UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x2,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x0, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), 1), offset=0, mask=None, contiguous=False))), src=( - x14,)),)),)), - UOp(Ops.CONST, dtypes.float, arg=-1.0, src=( - x30:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=())), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False), View(shape=(2, 4, 1, UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()), 1), strides=(UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=4, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x1, - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x1,)), 0, UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.MUL, dtypes.int, arg=None, src=( - UOp(Ops.CONST, dtypes.int, arg=0, src=()), - UOp(Ops.MUL, dtypes.int, arg=None, src=( - x3:=UOp(Ops.CONST, dtypes.int, arg=1, src=()), - UOp(Ops.DEFINE_VAR, dtypes.int, arg=('i', 1, 10), src=()),)),)), - x3,)), 1, 0), offset=0, mask=None, contiguous=False))), src=()),)),)),)), - UOp(Ops.CONST, dtypes.float, arg=1.4426950408889634, src=( - x30,)),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=0, arg=2), Opt(op=OptOps.LOCAL, axis=0, arg=4)] - # NOTE: this is slow to run, just confirm it can generate the program without Exception - Kernel(ast, opts=Device[Device.DEFAULT].renderer).apply_opts(opts).to_program() - - def test_failure_61(self): - # WINO=1 JITBEAM=4 python3 examples/beautiful_cifar.py - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=0, src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - x7:=UOp(Ops.CONST, dtypes.half, arg=0.6931471805599453, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.CONST, dtypes.half, arg=-1.0, src=( - x14:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 10, 1), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.ADD, dtypes.half, arg=None, src=( - UOp(Ops.CONST, dtypes.half, arg=-0.010000000000000002, src=( - x14,)), - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.CMPNE, dtypes.bool, arg=None, src=( - UOp(Ops.LOAD, dtypes.int, arg=None, src=( - UOp(Ops.VIEW, dtypes.int.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 10, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=False),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(1024), arg=1, src=()),)),)), - UOp(Ops.ADD, dtypes.int, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (2,), True), src=( - UOp(Ops.WHERE, dtypes.int, arg=None, src=( - UOp(Ops.VALID, dtypes.bool, arg=None, src=( - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(11, 19), strides=(0, 0), offset=0, mask=((0, 11), (9, 19)), contiguous=False), View(shape=(1024, 10, 10), strides=(0, 1, 20), offset=0, mask=None, contiguous=False))), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=1, src=( - x30:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 10, 10), strides=(0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.CONST, dtypes.int, arg=0, src=( - x30,)),)),)), - UOp(Ops.CONST, dtypes.int, arg=-1, src=( - x14,)),)),)), - UOp(Ops.CONST, dtypes.bool, arg=True, src=( - x14,)),)),)), - UOp(Ops.CONST, dtypes.half, arg=-0.4, src=( - x14,)),)),)),)),)),)),)),)), - UOp(Ops.RECIP, dtypes.half, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.VIEW, dtypes.half.ptr(1024), arg=ShapeTracker(views=(View(shape=(1024, 1, 1), strides=(1, 0, 0), offset=0, mask=None, contiguous=True),)), src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(1024), arg=2, src=()),)),)), - x7,)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=0, arg=32), Opt(op=OptOps.GROUP, axis=1, arg=0)] - helper_test_lin(Kernel(ast), opts, failed_platforms=["AMD", "METAL", "CUDA", "NV"]) - - def test_failure_62(self): - # WINO=1 DEFAULT_FLOAT=HALF FUSE_ARANGE=1 JITBEAM=4 BS=1024 STEPS=500 python examples/hlb_cifar10.py - # RuntimeError: UOp verification failed at 4 on Ops.LOAD dtypes.half 2 [, ] None - ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( - UOp(Ops.STORE, dtypes.void, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(11808768), arg=0, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 1, 1, 1), strides=(11532, 0, 961, 31, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()), - UOp(Ops.CAST, dtypes.half, arg=None, src=( - UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=( - UOp(Ops.CAST, dtypes.float, arg=None, src=( - UOp(Ops.MUL, dtypes.half, arg=None, src=( - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(3145728), arg=1, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 3, 2, 2), strides=(3072, 0, 0, 32, 1, 1024, 32, 1), offset=0, mask=None, contiguous=False),)), src=()),)), - UOp(Ops.LOAD, dtypes.half, arg=None, src=( - UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(144), arg=2, src=()), - UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1024, 1, 12, 31, 31, 3, 2, 2), strides=(0, 0, 12, 0, 0, 4, 2, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),)),)),)) - opts = [Opt(op=OptOps.LOCAL, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=1, arg=3), Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.PADTO, axis=2, arg=32), Opt(op=OptOps.UPCAST, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=0)] - helper_test_lin(Kernel(ast), opts, failed_platforms=["AMD", "HIP", "NV", "CUDA"]) - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_search.py b/test/test_search.py index af2c4f0a39..8faa25b823 100644 --- a/test/test_search.py +++ b/test/test_search.py @@ -83,6 +83,7 @@ class TestBEAM(unittest.TestCase): actions_after = actions.copy() assert actions_after == actions_before, "actions state was not preserved" + @unittest.skip("invalid reduce now") def test_filter_global_buffer(self): # taken from https://github.com/tinygrad/tinygrad/issues/4612 ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( diff --git a/tinygrad/opt/kernel.py b/tinygrad/opt/kernel.py index 581b165e9b..4dd0a8ad15 100644 --- a/tinygrad/opt/kernel.py +++ b/tinygrad/opt/kernel.py @@ -58,11 +58,6 @@ class Kernel: # add a shapetracker to the end to track the full shape, with 0 strides so it can merge self.sts.append(ShapeTracker.from_shape(tuple([smax(*s) for s in zip(*[x.shape for x in self.sts])]), (0,)*self.shape_len)) - # move all reduce axes to the end - reduce = list(enumerate(zip(self.full_shape, self.output_shape))) - permute = tuple([i for i,(s,n) in reduce if not resolve(s != n)] + [i for i,(s,n) in reduce if resolve(s != n)]) - self.reshape_and_permute(None, permute) - # parameters for optimization self.applied_opts: list[Opt] = [] self.group_for_reduces: int = 0 @@ -77,6 +72,11 @@ class Kernel: self.simplify_ones() self.simplify_merge_adjacent() + # confirm all reduce axes are at the end + final_reduces = [i for i,(s,n) in enumerate(zip(self.full_shape, self.output_shape)) if resolve(s != n)] + if final_reduces != list(range(len(self.full_shape)-len(final_reduces), len(self.full_shape))): + raise RuntimeError(f"reduces are not at the end of the shape {self.full_shape} -> {self.output_shape}") + def copy(self): ret = type(self).__new__(type(self))