diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 5444e600c9..af8d644d9d 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -11,7 +11,7 @@ from tinygrad.shape.symbolic import MulNode, Variable, NumNode, Node from tinygrad.tensor import Tensor from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import run_schedule, lower_schedule -from tinygrad.helpers import prod, Context, getenv +from tinygrad.helpers import prod, Context, getenv, CI from tinygrad.dtype import DType, dtypes from tinygrad.codegen.uops import UOpGraph @@ -787,7 +787,7 @@ class TestKernelOpts(unittest.TestCase): ], apply_tc=True, atol=atol, rtol=rtol) def test_padto_matmul(self): - if Device.DEFAULT in ["CUDA", "RHIP"]: self.skipTest("super slow on CUDA and RHIP because of the big grid dims") + if CI and Device.DEFAULT in ["CUDA", "RHIP"]: self.skipTest("super slow on CUDA and RHIP because of the big grid dims") N = 17 * 17 Tensor.manual_seed(289) a = Tensor.rand(N, N) @@ -802,6 +802,25 @@ class TestKernelOpts(unittest.TestCase): [Opt(OptOps.PADTO, 0, 32), Opt(OptOps.PADTO, 1, 32), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2),], ]) + def test_padto_upcasted_not_ok(self): + N = 4 + a = Tensor.rand(N, N) + b = Tensor.rand(N, N) + helper_linearizer_opt(a@b, [ + [Opt(OptOps.UPCAST, 0, 0)], + [Opt(OptOps.UPCAST, 1, 0)], + [Opt(OptOps.UNROLL, 0, 0)], + [Opt(OptOps.PADTO, 0, 8)], + [Opt(OptOps.PADTO, 1, 8)], + [Opt(OptOps.PADTO, 2, 8)], + ]) + with self.assertRaises(KernelOptError): + helper_linearizer_opt(a@b, [[Opt(OptOps.UPCAST, 0, 0), Opt(OptOps.PADTO, 2, 8)]]) + with self.assertRaises(KernelOptError): + helper_linearizer_opt(a@b, [[Opt(OptOps.UPCAST, 1, 0), Opt(OptOps.PADTO, 2, 8)]]) + with self.assertRaises(KernelOptError): + helper_linearizer_opt(a@b, [[Opt(OptOps.UNROLL, 0, 0), Opt(OptOps.PADTO, 2, 8)]]) + def test_padto_sum_ok(self): N = 18 * 18 # NOTE: this setup prevents 17 * 17 contiguous merged into one dimension diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py index 963a6e2956..31d4c6eb75 100644 --- a/test/test_linearizer_failures.py +++ b/test/test_linearizer_failures.py @@ -22,7 +22,7 @@ def helper_test_lin(lin: Linearizer, opts, failed_platforms, rtol=1e-2, atol=1e- lin.apply_opt(opt) except KernelOptError: # it's considered fixed if we invalidated the opts - assert Device.DEFAULT not in failed_platforms + assert Device.DEFAULT not in failed_platforms, f"unexpected success on {Device.DEFAULT}" return compare_result = compare_linearizer(lin, rtol=rtol, atol=atol) @@ -234,7 +234,7 @@ class TestLinearizerFailures(unittest.TestCase): def test_failure_31(self): ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=UnaryOps.EXP2, src=(LazyOp(op=BinaryOps.MUL, src=(LazyOp(op=BinaryOps.SUB, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 169, 13, 1), offset=0, mask=None, contiguous=True),)))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=False),))))), arg=None), LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=1.4426950408889634, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 16, 13, 13), strides=(0, 0, 0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=None),), arg=((3,), dtypes.float)),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 16, 13, 1), strides=(0, 13, 1, 0), offset=0, mask=None, contiguous=True),)))) opts = [Opt(op=OptOps.UNROLL, axis=0, amt=0), Opt(op=OptOps.PADTO, axis=1, amt=32)] - helper_test_lin(Linearizer(ast), opts=opts, failed_platforms=["METAL", "GPU", "HSA", "CUDA", "CLANG", "LLVM"]) + helper_test_lin(Linearizer(ast), opts=opts, failed_platforms=[]) if __name__ == '__main__': unittest.main() diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 1fe5b1e29f..2d53ea32c7 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -493,8 +493,9 @@ class Kernel: self.dont_use_locals = True elif opt.op is OptOps.PADTO: check(not self.vars, "does not work with symbolic shape") + check(axis < self.shape_len - self.upcasted, "cannot pad upcasted") # ok to pad SUM if all parent ops have f(0) = 0 - if self.first_reduce <= axis < self.shape_len - self.upcasted: + if self.first_reduce <= axis: check(self.reduceop.op is ReduceOps.SUM and all(op.op not in UNSAFE_PAD_OPS for ops in self.reduceop.src for op in ops.lazyops), "cannot pad") padded = False for i,st in enumerate(self.sts):