From 406cb5fd90dae48c66e1ec4ca23b5b37b4b6fb3f Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 3 Apr 2024 14:39:28 -0400 Subject: [PATCH] const fold ReduceOps (#4059) --- test/external/external_test_opt.py | 4 ++-- test/test_const_folding.py | 26 +++++++++++++++++++++++++- test/test_linearizer.py | 4 ++-- tinygrad/lazy.py | 4 ++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index 75bacc273c..80e7356ae0 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -173,7 +173,7 @@ class TestOpt(unittest.TestCase): np.testing.assert_allclose(d.numpy(), na*nb+nc, rtol=1e-5, atol=1e-7) def test_fold_reduce_elementwise(self): - img = Tensor.ones(32) + img = Tensor.ones(32).contiguous() addme = Tensor.ones(1) with CLCache(): ret = img.sum() + addme @@ -183,7 +183,7 @@ class TestOpt(unittest.TestCase): def test_fold_batchnorm(self): with Tensor.train(): - img = Tensor.ones(1,32,4,4) + img = Tensor.ones(1,32,4,4).contiguous() bn = nn.BatchNorm2d(32, track_running_stats=False) with CLCache(): img_bn = bn(img).realize() diff --git a/test/test_const_folding.py b/test/test_const_folding.py index 38cacb5fba..aa6af4ad1e 100644 --- a/test/test_const_folding.py +++ b/test/test_const_folding.py @@ -12,7 +12,7 @@ def _check_ast_count(desired_count:int, t:Tensor): asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE] assert len(asts) == desired_count -class TestSimpleConstFolding(unittest.TestCase): +class TestUnaryOpsConstFolding(unittest.TestCase): def test_all_consts_ops(self): _check_ast_count(0, Tensor.ones(4).exp()) _check_ast_count(0, Tensor.ones(4).sqrt()) @@ -23,6 +23,7 @@ class TestSimpleConstFolding(unittest.TestCase): _check_ast_count(0, Tensor.ones(4).cast(dtypes.int16)) _check_ast_count(0, Tensor.full(4, fill_value=-1).cast(dtypes.uint16)) +class TestBinaryOpsConstFolding(unittest.TestCase): def test_add_literal_zero(self): _check_ast_count(0, Tensor([1.0, 2, 3, 4]) + 0) def test_add_tensor_zero(self): @@ -98,6 +99,29 @@ class TestMovedConstFolding(unittest.TestCase): _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64)) np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0]) +class TestReduceOpsConstFolding(unittest.TestCase): + def test_const_sum(self): + _check_ast_count(0, Tensor.ones(4, 5, 6).sum()) + np.testing.assert_equal(Tensor.ones(4, 5, 6).sum().numpy(), 4 * 5 * 6) + _check_ast_count(0, Tensor.ones(4, 5, 6).sum(axis=0)) + np.testing.assert_equal(Tensor.ones(4, 5, 6).sum(axis=0).numpy(), np.full((5, 6), 4)) + _check_ast_count(0, Tensor(4).sum()) + np.testing.assert_equal(Tensor(4).sum().numpy(), 4) + + def test_padded_const_sum(self): + _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).sum()) + np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).sum().numpy(), 4) + + # NOTE: cannot just count the non-padded area because some UnaryOps f do not have f(0) = 0. + _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum()) + np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2) + + def test_const_max(self): + _check_ast_count(0, Tensor.ones(4, 5, 6).max()) + np.testing.assert_equal(Tensor.ones(4, 5, 6).max().numpy(), 1) + _check_ast_count(0, Tensor(4).max()) + np.testing.assert_equal(Tensor(4).max().numpy(), 4) + @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") class TestMultiConstFolding(unittest.TestCase): def test_multi_const_folding_literal(self): diff --git a/test/test_linearizer.py b/test/test_linearizer.py index fe8fdf91de..f9ea0bc949 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -203,7 +203,7 @@ class TestLinearizer(unittest.TestCase): lin.limit_dims_to_max(global_max=[16, 16, 16], local_max=[16, 16, 16]) def test_sum_collapse(self): - t = Tensor.ones(256,256).sum() + t = Tensor([2]).reshape(1, 1).expand(256, 256).sum() sched = [si for si in create_schedule([t.lazydata]) if si.ast[0].op not in LoadOps] assert len(sched) == 1 lin = Linearizer(*sched[0].ast) @@ -719,7 +719,7 @@ class TestKernelOpts(unittest.TestCase): def test_padto_max(self): N = 17 * 17 - a = -Tensor.ones(N, N) + a = -Tensor.rand(N, N) helper_linearizer_opt(a.max(0), [ [Opt(OptOps.PADTO, 0, 32)], diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 8c896f0520..1c0f231ccf 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -159,6 +159,10 @@ class LazyBuffer: new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape)) # TODO: this logic should move to the scheduler if self.size == 0 and 0 not in new_shape: return self.const({ReduceOps.SUM: 0.0, ReduceOps.MAX: -math.inf}[op], new_shape) + + if self.is_unrealized_unpadded_const(): + return self.const(self.base.arg * {ReduceOps.SUM: prod(self.shape[i] for i in axis), ReduceOps.MAX: 1}[op], new_shape) + # TODO: can we split symbolic shape if the reduce axis is not symbolic? if not all_int(self.shape) or (0 in self.shape) or prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768): return self._reduce_op(op, axis)