From 406cb5fd90dae48c66e1ec4ca23b5b37b4b6fb3f Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Wed, 3 Apr 2024 14:39:28 -0400
Subject: [PATCH] const fold ReduceOps (#4059)

---
 test/external/external_test_opt.py |  4 ++--
 test/test_const_folding.py         | 26 +++++++++++++++++++++++++-
 test/test_linearizer.py            |  4 ++--
 tinygrad/lazy.py                   |  4 ++++
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py
index 75bacc273c..80e7356ae0 100644
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -173,7 +173,7 @@ class TestOpt(unittest.TestCase):
     np.testing.assert_allclose(d.numpy(), na*nb+nc, rtol=1e-5, atol=1e-7)
 
   def test_fold_reduce_elementwise(self):
-    img = Tensor.ones(32)
+    img = Tensor.ones(32).contiguous()
     addme = Tensor.ones(1)
     with CLCache():
       ret = img.sum() + addme
@@ -183,7 +183,7 @@ class TestOpt(unittest.TestCase):
 
   def test_fold_batchnorm(self):
     with Tensor.train():
-      img = Tensor.ones(1,32,4,4)
+      img = Tensor.ones(1,32,4,4).contiguous()
       bn = nn.BatchNorm2d(32, track_running_stats=False)
       with CLCache():
         img_bn = bn(img).realize()
diff --git a/test/test_const_folding.py b/test/test_const_folding.py
index 38cacb5fba..aa6af4ad1e 100644
--- a/test/test_const_folding.py
+++ b/test/test_const_folding.py
@@ -12,7 +12,7 @@ def _check_ast_count(desired_count:int, t:Tensor):
   asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
   assert len(asts) == desired_count
 
-class TestSimpleConstFolding(unittest.TestCase):
+class TestUnaryOpsConstFolding(unittest.TestCase):
   def test_all_consts_ops(self):
     _check_ast_count(0, Tensor.ones(4).exp())
     _check_ast_count(0, Tensor.ones(4).sqrt())
@@ -23,6 +23,7 @@ class TestSimpleConstFolding(unittest.TestCase):
     _check_ast_count(0, Tensor.ones(4).cast(dtypes.int16))
     _check_ast_count(0, Tensor.full(4, fill_value=-1).cast(dtypes.uint16))
 
+class TestBinaryOpsConstFolding(unittest.TestCase):
   def test_add_literal_zero(self):
     _check_ast_count(0, Tensor([1.0, 2, 3, 4]) + 0)
   def test_add_tensor_zero(self):
@@ -98,6 +99,29 @@ class TestMovedConstFolding(unittest.TestCase):
     _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
     np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
 
+class TestReduceOpsConstFolding(unittest.TestCase):
+  def test_const_sum(self):
+    _check_ast_count(0, Tensor.ones(4, 5, 6).sum())
+    np.testing.assert_equal(Tensor.ones(4, 5, 6).sum().numpy(), 4 * 5 * 6)
+    _check_ast_count(0, Tensor.ones(4, 5, 6).sum(axis=0))
+    np.testing.assert_equal(Tensor.ones(4, 5, 6).sum(axis=0).numpy(), np.full((5, 6), 4))
+    _check_ast_count(0, Tensor(4).sum())
+    np.testing.assert_equal(Tensor(4).sum().numpy(), 4)
+
+  def test_padded_const_sum(self):
+    _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).sum())
+    np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).sum().numpy(), 4)
+
+    # NOTE: cannot just count the non-padded area because some UnaryOps f do not have f(0) = 0.
+    _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
+    np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
+
+  def test_const_max(self):
+    _check_ast_count(0, Tensor.ones(4, 5, 6).max())
+    np.testing.assert_equal(Tensor.ones(4, 5, 6).max().numpy(), 1)
+    _check_ast_count(0, Tensor(4).max())
+    np.testing.assert_equal(Tensor(4).max().numpy(), 4)
+
 @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI")
 class TestMultiConstFolding(unittest.TestCase):
   def test_multi_const_folding_literal(self):
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index fe8fdf91de..f9ea0bc949 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -203,7 +203,7 @@ class TestLinearizer(unittest.TestCase):
     lin.limit_dims_to_max(global_max=[16, 16, 16], local_max=[16, 16, 16])
 
   def test_sum_collapse(self):
-    t = Tensor.ones(256,256).sum()
+    t = Tensor([2]).reshape(1, 1).expand(256, 256).sum()
     sched = [si for si in create_schedule([t.lazydata]) if si.ast[0].op not in LoadOps]
     assert len(sched) == 1
     lin = Linearizer(*sched[0].ast)
@@ -719,7 +719,7 @@ class TestKernelOpts(unittest.TestCase):
 
   def test_padto_max(self):
     N = 17 * 17
-    a = -Tensor.ones(N, N)
+    a = -Tensor.rand(N, N)
 
     helper_linearizer_opt(a.max(0), [
       [Opt(OptOps.PADTO, 0, 32)],
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 8c896f0520..1c0f231ccf 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -159,6 +159,10 @@ class LazyBuffer:
     new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
     # TODO: this logic should move to the scheduler
     if self.size == 0 and 0 not in new_shape: return self.const({ReduceOps.SUM: 0.0, ReduceOps.MAX: -math.inf}[op], new_shape)
+
+    if self.is_unrealized_unpadded_const():
+      return self.const(self.base.arg * {ReduceOps.SUM: prod(self.shape[i] for i in axis), ReduceOps.MAX: 1}[op], new_shape)
+
     # TODO: can we split symbolic shape if the reduce axis is not symbolic?
     if not all_int(self.shape) or (0 in self.shape) or prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
       return self._reduce_op(op, axis)