From f6eb0574f2e56f7ff03e10183704e2f59019e31a Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:33:21 -0500
Subject: [PATCH] start tests for putting the tensor graph in a single kernel
 [pr] (#8542)

* start tests for putting the tensor graph in a single kernel [pr]

* parallel actually

* better view_left test

* test a softmax

* put all that in sym
---
 test/test_schedule.py | 116 ++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index 0801f530b2..37bfa60a74 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -15,7 +15,7 @@ from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
-from tinygrad.codegen.kernel import Kernel, verify_ast
+from tinygrad.codegen.kernel import verify_ast
 from tinygrad.engine.schedule import BUF_LIMIT, ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
 from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
 from extra.models.llama import precompute_freqs_cis
@@ -1753,73 +1753,65 @@ def swizzle_rewrite(u:UOp) -> UOp: return graph_rewrite(graph_rewrite(u, view_le
 
 def swizzle_cnt(u:UOp) -> int: return len([x for x in u.toposort if x.op is Ops.VIEW and len(x.src) != 0])
 
+# these pattern matchers should move to engine/schedule.py
+
+sym = symbolic_simple+PatternMatcher([
+  (UPat(Ops.DETACH, name="x"), lambda x:x.src[0]),
+])
+
+def _load_buffer(ctx:list[UOp], buf:UOp):
+  glbl = UOp(Ops.DEFINE_GLOBAL, buf.dtype.ptr(size=buf.size), (), len(ctx))
+  ctx.append(buf)
+  return UOp(Ops.LOAD, buf.dtype, (glbl, ShapeTracker.from_shape((buf.size,)).to_uop()))
+load_buffers = PatternMatcher([
+  (UPat(Ops.BUFFER, name="buf"), _load_buffer),
+])
+
+# put the entire schedule of the tensor in a single ScheduleItem
+@track_rewrites(named=True)
+def run_tensor_ast(r:Tensor):
+  output = UOp.new_buffer(r.device, r.lazydata.size, r.dtype)
+  glbl = UOp(Ops.DEFINE_GLOBAL, output.dtype.ptr(size=output.size), (), 0)
+  sink = UOp(Ops.STORE, src=(glbl, ShapeTracker.from_shape(r.lazydata.base.shape).to_uop(), r.lazydata.base)).sink()
+  sink = graph_rewrite(sink, remove_movement_ops+sym+load_buffers+view_left, bufs:=[output])
+  sink = graph_rewrite(sink, remove_movement_ops+sym+view_right)
+  si = ScheduleItem(sink, tuple(x.buffer for x in bufs), (), ())
+  run_schedule([si])
+  return output.realized.as_buffer().cast(output.dtype.fmt).tolist()
+
 class TestSwizzle(unittest.TestCase):
   def test_swizzle_simple(self):
-    sink = UOp(Ops.SINK, dtypes.void, arg=None, src=(
-      UOp(Ops.STORE, dtypes.void, arg=None, src=(
-        UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=0, src=()),
-        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 1), strides=(0, 0), offset=0, mask=None, contiguous=True),)), src=()), # noqa: E501
-        UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (0, 1)), src=(
-          UOp(Ops.ADD, dtypes.int, arg=None, src=(
-            UOp(Ops.VIEW, dtypes.int, arg=ShapeTracker(views=(View(shape=(32, 32), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa E501
-              UOp(Ops.REDUCE_AXIS, dtypes.int, arg=(Ops.ADD, (0, 1)), src=(
-                UOp(Ops.LOAD, dtypes.int, arg=None, src=(
-                  x8:=UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), arg=1, src=()),
-                  UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 32), strides=(32, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)),)), # noqa E501
-            UOp(Ops.LOAD, dtypes.int, arg=None, src=(
-               x8,
-              UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 32), strides=(32, 1), offset=0, mask=None, contiguous=True),)), src=()),)),)),)),)),)) # noqa E501
-    sink = swizzle_rewrite(sink)
-    k = Kernel(sink)
-    p = k.to_program()
-    a = Tensor.randint(32, 32).realize()
-    b = Tensor.empty((), dtype=dtypes.int).realize()
-    CompiledRunner(p).exec([b.lazydata.buffer, a.lazydata.buffer])
-    expected_out = (a.numpy() + a.numpy().sum()).sum()
-    np.testing.assert_equal(b.numpy(), expected_out)
+    with Context(DEBUG=0, TRACK_MATCH_STATS=0):
+      a = Tensor.randint(32, 32).realize()
+    # double reduce collapses to a single reduce
+    r = (a+a).sum(1).sum(0)
+    self.assertEqual(run_tensor_ast(r), (a.numpy()+a.numpy()).sum(1).sum(0))
 
   def test_single_swizzle(self):
-    # ast in tensor style
-    a = Tensor.randint(4,).realize()
-    expected_out = a.numpy().sum(0)+1
-    # LazyBuffer to pre-rewrite AST
-    bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), i) for i in range(2)]
-    ld = UOp(Ops.LOAD, dtypes.int, (bufs[1], ShapeTracker.from_shape((4,)).to_uop()))
-    r = UOp(Ops.REDUCE_AXIS, dtypes.int, (ld,), (Ops.ADD, (0,)))
-    swizzle_r = UOp(Ops.VIEW, dtypes.int, (r,), unwrap(r.st).reshape(()))
-    alu = swizzle_r+1
-    sink = UOp(Ops.SINK, dtypes.void, (UOp(Ops.STORE, dtypes.void, (bufs[0], ShapeTracker.from_shape(()).to_uop(), alu,),),))
-    # graph rewrite
-    sink = swizzle_rewrite(sink)
-    # verify output
-    k = Kernel(sink)
-    p = k.to_program()
-    b = Tensor.empty((1,), dtype=dtypes.int).realize()
-    CompiledRunner(p).exec([b.lazydata.buffer, a.lazydata.buffer])
-    np.testing.assert_equal(b.numpy(), expected_out)
+    with Context(DEBUG=0, TRACK_MATCH_STATS=0):
+      a = Tensor.randint(4, 1).realize()
+      b = Tensor.ones((1, 1), dtype=a.dtype).contiguous().realize()
+    # ADD(REDUCE(RESHAPE(LOAD)), LOAD) to ADD(REDUCE(RESHAPE(LOAD))), RESHAPE(LOAD)
+    r = a.sum(0)+b
+    self.assertEqual(run_tensor_ast(r), a.numpy().sum(0)+1)
 
   def test_double_swizzle_possible(self):
-    # ast in tensor style
-    Tensor.manual_seed(0)
-    a = Tensor.randint(4,).realize()
-    b = Tensor.randint(4,).realize()
-    expected_out = a.numpy().sum(0)+b.numpy().sum(0)+2
-    # LazyBuffer to pre-rewrite AST
-    bufs = [UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), i) for i in range(3)]
-    ld1 = UOp(Ops.LOAD, dtypes.int, (bufs[1], ShapeTracker.from_shape((4,)).to_uop()))
-    r1 = UOp(Ops.REDUCE_AXIS, dtypes.int, (ld1,), (Ops.ADD, (0,)))
-    ld2 = UOp(Ops.LOAD, dtypes.int, (bufs[2], ShapeTracker.from_shape((4,)).to_uop()))
-    r2 = UOp(Ops.REDUCE_AXIS, dtypes.int, (ld2,), (Ops.ADD, (0,)))
-    alu = UOp(Ops.VIEW, r1.dtype, (r1,), ShapeTracker.from_shape(()))+UOp(Ops.VIEW, r2.dtype, (r2,), ShapeTracker.from_shape(()))
-    sink = UOp(Ops.SINK, dtypes.void, (UOp(Ops.STORE, dtypes.void, (bufs[0], ShapeTracker.from_shape(()).to_uop(), alu+2,),),)) # noqa: E501
-    # graph rewrite
-    sink = swizzle_rewrite(sink)
-    # verify output
-    k = Kernel(sink)
-    p = k.to_program()
-    c = Tensor.empty((1,), dtype=dtypes.int).realize()
-    CompiledRunner(p).exec([c.lazydata.buffer, a.lazydata.buffer, b.lazydata.buffer])
-    np.testing.assert_equal(c.numpy(), expected_out)
+    with Context(DEBUG=0, TRACK_MATCH_STATS=0):
+      Tensor.manual_seed(0)
+      a = Tensor.randint(4,).realize()
+      b = Tensor.randint(4,).realize()
+    # parallel reduce!
+    add = a.sum(0)+b.sum(0)
+    self.assertEqual(run_tensor_ast(add), a.numpy().sum(0)+b.numpy().sum(0))
+
+  # TODO: this is failing because it cannot resolve the final shape of two swizzled sources
+  @unittest.expectedFailure
+  def test_softmax(self):
+    with Context(DEBUG=0, TRACK_MATCH_STATS=0):
+      Tensor.manual_seed(0)
+      a = Tensor.randn(32, 32).realize()
+    t = a.softmax()
+    run_tensor_ast(t)
 
   def test_swizzle_rewrite_alt(self):
     swizzle = UOp(Ops.VIEW, dtypes.float, arg=ShapeTracker(views=(View(shape=(2, 3, 3, 65, 3, 65), strides=(103788, 34596, 3, 558, 1, 9), offset=0, mask=((0, 2), (0, 3), (0, 3), (0, 62), (0, 3), (0, 62)), contiguous=False), View(shape=(2, 3, 256, 256), strides=(114075, 38025, 195, 1), offset=0, mask=((0, 2), (0, 3), (0, 195), (0, 195)), contiguous=False), View(shape=(1, 2, 1, 3, 4, 64, 4, 64), strides=(0, 196608, 0, 65536, 16384, 256, 64, 1), offset=0, mask=None, contiguous=True))), src=( # noqa: E501