From aec4c4f01bdc578feafc9da651da5f81418f13b3 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 12 Mar 2024 00:39:04 +0200
Subject: [PATCH] linearizer ast as a tuple of lazyops (#3689)

* multi store op linearizer

* currently we do only one output per kernel

* named opts
---
 docs/abstractions.py              | 2 +-
 examples/handcode_resnet50_opt.py | 6 +++---
 extra/optimization/helpers.py     | 2 +-
 test/test_linearizer.py           | 2 +-
 tinygrad/codegen/kernel.py        | 8 +++++---
 tinygrad/device.py                | 6 +++---
 6 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/docs/abstractions.py b/docs/abstractions.py
index eb66b7f8c9..82dfcfa81c 100644
--- a/docs/abstractions.py
+++ b/docs/abstractions.py
@@ -240,7 +240,7 @@ result = Tensor(2.0).realize() + Tensor(3.0).realize()
 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.realize import create_schedule
 sched = create_schedule([result.lazydata])
-linearizer = Linearizer(sched[-1].ast, ClangCompiler.linearizer_opts)
+linearizer = Linearizer(sched[-1].ast, opts=ClangCompiler.linearizer_opts)
 linearizer.linearize()
 
 # print the uops
diff --git a/examples/handcode_resnet50_opt.py b/examples/handcode_resnet50_opt.py
index 7f88087f1c..6da160df58 100644
--- a/examples/handcode_resnet50_opt.py
+++ b/examples/handcode_resnet50_opt.py
@@ -43,18 +43,18 @@ if __name__ == "__main__":
     lins:List[Linearizer] = []
 
     # always try hand coded opt
-    lin = Linearizer(si.ast, device.compiler.linearizer_opts)
+    lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts)
     lin.hand_coded_optimizations()
     lins.append(lin)
 
     # maybe try tensor cores
-    lin = Linearizer(si.ast, device.compiler.linearizer_opts)
+    lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts)
     if lin.apply_tensor_cores():
       lins.append(lin)
 
     # try a beam search
     if beam:=getenv("BEAM"):
-      lin = Linearizer(si.ast, device.compiler.linearizer_opts)
+      lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts)
       lin = beam_search(lin, rawbufs, beam, bool(getenv("BEAM_ESTIMATE", 1)))
       lins.append(lin)
 
diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py
index 241bae84c9..cefbba3eb6 100644
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -9,7 +9,7 @@ inf, nan = float('inf'), float('nan')
 # kernel unpacker
 from tinygrad.codegen.linearizer import Linearizer
 def ast_str_to_ast(ast_str:str) -> LazyOp: return eval(ast_str)
-def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts)
+def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts=opts)
 
 # load worlds, a dataset of about 12k kernels
 import gzip
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 961869eee9..adbe1617b9 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -204,7 +204,7 @@ class TestLinearizer(unittest.TestCase):
                    ConstBuffer(42, dtypes.float, ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),))))
       ast = LazyOp(BufferOps.STORE, (ast,),
                    MemBuffer(0, dtypes.float, ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),))))
-      lin = Linearizer(ast=ast) # this is a dummy ast
+      lin = Linearizer(ast) # this is a dummy ast
 
       lin.uops = UOpGraph()
       return lin.uops.add(uop, dtype, vin, arg, cachable=False)
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 0b8fd12b20..da250224b7 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -86,11 +86,13 @@ class LinearizerOptions(NamedTuple):
   local_max: Optional[List[int]] = None
 
 class Kernel:
-  def __init__(self, ast:LazyOp, opts:Optional[LinearizerOptions]=None):
+  def __init__(self, *ast:LazyOp, opts:Optional[LinearizerOptions]=None):
     self.opts = opts or (device.compiler.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) and device.compiler is not None else
                          LinearizerOptions(Device.DEFAULT))
-    self.ast = ast
-    assert ast.op == BufferOps.STORE, f"kernels must have a store as the output, got {ast.op}"
+    assert all(op.op is BufferOps.STORE for op in ast), f"kernels must have stores as the output, got {ast}"
+    assert len(set(op.arg.st.size for op in ast)) == 1, f"all outbufs should have the same size, got {[op.arg.st for op in ast]}"
+    assert len(ast) == 1, "max one output per kernel"
+    self.ast = ast[0]
 
     # fetch lazyop info
     self.info: FlopCounter = get_lazyop_info(self.ast)
diff --git a/tinygrad/device.py b/tinygrad/device.py
index b7d7ae002c..add5235f89 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -248,16 +248,16 @@ class Compiled:
       from tinygrad.features.graph import print_tree
       print_tree(ast)
     from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(ast, self.compiler.linearizer_opts)
+    k = Linearizer(ast, opts=self.compiler.linearizer_opts)
     k.required_optimizations()
     if not NOOPT:
       if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
       if BEAM >= 1:
         lins = [(("tc" if used_tensor_cores else "hc"), k)]
         if used_tensor_cores:
-          lins.append(("hc", Linearizer(ast, self.compiler.linearizer_opts)))
+          lins.append(("hc", Linearizer(ast, opts=self.compiler.linearizer_opts)))
           lins[-1][1].hand_coded_optimizations()
-        kb = Linearizer(ast, self.compiler.linearizer_opts)
+        kb = Linearizer(ast, opts=self.compiler.linearizer_opts)
         kb.required_optimizations()
         from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
         test_rawbuffers = bufs_from_lin(kb)    # allocate scratch buffers for optimization