From aec4c4f01bdc578feafc9da651da5f81418f13b3 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 12 Mar 2024 00:39:04 +0200 Subject: [PATCH] linearizer ast as a tuple of lazyops (#3689) * multi store op linearizer * currently we do only one output per kernel * named opts --- docs/abstractions.py | 2 +- examples/handcode_resnet50_opt.py | 6 +++--- extra/optimization/helpers.py | 2 +- test/test_linearizer.py | 2 +- tinygrad/codegen/kernel.py | 8 +++++--- tinygrad/device.py | 6 +++--- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/abstractions.py b/docs/abstractions.py index eb66b7f8c9..82dfcfa81c 100644 --- a/docs/abstractions.py +++ b/docs/abstractions.py @@ -240,7 +240,7 @@ result = Tensor(2.0).realize() + Tensor(3.0).realize() from tinygrad.codegen.linearizer import Linearizer from tinygrad.realize import create_schedule sched = create_schedule([result.lazydata]) -linearizer = Linearizer(sched[-1].ast, ClangCompiler.linearizer_opts) +linearizer = Linearizer(sched[-1].ast, opts=ClangCompiler.linearizer_opts) linearizer.linearize() # print the uops diff --git a/examples/handcode_resnet50_opt.py b/examples/handcode_resnet50_opt.py index 7f88087f1c..6da160df58 100644 --- a/examples/handcode_resnet50_opt.py +++ b/examples/handcode_resnet50_opt.py @@ -43,18 +43,18 @@ if __name__ == "__main__": lins:List[Linearizer] = [] # always try hand coded opt - lin = Linearizer(si.ast, device.compiler.linearizer_opts) + lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts) lin.hand_coded_optimizations() lins.append(lin) # maybe try tensor cores - lin = Linearizer(si.ast, device.compiler.linearizer_opts) + lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts) if lin.apply_tensor_cores(): lins.append(lin) # try a beam search if beam:=getenv("BEAM"): - lin = Linearizer(si.ast, device.compiler.linearizer_opts) + lin = Linearizer(si.ast, opts=device.compiler.linearizer_opts) lin = beam_search(lin, rawbufs, beam, bool(getenv("BEAM_ESTIMATE", 1))) lins.append(lin) diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py index 241bae84c9..cefbba3eb6 100644 --- a/extra/optimization/helpers.py +++ b/extra/optimization/helpers.py @@ -9,7 +9,7 @@ inf, nan = float('inf'), float('nan') # kernel unpacker from tinygrad.codegen.linearizer import Linearizer def ast_str_to_ast(ast_str:str) -> LazyOp: return eval(ast_str) -def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts) +def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts=opts) # load worlds, a dataset of about 12k kernels import gzip diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 961869eee9..adbe1617b9 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -204,7 +204,7 @@ class TestLinearizer(unittest.TestCase): ConstBuffer(42, dtypes.float, ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)))) ast = LazyOp(BufferOps.STORE, (ast,), MemBuffer(0, dtypes.float, ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)))) - lin = Linearizer(ast=ast) # this is a dummy ast + lin = Linearizer(ast) # this is a dummy ast lin.uops = UOpGraph() return lin.uops.add(uop, dtype, vin, arg, cachable=False) diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 0b8fd12b20..da250224b7 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -86,11 +86,13 @@ class LinearizerOptions(NamedTuple): local_max: Optional[List[int]] = None class Kernel: - def __init__(self, ast:LazyOp, opts:Optional[LinearizerOptions]=None): + def __init__(self, *ast:LazyOp, opts:Optional[LinearizerOptions]=None): self.opts = opts or (device.compiler.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) and device.compiler is not None else LinearizerOptions(Device.DEFAULT)) - self.ast = ast - assert ast.op == BufferOps.STORE, f"kernels must have a store as the output, got {ast.op}" + assert all(op.op is BufferOps.STORE for op in ast), f"kernels must have stores as the output, got {ast}" + assert len(set(op.arg.st.size for op in ast)) == 1, f"all outbufs should have the same size, got {[op.arg.st for op in ast]}" + assert len(ast) == 1, "max one output per kernel" + self.ast = ast[0] # fetch lazyop info self.info: FlopCounter = get_lazyop_info(self.ast) diff --git a/tinygrad/device.py b/tinygrad/device.py index b7d7ae002c..add5235f89 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -248,16 +248,16 @@ class Compiled: from tinygrad.features.graph import print_tree print_tree(ast) from tinygrad.codegen.linearizer import Linearizer - k = Linearizer(ast, self.compiler.linearizer_opts) + k = Linearizer(ast, opts=self.compiler.linearizer_opts) k.required_optimizations() if not NOOPT: if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations() if BEAM >= 1: lins = [(("tc" if used_tensor_cores else "hc"), k)] if used_tensor_cores: - lins.append(("hc", Linearizer(ast, self.compiler.linearizer_opts))) + lins.append(("hc", Linearizer(ast, opts=self.compiler.linearizer_opts))) lins[-1][1].hand_coded_optimizations() - kb = Linearizer(ast, self.compiler.linearizer_opts) + kb = Linearizer(ast, opts=self.compiler.linearizer_opts) kb.required_optimizations() from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization