From f6f712e6093afbe02c72d0a6dfe21563558633c5 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sun, 26 Nov 2023 09:07:37 -0800
Subject: [PATCH] split out the three steps of exec_ast (#2446)

* split out the three steps of exec_ast

* clean up extra args

* cleanups, bugfix

* allocate is a more normal name

* get_optimized_linearizer is better
---
 tinygrad/ops.py     | 29 +++++++++++++++--------------
 tinygrad/realize.py |  8 +++++++-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 2be284184c..7a8d342b43 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -144,7 +144,7 @@ def update_stats(name:str, op_estimate:sint, mem_estimate:sint, var_vals: Option
   GlobalCounters.global_mem += mem_estimate
   if et is not None: GlobalCounters.time_sum_s += et
 
-# **************** shared AST runner ****************
+# **************** shared Runner that can go in the JIT ****************
 
 class JITRunner:
   def __init__(self):
@@ -182,10 +182,12 @@ class Interpreted:
     self.synchronize, self.codegen, self.graph = lambda: None, None, None
     self.method_cache: Dict[LazyOp, InterpretedASTRunner] = {}
 
-  def exec_ast(self, ast:LazyOp, output:LazyBuffer, inputs:Tuple[LazyBuffer, ...], var_vals:Dict[Variable, int], **kwargs):
-    if ast not in self.method_cache: self.method_cache[ast] = get_interpreted_fxn(self.fxn_for_op, ast)
+  def allocate_output(self, ast:LazyOp, output:LazyBuffer, inputs:Tuple[LazyBuffer, ...]):
     output.realized = output.output_buffer if output.output_buffer is not None else self.buffer.__new__(self.buffer)
-    self.method_cache[ast].exec([output.realized] + [x.realized for x in inputs], var_vals)
+
+  def get_runner(self, ast:LazyOp, rawbuffers:List[RawBuffer]) -> InterpretedASTRunner:
+    if ast not in self.method_cache or getenv("DISABLE_METHOD_CACHE"): self.method_cache[ast] = get_interpreted_fxn(self.fxn_for_op, ast)
+    return self.method_cache[ast]
 
 def get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> InterpretedASTRunner:
   if DEBUG >= 3:
@@ -277,11 +279,12 @@ class Compiled:
     src, runtime_args = self.renderer(to_function_name(k.name), k.uops)
     return CompiledASTRunner(k.ast, k.name, src, k.global_size, k.local_size, runtime_args).build(self.compiler, self.runtime)
 
-  def exec_ast(self, ast:LazyOp, output:LazyBuffer, inputs:Tuple[LazyBuffer, ...], var_vals:Dict[Variable, int], **kwargs):
+  def allocate_output(self, ast:LazyOp, output:LazyBuffer, inputs:Tuple[LazyBuffer, ...]):
     # check if we can reuse the output buffer
     # if it's aliased, don't use it
     # TODO: this is pretty wrong actually, who knows where else this buffer is used?
     # TODO: what if an assign is required? this silently is wrong
+    # TODO: this logic just doesn't belong here
     output.realized = output.output_buffer
     if output.realized is not None:
       for i,a in enumerate(inputs):
@@ -293,16 +296,14 @@ class Compiled:
 
     # we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape
     if output.realized is None:
-      output.realized = self.buffer(prod((s if isinstance(s, int) else s.max for s in output.shape)), output.dtype, **kwargs)
-      if output.realized.size == 0: return output.realized
+      output.realized = self.buffer(prod((s if isinstance(s, int) else s.max for s in output.shape)), output.dtype, **output._device_extra_args())
 
-    # all the rawbuffers
-    rawbuffers = [output.realized] + [x.realized for x in inputs]
+  # TODO: the rawbuffers are only used for optimization, they should be removed and optimizer should realloc
+  def get_runner(self, ast:LazyOp, rawbuffers:List[RawBuffer]) -> CompiledASTRunner:
+    if ast not in self.method_cache or getenv("DISABLE_METHOD_CACHE"): self.method_cache[ast] = self.to_program(get_optimized_linearizer(ast, self.linearizer_opts, rawbuffers))
+    return self.method_cache[ast]
 
-    if ast not in self.method_cache or getenv("DISABLE_METHOD_CACHE"): self.method_cache[ast] = get_optimized_program(self.linearizer_opts, self.to_program, ast, rawbuffers)
-    self.method_cache[ast].exec(rawbuffers, var_vals)
-
-def get_optimized_program(linearizer_opts:LinearizerOptions, to_program, ast:LazyOp, rawbuffers:List[RawBuffer]) -> CompiledASTRunner:
+def get_optimized_linearizer(ast:LazyOp, linearizer_opts:LinearizerOptions, rawbuffers:List[RawBuffer]) -> Linearizer:
   if DEBUG >= 3:
     from tinygrad.graph import print_tree
     print_tree(ast)
@@ -327,4 +328,4 @@ def get_optimized_program(linearizer_opts:LinearizerOptions, to_program, ast:Laz
       k = timed[0][1]
   else:
     k.required_optimizations()
-  return to_program(k)
+  return k
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index 0f325c991b..df5c42e76e 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -21,7 +21,13 @@ def run_schedule(schedule:List[ScheduleItem], disable_logging=False):
       LOAD_OPS_DISPATCHER[cast(LoadOps, si.ast.op)](si.out, *si.inputs)
     else:
       assert all(si.out.device == x.device for x in si.inputs), f"all devices must be the same, {si.out.device} != {[x.device for x in si.inputs]} {print_tree(si.ast) or ''}"
-      Device[si.out.device].exec_ast(si.ast, output=si.out, inputs=si.inputs, var_vals=si.var_vals, **si.out._device_extra_args())
+      # TODO: allocate_output should be at the top of this function for global memory management
+      Device[si.out.device].allocate_output(si.ast, si.out, si.inputs)
+      # TODO: should this be handled here? it probably just shouldn't be in the schedule
+      if not hasattr(si.out.realized, 'size') or si.out.realized.size != 0:
+        rawbuffers = [si.out.realized] + [x.realized for x in si.inputs]
+        # TODO: remove rawbuffers from get_runner, optimizer should reallocate them
+        Device[si.out.device].get_runner(si.ast, rawbuffers).exec(rawbuffers, si.var_vals)
     del si.out.op
     for v in si.out.views: del v.op
     assert si.out.realized and isinstance(si.out.realized, Device[si.out.device].buffer), f"device mismatch on realized got {type(si.out.realized)} expected {si.out.device}"