From 2d0c1037b1d06df72c021d31775dca7bf5a05d72 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Thu, 5 Oct 2023 05:24:28 -0700
Subject: [PATCH] Fix up latest openpilot model (#1976)

* fix gemv triggering for gemm

* fixup_openpilot

* external test issues
---
 .github/workflows/test.yml                    |  3 ++
 openpilot/compile.py                          |  9 +---
 openpilot/go.sh                               |  2 +-
 .../external_test_allocator_on_models.py      |  2 +-
 test/external/external_test_jit_on_models.py  |  2 +-
 test/models/test_real_world.py                |  2 +-
 tinygrad/codegen/linearizer.py                |  4 +-
 tinygrad/lazy.py                              | 12 ++---
 tinygrad/ops.py                               |  3 +-
 tinygrad/realize.py                           | 44 ++++++++++++++++++-
 10 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9fc3a907c8..670320afdc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -151,6 +151,9 @@ jobs:
       - if: ${{ matrix.task == 'openpilot' }}
         name: Test openpilot model correctness (float32)
         run: DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py
+      - if: ${{ matrix.task == 'openpilot' }}
+        name: Test openpilot alt model correctness (float32)
+        run: DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
       - if: ${{ matrix.task == 'openpilot' }}
         name: Test tensor core ops
         run: GPU=1 TC=2 python -m pytest -n=auto test/test_ops.py
diff --git a/openpilot/compile.py b/openpilot/compile.py
index 708fe6fe8b..92e75e8cbd 100644
--- a/openpilot/compile.py
+++ b/openpilot/compile.py
@@ -145,10 +145,5 @@ def compile(dat, output_fn):
 # UNSAFE_FLOAT4=1 DEBUGCL=1 FLOAT16=1 python3 openpilot/compile.py
 # 22.59 ms
 if __name__ == "__main__":
-  if len(sys.argv) >= 3:
-    with open(sys.argv[1], "rb") as f:
-      dat = f.read()
-    compile(dat, sys.argv[2])
-  else:
-    dat = fetch(OPENPILOT_MODEL)
-    compile(dat, "/tmp/output.thneed")
+  dat = fetch(OPENPILOT_MODEL if len(sys.argv) == 1 else sys.argv[1])
+  compile(dat, sys.argv[2] if len(sys.argv) >= 3 else "/tmp/output.thneed")
diff --git a/openpilot/go.sh b/openpilot/go.sh
index dc334f365b..0c7639fc55 100755
--- a/openpilot/go.sh
+++ b/openpilot/go.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-FLOAT16=1 DEBUGCL=1 VALIDHACKS=1 IMAGE=2 GPU=1 ENABLE_METHOD_CACHE=1 python3 openpilot/compile.py
+FLOAT16=1 DEBUGCL=1 IMAGE=2 GPU=1 python3 openpilot/compile.py
diff --git a/test/external/external_test_allocator_on_models.py b/test/external/external_test_allocator_on_models.py
index 4c8d9da48b..105177d11a 100644
--- a/test/external/external_test_allocator_on_models.py
+++ b/test/external/external_test_allocator_on_models.py
@@ -90,7 +90,7 @@ def check_gc():
 def derandomize(x):
   if isinstance(x, LazyOp):
     if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY
-    x.src = [derandomize(s) for s in x.src]
+    x.src = tuple([derandomize(s) for s in x.src])
   else:
     x.op = derandomize(x.op)
   return x
diff --git a/test/external/external_test_jit_on_models.py b/test/external/external_test_jit_on_models.py
index 48bc94880f..2638422f88 100644
--- a/test/external/external_test_jit_on_models.py
+++ b/test/external/external_test_jit_on_models.py
@@ -14,7 +14,7 @@ from examples.llama import Transformer
 def derandomize(x):
   if isinstance(x, LazyOp):
     if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY
-    x.src = [derandomize(s) for s in x.src]
+    x.src = tuple([derandomize(s) for s in x.src])
   else:
     x.op = derandomize(x.op)
   return x
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 3e60404ac4..3e8782fb2d 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -55,7 +55,7 @@ def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed):
 def derandomize(x):
   if isinstance(x, LazyOp):
     if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY
-    x.src = [derandomize(s) for s in x.src]
+    x.src = tuple([derandomize(s) for s in x.src])
   elif hasattr(x, "op"):
     x.op = derandomize(x.op)
   return x
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 8c43bd3517..e17a4f27c5 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -4,7 +4,7 @@ import itertools, math, functools
 from collections import defaultdict
 from enum import Enum, auto
 
-from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, all_same
+from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, all_same, getenv
 from tinygrad.ops import LazyOp, UnaryOps, ConstBuffer, MemBuffer, BufferOps
 from tinygrad.ops import ReduceOps, BinaryOps, TernaryOps
 from tinygrad.shape.shapetracker import ShapeTracker
@@ -88,7 +88,7 @@ def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node) -> Tuple[Tup
   # This is the slow part
   # This part is for brute forcing all possible values of idx, idy and valid
   # If valid is both 0 and 1 for the same (idx, idy) we can not delete the valid
-  if valid.min == 0 and not isinstance(idx, ModNode):
+  if getenv("VALIDHACKS", 1) and valid.min == 0 and not isinstance(idx, ModNode):
     variables = tuple(val_vars | idy_vars | idx_vars)
     val_infer, idx_infer, idy_infer = valid.expand(variables), idx.expand(variables), idy.expand(variables)
     val_dict: Dict[int, Set[Tuple[int,int]]] = {0:set(), 1:set()}
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 19475406dd..ea51adc225 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -4,7 +4,7 @@ from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast, Mapp
 from weakref import ref, WeakSet, WeakValueDictionary
 
 import numpy as np
-from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, partition, dedup, merge_dicts
+from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, partition, dedup, merge_dicts
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer, BufferOps
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.shape.symbolic import Variable, sint
@@ -75,7 +75,7 @@ def _replace_bufferops(op:LazyOp) -> Tuple[LazyOp, List[LazyBuffer]]:
 
 # **** lazy operations ****
 
-def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 else root
+def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 and isinstance(root.op.src[0], LazyBuffer) else root
 def get_movementroot(root:LazyBuffer, allow_contiguous=False) -> LazyBuffer: return get_movementroot(cast(LazyBuffer, root.op.src[0]), allow_contiguous) if not root.realized and (root.optype == MovementOps or (root.op.op == LoadOps.CONTIGUOUS and allow_contiguous and root.op.src[0].st.contiguous)) else root
 def get_movementroot_contiguous(x:LazyBuffer) -> LazyBuffer: return get_movementroot_contiguous(cast(LazyBuffer, x.op.src[0])) if not x.realized and x.op.op == LoadOps.CONTIGUOUS else (get_movementroot(x, True) if x.optype == MovementOps and x.st.contiguous else x)
 
@@ -152,7 +152,7 @@ class LazyBuffer:
 
   @property
   def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,)
-  def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp]]): return real_srcs.get(self, self)
+  def map_buffers(self, real_srcs: Mapping[Any, Union[LazyBuffer, LazyOp]]): return real_srcs.get(self, self)
   def get_lazyops(self) -> List[LazyOp]: return []
 
   # *** scheduling ***
@@ -168,12 +168,6 @@ class LazyBuffer:
     if self.optype is BinaryOps: op = _ast_binaryops(op, self.shape)
     elif self.optype is ReduceOps: op = _ast_reduceops(op)
 
-    # HACK: image shape can be wrong, hot cast it back to a normal float
-    if isinstance(self.dtype, ImageDType) and (prod(self.shape) != prod(self.dtype.shape) or not any(self.shape[x]%4 == 0 for x in self.st.unit_stride_axes())):
-      if op.op == MovementOps.RESHAPE: op = LazyOp(MovementOps.RESHAPE, (LazyOp(UnaryOps.CAST, op.src, (dtypes.float32, False)),), op.arg)
-      else: op = LazyOp(UnaryOps.CAST, (op,), (dtypes.float32, False))
-      self.dtype = dtypes.float32
-
     # realize the past and exec the AST
     ret = []
     for x in op.buffers: ret += x.schedule(seen)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index f0726b24dd..5ac127afa1 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -56,7 +56,7 @@ class LazyOp:
   @property
   def key(self): return (self.op, tuple(map(lambda x: getattr(x, "key", x), self.src)), getattr(self.arg, "key", self.arg))
 
-  def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) for y in self.src]), self.arg)
+  def map_buffers(self, real_srcs: Mapping[Any, Union[LazyBuffer, LazyOp]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) if y not in real_srcs else real_srcs[y] for y in self.src]), self.arg)
   def get_lazyops(self) -> List[LazyOp]: return [self] + [item for x in self.src for item in x.get_lazyops()]
 
   def replace_with_movement_ops(self:LazyOp, ops:List[Tuple[MovementOps, Tuple[Any, ...]]]) -> 'LazyBuffer':
@@ -241,6 +241,7 @@ class Compiled:
     def get_program():
       from tinygrad.codegen.linearizer import Linearizer
       k = Linearizer(ast, self.linearizer_opts, var_vals)
+      assert k.info.dtype == output.dtype, f"linearizer must match dtype. linearizer wants {k.info.dtype} but buffer is {output.dtype}"
       from tinygrad.codegen.search import kernel_optimize
       if getenv("KOPT"): kernel_optimize(k, lambda: Linearizer(ast, self.linearizer_opts, var_vals), self.to_program, rawbuffers, key)
       elif not getenv("NOOPT"): k.hand_coded_optimizations()
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index 0dcad8a3d6..020c1a7bb6 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -1,16 +1,56 @@
 from typing import List, Tuple, cast, Dict, Callable
 import numpy as np
-from tinygrad.ops import LazyOp, LoadOps, BufferOps, Device
+from tinygrad.ops import LazyOp, LoadOps, Device, UnaryOps, BufferOps, MemBuffer, get_lazyop_info
 from tinygrad.graph import log_schedule_item
 from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import DEBUG, prod, all_int, getenv
+from tinygrad.helpers import DEBUG, prod, all_int, getenv, IMAGE, ImageDType, dtypes
 
 from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer
 from tinygrad.runtime.ops_disk import RawDiskBuffer
 
 P2P = getenv("P2P", 0)
 
+def fix_schedule_for_images(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]):
+  # this is the fundamental fix, find unwritable or unreadable images and convert them to normal float32 (TODO: should it be float16?)
+  for op,out,buffers in schedule:
+    if isinstance(out.dtype, ImageDType) and (prod(out.shape) != prod(out.dtype.shape) or not any(out.shape[x]%4 == 0 for x in out.st.unit_stride_axes())):
+      out.dtype = dtypes.float32
+    bops = [x for x in op.get_lazyops() if x.op == BufferOps.MEM]
+    for b in bops:
+      if isinstance(buffers[b.arg.idx-1].dtype, ImageDType) and (b.arg.st.real_offset() % 4 != 0 or not any(b.arg.st.shape[x]%4 == 0 for x in b.arg.st.unit_stride_axes())):
+        buffers[b.arg.idx-1].dtype = dtypes.float32
+
+  # fix the contiguous dtype, no cast required
+  for op,out,buffers in schedule:
+    if op.op == LoadOps.CONTIGUOUS and out.dtype != buffers[0].dtype:
+      out.dtype = buffers[0].dtype = dtypes.float32
+
+  # now fix up the schedule to reflect the new dtypes
+  fixed_schedule = []
+  for op,out,buffers in schedule:
+    # fix input dtypes to match what they actually are
+    bops = [x for x in op.get_lazyops() if x.op == BufferOps.MEM]
+    replacements = {}
+    for x in bops:
+      if x.arg.dtype != buffers[x.arg.idx-1].dtype:
+        replacements[x] = LazyOp(BufferOps.MEM, (), MemBuffer(x.arg.idx, buffers[x.arg.idx-1].dtype, x.arg.st))
+    if replacements: op = op.map_buffers(replacements)
+
+    # fix the ops to create the output dtype
+    if op.op not in LoadOps:
+      info = get_lazyop_info(op)
+      if info.dtype != out.dtype:
+        op = LazyOp(UnaryOps.CAST, (op,), (out.dtype, False))
+
+    # put this in the fixed schedule
+    fixed_schedule.append((op, out, buffers))
+  return fixed_schedule
+
+
 def run_schedule(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]):
+  # HACK: images can be not usable due to shape
+  if IMAGE >= 2: schedule = fix_schedule_for_images(schedule)
+
   # NOTE: if you for loop the schedule it's slow because nothing frees
   while len(schedule):
     op,out,buffers = schedule.pop(0)