From 2d0c1037b1d06df72c021d31775dca7bf5a05d72 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Thu, 5 Oct 2023 05:24:28 -0700 Subject: [PATCH] Fix up latest openpilot model (#1976) * fix gemv triggering for gemm * fixup_openpilot * external test issues --- .github/workflows/test.yml | 3 ++ openpilot/compile.py | 9 +--- openpilot/go.sh | 2 +- .../external_test_allocator_on_models.py | 2 +- test/external/external_test_jit_on_models.py | 2 +- test/models/test_real_world.py | 2 +- tinygrad/codegen/linearizer.py | 4 +- tinygrad/lazy.py | 12 ++--- tinygrad/ops.py | 3 +- tinygrad/realize.py | 44 ++++++++++++++++++- 10 files changed, 58 insertions(+), 25 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9fc3a907c8..670320afdc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -151,6 +151,9 @@ jobs: - if: ${{ matrix.task == 'openpilot' }} name: Test openpilot model correctness (float32) run: DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py + - if: ${{ matrix.task == 'openpilot' }} + name: Test openpilot alt model correctness (float32) + run: DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'openpilot' }} name: Test tensor core ops run: GPU=1 TC=2 python -m pytest -n=auto test/test_ops.py diff --git a/openpilot/compile.py b/openpilot/compile.py index 708fe6fe8b..92e75e8cbd 100644 --- a/openpilot/compile.py +++ b/openpilot/compile.py @@ -145,10 +145,5 @@ def compile(dat, output_fn): # UNSAFE_FLOAT4=1 DEBUGCL=1 FLOAT16=1 python3 openpilot/compile.py # 22.59 ms if __name__ == "__main__": - if len(sys.argv) >= 3: - with open(sys.argv[1], "rb") as f: - dat = f.read() - compile(dat, sys.argv[2]) - else: - dat = fetch(OPENPILOT_MODEL) - compile(dat, "/tmp/output.thneed") + dat = fetch(OPENPILOT_MODEL if len(sys.argv) == 1 else sys.argv[1]) + compile(dat, sys.argv[2] if len(sys.argv) >= 3 else "/tmp/output.thneed") diff --git a/openpilot/go.sh b/openpilot/go.sh index dc334f365b..0c7639fc55 100755 --- a/openpilot/go.sh +++ b/openpilot/go.sh @@ -1,2 +1,2 @@ #!/bin/bash -FLOAT16=1 DEBUGCL=1 VALIDHACKS=1 IMAGE=2 GPU=1 ENABLE_METHOD_CACHE=1 python3 openpilot/compile.py +FLOAT16=1 DEBUGCL=1 IMAGE=2 GPU=1 python3 openpilot/compile.py diff --git a/test/external/external_test_allocator_on_models.py b/test/external/external_test_allocator_on_models.py index 4c8d9da48b..105177d11a 100644 --- a/test/external/external_test_allocator_on_models.py +++ b/test/external/external_test_allocator_on_models.py @@ -90,7 +90,7 @@ def check_gc(): def derandomize(x): if isinstance(x, LazyOp): if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY - x.src = [derandomize(s) for s in x.src] + x.src = tuple([derandomize(s) for s in x.src]) else: x.op = derandomize(x.op) return x diff --git a/test/external/external_test_jit_on_models.py b/test/external/external_test_jit_on_models.py index 48bc94880f..2638422f88 100644 --- a/test/external/external_test_jit_on_models.py +++ b/test/external/external_test_jit_on_models.py @@ -14,7 +14,7 @@ from examples.llama import Transformer def derandomize(x): if isinstance(x, LazyOp): if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY - x.src = [derandomize(s) for s in x.src] + x.src = tuple([derandomize(s) for s in x.src]) else: x.op = derandomize(x.op) return x diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index 3e60404ac4..3e8782fb2d 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -55,7 +55,7 @@ def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed): def derandomize(x): if isinstance(x, LazyOp): if x.op == LoadOps.RAND: x.op = LoadOps.EMPTY - x.src = [derandomize(s) for s in x.src] + x.src = tuple([derandomize(s) for s in x.src]) elif hasattr(x, "op"): x.op = derandomize(x.op) return x diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 8c43bd3517..e17a4f27c5 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -4,7 +4,7 @@ import itertools, math, functools from collections import defaultdict from enum import Enum, auto -from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, all_same +from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, all_same, getenv from tinygrad.ops import LazyOp, UnaryOps, ConstBuffer, MemBuffer, BufferOps from tinygrad.ops import ReduceOps, BinaryOps, TernaryOps from tinygrad.shape.shapetracker import ShapeTracker @@ -88,7 +88,7 @@ def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node) -> Tuple[Tup # This is the slow part # This part is for brute forcing all possible values of idx, idy and valid # If valid is both 0 and 1 for the same (idx, idy) we can not delete the valid - if valid.min == 0 and not isinstance(idx, ModNode): + if getenv("VALIDHACKS", 1) and valid.min == 0 and not isinstance(idx, ModNode): variables = tuple(val_vars | idy_vars | idx_vars) val_infer, idx_infer, idy_infer = valid.expand(variables), idx.expand(variables), idy.expand(variables) val_dict: Dict[int, Set[Tuple[int,int]]] = {0:set(), 1:set()} diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 19475406dd..ea51adc225 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -4,7 +4,7 @@ from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast, Mapp from weakref import ref, WeakSet, WeakValueDictionary import numpy as np -from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, ImageDType, partition, dedup, merge_dicts +from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, partition, dedup, merge_dicts from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer, BufferOps from tinygrad.shape.shapetracker import ShapeTracker, get_contraction from tinygrad.shape.symbolic import Variable, sint @@ -75,7 +75,7 @@ def _replace_bufferops(op:LazyOp) -> Tuple[LazyOp, List[LazyBuffer]]: # **** lazy operations **** -def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 else root +def get_single_root(root:LazyBuffer) -> LazyBuffer: return get_single_root(cast(LazyBuffer, root.op.src[0])) if getattr(root, 'op', None) and len(root.op.src) == 1 and isinstance(root.op.src[0], LazyBuffer) else root def get_movementroot(root:LazyBuffer, allow_contiguous=False) -> LazyBuffer: return get_movementroot(cast(LazyBuffer, root.op.src[0]), allow_contiguous) if not root.realized and (root.optype == MovementOps or (root.op.op == LoadOps.CONTIGUOUS and allow_contiguous and root.op.src[0].st.contiguous)) else root def get_movementroot_contiguous(x:LazyBuffer) -> LazyBuffer: return get_movementroot_contiguous(cast(LazyBuffer, x.op.src[0])) if not x.realized and x.op.op == LoadOps.CONTIGUOUS else (get_movementroot(x, True) if x.optype == MovementOps and x.st.contiguous else x) @@ -152,7 +152,7 @@ class LazyBuffer: @property def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,) - def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp]]): return real_srcs.get(self, self) + def map_buffers(self, real_srcs: Mapping[Any, Union[LazyBuffer, LazyOp]]): return real_srcs.get(self, self) def get_lazyops(self) -> List[LazyOp]: return [] # *** scheduling *** @@ -168,12 +168,6 @@ class LazyBuffer: if self.optype is BinaryOps: op = _ast_binaryops(op, self.shape) elif self.optype is ReduceOps: op = _ast_reduceops(op) - # HACK: image shape can be wrong, hot cast it back to a normal float - if isinstance(self.dtype, ImageDType) and (prod(self.shape) != prod(self.dtype.shape) or not any(self.shape[x]%4 == 0 for x in self.st.unit_stride_axes())): - if op.op == MovementOps.RESHAPE: op = LazyOp(MovementOps.RESHAPE, (LazyOp(UnaryOps.CAST, op.src, (dtypes.float32, False)),), op.arg) - else: op = LazyOp(UnaryOps.CAST, (op,), (dtypes.float32, False)) - self.dtype = dtypes.float32 - # realize the past and exec the AST ret = [] for x in op.buffers: ret += x.schedule(seen) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index f0726b24dd..5ac127afa1 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -56,7 +56,7 @@ class LazyOp: @property def key(self): return (self.op, tuple(map(lambda x: getattr(x, "key", x), self.src)), getattr(self.arg, "key", self.arg)) - def map_buffers(self, real_srcs: Mapping[LazyBuffer, Union[LazyBuffer, LazyOp]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) for y in self.src]), self.arg) + def map_buffers(self, real_srcs: Mapping[Any, Union[LazyBuffer, LazyOp]]) -> LazyOp: return LazyOp(self.op, tuple([y.map_buffers(real_srcs) if y not in real_srcs else real_srcs[y] for y in self.src]), self.arg) def get_lazyops(self) -> List[LazyOp]: return [self] + [item for x in self.src for item in x.get_lazyops()] def replace_with_movement_ops(self:LazyOp, ops:List[Tuple[MovementOps, Tuple[Any, ...]]]) -> 'LazyBuffer': @@ -241,6 +241,7 @@ class Compiled: def get_program(): from tinygrad.codegen.linearizer import Linearizer k = Linearizer(ast, self.linearizer_opts, var_vals) + assert k.info.dtype == output.dtype, f"linearizer must match dtype. linearizer wants {k.info.dtype} but buffer is {output.dtype}" from tinygrad.codegen.search import kernel_optimize if getenv("KOPT"): kernel_optimize(k, lambda: Linearizer(ast, self.linearizer_opts, var_vals), self.to_program, rawbuffers, key) elif not getenv("NOOPT"): k.hand_coded_optimizations() diff --git a/tinygrad/realize.py b/tinygrad/realize.py index 0dcad8a3d6..020c1a7bb6 100644 --- a/tinygrad/realize.py +++ b/tinygrad/realize.py @@ -1,16 +1,56 @@ from typing import List, Tuple, cast, Dict, Callable import numpy as np -from tinygrad.ops import LazyOp, LoadOps, BufferOps, Device +from tinygrad.ops import LazyOp, LoadOps, Device, UnaryOps, BufferOps, MemBuffer, get_lazyop_info from tinygrad.graph import log_schedule_item from tinygrad.lazy import LazyBuffer -from tinygrad.helpers import DEBUG, prod, all_int, getenv +from tinygrad.helpers import DEBUG, prod, all_int, getenv, IMAGE, ImageDType, dtypes from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer from tinygrad.runtime.ops_disk import RawDiskBuffer P2P = getenv("P2P", 0) +def fix_schedule_for_images(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]): + # this is the fundamental fix, find unwritable or unreadable images and convert them to normal float32 (TODO: should it be float16?) + for op,out,buffers in schedule: + if isinstance(out.dtype, ImageDType) and (prod(out.shape) != prod(out.dtype.shape) or not any(out.shape[x]%4 == 0 for x in out.st.unit_stride_axes())): + out.dtype = dtypes.float32 + bops = [x for x in op.get_lazyops() if x.op == BufferOps.MEM] + for b in bops: + if isinstance(buffers[b.arg.idx-1].dtype, ImageDType) and (b.arg.st.real_offset() % 4 != 0 or not any(b.arg.st.shape[x]%4 == 0 for x in b.arg.st.unit_stride_axes())): + buffers[b.arg.idx-1].dtype = dtypes.float32 + + # fix the contiguous dtype, no cast required + for op,out,buffers in schedule: + if op.op == LoadOps.CONTIGUOUS and out.dtype != buffers[0].dtype: + out.dtype = buffers[0].dtype = dtypes.float32 + + # now fix up the schedule to reflect the new dtypes + fixed_schedule = [] + for op,out,buffers in schedule: + # fix input dtypes to match what they actually are + bops = [x for x in op.get_lazyops() if x.op == BufferOps.MEM] + replacements = {} + for x in bops: + if x.arg.dtype != buffers[x.arg.idx-1].dtype: + replacements[x] = LazyOp(BufferOps.MEM, (), MemBuffer(x.arg.idx, buffers[x.arg.idx-1].dtype, x.arg.st)) + if replacements: op = op.map_buffers(replacements) + + # fix the ops to create the output dtype + if op.op not in LoadOps: + info = get_lazyop_info(op) + if info.dtype != out.dtype: + op = LazyOp(UnaryOps.CAST, (op,), (out.dtype, False)) + + # put this in the fixed schedule + fixed_schedule.append((op, out, buffers)) + return fixed_schedule + + def run_schedule(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]): + # HACK: images can be not usable due to shape + if IMAGE >= 2: schedule = fix_schedule_for_images(schedule) + # NOTE: if you for loop the schedule it's slow because nothing frees while len(schedule): op,out,buffers = schedule.pop(0)