From 9defbc7d54a27abb02da3ec29230b1463eaff368 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Thu, 26 Dec 2024 14:05:08 +0200 Subject: [PATCH] add symbolic_simple to the scheduler [pr] (#8419) --- .github/workflows/test.yml | 2 +- test/test_image_dtype.py | 19 ++++++++++++++----- test/test_schedule.py | 7 +++++-- tinygrad/engine/schedule.py | 26 +++----------------------- tinygrad/ops.py | 4 ++-- 5 files changed, 25 insertions(+), 33 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 739f2b8bff..b0fbbab75e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -295,7 +295,7 @@ jobs: - if: ${{ matrix.task == 'optimage' }} name: Test openpilot model kernel count and gate usage run: | - PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2138 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx + PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2105 ALLOWED_GATED_READ_IMAGE=29 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot alt model correctness (float32) run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py index d2f0e80e04..2a8c719aff 100644 --- a/test/test_image_dtype.py +++ b/test/test_image_dtype.py @@ -42,10 +42,18 @@ class TestImageDType(unittest.TestCase): def test_image_and_back(self): data = Tensor.randn(9*27*4).realize() tst = data.numpy() - it = data.cast(dtypes.imagef((9,27,4))).realize() + it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() assert isinstance(it.lazydata.base.realized.dtype, ImageDType) np.testing.assert_equal(tst, it.numpy()) + def test_image_cast_and_back_collapses(self): + data = Tensor.randn(9*27*4).realize() + tst = data.numpy() + it = data.cast(dtypes.imagef((9,27,4))).realize() + # the underlying UOp is identical + self.assertIs(it.lazydata.base.realized, data.lazydata.base.realized) + np.testing.assert_equal(tst, it.numpy()) + def test_image_and_back_wrong_shape(self): data = Tensor.randn(9*27*4).realize() tst = data.numpy() @@ -59,7 +67,8 @@ class TestImageDType(unittest.TestCase): np.testing.assert_equal(imgv[0:2], it[0:2].numpy()) def test_mul_stays_image(self): - it = Tensor.randn(4).cast(dtypes.imagef((1,1,4))).realize() + # NOTE: contiguous is needed otherwise this folds + it = Tensor.randn(4).cast(dtypes.imagef((1,1,4))).contiguous().realize() out = (it*2).realize() assert isinstance(out.lazydata.base.realized.dtype, ImageDType) @@ -88,15 +97,15 @@ class TestImageDType(unittest.TestCase): def test_no_lru_alloc(self): data = Tensor.randn(9*27*4).realize() - it = data.cast(dtypes.imagef((9,27,4))).realize() + it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() b1 = it.lazydata.base.realized._buf del it - it = data.cast(dtypes.imagef((10,27,4))).realize() + it = data.cast(dtypes.imagef((10,27,4))).contiguous().realize() assert it.lazydata.base.realized._buf != b1 def test_no_lru_alloc_dtype(self): data = Tensor.randn(9*27*4).realize() - it = data.cast(dtypes.imagef((9,27,4))).realize() + it = data.cast(dtypes.imagef((9,27,4))).contiguous().realize() b1 = it.lazydata.base.realized._buf del it it = data.cast(dtypes.imageh((9,27,4))).realize() diff --git a/test/test_schedule.py b/test/test_schedule.py index 7647e6ff7c..db7726f9bd 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -10,7 +10,7 @@ from typing import List, Optional, Union, cast from tinygrad import nn, dtypes, Device, Tensor from tinygrad.device import is_dtype_supported -from tinygrad.dtype import DType +from tinygrad.dtype import DType, ImageDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic @@ -1405,8 +1405,11 @@ class TestSchedule(unittest.TestCase): x = Tensor.randn((9, 9)).realize() y = Tensor.randn((9, 9)).realize() out = x@y - run_schedule(check_schedule(out, 4)) + run_schedule(check_schedule(out, 3)) np.testing.assert_allclose(out.numpy(), x.numpy()@y.numpy(), atol=1e-4, rtol=1e-4) + self.assertIsInstance(out.dtype, ImageDType) + self.assertIsNotNone(out.lazydata.base.realized) + self.assertIsInstance(out.lazydata.base.realized.dtype, ImageDType) def _test_fusion(self, shapes, f, cnt): with Context(DEBUG=0, TRACK_MATCH_STATS=0): args = [Tensor.randn(s).realize() for s in shapes] diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 6172275bef..d5892526e5 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -2,7 +2,7 @@ import sys, atexit, functools, pickle from collections import defaultdict, deque from dataclasses import dataclass, field from tinygrad.ops import GroupOp, UOp, Ops, PatternMatcher, UPat, Variable, can_pad, graph_rewrite, resolve, track_rewrites, view_left, merge_views -from tinygrad.ops import identity_element, buffers, exec_alu, type_verify +from tinygrad.ops import identity_element, buffers, symbolic_simple, type_verify from tinygrad.helpers import Context, Metadata, all_int, all_same, colored, diskcache_put, merge_dicts, prod, dedup, getenv, unwrap from tinygrad.helpers import FUSE_CONV_BW, FUSE_ARANGE, DEBUG, ContextVar from tinygrad.dtype import DType, ImageDType, dtypes @@ -34,7 +34,7 @@ tensor_uop_spec = PatternMatcher([ (isinstance(mv.arg, tuple) and mv.dtype == x.dtype) or # TODO: "make things that can't be images not images" can override the source dtype # is there a clean way to update its _mop children? - (isinstance(mv.dtype, ImageDType) and x.dtype == mv.dtype.base and x.is_realized)), + ((isinstance(mv.dtype, ImageDType) or isinstance(x.dtype, ImageDType)) and x.dtype.base == mv.dtype.base and x.is_realized)), # Tensor variable bindings (UPat(Ops.BIND, dtypes.int, (UPat(Ops.DEFINE_VAR), UPat.cvar(dtype=dtypes.int)), arg=None), lambda: True), @@ -414,21 +414,6 @@ def simplify_reduceop(reduce:UOp, x:UOp) -> UOp|None: case _: return None return UOp.const(reduce.dtype, ret) -def simplify_alu(alu:UOp): - if not all(x.is_unrealized_unmasked_const() for x in alu.src): return None - # this needs to have a VIEW next (it has to, right?) - return UOp.const(alu.dtype, exec_alu(alu.op, alu.dtype, [s.const_arg for s in alu.src])) - -def simplify_binop(binop:UOp, x:UOp, y:UOp): - if all_int(x.shape) and x.is_unrealized_unmasked_const(): other, const = y, x - elif all_int(y.shape) and y.is_unrealized_unmasked_const(): - if binop.op is Ops.IDIV and y.const_arg == 1: return x - other, const = x, y - else: return None - if binop.op is Ops.ADD and const.const_arg == 0: return other - if binop.op is Ops.MUL and const.const_arg == 1: return other - if binop.op is Ops.MUL and const.const_arg == 0: return UOp.const(binop.dtype, 0) - def found_contiguous(ctx:ScheduleContext, contig:UOp, base:UOp, b:UOp): if contig.src[0].op is Ops.VIEW and len(contig.src[0].src): old_base = contig.src[0].src[0] @@ -439,18 +424,13 @@ def replace_contiguous(ctx:ScheduleContext, alu:UOp): if (replace_src:=ctx.contiguous.get(s, None)) is not None: new_src[i] = replace_src if tuple(new_src) != alu.src: return alu.replace(src=tuple(new_src)) -ops_folding = PatternMatcher([ +ops_folding = symbolic_simple+PatternMatcher([ # op with size 0 is zero (UPatScheduled(), lambda b,to_store,base: base.const_like(0) if base.size == 0 else None), # if the uop folded to a CONST we can delete the BUFFER (UPatScheduled(Ops.CONST, name="const"), lambda b,base,const: base.const_like(const.const_arg)), # DETACH is a NOOP here (UPat(Ops.DETACH, name="detach"), lambda detach: detach.src[0]), - # elementwise const folding - (UPat(GroupOp.ALU, name="alu"), simplify_alu), - (UPat({Ops.ADD, Ops.MUL, Ops.IDIV}, name="binop", src=(UPat.var("x"), UPat.var("y"))), simplify_binop), - (UPat(Ops.CAST, src=(UPat.var("x"),), name="cast"), - lambda x,cast: UOp.const(cast.dtype, x.const_arg) if all_int(x.shape) and x.is_unrealized_unmasked_const() else None), # reduce of size 0 is the identity element (UPat(Ops.REDUCE_AXIS, name="reduce", src=(UPat.var("x"),)), lambda reduce,x:UOp.const(reduce.dtype, identity_element(reduce.arg[0], reduce.dtype)) if x.size == 0 and reduce.size != 0 else None), diff --git a/tinygrad/ops.py b/tinygrad/ops.py index cac429433e..b08d54efd5 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -956,11 +956,11 @@ spec = PatternMatcher([ # most ALUs have all matching dtypes, except CMPLT, CMPNE, and WHERE (UPat(Ops.WHERE, name="w", src=(UPat(dtype=dtypes.bool), UPat(name="x"), UPat(name="y"))), lambda w,x,y: w.dtype == x.dtype == y.dtype), - (UPat((Ops.CMPLT, Ops.CMPNE), dtype=dtypes.bool, src=(UPat(name="x"), UPat(name="y"))), lambda x,y: x.dtype == y.dtype), + (UPat((Ops.CMPLT, Ops.CMPNE), dtype=dtypes.bool, src=(UPat(name="x"), UPat(name="y"))), lambda x,y: x.dtype.base == y.dtype.base), # and SHL/SHR, the shift distance can be an int (UPat((Ops.SHL, Ops.SHR), src=(UPat(name="x"), UPat(name="y")), name="a"), lambda a,x,y: a.dtype == x.dtype and y.dtype in (x.dtype, dtypes.uint)), (UPat(Ops.IDIV, name="x"), lambda x: None if dtypes.is_int(x.dtype) else False), - (UPat(GroupOp.ALU, name="x"), lambda x: all(x.dtype == y.dtype for y in x.src)), + (UPat(GroupOp.ALU, name="x"), lambda x: all(x.dtype.base == y.dtype.base for y in x.src)), (UPat(Ops.ASSIGN, src=(UPat((Ops.DEFINE_ACC, Ops.DEFINE_GLOBAL)), UPat())), lambda: True), (UPat(Ops.ENDRANGE, dtype=dtypes.void, src=(UPat(Ops.RANGE),)), lambda: True),