scheduler ops_folding from delete_lazy (#8124)

* scheduler diff from delete_lazy

* test_std_mean

* late fold copy of CONST

* clang const is fine
This commit is contained in:
qazal
2024-12-09 18:36:01 +02:00
committed by GitHub
parent 87c360c4b5
commit 80de06c8b9
4 changed files with 30 additions and 13 deletions

View File

@@ -1224,10 +1224,13 @@ class TestOps(unittest.TestCase):
lambda x: Tensor.stack(*x.std_mean(correction=5)))
helper_test_op([(15,25,35)], lambda x: torch.stack(torch.std_mean(x, keepdim=True, correction=0)),
lambda x: Tensor.stack(*x.std_mean(keepdim=True, correction=0)))
helper_test_op([(1,0,3,0,5)], lambda x: torch.stack(torch.std_mean(x, axis=(1,3))),
lambda x: Tensor.stack(*x.std_mean(axis=(1,3))))
helper_test_op([(3,4,5,6)], lambda x: torch.stack(torch.std_mean(x, axis=(1,2))),
lambda x: Tensor.stack(*x.std_mean(axis=(1,2))))
@unittest.skip("TODO: this fails because of loaded nan in mul folding")
def test_std_mean_loaded_nan(self):
helper_test_op([(1,0,3,0,5)], lambda x: torch.stack(torch.std_mean(x, axis=(1,3))),
lambda x: Tensor.stack(*x.std_mean(axis=(1,3))))
def test_softmax(self):
helper_test_op([(45,65)], torch.nn.Softmax(dim=1), Tensor.softmax, atol=1e-7, grad_atol=1e-7)
helper_test_op([(45)], torch.nn.Softmax(dim=0), Tensor.softmax, atol=1e-7, grad_atol=1e-7)

View File

@@ -3,7 +3,7 @@ from typing import Optional, Any, Tuple, List, get_args
from tinygrad.dtype import dtypes, DType, ConstType, to_dtype, ImageDType
from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG, _METADATA, Metadata, SPLIT_REDUCEOP, LAZYCACHE
from tinygrad.ops import exec_alu, python_alu
from tinygrad.ops import identity_element, MathTrait, resolve, UOp, sint, GroupOp, Ops, view_supported_devices
from tinygrad.ops import MathTrait, resolve, UOp, sint, GroupOp, Ops, view_supported_devices
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.device import Buffer
from weakref import ref, ReferenceType, WeakValueDictionary
@@ -180,15 +180,6 @@ class LazyBuffer(MathTrait):
def r(self, op:Ops, axis:Tuple[int, ...]) -> LazyBuffer:
new_shape = self.st.reduce(axis)
# TODO: this logic should move to the scheduler
if 0 in self.shape and 0 not in new_shape: return self.const_with_shape(identity_element(op, self.dtype), new_shape)
# const folding
# TODO: fold this for symbolic?
if self.is_unrealized_unmasked_const() and all_int(self.shape):
if op is Ops.ADD: return self.const_with_shape(self.base.arg * prod(self.shape[i] for i in axis), new_shape)
if op is Ops.MUL: return self.const_with_shape(self.base.arg ** prod(self.shape[i] for i in axis), new_shape)
if op is Ops.MAX: return self.const_with_shape(self.base.arg, new_shape)
# TODO: can we split symbolic shape if the reduce axis is not symbolic?
if not SPLIT_REDUCEOP or not all_int(self.shape) or (0 in self.shape) or \

View File

@@ -3,6 +3,7 @@ from collections import defaultdict, deque
from dataclasses import dataclass, field
from typing import FrozenSet, Set, Tuple, List, Dict, Optional, DefaultDict
from tinygrad.ops import GroupOp, UOp, Ops, PatternMatcher, UPat, Variable, can_pad, graph_rewrite, resolve, track_rewrites, view_left, merge_views
from tinygrad.ops import identity_element
from tinygrad.helpers import Context, Metadata, all_int, all_same, colored, diskcache_put, merge_dicts, prod, dedup, getenv, unwrap
from tinygrad.helpers import FUSE_CONV_BW, FUSE_ARANGE, DEBUG
from tinygrad.dtype import ConstType, ImageDType, dtypes
@@ -338,9 +339,29 @@ def _as_const(u:UOp, val:ConstType) -> UOp:
st = (base:=ShapeTracker.from_shape(())).reshape((1,)*len(u.shape)).expand(u.shape)
return UOp(Ops.VIEW, u.dtype, (u.buf_uop, UOp.const(u.dtype, val)), base).view(st)
def simplify_reduceop(ctx, reduce:UOp, x:UOp) -> Optional[UOp]:
# remove reduce on unmasked const
if all_int(x.shape) and x.is_unrealized_unmasked_const():
prshape = prod(unwrap(x.st).shape[i] for i in reduce.arg[1])
ret = x.base.src[1].arg
match reduce.arg[0]:
case Ops.ADD: ret *= prshape
case Ops.MUL: ret **= prshape
case Ops.MAX: pass # NOTE: Ops.MAX is passthrough
case _: return None
return UOp.const(reduce.dtype, ret)
return None
ops_folding = PatternMatcher([
# op with size 0 is zero
(UPatScheduled(), lambda ctx,b,to_store,base: _as_const(base, 0) if base.size == 0 else None),
# reduce of size 0 is the identity element
(UPat(Ops.REDUCE_AXIS, name="reduce", src=(UPat.var("x"),)),
lambda ctx,reduce,x:UOp.const(reduce.dtype, identity_element(reduce.arg[0], reduce.dtype)) if x.size == 0 and reduce.size != 0 else None),
# reduce of const is collapsed (TODO: make this a generic rule for stride0)
(UPat(Ops.REDUCE_AXIS, name="reduce", src=(UPat.var("x"),)), simplify_reduceop),
# CONST doesn't need COPY
(UPat(Ops.COPY, src=(UPat.var("x"),)), lambda ctx,x:x if x.is_unrealized_const() else None),
])
# ** this decides which ops get realized
@@ -366,7 +387,7 @@ def fold_img_cast(ctx:Dict[UOp, UOp], xb:UOp, view:UOp, b:UOp, to_cast:UOp, **kw
return to_cast.view(unwrap(view.st))
def init_big_graph(ctx:ScheduleContext, sink:UOp) -> Optional[UOp]:
new_src = tuple(x.base for x in sink.src if is_scheduled(x.base) and uval(x.base).op is not Ops.CONST)
new_src = tuple(x.base for x in sink.src if is_scheduled(x.base) and x.base.src[1].op is not Ops.CONST)
return None if new_src == sink.src else UOp(Ops.NOOP) if len(new_src) == 0 else UOp.sink(*new_src)
do_realize = PatternMatcher([

View File

@@ -340,6 +340,8 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
def const_with_shape(dtype:DType, val:ConstLike, shape:Tuple[sint,...]) -> UOp:
from tinygrad.shape.shapetracker import ShapeTracker
return UOp(Ops.VALID, dtypes.bool, (ShapeTracker.from_shape(()).reshape((1,)*len(shape)).expand(shape).to_uop(),)).where(UOp.const(dtype, val), 0)
def is_unrealized_const(self): return (s:=self.base).op is Ops.VIEW and len(s.src) == 2 and s.src[1].op is Ops.CONST
def is_unrealized_unmasked_const(self): return self.is_unrealized_const() and all(v.mask is None for v in unwrap(self.st).views)
# *** uop movement ops ***