diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index d37922a6a8..88dbe2c1a5 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -63,7 +63,7 @@ class TestInferenceMinKernels(unittest.TestCase): for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np)) img = Tensor.randn(1, 3, 224, 224) # TODO: this seems very high - with CLCache(116): + with CLCache(115): model.forward(img).realize() def test_resnet(self): diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index e00f0969d7..b2ca2e5790 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -1,5 +1,4 @@ from __future__ import annotations -import math import operator from typing import Callable, Optional, Tuple, Union, List, Dict, Any, cast import sys, importlib, inspect, functools, pathlib @@ -209,23 +208,11 @@ class LazyBuffer: return root.reshape(ret.st.shape) return ret - def _reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer: + def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer: if self.shape == tuple(new_shape): return self srcs = _push_movement_ops((self,)) if SHUFFLE_MOVEMENT_OPS else (self,) return create_lazybuffer(self.device, ShapeTracker(new_shape), ReduceOps, LazyOp(op, srcs, new_shape), self.dtype) - def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer: - if prod(self.shape) // prod(new_shape) > 8192: - reduced_dimensions = [(i, math.gcd(256, old), stride) for i, (old, new, stride) in enumerate(zip(self.shape, new_shape, self.st.real_strides())) if old != new] - dimension_to_split, divisor, _ = max(reduced_dimensions, key=lambda v: v[1]//(v[2] or math.inf) ) # heuristic -> choose largest divisor to split on, penalize large strides - - intermediate_input_shape = self.shape[:dimension_to_split] + (self.shape[dimension_to_split]//divisor, divisor) + self.shape[dimension_to_split+1:] - intermediate_output_shape = self.shape[:dimension_to_split] + (self.shape[dimension_to_split]//divisor, 1) + self.shape[dimension_to_split+1:] - final_input_shape = self.shape[:dimension_to_split] + (self.shape[dimension_to_split]//divisor,) + self.shape[dimension_to_split+1:] - - return self.reshape(intermediate_input_shape)._reduce_op(op, intermediate_output_shape).reshape(final_input_shape)._reduce_op(op, new_shape) - return self._reduce_op(op, new_shape) - def reshape(self:LazyBuffer, arg:Tuple[int, ...]) -> LazyBuffer: if self.shape == arg: return self if not self.realized and self.op.op == MovementOps.RESHAPE: