_reduce_op is axis based now (#3462)

* _reduce_op is axis based now * axis_ * update lin failures * disable that * fix shape
2026-01-09 23:18:04 -05:00 · 2024-02-21 16:36:31 +01:00
parent 22a90cbb15
commit 871ba73e65
8 changed files with 38 additions and 29 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -213,9 +213,10 @@ jobs:
      - if: ${{ matrix.task == 'onnx' }}
        name: Test ONNX (CLANG)
        run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
-      - if: ${{ matrix.task == 'onnx' }}
-        name: Test Action Space
-        run: PYTHONPATH="." GPU=1 python3 extra/optimization/get_action_space.py
+      # NOTE: need to regenerate the dataset for this after the reduce change. it should probably gen a small dataset here instead
+      #- if: ${{ matrix.task == 'onnx' }}
+      #  name: Test Action Space
+      #  run: PYTHONPATH="." GPU=1 python3 extra/optimization/get_action_space.py
      - if: ${{ matrix.task == 'onnx' }}
        name: Test Beam Search
        run: PYTHONPATH="." GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -745,10 +745,10 @@ class TestOps(unittest.TestCase):
    # exceed per kernel buffer limit with backward
    forward_only = (Device.DEFAULT == "WEBGPU")
    helper_test_op([(45,65)], torch.nn.Softmax(dim=1), Tensor.softmax, atol=1e-7, grad_atol=1e-7, forward_only=forward_only)
-    helper_test_op([()], torch.nn.Softmax(dim=0), Tensor.softmax, atol=1e-7, grad_atol=1e-7, forward_only=forward_only)
+    helper_test_op([(45)], torch.nn.Softmax(dim=0), Tensor.softmax, atol=1e-7, grad_atol=1e-7, forward_only=forward_only)
  def test_log_softmax(self):
    helper_test_op([(45,65)], torch.nn.LogSoftmax(dim=1), Tensor.log_softmax, atol=1e-7, grad_atol=1e-7)
-    helper_test_op([()], torch.nn.LogSoftmax(dim=0), Tensor.log_softmax, atol=1e-7, grad_atol=1e-7)
+    helper_test_op([(45)], torch.nn.LogSoftmax(dim=0), Tensor.log_softmax, atol=1e-7, grad_atol=1e-7)
  def test_log_softmax_other_axis(self):
    helper_test_op([(10,10,10)], lambda x: x.log_softmax(0), atol=1e-7, grad_atol=1e-7)
    helper_test_op([(10,10,10)], lambda x: x.log_softmax(1), atol=1e-7, grad_atol=1e-7)
--- a/test/unit/test_flopcounter.py
+++ b/test/unit/test_flopcounter.py
@@ -35,7 +35,7 @@ class TestFlopCounter(unittest.TestCase):

  def test_flops_red(self):
    op0 = LazyOp(BinaryOps.MUL, (self.buf0,self.buf1,), None)
-    op1 = LazyOp(ReduceOps.SUM, (op0,), (1,))
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (0,))
    op2 = LazyOp(BinaryOps.ADD, (op1, op1,), None)
    info = get_lazyop_info(op2)
    self.assertEqual(info.flops, 9)
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -47,7 +47,9 @@ class Linearizer(Kernel):
  def cast(self, val: UOp, dtype) -> UOp: return self.uop(UOps.CAST, dtype, (val,)) if val.dtype != dtype else val

  def get_reduce_acc(self, reduceop:LazyOp):
-    dtype = get_lazyop_info(reduceop).dtype
+    info = get_lazyop_info(reduceop)
+    assert all(0 <= x < len(info.shape) for x in reduceop.arg), "arg axis out of range"
+    dtype = info.dtype
    if reduceop.op == ReduceOps.SUM: return 0.0 if dtypes.is_float(dtype) else 0
    elif reduceop.op == ReduceOps.MAX:
      if dtypes.is_int(dtype): return 0 if dtypes.is_unsigned(dtype) else -2**(dtype.itemsize*8-1)
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -116,10 +116,12 @@ class LazyBuffer:

  # *** reduce ops ***

-  def _reduce_op(self, op:ReduceOps, new_shape:Tuple[sint, ...]) -> LazyBuffer:
-    if self.shape == new_shape: return self
-    unbound_new_shape = tuple(s.unbind()[0] if not isinstance(s, int) else s for s in new_shape)
-    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, unbound_new_shape, (self,))
+  def _reduce_op(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
+    assert all(0 <= x < len(self.shape) for x in axis), f"axis args {axis} out of range for shape {self.shape}"
+    axis = tuple(x for x in axis if self.shape[x] != 1)
+    if len(axis) == 0: return self
+    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
+    return create_lazybuffer(self.device, ShapeTracker.from_shape(new_shape), self.dtype, op, axis, (self,))

  def r(self, op:ReduceOps, axis:Tuple[int, ...]) -> LazyBuffer:
    new_shape = tuple(1 if i in axis else s for i,s in enumerate(self.shape))
@@ -128,13 +130,13 @@ class LazyBuffer:
    assert len(self.shape)==len(new_shape) and all(ns in (1,s) for s,ns in zip(self.shape,new_shape)), f"not a contraction {self.shape=} {new_shape=}"
    # TODO: can we split symbolic shape if the reduce axis is not symbolic?
    if not all_int(self.shape) or (0 in self.shape) or prod(self.shape) // prod(new_shape) < getenv("REDUCEOP_SPLIT_THRESHOLD", 32768):
-      return self._reduce_op(op, new_shape)
+      return self._reduce_op(op, axis)
    heuristic, divisor, dim_to_split = max(((divisor := math.gcd(256, old))/(stride or math.inf), divisor, i) for i, (old, new, stride) in enumerate(zip(self.shape, new_shape, self.st.real_strides())) if old != new) # type: ignore  # noqa: E501
-    if divisor < 16 or heuristic < 0.1: return self._reduce_op(op, new_shape)
+    if divisor < 16 or heuristic < 0.1: return self._reduce_op(op, axis)
    # choose largest divisor (>=16) to split on, penalize large strides
    def splitted_shape(dim_aft_div):
      return self.shape[:dim_to_split] + (self.shape[dim_to_split]//divisor,) + dim_aft_div + self.shape[dim_to_split+1:]
-    return self.reshape(splitted_shape((divisor,)))._reduce_op(op, splitted_shape((1,))).reshape(splitted_shape(()))._reduce_op(op, new_shape)
+    return self.reshape(splitted_shape((divisor,)))._reduce_op(op, (dim_to_split+1,)).reshape(splitted_shape(()))._reduce_op(op, axis)

  # *** movement ops ***

--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -87,7 +87,7 @@ InterpretedFlopCounter: Dict[Op, Callable] = {
  UnaryOps.CAST: lambda self,arg: FlopCounter(self.shape, arg[0], self.consume_flops(), self.mem),   # cast uses no flops
  **{op:lambda self: FlopCounter(self.shape, self.dtype, self.consume_flops() + prod(self.shape), self.mem) for op in UnaryOps if op != UnaryOps.CAST},  # noqa: E501
  **{op:lambda self,y,op=op: FlopCounter(self.shape,  dtypes.bool if op in (BinaryOps.CMPLT, BinaryOps.CMPEQ) else self.dtype, self.consume_flops() + y.consume_flops() + prod(self.shape), {**self.mem, **y.mem}) for op in BinaryOps},  # noqa: E501
-  **{op:lambda self,new_shape: FlopCounter(new_shape, self.dtype, self.consume_flops() + prod(self.shape), self.mem) for op in ReduceOps},
+  **{op:lambda self,axis: FlopCounter(tuple(1 if i in axis else s for i,s in enumerate(self.shape)), self.dtype, self.consume_flops() + prod(self.shape), self.mem) for op in ReduceOps},  # noqa: E501
  TernaryOps.WHERE: lambda self,y,z: FlopCounter(self.shape, y.dtype, self.consume_flops() + y.consume_flops() + z.consume_flops() + prod(self.shape), {**self.mem, **y.mem, **z.mem})}  # noqa: E501

@functools.lru_cache(None)
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -543,8 +543,8 @@ class Tensor:
  # ***** reduce ops *****

  def _reduce(self, fxn:Type[Function], axis:Optional[Union[int, Tuple[int, ...]]]=None, keepdim=False) -> Tensor:
-    axis_: List[int] = list(range(len(self.shape))) if axis is None else ([axis] if isinstance(axis, int) else list(axis))
-    axis_ = [x if x >= 0 else x+len(self.shape) for x in axis_]
+    axis_: Tuple[int, ...] = tuple(range(len(self.shape))) if axis is None else ((axis,) if isinstance(axis, int) else tuple(axis))
+    axis_ = tuple(x if x >= 0 else x+len(self.shape) for x in axis_)
    shape = tuple(s for i,s in enumerate(self.shape) if i not in axis_)
    ret = fxn.apply(self, axis=axis_)
    return ret if keepdim else ret.reshape(shape=shape)