From 46e7dfade1c3278b41daf5721dac8aeca44af01f Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Tue, 19 Jul 2022 11:42:14 -0700
Subject: [PATCH] REQUIRES_SIMPLE_REDUCE

---
 tinygrad/llops/ops_gpu.py |  1 +
 tinygrad/ops.py           | 10 +++++++++-
 tinygrad/tensor.py        | 12 ++++--------
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py
index 2205c73806..a6c168ec2e 100644
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -112,6 +112,7 @@ class GPUBuffer:
   def movement_op(x, op:MovementOps, arg) -> GPUBuffer: return type(x)(ShapeTracker(x.st).movement_op(op, arg), x)
   def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), start=GPUBuffer.start_for_op[op])
 
+  #REQUIRES_SIMPLE_REDUCE = True
   def _processing_op(ret, bufs: List[Tuple[str, GPUBuffer]]=[], code:str="acc", C:Optional[ConvArgs]=None, start="0.0", reduce_shape=None, earlybufs:Set[str]=set(), earlycode:str="acc") -> GPUBuffer:
     assert C is None
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 74dbde0cec..c3d8508d84 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -245,8 +245,16 @@ class LazyBuffer:
   def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y)
   def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
 
+  # TODO: permute to put all the reduce axis at the end
   def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
-    return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape))) if x.shape != tuple(new_shape) else x
+    if x.shape == tuple(new_shape): return x
+    if getattr(x.dbuffer, "REQUIRES_SIMPLE_REDUCE", False) and (len(new_shape) != 2 or new_shape[1] != 1):
+      num, red = prod([s for s,n in zip(x.shape, new_shape) if n != 1]), prod([s for s,n in zip(x.shape, new_shape) if n == 1])
+      x = x.movement_op(MovementOps.PERMUTE, [i for i,n in enumerate(new_shape) if n != 1] + [i for i,n in enumerate(new_shape) if n == 1])
+      x = x.movement_op(MovementOps.RESHAPE, (num, red))
+      return x.reduce_op(op, (num, 1)).movement_op(MovementOps.RESHAPE, new_shape)
+    else:
+      return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))
 
   # syntactic sugar around PAD and SHRINK
   # TODO: turn RESHAPE into EXPAND and CONTRACT (current EXPAND should be REPEAT)
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 9b4454f4cd..ce0bd5c4d7 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -23,10 +23,8 @@ class Tensor:
     if isinstance(data, np.ndarray):
       if data.shape == tuple(): data = data.reshape((1,))
       self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
-    elif isinstance(data, LazyBuffer):
-      self.lazydata = data
-    else:
-      raise Exception(f"can't create Tensor from {data}")
+    elif isinstance(data, LazyBuffer): self.lazydata = data
+    else: raise Exception(f"can't create Tensor from {data}")
 
     # tensors have gradients, buffers do not
     self.grad : Optional[Tensor] = None
@@ -55,8 +53,7 @@ class Tensor:
     return self
 
   def assign(self, x):
-    if not isinstance(x, Tensor):
-      x = Tensor(x)
+    if not isinstance(x, Tensor): x = Tensor(x)
     assert self.shape == x.shape
     self.lazydata = x.lazydata
     return x
@@ -127,8 +124,7 @@ class Tensor:
         for g in ([grads] if len(t0._ctx.parents) == 1 else grads)]
       for t, g in zip(t0._ctx.parents, grads):
         if g is not None and t.requires_grad:
-          assert g.shape == t.shape, \
-            f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
+          assert g.shape == t.shape, f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
           t.grad = g if t.grad is None else (t.grad + g)
 
   # ***** non first class ops (hlops) *****