remove pushing contig + fix linearizer bug (#2798)

* remove that logic * fix test, move LOADs * fix repeat issue on LLVM * with_phi
2026-01-10 23:48:01 -05:00 · 2023-12-16 09:36:31 -08:00
parent a7264dcb2b
commit 051402625e
5 changed files with 19 additions and 19 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -184,7 +184,7 @@ jobs:
      - if: ${{ matrix.task == 'openpilot' }}
        name: Test openpilot model compile and size
        run: |
-          DEBUG=2 ALLOWED_KERNEL_COUNT=207 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py
+          DEBUG=2 ALLOWED_KERNEL_COUNT=208 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py
          #python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
      - if: ${{ matrix.task == 'openpilot' }}
        name: Test openpilot model correctness (float32)
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1252,6 +1252,10 @@ class TestOps(unittest.TestCase):

    np.testing.assert_allclose(x.repeat((2, 0, 4)).numpy(), Tensor.zeros(8, 0, 12).numpy())

+  def test_simple_repeat(self):
+    repeats = [3, 3, 4]
+    helper_test_op([(3, 3)], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
+
  def test_clip(self):
    helper_test_op([(45,65)], lambda x: x.clip(-2.3, 1.2), lambda x: x.clip(-2.3, 1.2))

--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -376,9 +376,16 @@ class Linearizer(Kernel):
    # store
    self.global_store(0, global_idxs+local_idxs+fake_reduce_idxs+upcast_idxs, val)

+    # get PHI node loop scope, link anything using a DEFINE_ACC to the loop as a "parent"
+    acc_scope: DefaultDict[UOp, List[UOp]] = defaultdict(list)
+    for u in self.uops:
+      if u.uop == UOps.PHI:
+        acc_scope[u.vin[0]] += u.vin[2:]
+
    # graph helper functions
    @functools.lru_cache(None)
-    def get_recursive_parents(x:UOp) -> Set[UOp]: return set.union(set(x.vin), *[get_recursive_parents(p) for p in x.vin])
+    def get_recursive_parents(x:UOp, with_phi=False) -> Set[UOp]:
+      return set.union(set(x.vin), *[get_recursive_parents(p, with_phi) for p in x.vin], set(acc_scope[x]) if with_phi else set())

    def get_recursive_children(x:UOp) -> Set[UOp]:
      deps = set([x])
@@ -400,9 +407,9 @@ class Linearizer(Kernel):
    for u in self.uops:
      if not loop_stack[-1]: loop_stack[-1].append(u)
      elif u.uop == UOps.LOOP: loop_stack.append([u])
-      elif u.uop not in [UOps.CONST, UOps.ALU, UOps.CAST]: loop_stack[-1].append(u)
+      elif u.uop not in [UOps.CONST, UOps.ALU, UOps.CAST, UOps.LOAD]: loop_stack[-1].append(u)
      else:
-        parents = get_recursive_parents(u)
+        parents = get_recursive_parents(u, with_phi=True)
        for i in reversed(range(len(loop_stack))):
          # check backwards and put the uop in the first encounter with some dependency
          if any(x in parents for x in loop_stack[i]) or i == 0:
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -21,8 +21,7 @@ LAZYCACHE = getenv("LAZYCACHE", 1)
 # TODO: movement ops that only change shape are really nops. treat them as such
 REMOVE_MOVEMENT_NOPS, MERGE_ELEMENTWISE_INTO_REDUCE, SHUFFLE_MOVEMENT_OPS, MERGE_ELEMENTWISE_OPS = OPT>=1, OPT>=1, OPT>=1, OPT>=1
 MERGE_ONE_REDUCE_INTO_ELEMENTWISE, SHUFFLE_PAD_OPS = OPT>=2, OPT>=2
-PUSH_PERMUTES, PUSH_CONTIGUOUS = OPT>=3, OPT>=3
-PUSH_RESHAPES = OPT>=4
+PUSH_PERMUTES = OPT>=3

 # **** ast fixing functions ****

@@ -239,16 +238,6 @@ class LazyBuffer:
    # get outputs now
    out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max([x.dtype for x in srcs]) if op != UnaryOps.CAST else cast(Tuple[DType, bool], arg)[0]

-    # push all contiguous to the end of BinaryOps
-    if PUSH_CONTIGUOUS and any(not x.realized and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
-      new_srcs: List[LazyBuffer] = []
-      for x in srcs:
-        if not x.realized and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1:
-          x.op.src[0].children.discard(x)
-          x = cast(LazyBuffer, x.op.src[0])
-        new_srcs.append(x)
-      return new_srcs[0].e(op, *new_srcs[1:], arg=arg).contiguous()
-
    if MERGE_ELEMENTWISE_OPS:
      # remove the buffers from any (childless) BinaryOps that feed into this
      _srcs = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs])
@@ -329,7 +318,7 @@ class LazyBuffer:

  def _movement_op(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[sint, ...], Tuple[Tuple[sint, sint], ...]]) -> LazyBuffer:
    if SHUFFLE_MOVEMENT_OPS and not self.realized and self.optype == BinaryOps and not self.children:
-      if op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and (self.op.op in UnaryOps or PUSH_RESHAPES)):
+      if op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps):
        return self.op.replace_with_movement_ops([(op, arg)])
    if REMOVE_MOVEMENT_NOPS and not self.realized and st.contiguous:
      # MovementOps aren't stacked any more, they each have one parent, find the root