new linearizer with early endrange (#12823)

* new linearizer with early endrange * cleanups * second stage removal * not store * do that later * end cleanup * fix globals * end * multi end * fix ends earlier * work * do_merge_ends * mini change * range_gate * fix cpu * test fixups * ranges on index * not for ptx
2026-01-10 07:28:15 -05:00 · 2025-10-21 17:37:48 +08:00
parent d59d4cdbe4
commit c780cd9abb
21 changed files with 193 additions and 84 deletions
--- a/test/external/external_benchmark_schedule.py
+++ b/test/external/external_benchmark_schedule.py
@@ -3,6 +3,7 @@ from tinygrad import Tensor, nn, Device
 from tinygrad.helpers import Profiling, Timing, getenv
 from tinygrad.uop.ops import Ops
 from tinygrad.codegen import get_rewrites_for_renderer, apply_rewrites, rewrites_for_linearizer
+from tinygrad.codegen.control_flow import linearize
 from tinygrad.uop.spec import type_verify

 if __name__ == "__main__":
@@ -39,7 +40,7 @@ if __name__ == "__main__":
          with Timing("***** model linearize in "):
            uops_line = []
            for u in rewritten_uops:
-              uops_line.append(apply_rewrites(u, rewrites_for_linearizer))
+              uops_line.append(linearize(apply_rewrites(u, rewrites_for_linearizer)))
          with Timing("***** model verify in    "):
-            for u in uops_line: type_verify(u.arg.lst)
-          print(sum(len(u.arg.lst) for u in uops_line))
+            for u in uops_line: type_verify(u)
+          print(sum(len(u) for u in uops_line))
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -214,8 +214,8 @@ class TestLinearizer(unittest.TestCase):
        else:
          assert u.src[1].op in GroupOp.ALU
          assert begin_range < uops.index(u) < end_range
-      # children of STORE are placed after ENDRANGE
-      if any(x.op is Ops.STORE and x.src[1].op in GroupOp.ALU for x in u.src):
+      # children of END are placed after ENDRANGE
+      if any(x.op is Ops.END and x.src[1].op in GroupOp.ALU for x in u.src):
        assert end_range < uops.index(u)

  def test_grouped_dims(self):
@@ -400,7 +400,7 @@ class TestLinearizer(unittest.TestCase):
    # # check the children's vins
    # TODO: src ALU are not the same, should it?
    # assert barrier.src == tuple(local_stores)
-    assert len([u for u in uops if u.op is Ops.IF and u.src[-1] == barrier]) == 1
+    assert len([u for u in uops if u.op is Ops.IF and u.src[1] == barrier]) == 1

  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
  @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2602,6 +2602,7 @@ class TestOps(unittest.TestCase):
      lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=(111,28)),
      lambda x: Tensor.avg_pool2d(x, kernel_size=(111,28)), rtol=1e-5)

+  @unittest.skipIf(Device.DEFAULT == "AMD" and CI, "remu failure?")
  def test_avg_pool3d_failure(self):
    with Context(NOOPT=0):
      helper_test_op([(1,1,16,16,16)],
--- a/test/test_rangeify.py
+++ b/test/test_rangeify.py
@@ -1,7 +1,9 @@
 import unittest
-from tinygrad import Tensor, nn
-from tinygrad.helpers import Context, GlobalCounters, CI, CPU_LVP, getenv
+from tinygrad import Tensor, nn, Device
+from tinygrad.helpers import Context, GlobalCounters, CI, getenv
 from tinygrad.uop.ops import graph_rewrite, PatternMatcher, UPat, Ops
+from tinygrad.renderer.ptx import PTXRenderer
+from tinygrad.renderer.nir import NIRRenderer

 class TestRangeifyAssign(unittest.TestCase):
  def test_assign_permuted(self):
@@ -40,7 +42,7 @@ elif getenv("BIG") > 0:
 else:
  BS, HEADS, SEQLEN, EMB = 4, 2, 16, 8

-@unittest.skipIf(CPU_LVP, "broken in LVP")
+@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, (NIRRenderer, PTXRenderer)), "broken in LVP and PTX")
 class TestPcontig(unittest.TestCase):
  def test_flash_attention_bw(self):
    def fa_bw():
--- a/test/test_tensor_uop.py
+++ b/test/test_tensor_uop.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.realize import run_schedule
-from tinygrad.uop.ops import Ops, UOp, UPat
+from tinygrad.uop.ops import UOp
 from tinygrad.helpers import SPLIT_REDUCEOP

 class TestTensorUOp(unittest.TestCase):
@@ -93,7 +93,6 @@ class TestTensorUOp(unittest.TestCase):
    out.realize()
    self.assertEqual(out.tolist(), Tensor.zeros(4, 8).tolist())

-reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, allow_any_len=True, src=(UPat(), UPat((Ops.REDUCE_AXIS, Ops.REDUCE))))))
@unittest.skipUnless(SPLIT_REDUCEOP, "only for SPLIT_REDUCEOP")
 class TestReduceOp(unittest.TestCase):
  def test_no_split_reduce_kernel(self):
@@ -101,23 +100,18 @@ class TestReduceOp(unittest.TestCase):
    a = a.sum()
    sched = a.schedule()
    assert len(sched) == 1
-    assert reduce_kernel.match(sched[0].ast, {})

  def test_split_reduce_kernel_dim0(self):
    a = Tensor.rand(256, 255).realize()
    a = a.sum()
    sched = a.schedule()
    assert len(sched) == 2
-    for s in sched:
-      assert reduce_kernel.match(s.ast, {})

  def test_split_reduce_kernel_dim1(self):
    a = Tensor.rand(255, 256).realize()
    a = a.sum()
    sched = a.schedule()
    assert len(sched) == 2
-    for s in sched:
-      assert reduce_kernel.match(s.ast, {})

 if __name__ == "__main__":
  unittest.main()
--- a/test/test_uop_graph.py
+++ b/test/test_uop_graph.py
@@ -665,19 +665,6 @@ class TestUOpGraph(unittest.TestCase):
    bad_gate = UOp.const(dtypes.int, 1)
    with self.assertRaises(AssertionError): to_uops_list([UOp(Ops.STORE, dtypes.void, (glbl0, idx, UOp.const(dtypes.int, 42), bad_gate))])

-  def test_switched_range_order(self):
-    glbl = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), 0)
-    cf = UOp.const(dtypes.float, 0.0)
-    r1 = UOp.range(2, 0)
-    r2 = UOp.range(2, 1)
-    alu = UOp(Ops.MUL, dtypes.int, (r2, r1))
-    store = UOp(Ops.STORE, dtypes.void, (glbl.index(alu), cf))
-    uops = to_uops_list([store])
-    ranges = [x for x in uops if x.op is Ops.RANGE]
-    endranges = [x for x in uops if x.op is Ops.END]
-    # ranges are closed in the right order
-    self.assertEqual(endranges[-1].src[0], ranges[0])
-
@track_rewrites()
 def expander_rewrite(sink): return graph_rewrite(sink, sym + expander)

--- a/test/unit/test_simplify_valid_idx.py
+++ b/test/unit/test_simplify_valid_idx.py
@@ -5,7 +5,7 @@ from tinygrad.dtype import dtypes
 from tinygrad.uop.ops import UOp, Ops
 from tinygrad.uop.symbolic import simplify_valid
 from tinygrad.helpers import Context
-from .test_uop_symbolic import check_uop_against_string
+from test.unit.test_uop_symbolic import check_uop_against_string

 def get_gated_load_uop(valid:UOp, idx:UOp):
  return UOp(Ops.LOAD, dtypes.float, (
--- a/tinygrad/codegen/init.py
+++ b/tinygrad/codegen/init.py
@@ -14,10 +14,11 @@ from tinygrad.uop.decompositions import get_late_rewrite_patterns
 from tinygrad.codegen.late.expander import migrate_indexing, expander, pm_pre_expander, pm_group_for_reduce
 from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
  ReduceContext, correct_load_store, pm_render
-from tinygrad.codegen.late.linearize import block_create, pm_blockend_merge, block_merge, pm_finalize, BlockContext
 from tinygrad.codegen.opt.postrange import pm_postrange_opt
 from tinygrad.codegen.simplify import pm_simplify_ranges, pm_reduce_simplify, pm_flatten_range, pm_split_ranges
 from tinygrad.schedule.rangeify import pm_add_buffers, rangeify_codegen
+#from tinygrad.codegen.late.linearize import block_create, pm_blockend_merge, block_merge, pm_finalize, BlockContext
+from tinygrad.codegen.control_flow import CFGContext, pm_merge_ends, pm_add_control_flow, linearize

@dataclass
 class RewriteStep:
@@ -30,11 +31,18 @@ class RewriteStep:

 def apply_rewrites(sink:UOp, rewrites:list[RewriteStep]): return functools.reduce(lambda x,f: f(x), rewrites, sink)

+"""
 rewrites_for_linearizer = [
  RewriteStep(block_create, ctx=BlockContext.from_sink, name="Linearizer: Create Blocks", bottom_up=True),
  RewriteStep(pm_blockend_merge, name="Linearizer: Merge Blockends"),
  RewriteStep(block_merge, name="Linearizer: Merge Blocks"),
  RewriteStep(pm_finalize, name="Linearizer: Finalize")]
+"""
+
+rewrites_for_linearizer = [
+  RewriteStep(pm_merge_ends, CFGContext, name="merge ends", bottom_up=True),
+  RewriteStep(pm_add_control_flow, CFGContext, name="add control flow starts", bottom_up=True),
+]

 def get_rewrites_for_renderer(opts:Renderer, optimize:bool=True, linearizer:bool=True) -> list[RewriteStep]:
  # cache with the values of the context vars
@@ -119,6 +127,6 @@ def full_rewrite(sink:UOp, opts:Renderer|None=None) -> list[UOp]:
    Linear program in UOps.
  """

-  lst = list(full_rewrite_to_sink(sink, opts, optimize=sink.tag is None, linearizer=True).arg.lst)
+  lst = linearize(full_rewrite_to_sink(sink, opts, optimize=sink.tag is None, linearizer=True))
  if __debug__: type_verify(lst)
  return lst
--- a/tinygrad/codegen/control_flow.py
+++ b/tinygrad/codegen/control_flow.py
@@ -0,0 +1,100 @@
+import heapq
+from collections import defaultdict
+from tinygrad.uop.ops import PatternMatcher, UOp, Ops, UPat
+
+def linearize(u:UOp) -> list[UOp]:
+  lst = list(u.toposort())
+  in_this_block = set(lst)
+  local_children: defaultdict[UOp, list[UOp]] = defaultdict(list)
+  in_degree:dict[UOp, int] = {}
+  priorities:dict[UOp, int] = {}
+
+  # get local children and assign priorities
+  # NOTE: this requires the lst be locally toposorted
+  for u in reversed(lst):
+    in_degree[u] = 0
+    for s in u.src:
+      if s in in_this_block:
+        local_children[s].append(u)
+        in_degree[u] += 1
+    # put loads in the beginning of the block and prevent priority inversion. hack for BARRIER grouping too
+    priority = [0] + [priorities[x] for x in local_children[u]]
+    if u.op is Ops.LOAD: priority.append(-1000)
+    if u.op is Ops.BARRIER: priority.append(-1500)
+    # ranges are scheduled as late as possible so anything that can be outside is
+    #if u.op is Ops.RANGE: priority = [2000]
+    # move defines and consts to the top
+    if u.op in {Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL, Ops.DEFINE_REG, Ops.DEFINE_VAR, Ops.SPECIAL, Ops.CONST}: priority.append(-2000)
+    priorities[u] = min(priority)
+
+  # number the uops in "ideal" order
+  nkey = {u:i for i,u in enumerate(sorted(lst, key=lambda x: (priorities[x],)+x.tuplize))}
+
+  # then force then to be toposorted in as close to the ideal order as possible
+  heapq.heapify(heap:=[(nkey[u],u) for u in lst if in_degree[u] == 0])
+  newlst = []
+  while heap:
+    newlst.append(u:=heapq.heappop(heap)[1])
+    for v in local_children[u]:
+      in_degree[v] -= 1
+      if in_degree[v] == 0: heapq.heappush(heap, (nkey[v],v))
+
+  assert len(newlst) == len(lst), f"len mismatch {len(newlst)} != {len(lst)}"
+  return newlst
+
+class CFGContext:
+  def __init__(self, sink:UOp):
+    # there are 3 relationships between ranges:
+    # nested, meaning endrange y is a dependency of endrange x and range x is a dependency of endrange y
+    # dependent, meaning endrange y is a dependency of endrange x and range x is not a dependency of endrange y
+    # independent, endrange y is not a dependency of endrange x
+    # everything is nested inside the sink
+    deps: dict[UOp, set[UOp]] = {}
+    nesting: dict[UOp, UOp] = {}
+    for u in sink.toposort():
+      deps[u] = set().union(*(deps[s] for s in u.src))
+      if u.op in (Ops.END, Ops.ENDIF, Ops.SINK):
+        nesting |= {x:u for x in deps[u] if x.op in (Ops.END, Ops.ENDIF) and (u.op is Ops.SINK or u.src[0] in deps[x]) and x not in nesting}
+      if u.op in (Ops.RANGE, Ops.END, Ops.IF, Ops.ENDIF): deps[u] |= {u}
+
+    self.edges: dict[UOp, UOp] = {}
+    siblings: dict[UOp, list[UOp]] = {}
+    for k,vv in nesting.items(): siblings.setdefault(vv, []).append(k)
+    for k,v in siblings.items():
+      # range/if that have dependencies on other siblings need to run after them
+      order = sorted(v, key=lambda x: len(deps[x].intersection(v)))
+      zipped = zip(order, order[1:]) if k.op is Ops.SINK else zip([k.src[0]] + order, order)
+      for x,y in zipped:
+        # TODO: is this check correct?
+        if y.src[0] not in x.backward_slice_with_self:
+          self.edges[y.src[0]] = x
+
+pm_add_control_flow = PatternMatcher([
+  (UPat((Ops.RANGE, Ops.IF), name="x"), lambda ctx,x: x.replace(src=x.src+(y,)) if (y:=ctx.edges.get(x)) is not None else None),
+])
+
+def do_merge_ends(s:UOp):
+  # NOTE: this can fail
+  stacked: dict[UOp, list[UOp]] = {}
+  dangling_ifs = []
+  for x in s.toposort():
+    if x.op in {Ops.END, Ops.ENDIF}:
+      assert x.op is not Ops.END or x.arg == 1, "ends must be single ends for linearizer"
+      stacked.setdefault(x.src[0], []).append(x)
+    if x.op is Ops.IF: dangling_ifs.append(x)
+  dangling_ifs = [x for x in dangling_ifs if x not in stacked]
+  replaces = {}
+  for k,v in stacked.items():
+    if len(v) == 1: continue
+    rep = UOp(v[0].op, src=tuple([k] + [y for x in v for y in x.src[1:]]), arg=x[0].arg)
+    for x in v: replaces[x] = rep
+  if not len(replaces) and not len(dangling_ifs): return None
+  ret = s.substitute(replaces)
+  if len(dangling_ifs):
+    assert len(dangling_ifs) == 1, "we only support 1 dangling if"
+    ret = ret.replace(src=(UOp(Ops.ENDIF, src=(dangling_ifs[0], *ret.src)),))
+  return ret
+
+pm_merge_ends = PatternMatcher([
+  (UPat(Ops.SINK, name="s"), do_merge_ends),
+])
--- a/tinygrad/codegen/gpudims.py
+++ b/tinygrad/codegen/gpudims.py
@@ -87,15 +87,15 @@ def add_gpudims(ctx:Renderer, s:UOp):
    except ValueError: continue
  return s.substitute(subs)

-def add_barrier_and_if(buf:UOp, s:UOp):
+def add_barrier_and_if(buf:UOp, e:UOp):
  # TODO: this is not generic
-  local_ranges = [x for x in s.src[1:] if x.op is Ops.RANGE and x.arg[-1] == AxisType.GROUP_REDUCE]
+  local_ranges = [x for x in e.ended_ranges if x.op is Ops.RANGE and x.arg[-1] == AxisType.GROUP_REDUCE]
  if len(local_ranges) == 0: return None
-  return buf.after(UOp(Ops.IF, dtype=dtypes.void, src=(functools.reduce(operator.and_, [x.eq(0) for x in local_ranges]), s.barrier())))
+  return buf.after(UOp(Ops.IF, dtype=dtypes.void, src=(functools.reduce(operator.and_, [x.eq(0) for x in local_ranges]), e.barrier())))

 pm_add_gpudims = PatternMatcher([
  # add gpudims must be last
  (UPat(Ops.SINK, name="s"), add_gpudims),
  # add barrier and if
-  (UPat(Ops.AFTER, src=(UPat(Ops.DEFINE_LOCAL, name="buf"), UPat(Ops.STORE, name="s"))), add_barrier_and_if),
+  (UPat(Ops.AFTER, src=(UPat(Ops.DEFINE_LOCAL, name="buf"), UPat(Ops.END, name="e"))), add_barrier_and_if),
 ])
--- a/tinygrad/codegen/late/devectorizer.py
+++ b/tinygrad/codegen/late/devectorizer.py
@@ -268,10 +268,12 @@ pm_render = PatternMatcher([
    UPat.var("a")), lambda c,idx,l,a: l.replace(src=(l.src[0], a.cast(l.dtype))+l.src[2:]).cast(a.dtype)),
  (UPat.var("c").where(UPat.var("a"), UPat(Ops.LOAD, src=(UPat().index(UPat.var("idx"), UPat.var("c").logical_not()).or_casted(),),
    allow_any_len=True, name="l").or_casted()), lambda c,idx,l,a: l.replace(src=(l.src[0], a.cast(l.dtype))+l.src[2:]).cast(a.dtype)),
-  # gate any stores that aren't gated with ifs
+  # gate any stores that aren't gated with if/endif pairs
  (UPat(Ops.STORE, src=(UPat(src=(UPat(), UPat(), UPat(dtype=dtypes.bool)), name="idx").or_casted(), UPat()), name="store", allow_any_len=True),
-    lambda store,idx: UOp(Ops.STORE, dtype=store.dtype, src=store.src[:2]+(UOp(Ops.IF, src=(idx.src[2],)),)+store.src[2:]) if \
+    lambda store,idx: UOp(Ops.ENDIF, src=(uif:=UOp(Ops.IF, src=(idx.src[2],)), UOp(Ops.STORE, src=store.src[:2]+(uif,)+store.src[2:]))) if \
      len(store.src) <= 2 or store.src[2].op != Ops.IF else None),
+  # for renderering and linearizing, all ends must end one loop
+  (UPat(Ops.END, name="e"), lambda e: e.replace(src=e.src[e.arg-1:], arg=1).end(ends=e.src[:e.arg-1]) if e.arg > 1 else None),
 ])

 # *** Ops.REDUCE -> Ops.DEFINE_ACC ***
@@ -295,8 +297,8 @@ def reduce_to_acc(ctx:ReduceContext, red:UOp):
  # if we have a range
  if len(reduce_range) != 0:
    topo = inp.toposort()
-    stored_ranges = flatten([x.src[2:] for x in topo if x.op is Ops.STORE])
-    input_ranges = tuple([x for x in topo if x.op is Ops.RANGE and x not in reduce_range and x not in stored_ranges])
+    ended_ranges = flatten([x.src[:x.arg] for x in topo if x.op is Ops.END])
+    input_ranges = tuple([x for x in topo if x.op is Ops.RANGE and x not in reduce_range and x not in ended_ranges])
    identity = red.const(red.dtype, identity_element(red.arg, red.dtype.scalar()))
    acc = UOp(Ops.DEFINE_REG, red.dtype.ptr(size=1, addrspace=AddrSpace.REG), arg=(ctx.acc_num,))
    acc_init = acc.after(*input_ranges).index(UOp.const(dtypes.int, 0)).store(identity) if len(input_ranges) else \
@@ -305,7 +307,7 @@ def reduce_to_acc(ctx:ReduceContext, red:UOp):
    ctx.acc_num += 1
  ret = functools.reduce(lambda x,y: x.alu(red.arg, y), lst)
  if len(reduce_range) == 0: return ret
-  return acc.after(acc.index(UOp.const(dtypes.int, 0)).store(ret, *reduce_range)).index(UOp.const(dtypes.int, 0)).load()
+  return acc.after(acc.index(UOp.const(dtypes.int, 0)).store(ret).end(ends=reduce_range[::-1])).index(UOp.const(dtypes.int, 0)).load()

 pm_reduce = PatternMatcher([
  # REDUCE -> DEFINE_ACC+ASSIGN
--- a/tinygrad/codegen/late/expander.py
+++ b/tinygrad/codegen/late/expander.py
@@ -87,7 +87,7 @@ expander = PatternMatcher([
   lambda outer, inner: UOp(Ops.UNROLL, outer.dtype, (inner.src[0],), inner.arg+outer.arg)),
  # do expansion
  (UPat((*GroupOp.ALU, Ops.CAST, Ops.BITCAST, Ops.GEP, Ops.WMMA, Ops.LOAD, Ops.STORE, Ops.INDEX, Ops.BUFFERIZE,
-         Ops.VECTORIZE, Ops.IF, Ops.REDUCE), name="root", custom_early_reject=set([Ops.UNROLL])), do_expand),
+         Ops.VECTORIZE, Ops.IF, Ops.REDUCE, Ops.END), name="root", custom_early_reject=set([Ops.UNROLL])), do_expand),
  (UPat(Ops.CONTRACT, name="con"), do_contract),
  # BARRIERs aren't actually expanded
  (UPat(Ops.BARRIER, src=(UPat(Ops.UNROLL, name="ex"),)),
--- a/tinygrad/codegen/opt/postrange.py
+++ b/tinygrad/codegen/opt/postrange.py
@@ -4,8 +4,8 @@ from collections import defaultdict
 from typing import cast, Final
 from tinygrad.uop.ops import PatternMatcher, UPat, Ops, UOp, KernelInfo, graph_rewrite, AxisType, ssimplify, GroupOp
 from tinygrad.device import Buffer
-from tinygrad.dtype import AddrSpace, dtypes, ImageDType
-from tinygrad.helpers import colored, BEAM, getenv, DEBUG, to_function_name, NOOPT, argsort, round_up, prod, merge_dicts, get_single_element, flatten
+from tinygrad.dtype import dtypes, ImageDType
+from tinygrad.helpers import colored, BEAM, getenv, DEBUG, to_function_name, NOOPT, argsort, round_up, prod, merge_dicts, get_single_element
 from tinygrad.codegen.opt import axis_colors, Opt, OptOps, KernelOptError, check, axis_letters
 from tinygrad.codegen.simplify import pm_flatten_range
 from tinygrad.renderer import Renderer
@@ -64,21 +64,8 @@ class Scheduler:
    return self.ast.replace(arg=KernelInfo(name=name, applied_opts=tuple(self.applied_opts), dont_use_locals=self.dont_use_locals), tag=1)

  def _globalizable_rngs(self) -> list[UOp]:
-    store_rngs = self.ast.src[0].src[2:]
-
-    # filter any not in local stores
-    local_store_rngs = [x.ranges for x in self.ast.toposort() if (x.op is Ops.STORE and x.src[0].ptrdtype.addrspace == AddrSpace.LOCAL) \
-                        or (x.op is Ops.BUFFERIZE and x.arg == AddrSpace.LOCAL)]
-    for ls in local_store_rngs: store_rngs = tuple([x for x in store_rngs if x in ls])
-
-    # filter any not in reduces
-    # TODO: enable this
-    """
-    reduce_rngs = [x.ranges for x in self.ast.toposort() if x.op is Ops.REDUCE]
-    for ls in reduce_rngs: store_rngs = tuple([x for x in store_rngs if x in ls])
-    """
-
-    return [x for x in UOp.sink(*store_rngs).toposort() if x.op is Ops.RANGE and x.arg[-1] == AxisType.LOOP] if store_rngs else []
+    # all ranges that end before any STOREs
+    return [x for x in self.ast.toposort(lambda x: x.op is not Ops.STORE) if x.op is Ops.RANGE and x not in self.ast.ranges]

  def convert_loop_to_global(self):
    if not self.opts.has_local: return None
@@ -89,11 +76,11 @@ class Scheduler:
    self.ast = self.ast.substitute(dict(zip(self.rngs, rng)))

  def colors(self) -> list[str]:
-    store_rngs = flatten([x.src[2:] for x in self.ast.src])
+    globalizible_rngs = self._globalizable_rngs()
    ret = []
    for x,r in zip(self.axis_types, self.rngs):
      if self.dont_use_locals and x == AxisType.GLOBAL: ret.append("BLUE")
-      elif r not in store_rngs and x == AxisType.LOOP: ret.append("BLACK")
+      elif r not in globalizible_rngs and x == AxisType.LOOP: ret.append("BLACK")
      else: ret.append(axis_colors[x])
    return ret
  def colored_shape(self) -> str: return ' '.join([colored(f'{x.src[0].render():>4s}', color) for x,color in zip(self.rngs, self.colors())])
--- a/tinygrad/codegen/simplify.py
+++ b/tinygrad/codegen/simplify.py
@@ -13,14 +13,16 @@ def flatten_range(r:UOp):
 pm_flatten_range = PatternMatcher([
  # real ranges only
  (UPat((Ops.REDUCE, Ops.STORE), name="r"), flatten_range),
+  # END is only on RANGES. TODO: this is copied from symbolic
+  (UPat(Ops.END, name="e"), lambda e: UOp.end(*e.src[e.arg:], ends=sorted(UOp.sink(*e.src[:e.arg]).ranges, key=lambda x: x.arg))),
 ])

 def count_divmod(x:UOp): return len([u for u in x.toposort() if u.op in {Ops.IDIV, Ops.MOD}])
 def simplify_merge_adjacent(u:UOp) -> UOp|None:
  reduce_ranges = [x.ranges for x in u.backward_slice_with_self if x.op is Ops.REDUCE]
-  i = range_start[u.op]
-  while i < len(u.src)-1:
-    r0, r1 = u.src[i], u.src[i+1]
+  i = 0
+  while i < len(u.ended_ranges)-1:
+    r0, r1 = u.ended_ranges[i], u.ended_ranges[i+1]
    # check same type
    if r0.arg[-1] == r1.arg[-1]:
      # check if the ranges to merge are in the same reduces
@@ -39,7 +41,7 @@ def simplify_merge_adjacent(u:UOp) -> UOp|None:
  return u

 pm_simplify_ranges = PatternMatcher([
-  (UPat((Ops.STORE, Ops.REDUCE), name="u"), simplify_merge_adjacent),
+  (UPat((Ops.END, Ops.REDUCE), name="u"), simplify_merge_adjacent),
 ])

 def mark_range_mod(ctx, r:UOp, c:UOp):
@@ -57,7 +59,7 @@ def do_substitute(ctx, x: UOp):

 def dont_sub_ranges_for_image(ctx, x:UOp):
  if isinstance(x.src[0].dtype, ImageDType):
-    for s in x.src[1:]: ctx[s] = None
+    for s in x.src[0].ranges: ctx[s] = None

 pm_split_ranges = PatternMatcher([
  (UPat(Ops.RANGE, name="r")%UPat.cvar("c"), mark_range_mod),
--- a/tinygrad/renderer/init.py
+++ b/tinygrad/renderer/init.py
@@ -28,10 +28,11 @@ class Estimates:
    mult_stack: list[sint] = []
    dont_count: set[UOp] = set()
    if ignore_indexing:
+      def range_gate(x): return x.op is not Ops.RANGE
      for u in uops:
        if u.op in {Ops.LOAD, Ops.STORE} and (not isinstance(u.src[0].dtype, PtrDType) or u.src[0].dtype.addrspace != AddrSpace.REG):
          # if u.src[0] is INDEX, we have to include the buffer since it might be an AFTER
-          dont_count = dont_count.union((UOp.sink(*u.src[0].src[1:]) if u.src[0].op is Ops.INDEX else u.src[0]).toposort())
+          dont_count = dont_count.union((UOp.sink(*u.src[0].src[1:]) if u.src[0].op is Ops.INDEX else u.src[0]).toposort(range_gate))
          # TODO: is this correct? this all needs to be cleaned up
          if len(u.src) > 2: dont_count = dont_count.union(u.src[2].toposort())
        elif u.op is Ops.IF:
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@@ -115,7 +115,7 @@ string_rewrite = PatternMatcher([
     if x.dtype.count > 1 else f"ld.{mem_type(x)}.{ctx.mem_types[x.dtype]} {ctx.r[x]}, [{ctx.r[loc]}+0];"),
  (UPat(Ops.DEFINE_REG, src=()), lambda ctx: []),
  (UPat(Ops.RANGE, name="x"), lambda ctx, x: [f"mov.u32 {ctx.r[x]}, 0;", "LOOP_" + f"{ctx.r[x][1:]}:"]),
-  (UPat(Ops.END, name="x", src=(UPat.var("src0"),)), lambda ctx, x, src0: [
+  (UPat(Ops.END, name="x", src=(UPat.var("src0"),), allow_any_len=True), lambda ctx, x, src0: [
    ctx.code_for_op[Ops.ADD](ctx.r[src0], ctx.r[src0], "1", dtypes.int, ctx.types[dtypes.int]),
    ctx.code_for_op[Ops.CMPLT](ctx.r[x], ctx.r[x.src[0]], ctx.r[src0.src[0]], dtypes.int, ctx.types[dtypes.int]),
    f"@{ctx.r[x]} bra LOOP_{ctx.r[src0][1:]};"]),
--- a/tinygrad/schedule/rangeify.py
+++ b/tinygrad/schedule/rangeify.py
@@ -276,7 +276,7 @@ def bufferize_to_store(x:UOp):
    assert assign_target.op is Ops.INDEX, f"{assign_target.op} is not index"
    # in assign, this is the buffer size, not the bufferize size
    # TODO: assign_mops here
-    do_store = assign_target.replace(dtype=sdtype).store(assign_src, *rngs).replace(tag=x.tag)
+    do_store = assign_target.replace(dtype=sdtype).store(assign_src).replace(tag=x.tag).end(ends=[x for x in rngs if x.op is Ops.RANGE])
    ret = assign_target.src[0].after(do_store)
    mops = []
    walk = assign_mops
@@ -289,7 +289,7 @@ def bufferize_to_store(x:UOp):
  # NOTE: the DEFINE_LOCAL needs to be disambiguated here
  if sdtype.addrspace == AddrSpace.GLOBAL:
    buf = UOp.new_buffer(x.arg.device, size, x.dtype)
-    do_store = buf.reshape(shape).index(*rngs, dtype=sdtype).store(x.src[0], *rngs).replace(tag=x.tag)
+    do_store = buf.reshape(shape).index(*rngs, dtype=sdtype).store(x.src[0]).replace(tag=x.tag).end(ends=[x for x in rngs if x.op is Ops.RANGE])
    ret = buf.after(do_store).forced_reshape(shape)
    # TODO: is this right? what if it's offset
    if any(r.op is Ops.RANGE and r.src[0].op is not Ops.CONST for r in rngs):
@@ -301,7 +301,8 @@ def bufferize_to_store(x:UOp):
  tag = x.arg.device
  if tag is None: tag = UOp.unique().arg # TODO: hack
  buf = UOp(Ops.DEFINE_LOCAL, sdtype, arg=tag)
-  return buf.after(buf.reshape(shape).index(*rngs, dtype=sdtype).store(x.src[0], *rngs)).reshape(shape)
+  do_store = buf.reshape(shape).index(*rngs, dtype=sdtype).store(x.src[0]).end(ends=[x for x in rngs if x.op is Ops.RANGE])
+  return buf.after(do_store).reshape(shape)

 pm_add_buffers = pm_mops+to_bufferview+PatternMatcher([
  (UPat(Ops.BUFFERIZE, name="x"), bufferize_to_store),
@@ -412,7 +413,6 @@ class Kernel:

 def split_store(ctx:list[UOp], x:UOp) -> UOp|None:
  if len(x.ranges): return None
-  if x.src[0].ptrdtype.addrspace is AddrSpace.LOCAL: return None

  # local kernel rewrite
  lctx = LocalAddBufferContext()
@@ -422,8 +422,14 @@ def split_store(ctx:list[UOp], x:UOp) -> UOp|None:
  metadatas = [ctx[y].metadata for y in lctx.parent_tags]

  # NOTE: the hack for COPY is here
-  ret = ret.sink(arg=KernelInfo(opts_to_apply=lctx.opts) if lctx.opts is not None else None) \
-    if ret.src[1].op not in {Ops.COPY, Ops.BUFFER_VIEW} else ret.src[1]
+  for u in ret.toposort():
+    # TODO: this can be wrong if there's multiple of these
+    if u.op in {Ops.COPY, Ops.BUFFER_VIEW}:
+      ret = u
+      break
+  else:
+    ret = ret.sink(arg=KernelInfo(opts_to_apply=lctx.opts) if lctx.opts is not None else None)
+
  kernel_arg = Kernel(ret,tuple(dedup(flatten([x for x in metadatas if x is not None])))[::-1])
  kernel = UOp(Ops.KERNEL, src=tuple(lctx.map.values())+tuple(lctx.vars.keys()), arg=kernel_arg)
  if ret.op is Ops.SINK and not all_same([x.device for x in kernel.src if x.op is not Ops.BIND]):
@@ -431,7 +437,7 @@ def split_store(ctx:list[UOp], x:UOp) -> UOp|None:
  return kernel

 split_kernels = PatternMatcher([
-  (UPat(Ops.STORE, name="x"), split_store),
+  (UPat((Ops.STORE, Ops.END), name="x"), split_store),
 ])

 def tag_uop(ctx:list[UOp], x:UOp):
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@@ -190,7 +190,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
      case Ops.DEFINE_GLOBAL | Ops.DEFINE_LOCAL | Ops.DEFINE_REG: return (self.ptrdtype.size,)

      # passthrough ops
-      case Ops.REDUCE | Ops.MSTACK | Ops.MSELECT | Ops.DETACH | Ops.CONTIGUOUS | Ops.CONTIGUOUS_BACKWARD | Ops.FUSE | Ops.AFTER:
+      case Ops.REDUCE | Ops.MSTACK | Ops.MSELECT | Ops.DETACH | Ops.CONTIGUOUS | Ops.CONTIGUOUS_BACKWARD | Ops.FUSE | Ops.AFTER | Ops.END:
        return self.src[0]._shape

      # ops with custom handling
@@ -276,6 +276,10 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
      for s in self.src[:range_start[self.op]]: ret.update(s.ranges)
      for s in UOp.sink(*self.src[range_start[self.op]:]).ranges:
        if s in ret: del ret[s]
+    elif self.op is Ops.END:
+      for s in self.src[self.arg:]: ret.update(s.ranges)
+      for s in UOp.sink(*self.src[:self.arg]).ranges:
+        if s in ret: del ret[s]
    else:
      for s in self.src: ret.update(s.ranges)
    return ret
@@ -285,6 +289,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    if self.op is Ops.RANGE: return {self:None}
    return self._ranges

+  @functools.cached_property
+  def ended_ranges(self):
+    match self.op:
+      case Ops.REDUCE: return self.src[1:]
+      case Ops.END: return self.src[:self.arg]
+      case _: raise RuntimeError(f"{self.op} doesn't end ranges")
+
  # *** uop evaluation ***

  def simplify(self, tracked=False, full_symbolic=True):
@@ -350,6 +361,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    return UOp(Ops.GEP, self.dtype.scalar().vec(len(i)) if len(i) > 1 else self.dtype.scalar(), (self,), i)
  def load(self, *src:UOp, **kwargs): return UOp(Ops.LOAD, dtype=kwargs.pop("dtype", self.dtype.base), src=(self,)+src, **kwargs)
  def store(self, *src:UOp, **kwargs): return UOp(Ops.STORE, kwargs.pop("dtype", dtypes.void), (self,)+src, **kwargs)
+  def end(self, *src:UOp, ends:Sequence[UOp]):
+    if len(ends) == 0: return self
+    return UOp(Ops.END, src=(*ends, self, *src), arg=len(ends))
  def after(self, *src:UOp): return UOp(Ops.AFTER, self.dtype, (self,)+src)
  def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self, x))
  def barrier(self, *src:UOp): return UOp(Ops.BARRIER, src=(self,)+src)
--- a/tinygrad/uop/spec.py
+++ b/tinygrad/uop/spec.py
@@ -159,8 +159,9 @@ spec = PatternMatcher([
  (UPat(Ops.DEFINE_REG, src=()), lambda: True),
  (UPat(Ops.DEFINE_VAR, name="x"), lambda x: isinstance(x.arg[1], int) and isinstance(x.arg[2], int)),

-  (UPat(Ops.RANGE, src=(UPat.var("x"),), name="rng"), lambda rng,x: rng.dtype == x.dtype and isinstance(rng.arg, tuple) and len(rng.arg) >= 2 and \
-     all(isinstance(ra, int) for ra in rng.arg[0:-1]) and isinstance(rng.arg[-1], AxisType)),
+  (UPat(Ops.RANGE, src=(UPat.var("x"),), allow_any_len=True, name="rng"), lambda rng,x:
+    rng.dtype == x.dtype and isinstance(rng.arg, tuple) and len(rng.arg) >= 2 and \
+      all(isinstance(ra, int) for ra in rng.arg[0:-1]) and isinstance(rng.arg[-1], AxisType)),
  (UPat(Ops.SPECIAL, src=(UPat.var("x"),), name="s"), lambda s,x: s.dtype == x.dtype == dtypes.int32 and isinstance(s.arg, str)),

  (UPat(Ops.CONST, src=(), name="x"), lambda x: type(x.arg) is type(dtypes.as_const(x.arg, x.dtype))),
@@ -195,7 +196,7 @@ spec = PatternMatcher([
  (UPat((Ops.IDIV, Ops.MOD), name="x"), lambda x: None if dtypes.is_int(x.dtype) else False),
  (UPat(GroupOp.ALU, name="x"), lambda x: all(x.dtype.base == y.dtype.base for y in x.src)),

-  (UPat(Ops.END, dtype=dtypes.void, src=(UPat(Ops.RANGE),)), lambda: True),
+  (UPat(Ops.END, dtype=dtypes.void), lambda: True),

  # WMMA has a <a, b, acc>
  (UPat(Ops.WMMA, src=(UPat(), UPat(), UPat()), name="x"), lambda x: isinstance(x.arg, tuple) and len(x.arg) == 8),
@@ -203,9 +204,8 @@ spec = PatternMatcher([
  (UPat(Ops.UNROLL, name="x"), lambda x: x.src[0].dtype.count == prod(y[1] for y in x.arg)),

  # if has a <gate, barrier?>
-  (UPat(Ops.IF, dtype=dtypes.void, src=(UPat(),)), lambda: True),
-  (UPat(Ops.IF, dtype=dtypes.void, src=(UPat(), UPat(Ops.BARRIER))), lambda: True),
-  (UPat(Ops.ENDIF, dtype=dtypes.void, src=(UPat(Ops.IF),)), lambda: True),
+  (UPat(Ops.IF, dtype=dtypes.void, src=(UPat(),), allow_any_len=True), lambda: True),
+  (UPat(Ops.ENDIF, dtype=dtypes.void, src=(UPat(Ops.IF),), allow_any_len=True), lambda: True),

  (UPat(Ops.REDUCE_AXIS, name="x"), lambda x: isinstance(x.arg, tuple) and len(x.arg) >= 2 and x.arg[0] in {Ops.ADD, Ops.MUL, Ops.MAX}),
  (UPat(Ops.GEP, src=(UPat.var("src"),), name="gep"), lambda gep,src: gep.dtype == src.dtype.scalar()),
--- a/tinygrad/uop/symbolic.py
+++ b/tinygrad/uop/symbolic.py
@@ -379,9 +379,11 @@ symbolic = symbolic_simple+commutative+PatternMatcher([
  ((UPat.var("x", dtypes.index) + UPat.cvar("c")).cast(dtypes.sints, name="cast"), lambda x,c,cast:x.cast(cast.dtype)+c.cast(cast.dtype)),
  # only RANGE/IF/STORE/KERNEL have side effects
  (UPat(Ops.AFTER, name="x"), lambda x: x.replace(src=(x.src[0],)+
-    tuple(flatten([(y,) if y.op in {Ops.RANGE, Ops.IF, Ops.STORE, Ops.KERNEL, Ops.BARRIER} else y.src for y in x.src[1:]])))),
+    tuple(flatten([(y,) if y.op in {Ops.RANGE, Ops.IF, Ops.STORE, Ops.KERNEL, Ops.BARRIER, Ops.END} else y.src for y in x.src[1:]])))),
  # after with 1 src is just src[0]
  (UPat(Ops.AFTER, src=(UPat.var("s"),)), lambda s: s),
+  # END is only on RANGES
+  (UPat(Ops.END, name="e"), lambda e: UOp.end(*e.src[e.arg:], ends=sorted(UOp.sink(*e.src[:e.arg]).ranges, key=lambda x: x.arg))),
 ])+gep_pushing

 symbolic_flat = symbolic+PatternMatcher([
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -71,7 +71,7 @@ def uop_to_json(x:UOp, ignore_indexing=False) -> dict[int, dict]:
    if u.op in GroupOp.Movement: argst = (mask_to_str if u.op in {Ops.SHRINK, Ops.PAD} else shape_to_str)(u.marg)
    label = f"{str(u.op).split('.')[1]}{(chr(10)+word_wrap(argst.replace(':', ''))) if u.arg is not None else ''}"
    if u.dtype != dtypes.void: label += f"\n{u.dtype}"
-    for idx,x in enumerate(u.src[:1] if u.op in {Ops.BUFFERIZE, Ops.INDEX} else u.src):
+    for idx,x in enumerate(u.src[:1] if u.op in {Ops.BUFFERIZE, Ops.INDEX} else (u.src if u.op is not Ops.END else [])):
      if x in excluded:
        arg = f"{x.arg:g}" if x.op is Ops.CONST and dtypes.is_float(x.dtype) else f"{x.arg}"
        label += f"\n{x.op.name}{idx} {arg}" + (f" {x.src[0].op}" if len(x.src) else "")
@@ -82,6 +82,8 @@ def uop_to_json(x:UOp, ignore_indexing=False) -> dict[int, dict]:
        label += f"\n{shape_to_str(u.shape)}"
      if u.op in {Ops.INDEX, Ops.BUFFERIZE}:
        label += f"\n{u.render()}"
+      if u.op is Ops.END:
+        label += "\n"+' '.join([f"{colored(u.src[i].arg[0], axis_colors[u.src[i].arg[-1]])}({u.src[i].vmax+1})" for i in range(u.arg)])
    except Exception:
      label += "\n<ISSUE GETTING LABEL>"
    if (ref:=ref_map.get(u.arg.ast) if u.op is Ops.KERNEL else None) is not None: label += f"\ncodegen@{ctxs[ref]['name']}"