From d345f7f5dcd144048528e63afb3ca688dba58cf7 Mon Sep 17 00:00:00 2001 From: chenyu Date: Thu, 26 Feb 2026 22:38:10 -0500 Subject: [PATCH] remove _pending_assigns (#15040) --- test/null/test_schedule.py | 1 + test/unit/test_assign.py | 9 ++++---- tinygrad/engine/allocations.py | 11 +++++---- tinygrad/tensor.py | 42 +++++++++++----------------------- 4 files changed, 25 insertions(+), 38 deletions(-) diff --git a/test/null/test_schedule.py b/test/null/test_schedule.py index fb89f97f7a..1a14c1d646 100644 --- a/test/null/test_schedule.py +++ b/test/null/test_schedule.py @@ -1104,6 +1104,7 @@ class TestUOpBecome(unittest.TestCase): from tinygrad.helpers import all_same assert all_same([x.uop.base.realized for x in [a,b,c]]) + @unittest.skip("not clear if we want this") def test_setitem_becomes_subbuffer(self): a = Tensor.full((4,), 2.).contiguous().realize() b = a.shrink(((0, 2),)).assign(Tensor.full((2,), 1.0)) diff --git a/test/unit/test_assign.py b/test/unit/test_assign.py index 136db651a1..b78855ad6e 100644 --- a/test/unit/test_assign.py +++ b/test/unit/test_assign.py @@ -609,8 +609,8 @@ class TestAssign(unittest.TestCase): x = q + caches[i][:1] # next layer also references the same CONTIGUOUS through q GlobalCounters.reset() caches[-1][:1].contiguous().realize() - # 2 kernels for first assign + 3 per remaining assign (matmul, contiguous, assign) + 1 final read = 3*N - self.assertEqual(GlobalCounters.kernel_count, 3*N) + # N matmuls + N assigns + 1 final read = 2*N+1 (AFTER embedding allows full graph scheduling with shared contiguous reuse) + self.assertEqual(GlobalCounters.kernel_count, 2*N+1) class TestAssignOrdering(unittest.TestCase): @@ -767,13 +767,12 @@ class TestAssignOrdering(unittest.TestCase): np.testing.assert_equal(b.numpy(), [1, 2, 3, 4]) def test_variable_slice_ordering(self): - """Variable-indexed slices - tests symbolic dependency tracking.""" + """Variable-indexed slices - conflicting variable binds in same schedule are rejected.""" v_i = Variable("i", 0, 3) buf = Tensor.zeros(4, 4).contiguous().realize() buf[v_i.bind(0):v_i.bind(0)+1, :].assign(Tensor.ones(1, 4)) buf[v_i.bind(1):v_i.bind(1)+1, :].assign(Tensor.ones(1, 4) * 2) - self.assertEqual(buf[0:1, :].sum().item(), 4) - self.assertEqual(buf[1:2, :].sum().item(), 8) + with self.assertRaises(RuntimeError): buf[0:1, :].sum().item() def test_multi_step_assign_read_write_same_buffer(self): """Assign to m and param reading b, then update b, across multiple steps. diff --git a/tinygrad/engine/allocations.py b/tinygrad/engine/allocations.py index d7a1f5b238..7093d1ae13 100644 --- a/tinygrad/engine/allocations.py +++ b/tinygrad/engine/allocations.py @@ -25,7 +25,9 @@ def disk_copy_is_buffer(ctx:AllocCtx, u:UOp): if from_creation: return tag_uop(ctx, u) def apply_after(ctx:AllocCtx, u:UOp): - ctx.buffer_map[u] = u.src[0] + base = u.src[0] + while base.op is Ops.AFTER: base = base.src[0] + ctx.buffer_map[u] = base # CONTIGUOUS and ASSIGN + parents are the only nodes that get updated add_tags = PatternMatcher([ @@ -54,7 +56,7 @@ def replace_contig_with_assign(u:UOp): def replace_assign_with_contig(u:UOp): assigned_to = u - while assigned_to.op in {Ops.ASSIGN, Ops.BITCAST}: assigned_to = assigned_to.src[0].base + while assigned_to.op in {Ops.ASSIGN, Ops.BITCAST, Ops.AFTER}: assigned_to = assigned_to.src[0].base if assigned_to.op is not Ops.BUFFER: return u.src[1].contiguous(tag=u.tag) @@ -74,8 +76,9 @@ pm_early_transform_tensor_graph = PatternMatcher([ (UPat(GroupOp.ALU, name="alu"), lambda ctx,alu: alu.replace(src=new_src) if (new_src:=tuple(ctx.get(s, s) for s in alu.src)) != alu.src else None), # add CONTIGUOUS to tagged UOps (UPat(GroupOp.All-{Ops.CONTIGUOUS, Ops.ASSIGN}, name="x"), lambda x: x.rtag(None).contiguous(tag=x.tag) if x.tag else x.replace(tag=None)), - # remove extra CONTIGUOUS on ASSIGN - (UPat(Ops.CONTIGUOUS, src=(UPat(Ops.ASSIGN, name="a"),), name="c"), lambda a,c: a.replace(tag=a.tag+c.tag)), + # remove extra CONTIGUOUS on ASSIGN (only when assign target is contiguous) + (UPat(Ops.CONTIGUOUS, src=(UPat(Ops.ASSIGN, name="a"),), name="c"), + lambda a,c: a.replace(tag=a.tag+c.tag) if a.src[0].has_buffer_identity() else None), # replace ASSIGN with CONTIGUOUS (UPat(Ops.ASSIGN, name="u"), replace_assign_with_contig), # replace CONTIGUOUS with ASSIGNs diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 4247b70776..afb163e225 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -25,8 +25,7 @@ def canonicalize_device(device:str|tuple|list|None) -> str|tuple[str, ...]: # *** all in scope Tensors are here. this gets relevant UOps *** all_tensors: dict[weakref.ref[Tensor], None] = {} -_pending_assigns: dict[UOp, list[UOp]] = {} # buffer_uop -> [assign_uops in insertion order] -def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str) -> None: +def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str, walk:bool=False) -> None: with cpu_profile(TracingKey(name), "TINY"): # get tensors in scope in_scope: dict[UOp, bool] = {} @@ -35,7 +34,7 @@ def _apply_map_to_tensors(applied_map:dict[UOp, UOp], name:str) -> None: # get all Tensors and apply the map sink = UOp.sink(*[t.uop for t in scope_tensors]) - new_sink = sink.substitute(applied_map, name=f"substitute {name}") + new_sink = sink.substitute(applied_map, name=f"substitute {name}", walk=walk) # set the relevant uop to the realized UOps for t,s,ns in zip(scope_tensors, sink.src, new_sink.src): @@ -278,23 +277,6 @@ class Tensor(OpMixin): @disable_gc() def realize(self, *lst:Tensor, do_update_stats=True) -> Tensor: """Triggers the computation needed to create these Tensor(s).""" - # side-realize pending assigns for buffers referenced by these tensors - if _pending_assigns: - def _realize_pending(buf): - for assign_uop in _pending_assigns.pop(buf, []): - # recursively realize pending assigns that this assign's value depends on - for u in assign_uop.toposort(): - if u.op is Ops.BUFFER and u in _pending_assigns: _realize_pending(u) - big_sink, becomes_map = transform_to_call(UOp.sink(assign_uop)) - schedule, var_vals = complete_create_schedule_with_vars(big_sink) - _apply_map_to_tensors(becomes_map, name="Apply Pending Assign") - run_schedule(schedule, var_vals, do_update_stats=do_update_stats) - # update remaining pending assigns so they reference realized buffers instead of stale lazy graphs - if becomes_map: - for assigns in _pending_assigns.values(): - for i in range(len(assigns)): assigns[i] = assigns[i].substitute(becomes_map) - for buf in {u for t in (self,)+lst for u in t.uop.toposort() if u.op is Ops.BUFFER}: - if buf in _pending_assigns: _realize_pending(buf) if len(to_realize:=[x for x in (self,)+lst if not x.uop.has_buffer_identity()]): run_schedule(*Tensor.schedule_with_vars(*to_realize), do_update_stats=do_update_stats) return self @@ -323,13 +305,13 @@ class Tensor(OpMixin): if is_disk: self._buffer().copyin(x._data()) return self - result = self._apply_uop(UOp.assign, x) - # track view assigns (not full-buffer or assign-chain) so they can be side-realized when the buffer is read - if (buf_uop:=self.uop.base).op is Ops.BUFFER and self.uop.op is not Ops.ASSIGN and not self.uop.has_buffer_identity(): - # deduplicate: if the value is already a pending assign for this buffer (e.g. __iadd__ in __setitem__), remove it - if x.uop in _pending_assigns.get(buf_uop, []): _pending_assigns[buf_uop].remove(x.uop) - _pending_assigns.setdefault(buf_uop, []).append(result.uop) - return self.replace(result) + # NOTE: assign_uop is created before AFTER embedding (uses original self.uop), + # but AFTER must be embedded before _apply_uop (so subsequent assigns see it) + assign_uop = self.uop.assign(x.uop) + base = self.uop.base + if base.op in {Ops.BUFFER, Ops.AFTER} and not self.uop.has_buffer_identity(): + _apply_map_to_tensors({base: base.after(assign_uop)}, name="Embed View Assign", walk=True) + return self.replace(self._apply_uop(lambda *_: assign_uop, x)) def detach(self) -> Tensor: """ @@ -1351,8 +1333,10 @@ class Tensor(OpMixin): if is_disk: raise RuntimeError("advanced setitem is not supported for DISK tensors") if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype) self.assign(self._getitem(indices, v)) - elif is_disk or self.uop.is_realized: # basic setitem, self is realized. TODO: disk uop.base is a COPY and not realized - self[indices].assign(v) + elif is_disk or self.uop.is_realized or self.uop.base.op is Ops.AFTER: # basic setitem, self is realized + view = self[indices] + if isinstance(v, Tensor) and v.uop.op is Ops.ASSIGN and v.uop in view.uop.base.src: return + view.assign(v) else: # basic setitem, self is not realized if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype) # __iadd__/__isub__ on unrealized views creates a no-op ASSIGN; unwrap to get the computed value