From 415b83ba18d52cdcb23591fedc5134353514b6be Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Fri, 2 Jan 2026 15:47:39 -0800
Subject: [PATCH] tests pass

---
 extra/assembly/amd/emu.py                    |  93 ++++-
 extra/assembly/amd/test/test_sqtt_compare.py | 392 +++++++++++++++++++
 2 files changed, 470 insertions(+), 15 deletions(-)

diff --git a/extra/assembly/amd/emu.py b/extra/assembly/amd/emu.py
index 53ce50ae86..4c84c38f5d 100644
--- a/extra/assembly/amd/emu.py
+++ b/extra/assembly/amd/emu.py
@@ -498,10 +498,15 @@ class SQTTState:
     self.vgpr_chain_depth: dict[int, int] = {}
     # Pending ALUEXEC completions: list of (completion_cycle, alu_src)
     self.pending_aluexec: list[tuple[int, AluSrc]] = []
+    # Deferred ALUEXECs at exhaustion boundary: list of (src_vgpr, alu_src, chain_depth, dst_reg)
+    # src_vgpr is the VGPR index we depend on; we look up vgpr_ready[src_vgpr] when processing
+    self.deferred_aluexec: list[tuple[int, AluSrc, int, int | None]] = []
     # Track last VALU dispatch cycle for s_nop pipeline delay
     self.last_valu_dispatch: int = -100
     # Track s_nop issue penalty for next VALU (bypass + extra stall)
     self.snop_issue_penalty: int = 0
+    # Track IMMEDIATE cycles (need 2+ IMMEDIATEs after source_ready to keep network warm)
+    self.immediate_cycles: list[int] = []
 
   def emit(self, pkt_class, **kwargs):
     self.packets.append(pkt_class(_time=self.cycle, **kwargs))
@@ -513,11 +518,54 @@ class SQTTState:
 
   def tick(self):
     """Emit any completing ALUEXECs at current cycle, then advance."""
+    # Check if deferred ALUEXEC can be scheduled
+    # There's at most one active deferred at a time; others wait for their source
+    if self.deferred_aluexec:
+      src_vgpr, alu_src, chain_depth, dst_reg = self.deferred_aluexec[0]
+      if src_vgpr == -1:
+        # Independent instruction waiting for previous deferred - schedule immediately after last pending
+        if self.pending_aluexec:
+          last_pending = max(c for c, _ in self.pending_aluexec)
+          completion_cycle = last_pending + 1
+        else:
+          completion_cycle = self.cycle
+        self._schedule_aluexec(completion_cycle, alu_src, dst_reg)
+        self.deferred_aluexec.pop(0)
+      else:
+        source_ready = self.vgpr_ready.get(src_vgpr, 0)
+        if source_ready <= self.cycle:
+          # Check if 2+ IMMEDIATEs happened after source became ready (keeps forwarding warm)
+          imm_count = sum(1 for c in self.immediate_cycles if c > source_ready)
+          if imm_count >= 2:
+            # Forwarding works: 5-cycle latency from source_ready
+            completion_cycle = source_ready + VALU_LATENCY - 1
+            self._schedule_aluexec(completion_cycle, alu_src, dst_reg)
+            self.deferred_aluexec.pop(0)
+          elif self.cycle >= source_ready + 2:
+            # No IMMEDIATE for 2+ cycles after source ready: forwarding exhausted
+            completion_cycle = source_ready + FORWARD_DEEP_LATENCY
+            self._schedule_aluexec(completion_cycle, alu_src, dst_reg)
+            self.deferred_aluexec.pop(0)
+          # else: keep waiting to see if an IMMEDIATE comes
+
+    # Emit any completing ALUEXECs
     while self.pending_aluexec and self.pending_aluexec[0][0] <= self.cycle:
       _, alu_src = self.pending_aluexec.pop(0)
       self.emit(ALUEXEC, src=alu_src)
     self.cycle += 1
 
+  def _schedule_aluexec(self, completion_cycle: int, alu_src: AluSrc, dst_reg: int | None):
+    """Schedule an ALUEXEC at the given completion cycle, ensuring proper ordering."""
+    # Ensure spacing from other pending ALUEXECs
+    if self.pending_aluexec:
+      last_pending = max(c for c, _ in self.pending_aluexec)
+      completion_cycle = max(completion_cycle, last_pending + 1)
+    self.pending_aluexec.append((completion_cycle, alu_src))
+    self.pending_aluexec.sort(key=lambda x: x[0])
+    # Update vgpr_ready for dependent instructions
+    if dst_reg is not None:
+      self.vgpr_ready[dst_reg] = completion_cycle
+
   def _get_valu_src_regs(self, inst: Inst) -> list[int]:
     """Extract source VGPR indices from a VALU instruction."""
     src_vgprs = []
@@ -565,6 +613,7 @@ class SQTTState:
       snop_delay = N + imm_extra_delay + bypass_penalty
       for _ in range(snop_delay): self.tick()
       self.emit(IMMEDIATE, wave=self.wave_id)
+      self.immediate_cycles.append(self.cycle)  # track for warmth check
       self.tick()
 
       # Track issue delay for next instruction (affects when next VALU can dispatch)
@@ -579,7 +628,9 @@ class SQTTState:
       # Find source VGPRs and check when they're ready
       src_vgprs = self._get_valu_src_regs(inst)
       has_vgpr_src = len(src_vgprs) > 0 and any(r in self.vgpr_ready for r in src_vgprs)
-      source_ready = max((self.vgpr_ready.get(r, 0) for r in src_vgprs), default=0)
+      # Find the source VGPR with the latest ready time
+      src_vgpr_max = max((r for r in src_vgprs if r in self.vgpr_ready), key=lambda r: self.vgpr_ready[r], default=None)
+      source_ready = self.vgpr_ready.get(src_vgpr_max, 0) if src_vgpr_max is not None else 0
 
       # Calculate chain depth: max depth of any source VGPR + 1
       chain_depth = max((self.vgpr_chain_depth.get(r, 0) for r in src_vgprs), default=0) + 1 if has_vgpr_src else 0
@@ -597,15 +648,20 @@ class SQTTState:
       # Chain depth affects forwarding:
       # - depth 1: 6-cycle forwarding (producer wrote to bypass, consumer reads it)
       # - depth 2-4: 5-cycle forwarding (result already in bypass network, faster access)
-      # - depth >= 5: 9-cycle latency (forwarding network exhausted, back to register file)
+      # - depth >= 5: depends on forwarding network warmth (deferred until source_ready)
+      dst_reg = self._get_valu_dst_reg(inst)
+
       if has_vgpr_src:
         gap = source_ready - dispatch_cycle
-        if chain_depth >= FORWARD_DEPTH_LIMIT + 1:  # >= 5 (6th+ instruction in chain, depth 5+)
-          # Forwarding network exhausted - must go through register file
-          completion_cycle = source_ready + FORWARD_DEEP_LATENCY
+        if chain_depth >= FORWARD_DEPTH_LIMIT:  # >= 4 (5th+ instruction in chain)
+          # At exhaustion boundary - defer until we know if forwarding network is warm
+          # Store the source VGPR so we can look up its ready time when processing
+          self.deferred_aluexec.append((src_vgpr_max, AluSrc.VALU, chain_depth, dst_reg))
+          # Estimate completion for dependent instructions (will be updated when scheduled)
+          completion_cycle = source_ready + VALU_LATENCY - 1  # optimistic estimate
         elif gap >= 2:
           # Pure forwarding: consumer waits for producer, then latency cycles
-          # Chain depth 2-4 gets 5-cycle forwarding, depth 1 gets 6-cycle
+          # Chain depth 2-3 gets 5-cycle forwarding, depth 1 gets 6-cycle
           forwarding_latency = VALU_LATENCY - 1 if chain_depth >= 2 else VALU_LATENCY
           completion_cycle = source_ready + forwarding_latency
         elif gap == 1:
@@ -618,15 +674,22 @@ class SQTTState:
         # Independent VALU: 6 cycle latency
         completion_cycle = dispatch_cycle + VALU_LATENCY
 
-      # Only one ALUEXEC can be emitted per cycle - ensure spacing
-      if self.pending_aluexec:
-        last_pending = max(c for c, _ in self.pending_aluexec)
-        completion_cycle = max(completion_cycle, last_pending + 1)
-      self.pending_aluexec.append((completion_cycle, AluSrc.VALU))
-      self.pending_aluexec.sort(key=lambda x: x[0])
+      # For non-deferred ALUEXECs, schedule them now
+      # But if there's a deferred ALUEXEC, this one must wait (ALUEXECs are in-order)
+      if not has_vgpr_src or chain_depth < FORWARD_DEPTH_LIMIT:
+        if self.deferred_aluexec:
+          # Must wait for deferred to complete first - add to deferred queue
+          # Use a sentinel src_vgpr of -1 to indicate this is independent (no warmth check)
+          self.deferred_aluexec.append((-1, AluSrc.VALU, 0, dst_reg))
+        else:
+          # Only one ALUEXEC can be emitted per cycle - ensure spacing
+          if self.pending_aluexec:
+            last_pending = max(c for c, _ in self.pending_aluexec)
+            completion_cycle = max(completion_cycle, last_pending + 1)
+          self.pending_aluexec.append((completion_cycle, AluSrc.VALU))
+          self.pending_aluexec.sort(key=lambda x: x[0])
 
       # Forward-ready time matches ALUEXEC completion
-      dst_reg = self._get_valu_dst_reg(inst)
       if dst_reg is not None:
         self.vgpr_ready[dst_reg] = completion_cycle
         self.vgpr_chain_depth[dst_reg] = chain_depth if has_vgpr_src else 0
@@ -636,8 +699,8 @@ class SQTTState:
     self.inst_count += 1
 
   def finalize(self):
-    """Tick until all pending ALUEXECs complete, then emit WAVEEND."""
-    while self.pending_aluexec:
+    """Tick until all pending and deferred ALUEXECs complete, then emit WAVEEND."""
+    while self.pending_aluexec or self.deferred_aluexec:
       self.tick()
     self.emit(WAVEEND, wave=self.wave_id, simd=self.simd, cu_lo=self.cu & 0x7, flag7=self.cu >> 3)
 
diff --git a/extra/assembly/amd/test/test_sqtt_compare.py b/extra/assembly/amd/test/test_sqtt_compare.py
index d5a21a5237..7ef8b26f55 100644
--- a/extra/assembly/amd/test/test_sqtt_compare.py
+++ b/extra/assembly/amd/test/test_sqtt_compare.py
@@ -1517,6 +1517,398 @@ class TestVALUMov(SQTTCompareTestBase):
       v_mov_b32_e32(v[7], v[6]),
     ], "vmov_dep_chain_8")
 
+  def test_vmov_dep_chain_7(self):
+    """Seven v_mov with dependency chain - tests forwarding network exhaustion boundary."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+    ], "vmov_dep_chain_7")
+
+  def test_vmov_dep_chain_9(self):
+    """Nine v_mov with dependency chain."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+    ], "vmov_dep_chain_9")
+
+  def test_vmov_dep_chain_10(self):
+    """Ten v_mov with dependency chain."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+      v_mov_b32_e32(v[9], v[8]),
+    ], "vmov_dep_chain_10")
+
+  def test_vmov_dep_chain_12(self):
+    """Twelve v_mov with dependency chain - tests multiple forwarding exhaustion cycles."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+      v_mov_b32_e32(v[9], v[8]),
+      v_mov_b32_e32(v[10], v[9]),
+      v_mov_b32_e32(v[11], v[10]),
+    ], "vmov_dep_chain_12")
+
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Chain with trailing independent VALUs - tests IMMEDIATE interleaving edge case
+  # The number of trailing VALUs affects when s_nops start, which affects
+  # how many IMMEDIATEs are interleaved with ALUEXECs at the forwarding boundary.
+  # ─────────────────────────────────────────────────────────────────────────────
+
+  def test_vmov_chain5_trail0(self):
+    """5-chain with 0 trailing independent VALUs - baseline for edge case."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+    ], "vmov_chain5_trail0")
+
+  def test_vmov_chain5_trail1(self):
+    """5-chain with 1 trailing independent VALU."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[10], 2.0),  # independent
+    ], "vmov_chain5_trail1")
+
+  def test_vmov_chain5_trail2(self):
+    """5-chain with 2 trailing independent VALUs."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[10], 2.0),  # independent
+      v_mov_b32_e32(v[11], 3.0),  # independent
+    ], "vmov_chain5_trail2")
+
+  def test_vmov_chain5_trail3(self):
+    """5-chain with 3 trailing independent VALUs."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[10], 2.0),  # independent
+      v_mov_b32_e32(v[11], 3.0),  # independent
+      v_mov_b32_e32(v[12], 4.0),  # independent
+    ], "vmov_chain5_trail3")
+
+  def test_vmov_chain5_trail4(self):
+    """5-chain with 4 trailing independent VALUs - should change forwarding behavior."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[10], 2.0),  # independent
+      v_mov_b32_e32(v[11], 3.0),  # independent
+      v_mov_b32_e32(v[12], 4.0),  # independent
+      v_mov_b32_e32(v[13], 5.0),  # independent
+    ], "vmov_chain5_trail4")
+
+  def test_vmov_chain5_trail5(self):
+    """5-chain with 5 trailing independent VALUs."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[10], 2.0),  # independent
+      v_mov_b32_e32(v[11], 3.0),  # independent
+      v_mov_b32_e32(v[12], 4.0),  # independent
+      v_mov_b32_e32(v[13], 5.0),  # independent
+      v_mov_b32_e32(v[14], 6.0),  # independent
+    ], "vmov_chain5_trail5")
+
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Chain with trailing s_nops - tests IMMEDIATE interleaving with precise control
+  # Each s_nop(0) delays s_nop processing start by 1 cycle, affecting how many
+  # IMMEDIATEs are interleaved with ALUEXECs at the forwarding exhaustion boundary.
+  # ─────────────────────────────────────────────────────────────────────────────
+
+  def test_vmov_chain5_nop0(self):
+    """5-chain with 0 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+    ], "vmov_chain5_nop0")
+
+  def test_vmov_chain5_nop1(self):
+    """5-chain with 1 extra s_nop before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+    ], "vmov_chain5_nop1")
+
+  def test_vmov_chain5_nop2(self):
+    """5-chain with 2 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain5_nop2")
+
+  def test_vmov_chain5_nop3(self):
+    """5-chain with 3 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain5_nop3")
+
+  def test_vmov_chain5_nop4(self):
+    """5-chain with 4 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain5_nop4")
+
+  def test_vmov_chain5_nop5(self):
+    """5-chain with 5 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain5_nop5")
+
+  def test_vmov_chain5_nop8(self):
+    """5-chain with 8 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain5_nop8")
+
+  def test_vmov_chain5_nop16(self):
+    """5-chain with 16 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+    ] + [s_nop(0)] * 16, "vmov_chain5_nop16")
+
+  # Test with longer s_nops to add more delay per instruction
+  def test_vmov_chain5_nop3_long(self):
+    """5-chain with 3 s_nop(3) - each adds more delay."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      s_nop(3),
+      s_nop(3),
+      s_nop(3),
+    ], "vmov_chain5_nop3_long")
+
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Chain6 with trailing s_nops - same edge case but with 6-chain
+  # ─────────────────────────────────────────────────────────────────────────────
+
+  def test_vmov_chain6_nop0(self):
+    """6-chain with 0 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+    ], "vmov_chain6_nop0")
+
+  def test_vmov_chain6_nop1(self):
+    """6-chain with 1 extra s_nop before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      s_nop(0),
+    ], "vmov_chain6_nop1")
+
+  def test_vmov_chain6_nop2(self):
+    """6-chain with 2 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain6_nop2")
+
+  def test_vmov_chain6_nop3(self):
+    """6-chain with 3 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain6_nop3")
+
+  def test_vmov_chain6_nop4(self):
+    """6-chain with 4 extra s_nops before epilogue."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain6_nop4")
+
+  # ─────────────────────────────────────────────────────────────────────────────
+  # Chain12 with trailing instructions - tests second forwarding exhaustion cycle
+  # ─────────────────────────────────────────────────────────────────────────────
+
+  def test_vmov_chain12_nop0(self):
+    """12-chain with 0 extra s_nops - tests second exhaustion boundary."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+      v_mov_b32_e32(v[9], v[8]),
+      v_mov_b32_e32(v[10], v[9]),
+      v_mov_b32_e32(v[11], v[10]),
+    ], "vmov_chain12_nop0")
+
+  def test_vmov_chain12_nop2(self):
+    """12-chain with 2 extra s_nops."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+      v_mov_b32_e32(v[9], v[8]),
+      v_mov_b32_e32(v[10], v[9]),
+      v_mov_b32_e32(v[11], v[10]),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain12_nop2")
+
+  def test_vmov_chain12_nop4(self):
+    """12-chain with 4 extra s_nops."""
+    self._run_and_compare([
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], v[0]),
+      v_mov_b32_e32(v[2], v[1]),
+      v_mov_b32_e32(v[3], v[2]),
+      v_mov_b32_e32(v[4], v[3]),
+      v_mov_b32_e32(v[5], v[4]),
+      v_mov_b32_e32(v[6], v[5]),
+      v_mov_b32_e32(v[7], v[6]),
+      v_mov_b32_e32(v[8], v[7]),
+      v_mov_b32_e32(v[9], v[8]),
+      v_mov_b32_e32(v[10], v[9]),
+      v_mov_b32_e32(v[11], v[10]),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+      s_nop(0),
+    ], "vmov_chain12_nop4")
+
   # ─────────────────────────────────────────────────────────────────────────────
   # WAW (write-after-write) - same destination register
   # ─────────────────────────────────────────────────────────────────────────────