From 1baefed530276fb132aa1a283fdc72cf8b660871 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 21 Jan 2026 08:53:54 +0900
Subject: [PATCH] assembly/amd: add hw tests from ucode branch (#14259)

* assembly/amd: add hw tests from ucode branch

* fix is per lane
---
 extra/assembly/amd/pcode.py               |  12 +-
 extra/assembly/amd/test/hw/helpers.py     |  46 +-
 extra/assembly/amd/test/hw/test_ds.py     |  50 ++
 extra/assembly/amd/test/hw/test_global.py | 163 +++++++
 extra/assembly/amd/test/hw/test_sop.py    | 242 ++++++++++
 extra/assembly/amd/test/hw/test_vop1.py   | 292 +++++++++++-
 extra/assembly/amd/test/hw/test_vop2.py   | 414 +++++++++++++++++
 extra/assembly/amd/test/hw/test_vop3.py   | 531 +++++++++++++++++++++-
 extra/assembly/amd/test/hw/test_vop3p.py  |  79 +++-
 extra/assembly/amd/test/hw/test_vopc.py   | 355 +++++++++++++++
 extra/assembly/amd/test/hw/test_vopd.py   | 161 +++++++
 11 files changed, 2304 insertions(+), 41 deletions(-)
 create mode 100644 extra/assembly/amd/test/hw/test_vopd.py

diff --git a/extra/assembly/amd/pcode.py b/extra/assembly/amd/pcode.py
index bd98f82ff1..d1249b3e22 100644
--- a/extra/assembly/amd/pcode.py
+++ b/extra/assembly/amd/pcode.py
@@ -653,17 +653,17 @@ def _apply_pseudocode_fixes(op_name: str, code: str) -> str:
     code = code.replace('D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)',
                         'D0.f64 = (2.0 ** 128 if exponent(S2.f64) > 1023 else 2.0 ** -128) * fma(S0.f64, S1.f64, S2.f64)')
   if op_name == 'V_DIV_SCALE_F32':
-    code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(0x1); D0.f32 = float("nan")')
+    code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(1 << laneId); D0.f32 = float("nan")')
     code = code.replace('elif S1.f32 == DENORM.f32:\n  D0.f32 = ldexp(S0.f32, 64)', 'elif False:\n  pass')
     code += '\nif S1.f32 == DENORM.f32:\n  D0.f32 = float("nan")'
-    code = code.replace('elif exponent(S2.f32) <= 23:\n  D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n  VCC = Reg(0x1); D0.f32 = ldexp(S0.f32, 64)')
-    code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(0x1)\n  if S0.f32 == S2.f32:\n    D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(0x1)')
+    code = code.replace('elif exponent(S2.f32) <= 23:\n  D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n  VCC = Reg(1 << laneId); D0.f32 = ldexp(S0.f32, 64)')
+    code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(0x1)\n  if S0.f32 == S2.f32:\n    D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n  VCC = Reg(1 << laneId)')
   if op_name == 'V_DIV_SCALE_F64':
-    code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(0x1); D0.f64 = float("nan")')
+    code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(1 << laneId); D0.f64 = float("nan")')
     code = code.replace('elif S1.f64 == DENORM.f64:\n  D0.f64 = ldexp(S0.f64, 128)', 'elif False:\n  pass')
     code += '\nif S1.f64 == DENORM.f64:\n  D0.f64 = float("nan")'
-    code = code.replace('elif exponent(S2.f64) <= 52:\n  D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n  VCC = Reg(0x1); D0.f64 = ldexp(S0.f64, 128)')
-    code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(0x1)\n  if S0.f64 == S2.f64:\n    D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(0x1)')
+    code = code.replace('elif exponent(S2.f64) <= 52:\n  D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n  VCC = Reg(1 << laneId); D0.f64 = ldexp(S0.f64, 128)')
+    code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(0x1)\n  if S0.f64 == S2.f64:\n    D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n  VCC = Reg(1 << laneId)')
   if op_name == 'V_DIV_FIXUP_F32':
     code = code.replace('D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))',
                         'D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) if isNAN(S0.f32) else ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))')
diff --git a/extra/assembly/amd/test/hw/helpers.py b/extra/assembly/amd/test/hw/helpers.py
index 4e40417ad4..efb8f6e893 100644
--- a/extra/assembly/amd/test/hw/helpers.py
+++ b/extra/assembly/amd/test/hw/helpers.py
@@ -1,14 +1,25 @@
 """Test infrastructure for hardware-validated RDNA3 emulator tests.
 
 Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
-Set USE_HW=1 to run on both emulator and real hardware, comparing results.
+Set USE_HW=1 to run on both emulator and hardware, comparing results.
 """
-import ctypes, os, struct
+import ctypes, math, os, struct
 from extra.assembly.amd.autogen.rdna3.ins import *
 
-from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
+from extra.assembly.amd.emu import run_asm
 from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
-from extra.assembly.amd.pcode import _i32, _f32
+
+def _i32(f: float) -> int: return struct.unpack('<I', struct.pack('<f', f))[0]
+def _f32(i: int) -> float: return struct.unpack('<f', struct.pack('<I', i & 0xFFFFFFFF))[0]
+
+# f16 conversion helpers
+def _f16(i: int) -> float: return struct.unpack('<e', struct.pack('<H', i & 0xFFFF))[0]
+def f32_to_f16(f: float) -> int:
+  f = float(f)
+  if math.isnan(f): return 0x7e00
+  if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00
+  try: return struct.unpack('<H', struct.pack('<e', f))[0]
+  except OverflowError: return 0x7c00 if f > 0 else 0xfc00
 
 # For backwards compatibility with tests using SrcEnum.NULL etc.
 class SrcEnum:
@@ -32,11 +43,11 @@ VCC = VCC_LO  # For VOP3SD sdst field (VCC_LO is exported from dsl)
 USE_HW = os.environ.get("USE_HW", "0") == "1"
 FLOAT_TOLERANCE = 1e-5
 
-# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
+# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec
 N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
 VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4  # 16 regs * 32 lanes * 4 bytes = 2048
 SGPR_BYTES = N_SGPRS * 4  # 16 regs * 4 bytes = 64
-OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8  # + vcc + scc
+OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12  # + vcc + scc + exec
 
 # Float conversion helpers
 def f2i(f: float) -> int: return _i32(f)
@@ -47,6 +58,14 @@ def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
 def assemble(instructions: list) -> bytes:
   return b''.join(inst.to_bytes() for inst in instructions)
 
+# Simple WaveState class for test output parsing (mirrors emu.py interface for tests)
+class WaveState:
+  def __init__(self):
+    self.vgpr = [[0] * 256 for _ in range(32)]  # vgpr[lane][reg]
+    self.sgpr = [0] * 128
+    self.vcc = 0
+    self.scc = 0
+
 def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
   """Generate prologue and epilogue instructions for state capture."""
   prologue = [
@@ -63,6 +82,10 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
   epilogue = [
     s_mov_b32(s[90], VCC_LO),
     s_cselect_b32(s[91], 1, 0),
+    # Save EXEC early (before we modify it for VGPR stores)
+    s_mov_b32(s[95], EXEC_LO),
+    # Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC)
+    s_mov_b32(EXEC_LO, (1 << n_lanes) - 1),
     s_load_b64(s[92:93], s[80:81], 0, soffset=NULL),
     s_waitcnt(0),  # simm16=0 waits for all
     v_lshlrev_b32_e32(v[240], 2, v[255]),
@@ -80,6 +103,9 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
   epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES))
   epilogue.append(v_mov_b32_e32(v[243], s[91]))
   epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4))
+  # Store EXEC (saved earlier in s[95])
+  epilogue.append(v_mov_b32_e32(v[243], s[95]))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8))
   epilogue.append(s_mov_b32(EXEC_LO, s[94]))
   epilogue.append(s_endpgm())
   return prologue, epilogue
@@ -95,6 +121,8 @@ def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
     st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
   st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
   st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
+  # Store EXEC in its proper location (index 126)
+  st.sgpr[EXEC_LO.offset] = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 8)[0]
   return st
 
 def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
@@ -110,9 +138,9 @@ def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
   kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
   lib_ptr = ctypes.addressof(kernel_buf)
 
-  set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
   # rsrc2: USER_SGPR_COUNT=2, ENABLE_SGPR_WORKGROUP_ID_X/Y/Z=1, LDS_SIZE=128 (64KB)
   rsrc2 = 0x19c | (128 << 15)
+  scratch_size = 0x10000  # 64KB per lane, matches .amdhsa_private_segment_fixed_size in run_program_hw
   result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr, rsrc2)
   assert result == 0, f"run_asm failed with {result}"
 
@@ -148,6 +176,8 @@ test:
   .amdhsa_user_sgpr_kernarg_segment_ptr 1
   .amdhsa_kernarg_size 8
   .amdhsa_group_segment_fixed_size 65536
+  .amdhsa_private_segment_fixed_size 65536
+  .amdhsa_enable_private_segment 1
 .end_amdhsa_kernel
 
 .amdgpu_metadata
@@ -160,7 +190,7 @@ amdhsa.kernels:
     .symbol: test.kd
     .kernarg_segment_size: 8
     .group_segment_fixed_size: 65536
-    .private_segment_fixed_size: 0
+    .private_segment_fixed_size: 65536
     .kernarg_segment_align: 8
     .wavefront_size: 32
     .sgpr_count: 96
diff --git a/extra/assembly/amd/test/hw/test_ds.py b/extra/assembly/amd/test/hw/test_ds.py
index 220984d2d9..2783be043c 100644
--- a/extra/assembly/amd/test/hw/test_ds.py
+++ b/extra/assembly/amd/test/hw/test_ds.py
@@ -138,6 +138,56 @@ class TestDS2AddrMore(unittest.TestCase):
     self.assertEqual(st.vgpr[0][4], 0x12345678, "v4 should be untouched")
 
 
+class TestDSB128(unittest.TestCase):
+  """Tests for DS_STORE_B128 and DS_LOAD_B128 (128-bit / 4 dwords)."""
+
+  def test_ds_store_load_b128(self):
+    """DS_STORE_B128 stores 4 VGPRs, DS_LOAD_B128 loads them back."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[1], s[0]),
+      s_mov_b32(s[0], 0x33333333),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x44444444),
+      v_mov_b32_e32(v[3], s[0]),
+      ds_store_b128(addr=v[10], data0=v[0:3]),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b128(addr=v[10], vdst=v[4:7]),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword")
+    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have second dword")
+    self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should have third dword")
+    self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should have fourth dword")
+
+  def test_ds_store_b128_with_offset(self):
+    """DS_STORE_B128 with non-zero offset."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[1], s[0]),
+      s_mov_b32(s[0], 0xCCCCCCCC),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xDDDDDDDD),
+      v_mov_b32_e32(v[3], s[0]),
+      DS(DSOp.DS_STORE_B128, addr=v[10], data0=v[0:3], offset0=16),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_B128, addr=v[10], vdst=v[4:7], offset0=16),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][5], 0xBBBBBBBB)
+    self.assertEqual(st.vgpr[0][6], 0xCCCCCCCC)
+    self.assertEqual(st.vgpr[0][7], 0xDDDDDDDD)
+
+
 class TestDSAtomic(unittest.TestCase):
   """Tests for DS atomic operations."""
 
diff --git a/extra/assembly/amd/test/hw/test_global.py b/extra/assembly/amd/test/hw/test_global.py
index 20d5478162..edefc64f62 100644
--- a/extra/assembly/amd/test/hw/test_global.py
+++ b/extra/assembly/amd/test/hw/test_global.py
@@ -128,6 +128,169 @@ class TestGlobalLoad(unittest.TestCase):
 class TestGlobalStore(unittest.TestCase):
   """Tests for GLOBAL store instructions."""
 
+  def test_global_store_b8_basic(self):
+    """GLOBAL_STORE_B8 stores a single byte from VDATA[7:0]."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      # First store 0xDEADBEEF to memory
+      s_mov_b32(s[4], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      # Now store single byte 0x42 to same address (should only change byte 0)
+      v_mov_b32_e32(v[2], 0x42),
+      global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      # Read back and check
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Only byte 0 should change from 0xEF to 0x42
+    self.assertEqual(st.vgpr[0][0], 0xDEADBE42, "Only byte 0 should be modified")
+
+  def test_global_store_b8_byte1(self):
+    """GLOBAL_STORE_B8 at offset+1 stores to byte 1."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[2], 0x42),
+      global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
+      s_waitcnt(vmcnt=0),
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xDEAD42EF, "Only byte 1 should be modified")
+
+  def test_global_store_b16_basic(self):
+    """GLOBAL_STORE_B16 stores a 16-bit value from VDATA[15:0]."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0xCAFE),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xDEADCAFE, "Only lower 16 bits should be modified")
+
+  def test_global_store_b16_high_half(self):
+    """GLOBAL_STORE_B16 at offset+2 stores to high 16 bits."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0xCAFE),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+2),
+      s_waitcnt(vmcnt=0),
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xCAFEBEEF, "Only upper 16 bits should be modified")
+
+  def test_global_store_b16_byte_offset_1(self):
+    """GLOBAL_STORE_B16 at byte offset 1 stores bytes 1-2 within the same word."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xDDCCBBAA),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      # Store 0xBEEF at byte offset 1 (bytes 1-2)
+      s_mov_b32(s[4], 0xBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
+      s_waitcnt(vmcnt=0),
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Bytes 1-2 should be 0xBEEF (0xEF at byte 1, 0xBE at byte 2)
+    # Original: 0xDDCCBBAA -> bytes [AA, BB, CC, DD]
+    # After:    0xDDBEEFAA -> bytes [AA, EF, BE, DD]
+    self.assertEqual(st.vgpr[0][0], 0xDDBEEFAA, "Bytes 1-2 should be 0xBEEF")
+
+  def test_global_store_b16_cross_word_boundary(self):
+    """GLOBAL_STORE_B16 at byte offset 3 crosses word boundary (byte 3 of word N, byte 0 of word N+1)."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      # Initialize two consecutive words
+      s_mov_b32(s[4], 0xDDCCBBAA),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
+      s_mov_b32(s[4], 0x44332211),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+4),
+      s_waitcnt(vmcnt=0),
+      # Store 0xBEEF at byte offset 3 (crosses word boundary)
+      # Low byte (0xEF) goes to byte 3 of first word
+      # High byte (0xBE) goes to byte 0 of second word
+      s_mov_b32(s[4], 0xBEEF),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+3),
+      s_waitcnt(vmcnt=0),
+      # Load back both words
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
+      GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+4),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      v_mov_b32_e32(v[1], v[4]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # First word: 0xDDCCBBAA -> 0xEFCCBBAA (byte 3 becomes 0xEF)
+    # Second word: 0x44332211 -> 0x443322BE (byte 0 becomes 0xBE)
+    self.assertEqual(st.vgpr[0][0], 0xEFCCBBAA, "Byte 3 of first word should be 0xEF")
+    self.assertEqual(st.vgpr[0][1], 0x443322BE, "Byte 0 of second word should be 0xBE")
+
   def test_global_store_b64_basic(self):
     """GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory."""
     TEST_OFFSET = 256
diff --git a/extra/assembly/amd/test/hw/test_sop.py b/extra/assembly/amd/test/hw/test_sop.py
index 1e7e79b438..62ba1f9120 100644
--- a/extra/assembly/amd/test/hw/test_sop.py
+++ b/extra/assembly/amd/test/hw/test_sop.py
@@ -62,6 +62,28 @@ class TestBasicScalar(unittest.TestCase):
     st = run_program(instructions, n_lanes=1)
     self.assertEqual(st.sgpr[1], 0x80000000)
 
+  def test_s_fmamk_f32(self):
+    """S_FMAMK_F32: D = S0 * literal + S1."""
+    # 2.0 * 3.0 + 1.0 = 7.0
+    instructions = [
+      s_mov_b32(s[0], f2i(2.0)),
+      s_mov_b32(s[1], f2i(1.0)),
+      s_fmamk_f32(s[2], s[0], s[1], literal=f2i(3.0)),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], f2i(7.0))
+
+  def test_s_fmamk_f32_negative(self):
+    """S_FMAMK_F32 with negative values."""
+    # -2.0 * 4.0 + 10.0 = 2.0
+    instructions = [
+      s_mov_b32(s[0], f2i(-2.0)),
+      s_mov_b32(s[1], f2i(10.0)),
+      s_fmamk_f32(s[2], s[0], s[1], literal=f2i(4.0)),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], f2i(2.0))
+
 
 class TestQuadmaskWqm(unittest.TestCase):
   """Tests for S_QUADMASK_B32 and S_WQM_B32."""
@@ -298,6 +320,56 @@ class TestSignedArithmetic(unittest.TestCase):
     st = run_program(instructions, n_lanes=1)
     self.assertEqual(st.sgpr[2], 2)
 
+  def test_s_mul_hi_u32_max(self):
+    """S_MUL_HI_U32: 0xFFFFFFFF * 0xFFFFFFFF."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_mov_b32(s[1], 0xFFFFFFFF),
+      s_mul_hi_u32(s[2], s[0], s[1]),  # (0xFFFFFFFF * 0xFFFFFFFF) >> 32 = 0xFFFFFFFE
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 0xFFFFFFFE)
+
+  def test_s_mul_hi_i32_positive(self):
+    """S_MUL_HI_I32: positive * positive."""
+    instructions = [
+      s_mov_b32(s[0], 0x40000000),  # 2^30
+      s_mov_b32(s[1], 4),
+      s_mul_hi_i32(s[2], s[0], s[1]),  # (2^30 * 4) >> 32 = 1
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 1)
+
+  def test_s_mul_hi_i32_neg_times_neg(self):
+    """S_MUL_HI_I32: (-1) * (-1) = 1, high bits = 0."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # -1
+      s_mov_b32(s[1], 0xFFFFFFFF),  # -1
+      s_mul_hi_i32(s[2], s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 0)
+
+  def test_s_mul_hi_i32_neg_times_pos(self):
+    """S_MUL_HI_I32: (-1) * 2 = -2, high bits = -1 (sign extension)."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # -1
+      s_mov_b32(s[1], 2),
+      s_mul_hi_i32(s[2], s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 0xFFFFFFFF)  # -1 sign extends
+
+  def test_s_mul_hi_i32_min_int(self):
+    """S_MUL_HI_I32: MIN_INT * 2 = -2^32, high = -1."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -2^31 (MIN_INT)
+      s_mov_b32(s[1], 2),
+      s_mul_hi_i32(s[2], s[0], s[1]),  # (-2^31 * 2) >> 32 = -1
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 0xFFFFFFFF)
+
   def test_s_mul_i32(self):
     """S_MUL_I32: signed multiply low 32 bits."""
     instructions = [
@@ -329,6 +401,176 @@ class TestSignedArithmetic(unittest.TestCase):
     self.assertEqual(st.sgpr[7], ((dividend * 2) + 1) & 0xFFFFFFFF)
 
 
+class TestBitSet(unittest.TestCase):
+  """Tests for S_BITSET0_B32 and S_BITSET1_B32 instructions."""
+
+  def test_s_bitset1_b32_set_bit0(self):
+    """S_BITSET1_B32: set bit 0 in destination."""
+    instructions = [
+      s_mov_b32(s[0], 0),     # start with 0
+      s_mov_b32(s[1], 0),     # bit position = 0
+      s_bitset1_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 1, "Bit 0 should be set")
+
+  def test_s_bitset1_b32_set_bit31(self):
+    """S_BITSET1_B32: set bit 31 in destination."""
+    instructions = [
+      s_mov_b32(s[0], 0),     # start with 0
+      s_mov_b32(s[1], 31),    # bit position = 31
+      s_bitset1_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 0x80000000, "Bit 31 should be set")
+
+  def test_s_bitset1_b32_preserves_other_bits(self):
+    """S_BITSET1_B32: preserves bits not being set."""
+    instructions = [
+      s_mov_b32(s[0], 0xFF00FF00),  # existing pattern
+      s_mov_b32(s[1], 0),            # bit position = 0
+      s_bitset1_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 0xFF00FF01, "Should set bit 0 while preserving others")
+
+  def test_s_bitset0_b32_clear_bit0(self):
+    """S_BITSET0_B32: clear bit 0 in destination."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # start with all bits set
+      s_mov_b32(s[1], 0),            # bit position = 0
+      s_bitset0_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 0xFFFFFFFE, "Bit 0 should be cleared")
+
+  def test_s_bitset0_b32_clear_bit31(self):
+    """S_BITSET0_B32: clear bit 31 in destination."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # start with all bits set
+      s_mov_b32(s[1], 31),           # bit position = 31
+      s_bitset0_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 0x7FFFFFFF, "Bit 31 should be cleared")
+
+  def test_s_bitset1_b32_uses_low5_bits(self):
+    """S_BITSET1_B32: only uses low 5 bits of position (mod 32)."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 32 + 5),   # position = 37, but mod 32 = 5
+      s_bitset1_b32(s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0], 0x20, "Bit 5 should be set (37 mod 32 = 5)")
+
+
+class TestBfeI64(unittest.TestCase):
+  """Tests for S_BFE_I64 - 64-bit bit field extract with sign extension.
+
+  Regression tests for sign extension bug where 32-bit masks were incorrectly
+  used for 64-bit operations, causing the high 32 bits to not be sign-extended.
+  """
+
+  def test_s_bfe_i64_positive_no_sign_extend(self):
+    """S_BFE_I64: positive value (1) in 16 bits should not sign extend."""
+    # S1 encodes: [22:16] = width, [5:0] = offset
+    # width=16, offset=0 -> S1 = (16 << 16) | 0 = 0x100000
+    instructions = [
+      s_mov_b32(s[0], 1),         # S0 lo = 1
+      s_mov_b32(s[1], 0),         # S0 hi = 0
+      s_mov_b32(s[2], 0x100000),  # width=16, offset=0
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 1, "lo should be 1")
+    self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
+
+  def test_s_bfe_i64_negative_sign_extend(self):
+    """S_BFE_I64: 0xFFFF (-1 in 16 bits) should sign extend to 64 bits.
+
+    This is the main regression test - before the fix, hi was 0 instead of 0xFFFFFFFF.
+    """
+    instructions = [
+      s_mov_b32(s[0], 0xFFFF),    # S0 lo = -1 in 16 bits
+      s_mov_b32(s[1], 0),         # S0 hi = 0
+      s_mov_b32(s[2], 0x100000),  # width=16, offset=0
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
+
+  def test_s_bfe_i64_8bit_negative_sign_extend(self):
+    """S_BFE_I64: 0xFF (-1 in 8 bits) should sign extend to 64 bits."""
+    # width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
+    instructions = [
+      s_mov_b32(s[0], 0xFF),      # S0 lo = -1 in 8 bits
+      s_mov_b32(s[1], 0),         # S0 hi = 0
+      s_mov_b32(s[2], 0x80000),   # width=8, offset=0
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
+
+  def test_s_bfe_i64_8bit_positive(self):
+    """S_BFE_I64: 0x7F (127 in 8 bits) should not sign extend."""
+    # width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
+    instructions = [
+      s_mov_b32(s[0], 0x7F),      # S0 lo = 127 in 8 bits (MSB=0)
+      s_mov_b32(s[1], 0),         # S0 hi = 0
+      s_mov_b32(s[2], 0x80000),   # width=8, offset=0
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0x7F, "lo should be 0x7F")
+    self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
+
+  def test_s_bfe_i64_with_offset(self):
+    """S_BFE_I64: extract from non-zero bit offset with sign extension."""
+    # Extract 16 bits starting at bit 8: value 0xFF00 >> 8 = 0xFF = -1 in 8 bits? No wait...
+    # Let's put 0x8000FF00: extract 16 bits at offset 8 = 0x00FF (positive)
+    # Put 0xFF00_0000: extract 16 bits at offset 16 = 0xFF00 = -256 in signed 16-bit
+    instructions = [
+      s_mov_b32(s[0], 0xFF000000),  # bits [31:24] = 0xFF, [23:16] = 0x00
+      s_mov_b32(s[1], 0),
+      # width=16, offset=16 -> S1 = (16 << 16) | 16 = 0x100010
+      s_mov_b32(s[2], 0x100010),
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Extract bits [31:16] = 0xFF00, sign bit is bit 15 of extracted = bit 31 of original = 1
+    # So result should be sign-extended 0xFF00 -> 0xFFFFFF00 in lo, 0xFFFFFFFF in hi
+    self.assertEqual(st.vgpr[0][0], 0xFFFFFF00, "lo should be sign-extended 0xFF00")
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
+
+  def test_s_bfe_i64_32bit_negative(self):
+    """S_BFE_I64: extract 32 bits with sign extension."""
+    # width=32, offset=0 -> S1 = (32 << 16) | 0 = 0x200000
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # MIN_INT32 = -2^31
+      s_mov_b32(s[1], 0),
+      s_mov_b32(s[2], 0x200000),    # width=32, offset=0
+      s_bfe_i64(s[4:5], s[0:1], s[2]),
+      v_mov_b32_e32(v[0], s[4]),
+      v_mov_b32_e32(v[1], s[5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0x80000000, "lo should be 0x80000000")
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
+
+
 class Test64BitCompare(unittest.TestCase):
   """Tests for 64-bit scalar compare instructions."""
 
diff --git a/extra/assembly/amd/test/hw/test_vop1.py b/extra/assembly/amd/test/hw/test_vop1.py
index a1042e72b5..215a2a2fbe 100644
--- a/extra/assembly/amd/test/hw/test_vop1.py
+++ b/extra/assembly/amd/test/hw/test_vop1.py
@@ -255,7 +255,7 @@ class TestF16Conversions(unittest.TestCase):
 
   def test_v_cvt_f16_f32_small(self):
     """V_CVT_F16_F32 converts small f32 value."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     instructions = [
       v_mov_b32_e32(v[0], 0.5),
       v_cvt_f16_f32_e32(v[1], v[0]),
@@ -293,7 +293,7 @@ class TestF16Conversions(unittest.TestCase):
 
   def test_v_cvt_f16_f32_reads_full_32bit_source(self):
     """V_CVT_F16_F32 must read full 32-bit f32 source."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x3fc00000),  # f32 1.5
       v_mov_b32_e32(v[0], s[0]),
@@ -348,6 +348,142 @@ class TestF16Conversions(unittest.TestCase):
     self.assertEqual(result, 1, f"Expected 1 from high bits, got {result}")
 
 
+class TestF64Conversions(unittest.TestCase):
+  """Tests for f64 conversion instructions. Regression tests for f32_to_f64/f64_to_f32."""
+
+  def test_v_cvt_f64_f32_one(self):
+    """V_CVT_F64_F32 converts f32 1.0 to f64."""
+    instructions = [
+      s_mov_b32(s[0], f2i(1.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_f32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, 1.0, places=10)
+
+  def test_v_cvt_f64_f32_negative(self):
+    """V_CVT_F64_F32 converts f32 -2.5 to f64."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-2.5)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_f32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, -2.5, places=10)
+
+  def test_v_cvt_f64_f32_pi(self):
+    """V_CVT_F64_F32 converts f32 pi to f64."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], f2i(3.14159265)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_f32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, 3.14159265, places=5)
+
+  def test_v_cvt_f64_f32_zero(self):
+    """V_CVT_F64_F32 converts f32 0.0 to f64."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_cvt_f64_f32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertEqual(result, 0.0)
+
+  def test_v_cvt_f32_f64_one(self):
+    """V_CVT_F32_F64 converts f64 1.0 to f32."""
+    f64_bits = f2i64(1.0)
+    lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
+    instructions = [
+      s_mov_b32(s[0], lo),
+      s_mov_b32(s[1], hi),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cvt_f32_f64_e32(v[2], v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][2])
+    self.assertAlmostEqual(result, 1.0, places=5)
+
+  def test_v_cvt_f32_f64_negative(self):
+    """V_CVT_F32_F64 converts f64 -3.5 to f32."""
+    f64_bits = f2i64(-3.5)
+    lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
+    instructions = [
+      s_mov_b32(s[0], lo),
+      s_mov_b32(s[1], hi),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cvt_f32_f64_e32(v[2], v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][2])
+    self.assertAlmostEqual(result, -3.5, places=5)
+
+  def test_v_cvt_f32_f64_large(self):
+    """V_CVT_F32_F64 converts large f64 to f32."""
+    f64_bits = f2i64(123456.789)
+    lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
+    instructions = [
+      s_mov_b32(s[0], lo),
+      s_mov_b32(s[1], hi),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cvt_f32_f64_e32(v[2], v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][2])
+    self.assertAlmostEqual(result, 123456.789, places=0)
+
+  def test_v_cvt_f64_i32_positive(self):
+    """V_CVT_F64_I32 converts positive i32 to f64."""
+    instructions = [
+      s_mov_b32(s[0], 42),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_i32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, 42.0, places=10)
+
+  def test_v_cvt_f64_i32_negative(self):
+    """V_CVT_F64_I32 converts negative i32 to f64."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # -1 as i32
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_i32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, -1.0, places=10)
+
+  def test_v_cvt_f64_u32_large(self):
+    """V_CVT_F64_U32 converts large u32 to f64."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # max u32
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f64_u32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertAlmostEqual(result, 4294967295.0, places=0)
+
+  def test_v_cvt_f64_u32_zero(self):
+    """V_CVT_F64_U32 converts 0 to f64."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_cvt_f64_u32_e32(v[2:3], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
+    self.assertEqual(result, 0.0)
+
+
 class TestClz(unittest.TestCase):
   """Tests for V_CLZ_I32_U32 - count leading zeros."""
 
@@ -560,7 +696,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
 
   def test_v_cvt_f32_f16_abs_negative(self):
     """V_CVT_F32_F16 with |abs| on negative value."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_neg1 = f32_to_f16(-1.0)  # 0xbc00
     instructions = [
       s_mov_b32(s[0], f16_neg1),
@@ -573,7 +709,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
 
   def test_v_cvt_f32_f16_abs_positive(self):
     """V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_2 = f32_to_f16(2.0)  # 0x4000
     instructions = [
       s_mov_b32(s[0], f16_2),
@@ -586,7 +722,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
 
   def test_v_cvt_f32_f16_neg_positive(self):
     """V_CVT_F32_F16 with neg on positive value."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_2 = f32_to_f16(2.0)  # 0x4000
     instructions = [
       s_mov_b32(s[0], f16_2),
@@ -599,7 +735,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
 
   def test_v_cvt_f32_f16_neg_negative(self):
     """V_CVT_F32_F16 with neg on negative value (double negative)."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_neg2 = f32_to_f16(-2.0)  # 0xc000
     instructions = [
       s_mov_b32(s[0], f16_neg2),
@@ -612,7 +748,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
 
   def test_v_cvt_f16_f32_then_pack_for_wmma(self):
     """CVT F32->F16 followed by pack (common WMMA pattern)."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     f32_val = 3.5
     instructions = [
       s_mov_b32(s[0], f2i(f32_val)),
@@ -668,7 +804,7 @@ class TestConversionRounding(unittest.TestCase):
 
   def test_f16_to_f32_precision(self):
     """F16 to F32 conversion precision."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_val = f32_to_f16(1.5)
     instructions = [
       s_mov_b32(s[0], f16_val),
@@ -680,7 +816,7 @@ class TestConversionRounding(unittest.TestCase):
 
   def test_f16_denormal_to_f32(self):
     """F16 denormal converts to small positive f32."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     f16_denorm = 0x0001  # Smallest positive f16 denormal
     instructions = [
       v_mov_b32_e32(v[0], f16_denorm),
@@ -1238,5 +1374,143 @@ class TestFloorEdgeCases(unittest.TestCase):
     self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
 
 
+class TestVop1F16HiHalf(unittest.TestCase):
+  """Regression tests for VOP1 f16 hi-half source operand handling.
+
+  For 16-bit VOP1 operations, when src0 is in the range v[128]+ (offset >= 384),
+  the hardware reads from the high 16 bits of v[src0-128]. The emulator must
+  extract bits [31:16] from the actual VGPR.
+  """
+
+  def test_v_cvt_f32_f16_src_hi_half(self):
+    """V_CVT_F32_F16 with source from hi-half (v[128]+).
+
+    When src0 >= v[128], it reads from the high 16 bits of v[src0-128].
+    This is critical for global_load_d16_hi_b16 + v_cvt_f32_f16 patterns.
+
+    Regression test for: VOP1 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v_cvt_f32_f16 v[1], v[128] (reads hi half of v[0])
+      # Should convert f16(2.0) to f32(2.0)
+      v_cvt_f32_f16_e32(v[1], v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected f32(2.0), got {result}")
+
+  def test_v_cvt_f32_f16_src_lo_vs_hi(self):
+    """V_CVT_F32_F16 comparing lo and hi half reads.
+
+    v[0] has different values in lo and hi halves.
+    v_cvt_f32_f16 v[1], v[0] should read lo (1.0)
+    v_cvt_f32_f16 v[2], v[128] should read hi (2.0)
+
+    Regression test for: VOP1 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # Read from lo half
+      v_cvt_f32_f16_e32(v[1], v[0]),
+      # Read from hi half
+      v_cvt_f32_f16_e32(v[2], v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_lo = i2f(st.vgpr[0][1])
+    result_hi = i2f(st.vgpr[0][2])
+    self.assertAlmostEqual(result_lo, 1.0, places=5, msg=f"Expected f32(1.0) from lo, got {result_lo}")
+    self.assertAlmostEqual(result_hi, 2.0, places=5, msg=f"Expected f32(2.0) from hi, got {result_hi}")
+
+  def test_v_cvt_i16_f16_src_hi_half(self):
+    """V_CVT_I16_F16 with source from hi-half.
+
+    Regression test for: VOP1 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0xc000_3c00: hi=f16(-2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0xc0003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v_cvt_i16_f16 v[1], v[128] (reads hi half of v[0])
+      # Should convert f16(-2.0) to i16(-2)
+      v_cvt_i16_f16_e32(v[1], v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    expected = (-2) & 0xffff
+    self.assertEqual(result, expected, f"Expected i16(-2)=0x{expected:04x}, got 0x{result:04x}")
+
+  def test_v_mov_b16_src_hi_half(self):
+    """V_MOV_B16 with source from hi-half.
+
+    Regression test for: VOP1 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0xBEEF_DEAD: hi=0xBEEF, lo=0xDEAD
+      s_mov_b32(s[0], 0xBEEFDEAD),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = 0x0000_0000 initially
+      v_mov_b32_e32(v[1], 0),
+      # v_mov_b16 v[1], v[128] (reads hi half of v[0])
+      # Should move 0xBEEF to v[1].lo
+      v_mov_b16_e32(v[1], v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 0xBEEF, f"Expected 0xBEEF from hi half, got 0x{result:04x}")
+
+
+class TestReciprocalF16(unittest.TestCase):
+  """Tests for V_RCP_F16 - reciprocal in half precision.
+
+  The pcode uses a 16-bit float literal: D0.f16 = 16'1.0 / S0.f16
+  This tests that the sized float literal (16'1.0) is correctly parsed.
+  """
+
+  def test_v_rcp_f16_one(self):
+    """V_RCP_F16: 1/1.0 = 1.0"""
+    import struct
+    def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
+    def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
+    instructions = [
+      # Load f16 1.0 into low 16 bits of v[0]
+      v_mov_b32_e32(v[0], f16_to_bits(1.0)),
+      v_rcp_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
+    self.assertAlmostEqual(result, 1.0, places=2, msg="1/1.0 should be 1.0")
+
+  def test_v_rcp_f16_two(self):
+    """V_RCP_F16: 1/2.0 = 0.5"""
+    import struct
+    def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
+    def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
+    instructions = [
+      v_mov_b32_e32(v[0], f16_to_bits(2.0)),
+      v_rcp_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
+    self.assertAlmostEqual(result, 0.5, places=2, msg="1/2.0 should be 0.5")
+
+  def test_v_rcp_f16_four(self):
+    """V_RCP_F16: 1/4.0 = 0.25"""
+    import struct
+    def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
+    def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
+    instructions = [
+      v_mov_b32_e32(v[0], f16_to_bits(4.0)),
+      v_rcp_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
+    self.assertAlmostEqual(result, 0.25, places=2, msg="1/4.0 should be 0.25")
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop2.py b/extra/assembly/amd/test/hw/test_vop2.py
index 03d364264a..ba19310671 100644
--- a/extra/assembly/amd/test/hw/test_vop2.py
+++ b/extra/assembly/amd/test/hw/test_vop2.py
@@ -341,6 +341,293 @@ class TestHiHalfOps(unittest.TestCase):
       self.assertEqual(result, 0x4200, f"Lane {lane}: expected 0x4200, got 0x{result:04x}")
 
 
+class TestVop2F16HiHalf(unittest.TestCase):
+  """Regression tests for VOP2 f16 hi-half operand handling.
+
+  These test the bugs where:
+  1. VOP2 vsrc1 >= 384 (v[128]+) wasn't extracting hi 16 bits
+  2. VOP2 vdst >= 384 (v[128]+) wasn't preserving lo 16 bits
+  """
+
+  def test_v_add_f16_e32_vsrc1_hi_half(self):
+    """V_ADD_F16_E32 with vsrc1 from hi-half (v[128]+).
+
+    When vsrc1 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
+    of v[vsrc1-128]. The emulator must extract bits [31:16] from the actual VGPR.
+
+    Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v_add_f16_e32 v[1], v[0], v[128]  (vsrc1=v[128] reads hi of v[0])
+      # In VOP2 encoding, vsrc1=384 means v[128], which maps to v[0].hi
+      # v[1] = v[0].lo + v[0].hi = 1.0 + 2.0 = 3.0
+      VOP2(VOP2Op.V_ADD_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    # 1.0 + 2.0 = 3.0, f16 3.0 = 0x4200
+    self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
+
+  def test_v_mul_f16_e32_vsrc1_hi_half(self):
+    """V_MUL_F16_E32 with vsrc1 from hi-half.
+
+    Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4200_4000: hi=f16(3.0), lo=f16(2.0)
+      s_mov_b32(s[0], 0x42004000),
+      v_mov_b32_e32(v[0], s[0]),
+      # v_mul_f16_e32 v[1], v[0], v[128]  (vsrc1=v[128] reads hi of v[0])
+      # v[1] = v[0].lo * v[0].hi = 2.0 * 3.0 = 6.0
+      VOP2(VOP2Op.V_MUL_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
+    self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
+
+  def test_v_add_f16_e32_vdst_hi_half(self):
+    """V_ADD_F16_E32 writing to hi-half destination (v[128]+).
+
+    When vdst >= 384 (representing v[128]+), the hardware writes to bits [31:16]
+    of v[vdst-128] while preserving bits [15:0]. The emulator must merge the result.
+
+    Regression test for: VOP2 f16 vdst hi-half write bug.
+    """
+    instructions = [
+      # v[0] = 0x0000_BEEF: lo has marker value
+      s_mov_b32(s[0], 0x0000BEEF),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(1.0), v[2] = f16(2.0)
+      s_mov_b32(s[1], 0x3c00),
+      s_mov_b32(s[2], 0x4000),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      # v_add_f16_e32 v[128], v[1], v[2]  (vdst=v[128] writes hi of v[0])
+      # v[0].hi = 1.0 + 2.0 = 3.0, v[0].lo preserved = 0xBEEF
+      VOP2(VOP2Op.V_ADD_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][0] >> 16) & 0xffff
+    lo = st.vgpr[0][0] & 0xffff
+    # hi = 3.0 = 0x4200, lo preserved = 0xBEEF
+    self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
+    self.assertEqual(lo, 0xBEEF, f"Expected lo preserved=0xBEEF, got 0x{lo:04x}")
+
+  def test_v_mul_f16_e32_vdst_hi_half(self):
+    """V_MUL_F16_E32 writing to hi-half destination.
+
+    Regression test for: VOP2 f16 vdst hi-half write bug.
+    """
+    instructions = [
+      # v[0] = 0x0000_DEAD: lo has marker value
+      s_mov_b32(s[0], 0x0000DEAD),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(2.0), v[2] = f16(4.0)
+      s_mov_b32(s[1], 0x4000),
+      s_mov_b32(s[2], 0x4400),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      # v_mul_f16_e32 v[128], v[1], v[2]  (vdst=v[128] writes hi of v[0])
+      # v[0].hi = 2.0 * 4.0 = 8.0, v[0].lo preserved = 0xDEAD
+      VOP2(VOP2Op.V_MUL_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][0] >> 16) & 0xffff
+    lo = st.vgpr[0][0] & 0xffff
+    # hi = 8.0 = 0x4800, lo preserved = 0xDEAD
+    self.assertEqual(hi, 0x4800, f"Expected hi=f16(8.0)=0x4800, got 0x{hi:04x}")
+    self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
+
+  def test_v_add_f16_e32_both_hi_half(self):
+    """V_ADD_F16_E32 with both vsrc1 and vdst as hi-half (different underlying regs).
+
+    Tests the combination of both fixes: reading vsrc1 from hi-half AND
+    writing result to hi-half destination, using different underlying VGPRs.
+
+    Regression test for: VOP2 f16 hi-half bugs (combined).
+    """
+    instructions = [
+      # v[0] = 0x4000_xxxx: hi=f16(2.0) for vsrc1
+      s_mov_b32(s[0], 0x40000000),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = 0x0000_3c00: lo=f16(1.0) for src0
+      s_mov_b32(s[1], 0x00003c00),
+      v_mov_b32_e32(v[1], s[1]),
+      # v[2] = 0x0000_CAFE: lo=marker for vdst preservation
+      s_mov_b32(s[2], 0x0000CAFE),
+      v_mov_b32_e32(v[2], s[2]),
+      # v_add_f16_e32 v[130], v[1], v[128]
+      # src0 = v[1].lo = 1.0
+      # vsrc1 = v[128] reads v[0].hi = 2.0
+      # result = 1.0 + 2.0 = 3.0
+      # vdst = v[130] writes to v[2].hi, preserving v[2].lo
+      VOP2(VOP2Op.V_ADD_F16, vdst=v[130], src0=v[1], vsrc1=v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][2] >> 16) & 0xffff
+    lo = st.vgpr[0][2] & 0xffff
+    # hi = 3.0 = 0x4200, lo preserved = 0xCAFE
+    self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
+    self.assertEqual(lo, 0xCAFE, f"Expected lo preserved=0xCAFE, got 0x{lo:04x}")
+
+  def test_v_fmac_f16_e32_vsrc1_hi_half(self):
+    """V_FMAC_F16_E32 with vsrc1 from hi-half.
+
+    V_FMAC_F16: vdst = vdst + src0 * vsrc1
+
+    Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(3.0) = 0x4200
+      s_mov_b32(s[1], 0x4200),
+      v_mov_b32_e32(v[1], s[1]),
+      # v_fmac_f16_e32 v[1], v[0], v[128]
+      # vdst = v[1] = 3.0 + v[0].lo * v[0].hi = 3.0 + 1.0 * 2.0 = 5.0
+      VOP2(VOP2Op.V_FMAC_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    # 3.0 + 1.0 * 2.0 = 5.0, f16 5.0 = 0x4500
+    self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
+
+  def test_v_fmac_f16_e32_vdst_hi_half(self):
+    """V_FMAC_F16_E32 writing to hi-half destination.
+
+    V_FMAC_F16: vdst.h = vdst.h + src0 * vsrc1
+
+    When vdst is v[128]+, the accumulator D0 must also read from the hi-half.
+    This tests the bug where D0 was read from lo-half instead of hi-half.
+
+    Regression test for: VOP2 FMAC hi-half D0 accumulator read bug.
+    """
+    instructions = [
+      # v[0] = 0x3800_DEAD: hi=f16(0.5), lo=marker (0xDEAD)
+      s_mov_b32(s[0], 0x3800DEAD),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(2.0) = 0x4000
+      s_mov_b32(s[1], 0x4000),
+      v_mov_b32_e32(v[1], s[1]),
+      # v[2] = f16(3.0) = 0x4200
+      s_mov_b32(s[2], 0x4200),
+      v_mov_b32_e32(v[2], s[2]),
+      # v_fmac_f16_e32 v[128], v[1], v[2]
+      # vdst = v[128] means v[0].hi
+      # D0 = v[0].hi = 0.5
+      # result = D0 + src0 * vsrc1 = 0.5 + 2.0 * 3.0 = 6.5
+      # v[0].hi = 6.5, v[0].lo preserved = 0xDEAD
+      VOP2(VOP2Op.V_FMAC_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][0] >> 16) & 0xffff
+    lo = st.vgpr[0][0] & 0xffff
+    # hi = 6.5 = 0x4680, lo preserved = 0xDEAD
+    self.assertEqual(hi, 0x4680, f"Expected hi=f16(6.5)=0x4680, got 0x{hi:04x}")
+    self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
+
+  def test_v_mul_f16_e32_src0_hi_half(self):
+    """V_MUL_F16_E32 with src0 from hi-half (src0 >= v[128]).
+
+    When src0 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
+    of v[src0-128]. The emulator must extract bits [31:16] from the actual VGPR.
+
+    Regression test for: VOP2 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(3.0) = 0x4200
+      s_mov_b32(s[1], 0x4200),
+      v_mov_b32_e32(v[1], s[1]),
+      # v_mul_f16_e32 v[2], v[128], v[1]
+      # src0 = v[128] reads from v[0].hi = 2.0
+      # result = 2.0 * 3.0 = 6.0
+      VOP2(VOP2Op.V_MUL_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
+    self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
+
+  def test_v_add_f16_e32_src0_hi_half(self):
+    """V_ADD_F16_E32 with src0 from hi-half (src0 >= v[128]).
+
+    Regression test for: VOP2 f16 src0 hi-half extraction bug.
+    """
+    instructions = [
+      # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
+      s_mov_b32(s[0], 0x40003c00),
+      v_mov_b32_e32(v[0], s[0]),
+      # v[1] = f16(5.0) = 0x4500
+      s_mov_b32(s[1], 0x4500),
+      v_mov_b32_e32(v[1], s[1]),
+      # v_add_f16_e32 v[2], v[128], v[1]
+      # src0 = v[128] reads from v[0].hi = 2.0
+      # result = 2.0 + 5.0 = 7.0
+      VOP2(VOP2Op.V_ADD_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    # 2.0 + 5.0 = 7.0, f16 7.0 = 0x4700
+    self.assertEqual(result, 0x4700, f"Expected f16(7.0)=0x4700, got 0x{result:04x}")
+
+
+class TestF16InlineConstants(unittest.TestCase):
+  """Regression tests for VOP2 F16 inline float constants.
+
+  For 16-bit VOP2 operations (v_add_f16, v_mul_f16, etc.), inline float constants
+  like 1.0, 2.0 must use F16 encoding (0x3c00, 0x4000) not F32 encoding (0x3f800000).
+
+  The emulator's rsrc() function needs bits=16 to select F16_INLINE constants.
+
+  Regression test for: VOP2 16-bit inline constant using F32 instead of F16.
+  """
+
+  def test_v_add_f16_inline_constant_1_0(self):
+    """V_ADD_F16_E32 with inline constant 1.0 should use F16 encoding."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      v_mov_b32_e32(v[0], s[0]),
+      # v_add_f16_e32 v[1], 1.0, v[0]  -- 1.0 must be F16 0x3c00, not F32 0x3f800000
+      v_add_f16_e32(v[1], 1.0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xFFFF
+    # 1.0 + 1.0 = 2.0, f16 2.0 = 0x4000
+    self.assertEqual(result, 0x4000, f"Expected f16(2.0)=0x4000, got 0x{result:04x}")
+
+  def test_v_add_f16_inline_constant_2_0(self):
+    """V_ADD_F16_E32 with inline constant 2.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x4200),  # f16 3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_add_f16_e32(v[1], 2.0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xFFFF
+    # 2.0 + 3.0 = 5.0, f16 5.0 = 0x4500
+    self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
+
+  def test_v_mul_f16_inline_constant(self):
+    """V_MUL_F16_E32 with inline constant 2.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x4200),  # f16 3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mul_f16_e32(v[1], 2.0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xFFFF
+    # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
+    self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
+
+
 class TestCndmask(unittest.TestCase):
   """Tests for V_CNDMASK_B32 and V_CNDMASK_B16."""
 
@@ -447,5 +734,132 @@ class TestSpecialFloatValues(unittest.TestCase):
     self.assertEqual(st.vgpr[0][1], 0x00000000)
 
 
+class TestCarryOps(unittest.TestCase):
+  """Tests for VOP2 carry instructions (v_add_co_ci_u32, v_sub_co_ci_u32, v_subrev_co_ci_u32)."""
+
+  def test_v_subrev_co_ci_u32_no_borrow(self):
+    """V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=0."""
+    instructions = [
+      s_mov_b32(VCC_LO, 0),  # VCC = 0 (no borrow in)
+      v_mov_b32_e32(v[0], 5),  # S0 = 5
+      v_mov_b32_e32(v[1], 10),  # S1 = 10
+      v_subrev_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 10 - 5 - 0 = 5
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 5)
+    self.assertEqual(st.vcc, 0)  # No borrow out
+
+  def test_v_subrev_co_ci_u32_with_borrow(self):
+    """V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=1."""
+    instructions = [
+      s_mov_b32(VCC_LO, 1),  # VCC = 1 (borrow in)
+      v_mov_b32_e32(v[0], 5),  # S0 = 5
+      v_mov_b32_e32(v[1], 10),  # S1 = 10
+      v_subrev_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 10 - 5 - 1 = 4
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 4)
+    self.assertEqual(st.vcc, 0)  # No borrow out
+
+  def test_v_subrev_co_ci_u32_generates_borrow(self):
+    """V_SUBREV_CO_CI_U32: generates borrow when S0 + VCC_IN > S1."""
+    instructions = [
+      s_mov_b32(VCC_LO, 0),  # VCC = 0
+      v_mov_b32_e32(v[0], 10),  # S0 = 10
+      v_mov_b32_e32(v[1], 5),  # S1 = 5
+      v_subrev_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 5 - 10 - 0 = -5 (underflow)
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xFFFFFFFB)  # -5 as unsigned
+    self.assertEqual(st.vcc, 1)  # Borrow out
+
+  def test_v_add_co_ci_u32_no_carry(self):
+    """V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=0."""
+    instructions = [
+      s_mov_b32(VCC_LO, 0),  # VCC = 0 (no carry in)
+      v_mov_b32_e32(v[0], 5),  # S0 = 5
+      v_mov_b32_e32(v[1], 10),  # S1 = 10
+      v_add_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 5 + 10 + 0 = 15
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 15)
+    self.assertEqual(st.vcc, 0)  # No carry out
+
+  def test_v_add_co_ci_u32_with_carry(self):
+    """V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=1."""
+    instructions = [
+      s_mov_b32(VCC_LO, 1),  # VCC = 1 (carry in)
+      v_mov_b32_e32(v[0], 5),  # S0 = 5
+      v_mov_b32_e32(v[1], 10),  # S1 = 10
+      v_add_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 5 + 10 + 1 = 16
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 16)
+    self.assertEqual(st.vcc, 0)  # No carry out
+
+  def test_v_add_co_ci_u32_generates_carry(self):
+    """V_ADD_CO_CI_U32: generates carry when overflow occurs."""
+    instructions = [
+      s_mov_b32(VCC_LO, 1),  # VCC = 1 (carry in)
+      s_mov_b32(s[0], 0xFFFFFFFF),  # max u32
+      v_mov_b32_e32(v[0], s[0]),  # S0 = 0xFFFFFFFF
+      v_mov_b32_e32(v[1], 0),  # S1 = 0
+      v_add_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 0xFFFFFFFF + 0 + 1 = 0 (overflow)
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0)  # Overflowed to 0
+    self.assertEqual(st.vcc, 1)  # Carry out
+
+  def test_v_sub_co_ci_u32_no_borrow(self):
+    """V_SUB_CO_CI_U32: D0 = S0 - S1 - VCC_IN, when VCC_IN=0."""
+    instructions = [
+      s_mov_b32(VCC_LO, 0),  # VCC = 0 (no borrow in)
+      v_mov_b32_e32(v[0], 10),  # S0 = 10
+      v_mov_b32_e32(v[1], 5),  # S1 = 5
+      v_sub_co_ci_u32_e32(v[2], v[0], v[1]),  # D0 = 10 - 5 - 0 = 5
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 5)
+    self.assertEqual(st.vcc, 0)  # No borrow out
+
+  def test_v_sub_co_ci_u32_vop3sd_separate_carry_regs(self):
+    """VOP3SD V_SUB_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
+
+    This tests the VOP3SD encoding where src2 specifies the carry-in register
+    independently from sdst (carry-out). The bug was reading carry-in from sdst
+    instead of src2.
+
+    Computation: D0 = S0 - S1 - carry_in = 0 - 0 - 1 = -1 = 0xFFFFFFFF
+    """
+    instructions = [
+      s_mov_b32(s[6], 1),  # carry-in = 1 (in s[6])
+      s_mov_b32(s[10], 0),  # carry-out dest = 0 initially (in s[10])
+      # VOP3SD: v_sub_co_ci_u32(vdst, sdst, src0, src1, src2)
+      # src2 is carry-in (s[6]=1), sdst is carry-out (s[10])
+      v_sub_co_ci_u32(v[0], s[10], 0, 0, s[6]),  # D0 = 0 - 0 - 1 = -1
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF)  # -1 as unsigned
+    self.assertEqual(st.sgpr[10], 1)  # Borrow out to s[10]
+
+  def test_v_add_co_ci_u32_vop3sd_separate_carry_regs(self):
+    """VOP3SD V_ADD_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
+
+    This tests the VOP3SD encoding where src2 specifies the carry-in register
+    independently from sdst (carry-out).
+
+    Computation: D0 = S0 + S1 + carry_in = 5 + 10 + 1 = 16
+    """
+    instructions = [
+      s_mov_b32(s[6], 1),  # carry-in = 1 (in s[6])
+      s_mov_b32(s[10], 0),  # carry-out dest = 0 initially (in s[10])
+      # VOP3SD: v_add_co_ci_u32(vdst, sdst, src0, src1, src2)
+      v_add_co_ci_u32(v[0], s[10], 5, 10, s[6]),  # D0 = 5 + 10 + 1 = 16
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 16)
+    self.assertEqual(st.sgpr[10], 0)  # No carry out
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop3.py b/extra/assembly/amd/test/hw/test_vop3.py
index 7797312edb..9a4bbfa3a9 100644
--- a/extra/assembly/amd/test/hw/test_vop3.py
+++ b/extra/assembly/amd/test/hw/test_vop3.py
@@ -58,6 +58,95 @@ class TestFMA(unittest.TestCase):
     self.assertTrue(math.isinf(result) and result > 0)
 
 
+class TestFmacE64(unittest.TestCase):
+  """Regression tests for V_FMAC_F32 VOP3 encoding (e64).
+
+  V_FMAC_F32: D0 = D0 + S0 * S1 (fused multiply-add with accumulator)
+
+  The VOP3 encoding needs to read D0 from the destination register as the
+  accumulator input, not just write to it.
+
+  Regression test for: VOP3 FMAC missing D0 accumulator bug.
+  """
+
+  def test_v_fmac_f32_e64_basic(self):
+    """V_FMAC_F32_E64: basic accumulate test."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),  # S0 = 2.0
+      v_mov_b32_e32(v[1], 3.0),  # S1 = 3.0
+      v_mov_b32_e32(v[2], 1.0),  # D0 (accumulator) = 1.0
+      # v_fmac_f32_e64 v[2], v[0], v[1]
+      # D0 = D0 + S0 * S1 = 1.0 + 2.0 * 3.0 = 7.0
+      v_fmac_f32_e64(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)
+
+  def test_v_fmac_f32_e64_with_sgpr_sources(self):
+    """V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
+
+    This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
+    where src0 is SGPR and src1 is inline constant 0.
+
+    Regression test for: VOP3 FMAC missing D0 accumulator bug.
+    """
+    instructions = [
+      s_mov_b32(s[4], f2i(2.0)),  # S0 = 2.0 in SGPR
+      v_mov_b32_e32(v[0], 5.0),   # D0 (accumulator) = 5.0
+      # v_fmac_f32_e64 v[0], s[4], 0
+      # D0 = D0 + S0 * S1 = 5.0 + 2.0 * 0.0 = 5.0
+      v_fmac_f32_e64(v[0], s[4], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][0]), 5.0, places=5)
+
+  def test_v_fmac_f32_e64_with_two_sgprs(self):
+    """V_FMAC_F32_E64 with two SGPR sources.
+
+    Tests pattern: v_fmac_f32_e64(v[0], s[a], s[b])
+
+    Regression test for: VOP3 FMAC missing D0 accumulator bug.
+    """
+    instructions = [
+      s_mov_b32(s[10], f2i(3.0)),  # S0 = 3.0
+      s_mov_b32(s[12], f2i(4.0)),  # S1 = 4.0
+      v_mov_b32_e32(v[9], 2.0),    # D0 (accumulator) = 2.0
+      # v_fmac_f32_e64 v[9], s[10], s[12]
+      # D0 = D0 + S0 * S1 = 2.0 + 3.0 * 4.0 = 14.0
+      v_fmac_f32_e64(v[9], s[10], s[12]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][9]), 14.0, places=5)
+
+  def test_v_fmac_f32_e64_accumulates_correctly(self):
+    """V_FMAC_F32_E64 accumulates multiple times."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0.0),   # D0 = 0.0
+      v_mov_b32_e32(v[1], 1.0),   # S0 = 1.0
+      v_mov_b32_e32(v[2], 2.0),   # S1 = 2.0
+      # First: D0 = 0.0 + 1.0 * 2.0 = 2.0
+      v_fmac_f32_e64(v[0], v[1], v[2]),
+      # Second: D0 = 2.0 + 1.0 * 2.0 = 4.0
+      v_fmac_f32_e64(v[0], v[1], v[2]),
+      # Third: D0 = 4.0 + 1.0 * 2.0 = 6.0
+      v_fmac_f32_e64(v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][0]), 6.0, places=5)
+
+  def test_v_fmac_f32_e64_negative_accumulator(self):
+    """V_FMAC_F32_E64 with negative accumulator."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),   # S0 = 2.0
+      v_mov_b32_e32(v[1], 3.0),   # S1 = 3.0
+      v_mov_b32_e32(v[2], -10.0), # D0 (accumulator) = -10.0
+      # D0 = -10.0 + 2.0 * 3.0 = -4.0
+      v_fmac_f32_e64(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), -4.0, places=5)
+
+
 class TestDivScale(unittest.TestCase):
   """Tests for V_DIV_SCALE_F32."""
 
@@ -768,7 +857,7 @@ class TestF16Modifiers(unittest.TestCase):
 
   def test_v_fma_f16_inline_const_1_0(self):
     """V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
     f16_a = f32_to_f16(0.325928)  # ~0x3537
     f16_b = f32_to_f16(-0.486572)  # ~0xb7c9
     instructions = [
@@ -785,7 +874,7 @@ class TestF16Modifiers(unittest.TestCase):
 
   def test_v_fma_f16_inline_const_0_5(self):
     """V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
     f16_a = f32_to_f16(2.0)
     f16_b = f32_to_f16(3.0)
     instructions = [
@@ -802,7 +891,7 @@ class TestF16Modifiers(unittest.TestCase):
 
   def test_v_fma_f16_inline_const_neg_1_0(self):
     """V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
     f16_a = f32_to_f16(2.0)
     f16_b = f32_to_f16(3.0)
     instructions = [
@@ -819,7 +908,7 @@ class TestF16Modifiers(unittest.TestCase):
 
   def test_v_add_f16_abs_both(self):
     """V_ADD_F16 with abs on both operands."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
     f16_neg2 = f32_to_f16(-2.0)
     f16_neg3 = f32_to_f16(-3.0)
     instructions = [
@@ -835,7 +924,7 @@ class TestF16Modifiers(unittest.TestCase):
 
   def test_v_mul_f16_neg_abs(self):
     """V_MUL_F16 with neg on one operand and abs on another."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
     f16_2 = f32_to_f16(2.0)
     f16_neg3 = f32_to_f16(-3.0)
     instructions = [
@@ -854,7 +943,7 @@ class TestF16Modifiers(unittest.TestCase):
 
     This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
     """
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x38003c00),  # v0 = {hi=0.5, lo=1.0}
       v_mov_b32_e32(v[0], s[0]),
@@ -1621,6 +1710,27 @@ class TestCarryBorrow(unittest.TestCase):
     self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result")
     self.assertEqual(st.vgpr[0][5], 0x00000003, "hi result")
 
+  def test_add_co_u32_same_dst_src(self):
+    """V_ADD_CO_U32 where dst is same as src - VCC must use original src value."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_add_co_u32(v[0], VCC, v[0], 1),  # v[0] = v[0] + 1, VCC should be set from overflow
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0, "0xFFFFFFFF + 1 = 0")
+    self.assertEqual(st.vcc & 1, 1, "Should have carry from 0xFFFFFFFF + 1")
+
+  def test_add_co_u32_same_dst_src_no_carry(self):
+    """V_ADD_CO_U32 where dst is same as src - no carry case."""
+    instructions = [
+      v_mov_b32_e32(v[0], 100),
+      v_add_co_u32(v[0], VCC, v[0], 1),  # v[0] = v[0] + 1
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 101, "100 + 1 = 101")
+    self.assertEqual(st.vcc & 1, 0, "No carry from 100 + 1")
+
 
 class TestReadlane(unittest.TestCase):
   """Tests for V_READLANE_B32 and related cross-lane operations."""
@@ -2292,5 +2402,414 @@ class TestAddF32EdgeCases(unittest.TestCase):
     self.assertEqual(st.vgpr[0][2], 0x80000000)  # -0
 
 
+class TestDivScaleF64(unittest.TestCase):
+  """Tests for V_DIV_SCALE_F64 - critical for tan() and division.
+
+  These tests verify that VCC bits are set independently per lane,
+  which is essential for correct multi-lane f64 division operations.
+  """
+
+  def test_div_scale_f64_basic_no_scaling(self):
+    """V_DIV_SCALE_F64: normal values with no scaling needed."""
+    sqrt2 = f2i64(1.4142135623730951)
+    one = f2i64(1.0)
+    instructions = [
+      s_mov_b32(s[0], sqrt2 & 0xffffffff),
+      s_mov_b32(s[1], sqrt2 >> 32),
+      s_mov_b32(s[2], one & 0xffffffff),
+      s_mov_b32(s[3], one >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    self.assertAlmostEqual(result, 1.4142135623730951, places=10)
+    self.assertEqual(st.vcc & 1, 0, "VCC should be 0 when no scaling needed")
+
+  def test_div_scale_f64_vcc_per_lane_uniform_input(self):
+    """V_DIV_SCALE_F64: VCC bits should be set independently per lane (uniform input).
+
+    This is a regression test for the bug where VCC = 0x0LL was setting the whole
+    64-bit VCC register instead of just the current lane's bit. With uniform input
+    all lanes should get VCC=0.
+    """
+    val = f2i64(2.0)
+    instructions = [
+      s_mov_b32(s[0], val & 0xffffffff),
+      s_mov_b32(s[1], val >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # All lanes should have VCC=0 for normal values
+    self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
+    # All lanes should have same result
+    for lane in range(4):
+      result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
+      self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} result mismatch")
+
+  def test_div_scale_f64_vcc_per_lane_varying_input(self):
+    """V_DIV_SCALE_F64: VCC bits set per-lane with different inputs per lane.
+
+    This test uses different inputs per lane to verify that VCC is tracked
+    independently. This catches the bug where the emulator was setting VCC
+    for all lanes to the same value.
+    """
+    import math
+    # Use lane-varying input: lane 0 gets 2.0, lane 1 gets 3.0, etc.
+    # All normal values should result in VCC=0 for each lane
+    instructions = [
+      # Set up per-lane values using lane_id
+      v_cvt_f64_i32_e32(v[0:1], v[255]),  # v0:1 = f64(lane_id)
+      v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO),  # v0:1 = lane_id + 2.0
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # All lanes should have VCC=0 (no scaling needed for 2.0, 3.0, 4.0, 5.0)
+    self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
+    # Verify each lane has correct result
+    for lane in range(4):
+      expected = float(lane) + 2.0
+      result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
+      self.assertAlmostEqual(result, expected, places=10, msg=f"Lane {lane}: expected {expected}, got {result}")
+
+  def test_div_scale_f64_zero_denom_sets_vcc(self):
+    """V_DIV_SCALE_F64: zero denominator -> NaN, VCC=1."""
+    import math
+    one = f2i64(1.0)
+    zero = f2i64(0.0)
+    instructions = [
+      s_mov_b32(s[0], one & 0xffffffff),
+      s_mov_b32(s[1], one >> 32),
+      s_mov_b32(s[2], zero & 0xffffffff),
+      s_mov_b32(s[3], zero >> 32),
+      v_mov_b32_e32(v[0], s[0]),  # numer = 1.0
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),  # denom = 0.0
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    self.assertTrue(math.isnan(result), "Should be NaN for zero denom")
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
+
+  def test_div_scale_f64_mixed_vcc_per_lane(self):
+    """V_DIV_SCALE_F64: some lanes need scaling, others don't.
+
+    This is the key test for the tan() bug - it verifies that VCC is set
+    correctly for each lane independently when some lanes need scaling and
+    others don't.
+    """
+    import math
+    # Lane 0: normal value (VCC=0), Lane 1: zero denom (VCC=1)
+    # Lane 2: normal value (VCC=0), Lane 3: zero denom (VCC=1)
+    normal = f2i64(2.0)
+    zero = f2i64(0.0)
+    instructions = [
+      # Set up numer = 2.0 for all lanes
+      s_mov_b32(s[0], normal & 0xffffffff),
+      s_mov_b32(s[1], normal >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      # Set up denom: lane 0,2 get 2.0, lane 1,3 get 0.0
+      s_mov_b32(s[2], zero & 0xffffffff),
+      s_mov_b32(s[3], zero >> 32),
+      v_mov_b32_e32(v[2], s[0]),  # default to 2.0
+      v_mov_b32_e32(v[3], s[1]),
+      # Override lanes 1 and 3 with 0.0 using writelane
+      v_writelane_b32(v[2], s[2], 1),
+      v_writelane_b32(v[3], s[3], 1),
+      v_writelane_b32(v[2], s[2], 3),
+      v_writelane_b32(v[3], s[3], 3),
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # Lanes 0,2 should have VCC=0 (normal), lanes 1,3 should have VCC=1 (zero denom)
+    self.assertEqual(st.vcc & 0b0001, 0, "Lane 0 VCC should be 0")
+    self.assertEqual(st.vcc & 0b0010, 0b0010, "Lane 1 VCC should be 1")
+    self.assertEqual(st.vcc & 0b0100, 0, "Lane 2 VCC should be 0")
+    self.assertEqual(st.vcc & 0b1000, 0b1000, "Lane 3 VCC should be 1")
+
+    # Check results
+    for lane in [0, 2]:
+      result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
+      self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} should be 2.0")
+    for lane in [1, 3]:
+      result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
+      self.assertTrue(math.isnan(result), f"Lane {lane} should be NaN")
+
+
+class TestDivFmasF64(unittest.TestCase):
+  """Tests for V_DIV_FMAS_F64 - scaling FMA for f64 division.
+
+  These tests verify that V_DIV_FMAS applies the correct scaling
+  based on VCC per lane, which is essential for correct tan() results.
+  """
+
+  def test_div_fmas_f64_no_scale_vcc0(self):
+    """V_DIV_FMAS_F64: VCC=0 -> normal FMA, no scaling."""
+    a = f2i64(2.0)
+    b = f2i64(3.0)
+    c = f2i64(1.0)
+    instructions = [
+      s_mov_b32(VCC_LO, 0),
+      s_mov_b32(s[0], a & 0xffffffff),
+      s_mov_b32(s[1], a >> 32),
+      s_mov_b32(s[2], b & 0xffffffff),
+      s_mov_b32(s[3], b >> 32),
+      s_mov_b32(s[4], c & 0xffffffff),
+      s_mov_b32(s[5], c >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_mov_b32_e32(v[4], s[4]),
+      v_mov_b32_e32(v[5], s[5]),
+      v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
+    expected = 2.0 * 3.0 + 1.0  # = 7.0
+    self.assertAlmostEqual(result, expected, places=10)
+
+  def test_div_fmas_f64_scale_up_vcc1_large_s2(self):
+    """V_DIV_FMAS_F64: VCC=1 with S2 exponent > 1023 -> scale by 2^+128."""
+    a = f2i64(1.0)
+    b = f2i64(1.0)
+    c = f2i64(2.0)  # exponent = 1024 > 1023, so scale UP
+    instructions = [
+      s_mov_b32(VCC_LO, 1),
+      s_mov_b32(s[0], a & 0xffffffff),
+      s_mov_b32(s[1], a >> 32),
+      s_mov_b32(s[2], b & 0xffffffff),
+      s_mov_b32(s[3], b >> 32),
+      s_mov_b32(s[4], c & 0xffffffff),
+      s_mov_b32(s[5], c >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_mov_b32_e32(v[4], s[4]),
+      v_mov_b32_e32(v[5], s[5]),
+      v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
+    expected = (1.0 * 1.0 + 2.0) * (2.0 ** 128)  # = 3.0 * 2^128
+    self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
+
+  def test_div_fmas_f64_scale_down_vcc1_small_s2(self):
+    """V_DIV_FMAS_F64: VCC=1 with S2 exponent <= 1023 -> scale by 2^-128."""
+    a = f2i64(2.0)
+    b = f2i64(3.0)
+    c = f2i64(1.0)  # exponent = 1023, so scale DOWN
+    instructions = [
+      s_mov_b32(VCC_LO, 1),
+      s_mov_b32(s[0], a & 0xffffffff),
+      s_mov_b32(s[1], a >> 32),
+      s_mov_b32(s[2], b & 0xffffffff),
+      s_mov_b32(s[3], b >> 32),
+      s_mov_b32(s[4], c & 0xffffffff),
+      s_mov_b32(s[5], c >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_mov_b32_e32(v[4], s[4]),
+      v_mov_b32_e32(v[5], s[5]),
+      v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
+    expected = (2.0 * 3.0 + 1.0) * (2.0 ** -128)  # = 7.0 * 2^-128
+    self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
+
+  def test_div_fmas_f64_per_lane_vcc_varying(self):
+    """V_DIV_FMAS_F64: different VCC per lane applies different scaling.
+
+    This is the key test for the tan() bug - verifies that scaling is
+    applied per-lane based on VCC bits, not uniformly.
+    """
+    a = f2i64(1.0)
+    b = f2i64(1.0)
+    c = f2i64(1.0)  # exponent = 1023, so when VCC=1 it scales DOWN
+    instructions = [
+      # VCC = 0b0101: lanes 0,2 scale, lanes 1,3 don't
+      s_mov_b32(VCC_LO, 0b0101),
+      s_mov_b32(s[0], a & 0xffffffff),
+      s_mov_b32(s[1], a >> 32),
+      s_mov_b32(s[2], b & 0xffffffff),
+      s_mov_b32(s[3], b >> 32),
+      s_mov_b32(s[4], c & 0xffffffff),
+      s_mov_b32(s[5], c >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_mov_b32_e32(v[4], s[4]),
+      v_mov_b32_e32(v[5], s[5]),
+      v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+
+    scaled = (1.0 * 1.0 + 1.0) * (2.0 ** -128)  # = 2.0 * 2^-128
+    unscaled = 1.0 * 1.0 + 1.0  # = 2.0
+
+    # Lane 0: VCC=1, scale
+    result0 = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
+    self.assertAlmostEqual(result0, scaled, delta=abs(scaled) * 1e-10, msg="Lane 0 should be scaled")
+
+    # Lane 1: VCC=0, no scale
+    result1 = i642f(st.vgpr[1][6] | (st.vgpr[1][7] << 32))
+    self.assertAlmostEqual(result1, unscaled, places=10, msg="Lane 1 should be unscaled")
+
+    # Lane 2: VCC=1, scale
+    result2 = i642f(st.vgpr[2][6] | (st.vgpr[2][7] << 32))
+    self.assertAlmostEqual(result2, scaled, delta=abs(scaled) * 1e-10, msg="Lane 2 should be scaled")
+
+    # Lane 3: VCC=0, no scale
+    result3 = i642f(st.vgpr[3][6] | (st.vgpr[3][7] << 32))
+    self.assertAlmostEqual(result3, unscaled, places=10, msg="Lane 3 should be unscaled")
+
+
+class TestDivScaleFmasF64Integration(unittest.TestCase):
+  """Integration tests for V_DIV_SCALE_F64 + V_DIV_FMAS_F64.
+
+  These tests verify the full division sequence used by tan() works
+  correctly with multiple lanes having different values.
+  """
+
+  def test_div_scale_then_fmas_multi_lane_tan_pattern(self):
+    """Test the pattern used by tan(): DIV_SCALE sets VCC, DIV_FMAS uses it.
+
+    This is the exact bug scenario: tan([2.0, 3.0, 4.0]) was failing because
+    VCC from DIV_SCALE was being set incorrectly for all lanes.
+    """
+    import math
+    # Set up values like tan() would: different values per lane
+    instructions = [
+      # Create per-lane values: 2.0, 3.0, 4.0, 5.0
+      v_cvt_f64_i32_e32(v[0:1], v[255]),  # v0:1 = f64(lane_id)
+      v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO),  # numer = lane_id + 2.0
+      # denom = 1.0 for all lanes (uniform)
+      v_mov_b32_e32(v[2], f2i64(1.0) & 0xffffffff),
+      v_mov_b32_e32(v[3], f2i64(1.0) >> 32),
+      # V_DIV_SCALE_F64: sets VCC per lane
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
+      # Copy scaled numer for FMA
+      v_mov_b32_e32(v[6], v[4]),
+      v_mov_b32_e32(v[7], v[5]),
+      # V_DIV_FMAS_F64: uses VCC to apply scaling
+      v_div_fmas_f64(v[8:9], v[6:7], v[2:3], v[4:5]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+
+    # All lanes should have VCC=0 (no scaling needed for normal values)
+    self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
+
+    # Verify each lane has correct intermediate value
+    for lane in range(4):
+      expected_numer = float(lane) + 2.0
+      # With VCC=0, DIV_FMAS should just do FMA with no scaling
+      result = i642f(st.vgpr[lane][8] | (st.vgpr[lane][9] << 32))
+      # The FMA result should be: scaled_numer * denom + scaled_numer = 2*scaled_numer
+      expected = expected_numer * 1.0 + expected_numer  # Simple FMA for this test setup
+      self.assertAlmostEqual(result, expected, places=8,
+        msg=f"Lane {lane}: expected {expected}, got {result}")
+
+
+class TestVOP3VOPC(unittest.TestCase):
+  """Tests for VOP3-encoded VOPC instructions (comparisons with scalar dest)."""
+
+  def test_v_cmp_ge_f32_e64_nan(self):
+    """V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false)."""
+    from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST
+    instructions = [
+      s_mov_b32(s[0], 0xffc00000),  # NaN
+      s_mov_b32(s[1], 0x00000000),  # 0.0
+      v_mov_b32_e32(v[5], s[0]),
+      v_mov_b32_e32(v[3], s[1]),
+      VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs_=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[5], 0)  # NaN comparison is always FALSE
+
+
+class TestMin3Max3Unsigned(unittest.TestCase):
+  """Regression tests for V_MIN3/V_MAX3 with unsigned integer types.
+
+  The emulator's _minmax_reduce used UOp.minimum() which implements min(a,b) as
+  -max(-a,-b). This is broken for unsigned types because negation (mul by -1)
+  doesn't preserve ordering: for uint16, -0 = 0 but -5 = 65531, so
+  max(-0, -5) = max(0, 65531) = 65531, and -65531 = 5, giving min(0,5) = 5 (wrong!).
+
+  Fix: use comparison-based min/max for unsigned types: min(a,b) = (a<b)?a:b
+  """
+
+  def test_v_min3_u16_with_zero(self):
+    """V_MIN3_U16: min3(0, 3, 5) should return 0, not a wrong value."""
+    instructions = [
+      s_mov_b32(s[0], 0),   # 0
+      s_mov_b32(s[1], 3),   # 3
+      s_mov_b32(s[2], 5),   # 5
+      v_mov_b32_e32(v[0], s[0]),
+      v_min3_u16(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
+
+  def test_v_min3_u16_all_nonzero(self):
+    """V_MIN3_U16: min3(2, 5, 3) should return 2."""
+    instructions = [
+      s_mov_b32(s[0], 2),
+      s_mov_b32(s[1], 5),
+      s_mov_b32(s[2], 3),
+      v_mov_b32_e32(v[0], s[0]),
+      v_min3_u16(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1] & 0xFFFF, 2)
+
+  def test_v_min3_u32_with_zero(self):
+    """V_MIN3_U32: min3(0, 100, 50) should return 0."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 100),
+      s_mov_b32(s[2], 50),
+      v_mov_b32_e32(v[0], s[0]),
+      v_min3_u32(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+  def test_v_max3_u16_basic(self):
+    """V_MAX3_U16: max3(0, 3, 5) should return 5."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 3),
+      s_mov_b32(s[2], 5),
+      v_mov_b32_e32(v[0], s[0]),
+      v_max3_u16(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1] & 0xFFFF, 5)
+
+  def test_v_min_u16_two_operand(self):
+    """V_MIN_U16 (two operand): min(0, 5) should return 0."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 5),
+      v_mov_b32_e32(v[0], s[0]),
+      v_min_u16(v[1], v[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop3p.py b/extra/assembly/amd/test/hw/test_vop3p.py
index 7e8fe61443..2a403755c0 100644
--- a/extra/assembly/amd/test/hw/test_vop3p.py
+++ b/extra/assembly/amd/test/hw/test_vop3p.py
@@ -149,7 +149,7 @@ class TestFmaMix(unittest.TestCase):
 
   def test_v_fma_mix_f32_src2_f16_lo(self):
     """V_FMA_MIX_F32 with src2 as f16 from lo bits."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_2 = f32_to_f16(2.0)
     instructions = [
       s_mov_b32(s[0], f2i(1.0)),
@@ -166,7 +166,7 @@ class TestFmaMix(unittest.TestCase):
 
   def test_v_fma_mix_f32_src2_f16_hi(self):
     """V_FMA_MIX_F32 with src2 as f16 from hi bits."""
-    from extra.assembly.amd.pcode import f32_to_f16
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
     f16_2 = f32_to_f16(2.0)
     val = (f16_2 << 16) | 0
     instructions = [
@@ -197,9 +197,64 @@ class TestFmaMix(unittest.TestCase):
     result = i2f(st.vgpr[0][3])
     self.assertAlmostEqual(result, 7.0, places=5)
 
+  def test_v_fma_mix_f32_with_abs_f16_src2_lo(self):
+    """V_FMA_MIX_F32 with abs modifier on f16 src2 (lo half). Regression test for sin(1.0) bug."""
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
+    f16_neg1 = f32_to_f16(-1.0)  # 0xbc00
+    instructions = [
+      s_mov_b32(s[0], f2i(0.0)),  # src0 = 0.0 (f32)
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(1.0)),  # src1 = 1.0 (f32)
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f16_neg1),  # src2 = -1.0 (f16 in lo)
+      v_mov_b32_e32(v[2], s[2]),
+      # 0*1 + abs(-1.0) = 1.0; neg_hi=4 means abs on src2, opsel_hi2=1 means src2 is f16
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg_hi=4),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 1.0, places=5)
+
+  def test_v_fma_mix_f32_with_neg_f16_src2_lo(self):
+    """V_FMA_MIX_F32 with neg modifier on f16 src2 (lo half)."""
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
+    f16_1 = f32_to_f16(1.0)  # 0x3c00
+    instructions = [
+      s_mov_b32(s[0], f2i(0.0)),  # src0 = 0.0 (f32)
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(1.0)),  # src1 = 1.0 (f32)
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f16_1),  # src2 = 1.0 (f16 in lo)
+      v_mov_b32_e32(v[2], s[2]),
+      # 0*1 + neg(1.0) = -1.0; neg=4 means neg on src2, opsel_hi2=1 means src2 is f16
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg=4),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, -1.0, places=5)
+
+  def test_v_fma_mix_f32_with_abs_f16_src2_hi(self):
+    """V_FMA_MIX_F32 with abs modifier on f16 src2 (hi half)."""
+    from extra.assembly.amd.test.hw.helpers import f32_to_f16
+    f16_neg1 = f32_to_f16(-1.0)  # 0xbc00
+    val = (f16_neg1 << 16) | 0  # -1.0 in hi, 0 in lo
+    instructions = [
+      s_mov_b32(s[0], f2i(0.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(1.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], val),
+      v_mov_b32_e32(v[2], s[2]),
+      # opsel=4 selects hi half of src2; neg_hi=4 means abs on src2
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1, neg_hi=4),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 1.0, places=5)
+
   def test_v_fma_mixlo_f16(self):
     """V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], f2i(2.0)),
       v_mov_b32_e32(v[0], s[0]),
@@ -219,7 +274,7 @@ class TestFmaMix(unittest.TestCase):
 
   def test_v_fma_mixlo_f16_all_f32_sources(self):
     """V_FMA_MIXLO_F16 with all f32 sources."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], f2i(1.0)),
       v_mov_b32_e32(v[0], s[0]),
@@ -237,7 +292,7 @@ class TestFmaMix(unittest.TestCase):
 
   def test_v_fma_mixlo_f16_sin_case(self):
     """V_FMA_MIXLO_F16 case from sin kernel."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x3f800000),  # f32 1.0
       v_mov_b32_e32(v[3], s[0]),
@@ -259,7 +314,7 @@ class TestVOP3P(unittest.TestCase):
 
   def test_v_pk_add_f16_basic(self):
     """V_PK_ADD_F16 adds two packed f16 values."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x40003c00),  # hi=2.0, lo=1.0
       s_mov_b32(s[1], 0x44004200),  # hi=4.0, lo=3.0
@@ -276,7 +331,7 @@ class TestVOP3P(unittest.TestCase):
 
   def test_v_pk_mul_f16_basic(self):
     """V_PK_MUL_F16 multiplies two packed f16 values."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x42004000),  # hi=3.0, lo=2.0
       s_mov_b32(s[1], 0x45004400),  # hi=5.0, lo=4.0
@@ -293,7 +348,7 @@ class TestVOP3P(unittest.TestCase):
 
   def test_v_pk_fma_f16_basic(self):
     """V_PK_FMA_F16: D = A * B + C for packed f16."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x42004000),  # A: hi=3.0, lo=2.0
       s_mov_b32(s[1], 0x45004400),  # B: hi=5.0, lo=4.0
@@ -315,7 +370,7 @@ class TestVOP3P(unittest.TestCase):
     Inline constants for VOP3P are f16 values in the low 16 bits only.
     hi half of inline constant is 0, so hi result = v0.hi + 0 = 1.0.
     """
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x3c003c00),  # packed f16: hi=1.0, lo=1.0
       v_mov_b32_e32(v[0], s[0]),
@@ -333,7 +388,7 @@ class TestVOP3P(unittest.TestCase):
     """V_PK_MUL_F16 with inline constant POS_TWO (2.0).
     Inline constant has value only in low 16 bits, hi is 0.
     """
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     # v0 = packed (3.0, 4.0), multiply by POS_TWO
     # lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
     instructions = [
@@ -498,7 +553,7 @@ class TestPackedMixedSigns(unittest.TestCase):
 
   def test_pk_add_f16_mixed_signs(self):
     """V_PK_ADD_F16 with mixed positive/negative values."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0xc0003c00),  # packed: hi=-2.0, lo=1.0
       s_mov_b32(s[1], 0x3c003c00),  # packed: hi=1.0, lo=1.0
@@ -515,7 +570,7 @@ class TestPackedMixedSigns(unittest.TestCase):
 
   def test_pk_mul_f16_zero(self):
     """V_PK_MUL_F16 with zero."""
-    from extra.assembly.amd.pcode import _f16
+    from extra.assembly.amd.test.hw.helpers import _f16
     instructions = [
       s_mov_b32(s[0], 0x40004000),  # packed: 2.0, 2.0
       s_mov_b32(s[1], 0x00000000),  # packed: 0.0, 0.0
diff --git a/extra/assembly/amd/test/hw/test_vopc.py b/extra/assembly/amd/test/hw/test_vopc.py
index 396ed44ad8..90a4fbc93f 100644
--- a/extra/assembly/amd/test/hw/test_vopc.py
+++ b/extra/assembly/amd/test/hw/test_vopc.py
@@ -324,6 +324,29 @@ class TestCmpInt(unittest.TestCase):
     st = run_program(instructions, n_lanes=4)
     self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
 
+  def test_v_cmp_ne_u32_with_zero(self):
+    """V_CMP_NE_U32: compare with zero, used for int->bool cast."""
+    instructions = [
+      v_mov_b32_e32(v[1], 0),
+      v_cmp_eq_u32_e32(1, v[255]),  # vcc = (lane == 1)
+      v_cndmask_b32_e64(v[1], v[1], 1, VCC_LO),  # v1[lane1] = 1
+      v_cmp_ne_u32_e32(0, v[1]),  # vcc = (0 != v1)
+      v_cndmask_b32_e64(v[0], 0, 1, VCC_LO),  # v0 = vcc ? 1 : 0
+    ]
+    st = run_program(instructions, n_lanes=2)
+    self.assertEqual(st.vgpr[0][0], 0, "lane 0: 0 != 0 should be false")
+    self.assertEqual(st.vgpr[1][0], 1, "lane 1: 0 != 1 should be true")
+    self.assertEqual(st.vcc & 0x3, 0x2, "VCC should be 0b10")
+
+  def test_v_cmp_ne_u32_all_nonzero(self):
+    """V_CMP_NE_U32: all lanes have nonzero values."""
+    instructions = [
+      v_mov_b32_e32(v[1], 5),
+      v_cmp_ne_u32_e32(0, v[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should be != 0")
+
   def test_cmp_eq_u16_opsel_lo_lo(self):
     """V_CMP_EQ_U16 comparing lo halves."""
     instructions = [
@@ -448,6 +471,242 @@ class TestCmpFloat(unittest.TestCase):
     self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
 
 
+class TestVOP3VOPCModifiers(unittest.TestCase):
+  """Tests for VOP3 VOPC with abs/neg modifiers."""
+
+  def test_v_cmp_ge_f32_abs_both(self):
+    """v_cmp_ge_f32 with abs on both sources: abs(0.0) >= abs(-1.0) = false.
+
+    Regression test: int16 mod operation uses v_cmp_ge_f32 with abs modifiers.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0.0),
+      v_mov_b32_e32(v[1], -1.0),
+      # abs=0b11 means abs(src0) and abs(src1)
+      v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
+
+  def test_v_cmp_ge_f32_abs_negative_divisor(self):
+    """v_cmp_ge_f32 with abs: remainder check for negative divisor.
+
+    Tests the exact comparison used in int16 mod: abs(rem_f) >= abs(div_f).
+    For 1 % -1: rem_f = 0.0, div_f = -1.0, so abs(0.0) >= abs(-1.0) = false.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0.0),    # remainder as float
+      v_mov_b32_e32(v[1], -1.0),   # divisor as float
+      v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
+
+  def test_v_cmp_ge_f32_abs_small_remainder(self):
+    """v_cmp_ge_f32 with abs: abs(-0.5) >= abs(-3.0) = false."""
+    instructions = [
+      v_mov_b32_e32(v[0], -0.5),
+      v_mov_b32_e32(v[1], -3.0),
+      v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "abs(-0.5) >= abs(-3.0) should be false")
+
+  def test_v_cmp_ge_f32_abs_equal(self):
+    """v_cmp_ge_f32 with abs: abs(-1.0) >= abs(1.0) = true."""
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "abs(-1.0) >= abs(1.0) should be true")
+
+
+class TestVOP3VOPC64Bit(unittest.TestCase):
+  """Tests for VOP3 VOPC with 64-bit operands."""
+
+  def test_v_cmp_lt_f64_basic(self):
+    """v_cmp_lt_f64: 0.0 < 1.0 = true."""
+    zero_f64 = f2i64(0.0)
+    one_f64 = f2i64(1.0)
+    instructions = [
+      s_mov_b32(s[0], zero_f64 & 0xffffffff),
+      s_mov_b32(s[1], zero_f64 >> 32),
+      s_mov_b32(s[2], one_f64 & 0xffffffff),
+      s_mov_b32(s[3], one_f64 >> 32),
+      v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "0.0 < 1.0 should be true")
+
+  def test_v_cmp_lt_f64_negative(self):
+    """v_cmp_lt_f64: -1.0 < 0.0 = true."""
+    neg_one_f64 = f2i64(-1.0)
+    zero_f64 = f2i64(0.0)
+    instructions = [
+      s_mov_b32(s[0], neg_one_f64 & 0xffffffff),
+      s_mov_b32(s[1], neg_one_f64 >> 32),
+      s_mov_b32(s[2], zero_f64 & 0xffffffff),
+      s_mov_b32(s[3], zero_f64 >> 32),
+      v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "-1.0 < 0.0 should be true")
+
+  def test_v_cmp_lt_i64_signed(self):
+    """v_cmp_lt_i64: 0 < -1 (signed) = false."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 0),              # s[0:1] = 0
+      s_mov_b32(s[2], 0xffffffff),
+      s_mov_b32(s[3], 0xffffffff),     # s[2:3] = -1
+      v_cmp_lt_i64_e64(VCC_LO, s[0:1], s[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "0 < -1 (signed) should be false")
+
+  def test_v_cmp_lt_u64_unsigned(self):
+    """v_cmp_lt_u64: 0 < 0xFFFFFFFFFFFFFFFF (unsigned) = true."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_mov_b32(s[1], 0),              # s[0:1] = 0
+      s_mov_b32(s[2], 0xffffffff),
+      s_mov_b32(s[3], 0xffffffff),     # s[2:3] = max uint64
+      v_cmp_lt_u64_e64(VCC_LO, s[0:1], s[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "0 < max_uint64 should be true")
+
+
+class TestVOPCF64(unittest.TestCase):
+  """Tests for VOPC (E32 encoding) with 64-bit float operands. Regression test for f64 compare bug."""
+
+  def test_v_cmp_lt_f64_e32_true(self):
+    """v_cmp_lt_f64_e32: 2.0 < 3.0 = true."""
+    lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
+    lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
+    instructions = [
+      s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
+      s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
+      v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
+      v_cmp_lt_f64_e32(v[0:1], v[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "2.0 < 3.0 should be true")
+
+  def test_v_cmp_lt_f64_e32_false(self):
+    """v_cmp_lt_f64_e32: 3.0 < 2.0 = false."""
+    lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
+    lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
+    instructions = [
+      s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
+      s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
+      v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
+      v_cmp_lt_f64_e32(v[0:1], v[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "3.0 < 2.0 should be false")
+
+  def test_v_cmp_nlt_f64_e32_true(self):
+    """v_cmp_nlt_f64_e32: !(3.0 < 2.0) = true."""
+    lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
+    lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
+    instructions = [
+      s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
+      s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
+      v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
+      v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "!(3.0 < 2.0) should be true")
+
+  def test_v_cmp_nlt_f64_e32_false(self):
+    """v_cmp_nlt_f64_e32: !(2.0 < 3.0) = false."""
+    lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
+    lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
+    instructions = [
+      s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
+      s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
+      v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
+      v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "!(2.0 < 3.0) should be false")
+
+
+class TestCmpxExec(unittest.TestCase):
+  """Tests for V_CMPX instructions that modify EXEC mask."""
+
+  def test_v_cmpx_ngt_f32_e64_all_true(self):
+    """V_CMPX_NGT_F32_E64: all lanes pass (literal <= all values)."""
+    # 131072.0 = 0x48000000
+    # All values > 131072, so !(131072 > val) = true for all
+    instructions = [
+      s_mov_b32(EXEC_LO, 0x7),  # 3 lanes active
+      v_mov_b32_e32(v[0], f2i(200000.0)),  # lane 0
+      v_cmp_eq_u32_e32(1, v[255]),
+      v_cndmask_b32_e64(v[1], v[0], f2i(300000.0), VCC_LO),  # lane 1
+      v_cmp_eq_u32_e32(2, v[255]),
+      v_cndmask_b32_e64(v[1], v[1], f2i(400000.0), VCC_LO),  # lane 2
+      # Now v[1] has: lane0=200000, lane1=300000, lane2=400000
+      # Compare: !(131072.0 > v[1]) i.e., 131072.0 <= v[1]
+      v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # All values > 131072, so all lanes should remain active
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
+
+  def test_v_cmpx_ngt_f32_e64_some_false(self):
+    """V_CMPX_NGT_F32_E64: some lanes fail (literal > some values)."""
+    instructions = [
+      s_mov_b32(EXEC_LO, 0x7),  # 3 lanes active
+      v_mov_b32_e32(v[0], f2i(100000.0)),  # lane 0: 131072 > 100000 = true, so !(true) = false
+      v_cmp_eq_u32_e32(1, v[255]),
+      v_cndmask_b32_e64(v[1], v[0], f2i(200000.0), VCC_LO),  # lane 1: 131072 > 200000 = false, so !(false) = true
+      v_cmp_eq_u32_e32(2, v[255]),
+      v_cndmask_b32_e64(v[1], v[1], f2i(150000.0), VCC_LO),  # lane 2: 131072 > 150000 = false, so !(false) = true
+      v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # lane 0: fail (100000 < 131072), lanes 1,2: pass
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x6, "Lanes 1,2 should be active, lane 0 inactive")
+
+  def test_v_cmpx_ngt_f32_e64_all_false(self):
+    """V_CMPX_NGT_F32_E64: all lanes fail (literal > all values)."""
+    instructions = [
+      s_mov_b32(EXEC_LO, 0x7),  # 3 lanes active
+      v_mov_b32_e32(v[0], f2i(100.0)),  # all lanes have 100.0
+      # 131072 > 100 = true, so !(true) = false for all
+      v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[0]),
+    ]
+    st = run_program(instructions, n_lanes=3)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x0, "All lanes should be inactive")
+
+  def test_v_cmpx_ngt_f32_e64_large_values(self):
+    """V_CMPX_NGT_F32_E64: test with values that trigger Payne-Hanek in sin().
+
+    This is a regression test for the sin(859240.0) bug.
+    Values 859240, 1000000, 100594688 should all pass !(131072 > val).
+    """
+    instructions = [
+      s_mov_b32(EXEC_LO, 0x7),  # 3 lanes active
+      v_mov_b32_e32(v[0], f2i(859240.0)),   # lane 0
+      v_cmp_eq_u32_e32(1, v[255]),
+      v_cndmask_b32_e64(v[1], v[0], f2i(1000000.0), VCC_LO),   # lane 1
+      v_cmp_eq_u32_e32(2, v[255]),
+      v_cndmask_b32_e64(v[1], v[1], f2i(100594688.0), VCC_LO), # lane 2
+      v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # All values > 131072, so !(131072 > val) = true for all
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
+
+
 class TestVCCBehavior(unittest.TestCase):
   """Tests for VCC condition code behavior."""
 
@@ -472,5 +731,101 @@ class TestVCCBehavior(unittest.TestCase):
     self.assertEqual(st.vcc >> 16, 0x0000, "Lanes 16-31 should be false")
 
 
+class TestCmpxPartialWavefront(unittest.TestCase):
+  """Tests for V_CMPX with partial wavefronts (fewer than 32 active lanes).
+
+  Regression tests for bug where v_cmpx incorrectly set EXEC bits for inactive
+  lanes when the wavefront had fewer than 32 lanes. This caused garbage data
+  from uninitialized lanes to corrupt memory writes.
+  """
+
+  def test_v_cmpx_eq_u32_partial_wave_3_lanes(self):
+    """V_CMPX_EQ_U32 with 3 active lanes should only affect those 3 lanes.
+
+    With n_lanes=3, initial EXEC=0x7. After v_cmpx comparing lane_id == 1,
+    only lane 1 should pass, so EXEC should become 0x2 (not have bits 3-31 set).
+    """
+    instructions = [
+      v_cmpx_eq_u32_e32(1, v[255]),  # EXEC = lanes where lane_id == 1
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # Only lane 1 should be active (bit 1 set)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x2,
+                     "Only lane 1 should be active after v_cmpx_eq_u32 with 3 lanes")
+
+  def test_v_cmpx_eq_u32_partial_wave_5_lanes(self):
+    """V_CMPX_EQ_U32 with 5 active lanes."""
+    instructions = [
+      v_cmpx_eq_u32_e32(3, v[255]),  # EXEC = lanes where lane_id == 3
+    ]
+    st = run_program(instructions, n_lanes=5)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x8,
+                     "Only lane 3 should be active after v_cmpx_eq_u32 with 5 lanes")
+
+  def test_v_cmpx_lt_u32_partial_wave(self):
+    """V_CMPX_LT_U32 with partial wavefront."""
+    # VOPC: src0 < vsrc1, so we need v_cmpx_gt_u32 to get lane_id < 2
+    instructions = [
+      v_cmpx_gt_u32_e32(2, v[255]),  # EXEC = lanes where 2 > lane_id (i.e., lane_id < 2)
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # Lanes 0,1 should be active (bits 0,1 set = 0x3)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x3,
+                     "Only lanes 0,1 should be active after v_cmpx_gt_u32(2, lane_id) with 4 lanes")
+
+  def test_v_cmpx_ge_u32_partial_wave(self):
+    """V_CMPX_GE_U32 with partial wavefront."""
+    # VOPC: src0 >= vsrc1, so v_cmpx_le_u32(1, lane_id) gives lane_id >= 2? No.
+    # v_cmpx_le_u32(src0, vsrc1) = src0 <= vsrc1 = 1 <= lane_id
+    instructions = [
+      v_cmpx_le_u32_e32(2, v[255]),  # EXEC = lanes where 2 <= lane_id (i.e., lane_id >= 2)
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # Lanes 2,3 should be active (bits 2,3 set = 0xC)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xC,
+                     "Only lanes 2,3 should be active after v_cmpx_le_u32(2, lane_id) with 4 lanes")
+
+  def test_v_cmpx_ne_u32_partial_wave_all_pass(self):
+    """V_CMPX_NE_U32 where all active lanes pass."""
+    instructions = [
+      v_cmpx_ne_u32_e32(99, v[255]),  # EXEC = lanes where lane_id != 99
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # All 3 lanes should remain active (bits 0,1,2 set = 0x7)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x7,
+                     "All 3 lanes should remain active when all pass")
+
+  def test_v_cmpx_eq_u32_partial_wave_none_pass(self):
+    """V_CMPX_EQ_U32 where no active lanes pass."""
+    instructions = [
+      v_cmpx_eq_u32_e32(99, v[255]),  # EXEC = lanes where lane_id == 99
+    ]
+    st = run_program(instructions, n_lanes=3)
+    # No lanes should be active
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x0,
+                     "No lanes should be active when none pass")
+
+  def test_v_cmpx_f32_partial_wave(self):
+    """V_CMPX_GT_F32 with partial wavefront - float comparison."""
+    instructions = [
+      v_cvt_f32_u32_e32(v[0], v[255]),  # v[0] = float(lane_id)
+      v_mov_b32_e32(v[1], f2i(0.5)),    # v[1] = 0.5
+      v_cmpx_gt_f32_e32(v[0], v[1]),    # EXEC = lanes where v[0] > 0.5
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # Lanes 1,2,3 have values > 0.5, lane 0 has 0.0
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xE,
+                     "Lanes 1,2,3 should be active (float > 0.5)")
+
+  def test_v_cmpx_e64_partial_wave(self):
+    """V_CMPX_EQ_U32_E64 (VOP3 encoding) with partial wavefront."""
+    instructions = [
+      v_cmpx_eq_u32_e64(EXEC_LO, v[255], 2),  # EXEC = lanes where lane_id == 2
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x4,
+                     "Only lane 2 should be active after v_cmpx_eq_u32_e64")
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vopd.py b/extra/assembly/amd/test/hw/test_vopd.py
new file mode 100644
index 0000000000..15c67ba448
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vopd.py
@@ -0,0 +1,161 @@
+"""Tests for VOPD instructions - dual-issue vector operations.
+
+VOPD executes two operations simultaneously. Key behavior:
+- Both ops read their sources BEFORE either writes (dual-issue semantics)
+- This means if X writes to a register that Y reads, Y sees the OLD value
+- Op X can use ops 0-15 (FMAC, MUL, ADD, MOV, etc.)
+- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import run_program, run_program_emu, run_program_hw, compare_wave_states, \
+  v, s, v_mov_b32_e32, s_mov_b32
+from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
+
+class TestVOPDBasic(unittest.TestCase):
+  """Basic VOPD functionality tests."""
+
+  def test_vopd_dual_mov(self):
+    """VOPD with two MOV operations to different registers."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0x12345678),
+      v_mov_b32_e32(v[1], 0xDEADBEEF),
+      # X: v[2] = v[0], Y: v[3] = v[1]
+      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[1], v[0], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x12345678)
+    self.assertEqual(st.vgpr[0][3], 0xDEADBEEF)
+
+  def test_vopd_mov_and_add(self):
+    """VOPD with MOV (X) and ADD_NC_U32 (Y) - ADD_NC_U32 can only be Y op."""
+    instructions = [
+      v_mov_b32_e32(v[0], 10),
+      v_mov_b32_e32(v[1], 5),
+      # X: v[2] = 100 (literal), Y: v[3] = v[0] + v[1] = 15
+      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[3], 100, v[0], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 100)
+    self.assertEqual(st.vgpr[0][3], 15)
+
+
+class TestVOPDReadBeforeWrite(unittest.TestCase):
+  """Tests for VOPD dual-issue read-before-write semantics.
+
+  In VOPD, both X and Y operations read their sources BEFORE either writes.
+  This is critical when X's destination is Y's source.
+  """
+
+  def test_vopd_x_writes_y_reads_same_reg(self):
+    """VOPD where X writes to a register that Y reads.
+
+    X: v[2] = 0 (overwrites v[2])
+    Y: v[1] = v[2] + v[0]  (srcy0=v[2], vsrcy1=v[0])
+
+    If reads happen before writes: v[1] = OLD_v[2] + v[0] = 0xFFFFFFFF + 1 = 0
+    If writes happen before reads: v[1] = 0 + v[0] = 0 + 1 = 1
+
+    Hardware does reads-before-writes, so v[1] should be 0.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 1),          # v[0] = 1
+      v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
+      v_mov_b32_e32(v[2], 0xFFFFFFFF), # v[2] = 0xFFFFFFFF
+      # X: v[2] = 0 (literal), srcx0=0, vsrcx1=v[0] (unused for MOV)
+      # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[0] (should read OLD v[2] = 0xFFFFFFFF)
+      # vdsty encoding: (vdsty << 1) | ((vdstx & 1) ^ 1) where vdsty field = 0, vdstx = v[2]
+      # So vdsty_reg = (0 << 1) | ((2 & 1) ^ 1) = 0 | 1 = 1 = v[1]
+      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # X should have written 0 to v[2]
+    self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
+    # Y should have read OLD v[2] (0xFFFFFFFF) and added v[0] (1)
+    # 0xFFFFFFFF + 1 = 0 (wrap around)
+    self.assertEqual(st.vgpr[0][1], 0, "Y should read OLD v[2]=0xFFFFFFFF, compute 0xFFFFFFFF+1=0")
+
+  def test_vopd_x_writes_y_reads_same_reg_v2(self):
+    """VOPD where X writes to a register that Y reads - cleaner test case.
+
+    X: v[2] = 0 (MOV)
+    Y: v[1] = v[2] + v[2] (ADD_NC_U32 with both sources from v[2])
+
+    If reads happen before writes: v[1] = OLD_v[2] + OLD_v[2] = 100 + 100 = 200
+    If writes happen before reads: v[1] = 0 + 0 = 0
+
+    Hardware does reads-before-writes, so v[1] should be 200.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0x88888888), # v[0] = unused placeholder
+      v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
+      v_mov_b32_e32(v[2], 100),        # v[2] = 100
+      # X: v[2] = 0 (literal)
+      # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[2] (should read OLD v[2] = 100)
+      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # X should have written 0 to v[2]
+    self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
+    # Y should have read OLD v[2] (100) twice and added them
+    self.assertEqual(st.vgpr[0][1], 200, "Y should read OLD v[2]=100 twice, compute 100+100=200")
+
+
+class TestVOPDLiterals(unittest.TestCase):
+  """Tests for VOPD instructions that use SIMM32 literals (FMAAK, FMAMK)."""
+
+  def test_vopd_fmaak_f32(self):
+    """VOPD V_DUAL_FMAAK_F32: D = S0 * S1 + SIMM32 (literal addend).
+
+    Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
+    fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
+    """
+    from extra.assembly.amd.test.hw.helpers import f2i, i2f
+    instructions = [
+      v_mov_b32_e32(v[0], f2i(2.0)),  # v[0] = 2.0
+      v_mov_b32_e32(v[1], f2i(3.0)),  # v[1] = 3.0
+      # VOPD args: opx, opy, vdstx, vdsty, srcx0, srcy0, vsrcx1, vsrcy1
+      # X: v[2] = fma(srcx0, vsrcx1, SIMM32) = v[0]*v[1]+10.0 = 2*3+10 = 16
+      # Y: v[3] = srcy0 (MOV) = v[0] = 2.0
+      VOPD_LIT(VOPDOp.V_DUAL_FMAAK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(10.0)),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 16.0, places=5, msg="fma(2.0, 3.0, 10.0) should be 16.0")
+
+  def test_vopd_fmamk_f32(self):
+    """VOPD V_DUAL_FMAMK_F32: D = S0 * SIMM32 + S1 (literal multiplier).
+
+    Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
+    fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
+    """
+    from extra.assembly.amd.test.hw.helpers import f2i, i2f
+    instructions = [
+      v_mov_b32_e32(v[0], f2i(2.0)),  # v[0] = 2.0
+      v_mov_b32_e32(v[1], f2i(3.0)),  # v[1] = 3.0
+      # X: v[2] = fma(srcx0, SIMM32, vsrcx1) = v[0]*5.0+v[1] = 2*5+3 = 13
+      # Y: v[3] = srcy0 (MOV) = v[0] = 2.0
+      VOPD_LIT(VOPDOp.V_DUAL_FMAMK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(5.0)),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 13.0, places=5, msg="fma(2.0, 5.0, 3.0) should be 13.0")
+
+
+class TestVOPDMultilane(unittest.TestCase):
+  """Tests for VOPD with multiple lanes."""
+
+  def test_vopd_multilane_mov_add(self):
+    """VOPD MOV and ADD with multiple active lanes - no register conflict."""
+    instructions = [
+      v_mov_b32_e32(v[0], 5),
+      v_mov_b32_e32(v[1], 10),
+      # X: v[2] = 100 (constant), Y: v[1] = v[0] + v[1] = 5 + 10 = 15
+      # vdsty_reg = (vdsty << 1) | ((vdstx.offset & 1) ^ 1) = (0 << 1) | ((258 & 1) ^ 1) = 0 | 1 = 1
+      VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 100, v[0], v[2], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][2], 100, f"Lane {lane}: v[2] should be 100")
+      self.assertEqual(st.vgpr[lane][1], 15, f"Lane {lane}: v[1] should be 15 (5+10)")
+
+
+if __name__ == '__main__':
+  unittest.main()