From 0e282025ffadf59c0c5985206800374a80b6a269 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Fri, 2 Jan 2026 11:04:56 -0500
Subject: [PATCH] assembly/amd: split test_emu into hw tests (#13966)

* assmebly/amd: split test_emu into hw tests

* hw tests

* bugfixes

* more tests and fix
---
 extra/assembly/amd/dsl.py                 |   15 +-
 extra/assembly/amd/pcode.py               |   16 +-
 extra/assembly/amd/test/hw/__init__.py    |    1 +
 extra/assembly/amd/test/hw/helpers.py     |  200 +
 extra/assembly/amd/test/hw/test_ds.py     |  629 +++
 extra/assembly/amd/test/hw/test_flat.py   |  363 ++
 extra/assembly/amd/test/hw/test_global.py |  364 ++
 extra/assembly/amd/test/hw/test_sop.py    |  205 +
 extra/assembly/amd/test/hw/test_vop1.py   | 1242 +++++
 extra/assembly/amd/test/hw/test_vop2.py   |  451 ++
 extra/assembly/amd/test/hw/test_vop3.py   | 2266 ++++++++
 extra/assembly/amd/test/hw/test_vop3p.py  |  538 ++
 extra/assembly/amd/test/hw/test_vopc.py   |  486 ++
 extra/assembly/amd/test/test_emu.py       | 5768 ---------------------
 14 files changed, 6772 insertions(+), 5772 deletions(-)
 create mode 100644 extra/assembly/amd/test/hw/__init__.py
 create mode 100644 extra/assembly/amd/test/hw/helpers.py
 create mode 100644 extra/assembly/amd/test/hw/test_ds.py
 create mode 100644 extra/assembly/amd/test/hw/test_flat.py
 create mode 100644 extra/assembly/amd/test/hw/test_global.py
 create mode 100644 extra/assembly/amd/test/hw/test_sop.py
 create mode 100644 extra/assembly/amd/test/hw/test_vop1.py
 create mode 100644 extra/assembly/amd/test/hw/test_vop2.py
 create mode 100644 extra/assembly/amd/test/hw/test_vop3.py
 create mode 100644 extra/assembly/amd/test/hw/test_vop3p.py
 create mode 100644 extra/assembly/amd/test/hw/test_vopc.py
 delete mode 100644 extra/assembly/amd/test/test_emu.py

diff --git a/extra/assembly/amd/dsl.py b/extra/assembly/amd/dsl.py
index 541acd1d8e..08b34bee27 100644
--- a/extra/assembly/amd/dsl.py
+++ b/extra/assembly/amd/dsl.py
@@ -13,12 +13,21 @@ MASK32, MASK64, MASK128 = 0xffffffff, 0xffffffffffffffff, (1 << 128) - 1
 _struct_f, _struct_I = struct.Struct("<f"), struct.Struct("<I")
 _struct_e, _struct_H = struct.Struct("<e"), struct.Struct("<H")
 _struct_d, _struct_Q = struct.Struct("<d"), struct.Struct("<Q")
-def _f32(i): return _struct_f.unpack(_struct_I.pack(i & MASK32))[0]
+def _f32(i):
+  i = i & MASK32
+  # RDNA3 default mode: flush f32 denormals to zero (FTZ)
+  # Denormal: exponent=0 (bits 23-30) and mantissa!=0 (bits 0-22)
+  if (i & 0x7f800000) == 0 and (i & 0x007fffff) != 0: return 0.0
+  return _struct_f.unpack(_struct_I.pack(i))[0]
 def _i32(f):
   if isinstance(f, int): f = float(f)
   if math.isnan(f): return 0xffc00000 if math.copysign(1.0, f) < 0 else 0x7fc00000
   if math.isinf(f): return 0x7f800000 if f > 0 else 0xff800000
-  try: return _struct_I.unpack(_struct_f.pack(f))[0]
+  try:
+    bits = _struct_I.unpack(_struct_f.pack(f))[0]
+    # RDNA3 default mode: flush f32 denormals to zero (FTZ)
+    if (bits & 0x7f800000) == 0 and (bits & 0x007fffff) != 0: return 0x80000000 if bits & 0x80000000 else 0
+    return bits
   except (OverflowError, struct.error): return 0x7f800000 if f > 0 else 0xff800000
 def _sext(v, b): return v - (1 << b) if v & (1 << (b - 1)) else v
 def _f16(i): return _struct_e.unpack(_struct_H.pack(i & 0xffff))[0]
@@ -333,6 +342,8 @@ class Inst:
   def __init__(self, *args, literal: int | None = None, **kwargs):
     self._values, self._literal = dict(self._defaults), None
     field_names = [n for n in self._fields if n != 'encoding']
+    # Map Python-friendly names to actual field names (abs_ -> abs for Python reserved word)
+    if 'abs_' in kwargs: kwargs['abs'] = kwargs.pop('abs_')
     orig_args = dict(zip(field_names, args)) | kwargs
     self._values.update(orig_args)
     self._validate(orig_args)
diff --git a/extra/assembly/amd/pcode.py b/extra/assembly/amd/pcode.py
index 3e6b83c0e9..dc15851f52 100644
--- a/extra/assembly/amd/pcode.py
+++ b/extra/assembly/amd/pcode.py
@@ -35,7 +35,15 @@ def _gt_neg_zero(a, b): return (a > b) or (a == 0 and b == 0 and not math.copysi
 def _lt_neg_zero(a, b): return (a < b) or (a == 0 and b == 0 and math.copysign(1, a) < 0 and not math.copysign(1, b) < 0)
 def _fma(a, b, c): return a * b + c
 def _signext(v): return v
-def _fpop(fn): return lambda x: (x := float(x), x if math.isnan(x) or math.isinf(x) else float(fn(x)))[1]
+def _fpop(fn):
+  def wrapper(x):
+    x = float(x)
+    if math.isnan(x) or math.isinf(x): return x
+    result = float(fn(x))
+    # Preserve sign of zero (IEEE 754: ceil(-0.0) = -0.0, ceil(-0.1) = -0.0)
+    if result == 0.0: return math.copysign(0.0, x)
+    return result
+  return wrapper
 trunc, floor, ceil = _fpop(math.trunc), _fpop(math.floor), _fpop(math.ceil)
 class _SafeFloat(float):
   """Float subclass that uses _div for division to handle 0/inf correctly."""
@@ -75,7 +83,11 @@ def _trig(fn, x):
   # V_SIN/COS_F32: hardware does frac on input cycles before computing
   if math.isinf(x) or math.isnan(x): return float("nan")
   frac_cycles = fract(x / (2 * math.pi))
-  return fn(frac_cycles * 2 * math.pi)
+  result = fn(frac_cycles * 2 * math.pi)
+  # Hardware returns exactly 0 for cos(π/2), sin(π), etc. due to lookup table
+  # Round very small results (below f32 precision) to exactly 0
+  if abs(result) < 1e-7: return 0.0
+  return result
 def sin(x): return _trig(math.sin, x)
 def cos(x): return _trig(math.cos, x)
 def pow(a, b):
diff --git a/extra/assembly/amd/test/hw/__init__.py b/extra/assembly/amd/test/hw/__init__.py
new file mode 100644
index 0000000000..bd94b7338d
--- /dev/null
+++ b/extra/assembly/amd/test/hw/__init__.py
@@ -0,0 +1 @@
+"""Hardware-validated emulator tests for RDNA3 instructions."""
diff --git a/extra/assembly/amd/test/hw/helpers.py b/extra/assembly/amd/test/hw/helpers.py
new file mode 100644
index 0000000000..221a7932f5
--- /dev/null
+++ b/extra/assembly/amd/test/hw/helpers.py
@@ -0,0 +1,200 @@
+"""Test infrastructure for hardware-validated RDNA3 emulator tests.
+
+Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
+Set USE_HW=1 to run on both emulator and real hardware, comparing results.
+"""
+import ctypes, os, struct
+from extra.assembly.amd.autogen.rdna3.ins import *
+from extra.assembly.amd.dsl import RawImm
+from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
+from extra.assembly.amd.pcode import _i32, _f32
+
+VCC = SrcEnum.VCC_LO  # For VOP3SD sdst field
+USE_HW = os.environ.get("USE_HW", "0") == "1"
+FLOAT_TOLERANCE = 1e-5
+
+# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
+N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
+VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4  # 16 regs * 32 lanes * 4 bytes = 2048
+SGPR_BYTES = N_SGPRS * 4  # 16 regs * 4 bytes = 64
+OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8  # + vcc + scc
+
+# Float conversion helpers
+def f2i(f: float) -> int: return _i32(f)
+def i2f(i: int) -> float: return _f32(i)
+def f2i64(f: float) -> int: return struct.unpack('<Q', struct.pack('<d', f))[0]
+def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
+
+def assemble(instructions: list) -> bytes:
+  return b''.join(inst.to_bytes() for inst in instructions)
+
+def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
+  """Generate prologue and epilogue instructions for state capture."""
+  prologue = [
+    s_mov_b32(s[80], s[0]),
+    s_mov_b32(s[81], s[1]),
+    v_mov_b32_e32(v[255], v[0]),
+  ]
+  for i in range(N_VGPRS):
+    prologue.append(v_mov_b32_e32(v[i], 0))
+  for i in range(N_SGPRS):
+    prologue.append(s_mov_b32(s[i], 0))
+  prologue.append(s_mov_b32(s[SrcEnum.VCC_LO - 128], 0))
+
+  epilogue = [
+    s_mov_b32(s[90], SrcEnum.VCC_LO),
+    s_cselect_b32(s[91], 1, 0),
+    s_load_b64(s[92:93], s[80], 0, soffset=SrcEnum.NULL),
+    s_waitcnt(lgkmcnt=0),
+    v_lshlrev_b32_e32(v[240], 2, v[255]),
+  ]
+  for i in range(N_VGPRS):
+    epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92], offset=i * WAVE_SIZE * 4))
+  epilogue.append(v_mov_b32_e32(v[241], 0))
+  epilogue.append(v_cmp_eq_u32_e32(v[255], v[241]))
+  epilogue.append(s_and_saveexec_b32(s[94], SrcEnum.VCC_LO))
+  epilogue.append(v_mov_b32_e32(v[240], 0))
+  for i in range(N_SGPRS):
+    epilogue.append(v_mov_b32_e32(v[243], s[i]))
+    epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + i * 4))
+  epilogue.append(v_mov_b32_e32(v[243], s[90]))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES))
+  epilogue.append(v_mov_b32_e32(v[243], s[91]))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES + 4))
+  epilogue.append(s_mov_b32(s[SrcEnum.EXEC_LO - 128], s[94]))
+  epilogue.append(s_endpgm())
+  return prologue, epilogue
+
+def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
+  """Parse output buffer into WaveState."""
+  st = WaveState()
+  for i in range(N_VGPRS):
+    for lane in range(n_lanes):
+      off = i * WAVE_SIZE * 4 + lane * 4
+      st.vgpr[lane][i] = struct.unpack_from('<I', out_buf, off)[0]
+  for i in range(N_SGPRS):
+    st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
+  st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
+  st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
+  return st
+
+def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
+  """Run instructions via emulator run_asm, dump state to memory, return WaveState."""
+  out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES))
+  out_addr = ctypes.addressof(out_buf)
+
+  prologue, epilogue = get_prologue_epilogue(n_lanes)
+  code = assemble(prologue + instructions + epilogue)
+
+  args = (ctypes.c_uint64 * 1)(out_addr)
+  args_ptr = ctypes.addressof(args)
+  kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
+  lib_ptr = ctypes.addressof(kernel_buf)
+
+  set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
+  result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr)
+  assert result == 0, f"run_asm failed with {result}"
+
+  return parse_output(bytes(out_buf), n_lanes)
+
+def run_program_hw(instructions: list, n_lanes: int = 1) -> WaveState:
+  """Run instructions on real AMD hardware via HIPCompiler and AMDProgram."""
+  from tinygrad.device import Device
+  from tinygrad.runtime.ops_amd import AMDProgram
+  from tinygrad.runtime.support.compiler_amd import HIPCompiler
+  from tinygrad.helpers import flat_mv
+
+  dev = Device["AMD"]
+  compiler = HIPCompiler(dev.arch)
+
+  prologue, epilogue = get_prologue_epilogue(n_lanes)
+  code = assemble(prologue + instructions + epilogue)
+
+  byte_str = ', '.join(f'0x{b:02x}' for b in code)
+  asm_src = f""".text
+.globl test
+.p2align 8
+.type test,@function
+test:
+.byte {byte_str}
+
+.rodata
+.p2align 6
+.amdhsa_kernel test
+  .amdhsa_next_free_vgpr 256
+  .amdhsa_next_free_sgpr 96
+  .amdhsa_wavefront_size32 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_group_segment_fixed_size 65536
+.end_amdhsa_kernel
+
+.amdgpu_metadata
+---
+amdhsa.version:
+  - 1
+  - 0
+amdhsa.kernels:
+  - .name: test
+    .symbol: test.kd
+    .kernarg_segment_size: 8
+    .group_segment_fixed_size: 65536
+    .private_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .wavefront_size: 32
+    .sgpr_count: 96
+    .vgpr_count: 256
+    .max_flat_workgroup_size: 1024
+...
+.end_amdgpu_metadata
+"""
+
+  lib = compiler.compile(asm_src)
+  prg = AMDProgram(dev, "test", lib)
+
+  out_gpu = dev.allocator.alloc(OUT_BYTES)
+  prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True)
+
+  out_buf = bytearray(OUT_BYTES)
+  dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu)
+
+  return parse_output(bytes(out_buf), n_lanes)
+
+def compare_wave_states(emu_st: WaveState, hw_st: WaveState, n_lanes: int, n_vgprs: int = N_VGPRS) -> list[str]:
+  """Compare two WaveStates and return list of differences."""
+  import math
+  diffs = []
+  for i in range(n_vgprs):
+    for lane in range(n_lanes):
+      emu_val = emu_st.vgpr[lane][i]
+      hw_val = hw_st.vgpr[lane][i]
+      if emu_val != hw_val:
+        emu_f, hw_f = _f32(emu_val), _f32(hw_val)
+        if math.isnan(emu_f) and math.isnan(hw_f):
+          continue
+        diffs.append(f"v[{i}] lane {lane}: emu=0x{emu_val:08x} ({emu_f:.6g}) hw=0x{hw_val:08x} ({hw_f:.6g})")
+  for i in range(N_SGPRS):
+    emu_val = emu_st.sgpr[i]
+    hw_val = hw_st.sgpr[i]
+    if emu_val != hw_val:
+      diffs.append(f"s[{i}]: emu=0x{emu_val:08x} hw=0x{hw_val:08x}")
+  if emu_st.vcc != hw_st.vcc:
+    diffs.append(f"vcc: emu=0x{emu_st.vcc:08x} hw=0x{hw_st.vcc:08x}")
+  if emu_st.scc != hw_st.scc:
+    diffs.append(f"scc: emu={emu_st.scc} hw={hw_st.scc}")
+  return diffs
+
+def run_program(instructions: list, n_lanes: int = 1) -> WaveState:
+  """Run instructions and return WaveState.
+
+  If USE_HW=1, runs on both emulator and hardware, compares results, and raises if they differ.
+  Otherwise, runs only on emulator.
+  """
+  emu_st = run_program_emu(instructions, n_lanes)
+  if USE_HW:
+    hw_st = run_program_hw(instructions, n_lanes)
+    diffs = compare_wave_states(emu_st, hw_st, n_lanes)
+    if diffs:
+      raise AssertionError(f"Emulator vs Hardware mismatch:\n" + "\n".join(diffs))
+    return hw_st
+  return emu_st
diff --git a/extra/assembly/amd/test/hw/test_ds.py b/extra/assembly/amd/test/hw/test_ds.py
new file mode 100644
index 0000000000..b58d22e768
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_ds.py
@@ -0,0 +1,629 @@
+"""Tests for DS instructions - data share (LDS) operations.
+
+Includes: ds_store_b32, ds_load_b32, ds_store_2addr_*, ds_load_2addr_*,
+          ds_add_*, ds_max_*, ds_min_*, ds_and_*, ds_or_*, ds_xor_*,
+          ds_inc_*, ds_dec_*, ds_cmpstore_*, ds_storexchg_*
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestDS2Addr(unittest.TestCase):
+  """Tests for DS_*_2ADDR instructions."""
+
+  def test_ds_store_load_2addr_b32(self):
+    """DS_STORE_2ADDR_B32 and DS_LOAD_2ADDR_B32 with offset * 4."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB)
+
+  def test_ds_store_load_2addr_b64(self):
+    """DS_STORE_2ADDR_B64 and DS_LOAD_2ADDR_B64."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[1], s[0]),
+      s_mov_b32(s[0], 0x12345678),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x9ABCDEF0),
+      v_mov_b32_e32(v[3], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=0, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE)
+    self.assertEqual(st.vgpr[0][6], 0x12345678)
+    self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0)
+
+
+class TestDS2AddrMore(unittest.TestCase):
+  """Additional DS_*_2ADDR tests."""
+
+  def test_ds_store_load_2addr_b32_nonzero_offsets(self):
+    """DS_STORE_2ADDR_B32 with non-zero offsets (offset*4 scaling)."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0x11111111),
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x22222222),
+      v_mov_b32_e32(v[1], s[2]),
+      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=2, offset1=5),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=2, offset1=5),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have value from offset 8 (2*4)")
+    self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have value from offset 20 (5*4)")
+
+  def test_ds_2addr_b64_no_overlap(self):
+    """DS_LOAD_2ADDR_B64 with adjacent offsets should not overlap."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0x11111111),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_mov_b32(s[2], 0x22222222),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=4),
+      s_mov_b32(s[2], 0x33333333),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=8),
+      s_mov_b32(s[2], 0x44444444),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=12),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should be 0x11111111")
+    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should be 0x22222222")
+    self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should be 0x33333333")
+    self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should be 0x44444444")
+
+  def test_ds_load_2addr_b32_no_overwrite(self):
+    """DS_LOAD_2ADDR_B32 should only write 2 VGPRs."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0xBBBBBBBB),
+      v_mov_b32_e32(v[1], s[2]),
+      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 0xDEADBEEF),
+      v_mov_b32_e32(v[4], s[2]),  # Sentinel
+      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should be untouched")
+
+  def test_ds_load_b64_no_overwrite(self):
+    """DS_LOAD_B64 should only write 2 VGPRs."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xDEADBEEF),
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0xCAFEBABE),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_store_b64(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 0x12345678),
+      v_mov_b32_e32(v[4], s[2]),  # Sentinel
+      ds_load_b64(addr=v[10], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][3], 0xCAFEBABE)
+    self.assertEqual(st.vgpr[0][4], 0x12345678, "v4 should be untouched")
+
+
+class TestDSAtomic(unittest.TestCase):
+  """Tests for DS atomic operations."""
+
+  def test_ds_max_rtn_u32(self):
+    """DS_MAX_RTN_U32: atomically store max and return old value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 200),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_max_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
+    self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200")
+
+  def test_ds_min_rtn_u32(self):
+    """DS_MIN_RTN_U32: atomically store min and return old value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 200),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_min_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 200)
+    self.assertEqual(st.vgpr[0][3], 100)
+
+  def test_ds_and_rtn_b32(self):
+    """DS_AND_RTN_B32: atomically AND and return old value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xFF00FF00),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 0xFFFF0000),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_and_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xFF00FF00)
+    self.assertEqual(st.vgpr[0][3], 0xFF000000)
+
+  def test_ds_or_rtn_b32(self):
+    """DS_OR_RTN_B32: atomically OR and return old value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0x00FF0000),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 0x000000FF),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_or_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x00FF0000)
+    self.assertEqual(st.vgpr[0][3], 0x00FF00FF)
+
+  def test_ds_xor_rtn_b32(self):
+    """DS_XOR_RTN_B32: atomically XOR and return old value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 0xFFFFFFFF),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_xor_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][3], 0x55555555)
+
+  def test_ds_inc_rtn_u32(self):
+    """DS_INC_RTN_U32: increment with wrap."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 5),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 10),  # limit
+      v_mov_b32_e32(v[1], s[2]),
+      ds_inc_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 5)
+    self.assertEqual(st.vgpr[0][3], 6)
+
+  def test_ds_dec_rtn_u32(self):
+    """DS_DEC_RTN_U32: decrement with wrap."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 5),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 10),  # limit
+      v_mov_b32_e32(v[1], s[2]),
+      ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 5)
+    self.assertEqual(st.vgpr[0][3], 4)
+
+  def test_ds_cmpstore_b32_match(self):
+    """DS_CMPSTORE_B32: conditional store when compare matches."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 200),
+      v_mov_b32_e32(v[1], s[2]),  # new value
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[2], s[2]),  # compare = 100 (matches)
+      ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 200)
+
+  def test_ds_cmpstore_b32_no_match(self):
+    """DS_CMPSTORE_B32: no store when compare doesn't match."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 200),
+      v_mov_b32_e32(v[1], s[2]),  # new value
+      s_mov_b32(s[2], 50),
+      v_mov_b32_e32(v[2], s[2]),  # compare = 50 (doesn't match)
+      ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 100)
+
+  def test_ds_max_u32_no_rtn(self):
+    """DS_MAX_U32 (no RTN): atomically store max, no return value."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 200),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_max_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200")
+
+  def test_ds_add_u32_no_rtn_preserves_vdst(self):
+    """DS_ADD_U32 (no RTN) should NOT write to vdst."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[2]),  # sentinel
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 50),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_add_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xDEADBEEF, "v2 should preserve sentinel")
+    self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150")
+
+  def test_ds_add_rtn_u32_writes_vdst(self):
+    """DS_ADD_RTN_U32 should write old value to vdst."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[2]),  # sentinel
+      s_mov_b32(s[2], 100),
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 50),
+      v_mov_b32_e32(v[1], s[2]),
+      ds_add_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
+    self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150")
+
+  def test_ds_dec_rtn_u32_wrap(self):
+    """DS_DEC_RTN_U32: decrement wraps when value is 0 or > limit."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[2], 0),  # Start at 0
+      v_mov_b32_e32(v[0], s[2]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[2], 10),  # limit
+      v_mov_b32_e32(v[1], s[2]),
+      ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0, "v2 should have old value (0)")
+    # When mem == 0 or mem > limit, result = limit
+    self.assertEqual(st.vgpr[0][3], 10, "v3 should wrap to limit (10)")
+
+
+class TestDSStorexchg(unittest.TestCase):
+  """Tests for DS_STOREXCHG instructions."""
+
+  def test_ds_storexchg_rtn_b32(self):
+    """DS_STOREXCHG_RTN_B32: exchange value and return old."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[0]),
+      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STOREXCHG_RTN_B32, addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB)
+
+
+class TestDSRegisterWidth(unittest.TestCase):
+  """Regression tests: DS loads should only write correct number of VGPRs."""
+
+  def test_ds_load_b32_no_overwrite(self):
+    """DS_LOAD_B32 should only write 1 VGPR."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[1], s[0]),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[2], s[0]),  # sentinel
+      ds_store_b32(addr=v[0], data0=v[1], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      ds_load_b32(addr=v[0], vdst=v[1], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should be untouched")
+
+
+class TestDS2AddrStride64(unittest.TestCase):
+  """Tests for DS_*_2ADDR_STRIDE64 (offset * 256 for B32, offset * 512 for B64)."""
+
+  def test_ds_store_load_2addr_stride64_b32(self):
+    """DS_STORE_2ADDR_STRIDE64_B32: stores at ADDR + offset*256."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[2], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 from addr 256")
+    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB, "v3 from addr 512")
+
+  def test_ds_store_load_2addr_stride64_b64(self):
+    """DS_STORE_2ADDR_STRIDE64_B64: stores at ADDR + offset*512."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[1], s[0]),
+      s_mov_b32(s[0], 0x12345678),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x9ABCDEF0),
+      v_mov_b32_e32(v[3], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[4], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE)
+    self.assertEqual(st.vgpr[0][6], 0x12345678)
+    self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0)
+
+  def test_ds_storexchg_2addr_rtn_b32(self):
+    """DS_STOREXCHG_2ADDR_RTN_B32: exchange at two addresses."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[3], s[0]),
+      DS(DSOp.DS_STOREXCHG_2ADDR_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[6], offset0=0, offset1=1),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0x11111111, "old val 0")
+    self.assertEqual(st.vgpr[0][5], 0x22222222, "old val 1")
+    self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "new val 0")
+    self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "new val 1")
+
+
+  def test_ds_storexchg_rtn_b64(self):
+    """DS_STOREXCHG_RTN_B64: exchange 64-bit value and return old."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[0], s[0]),   # initial low
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[1], s[0]),   # initial high
+      DS(DSOp.DS_STORE_B64, addr=v[10], data0=v[0], vdst=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[0], 0x12345678),
+      v_mov_b32_e32(v[2], s[0]),   # new low
+      s_mov_b32(s[0], 0x9ABCDEF0),
+      v_mov_b32_e32(v[3], s[0]),   # new high
+      DS(DSOp.DS_STOREXCHG_RTN_B64, addr=v[10], data0=v[2], vdst=v[4], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_B64, addr=v[10], vdst=v[6], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have old low dword")
+    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have old high dword")
+    self.assertEqual(st.vgpr[0][6], 0x12345678, "v6 should have new low dword")
+    self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0, "v7 should have new high dword")
+
+  def test_ds_store_load_2addr_stride64_b64_roundtrip(self):
+    """DS_STORE_2ADDR_STRIDE64_B64 followed by DS_LOAD_2ADDR_STRIDE64_B64 works correctly."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[2], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have val1 low")
+    self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have val1 high")
+    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have val2 low")
+    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have val2 high")
+
+  def test_ds_storexchg_2addr_stride64_rtn_b32(self):
+    """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: exchange at two addresses (offset*256)."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[3], s[0]),
+      DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[6], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value")
+    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have old value")
+    self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have new value")
+    self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have new value")
+
+  def test_ds_storexchg_2addr_stride64_rtn_b64_returns_old(self):
+    """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: returns old values correctly."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[1], s[0]),
+      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[6], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[7], s[0]),
+      DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, addr=v[10], data0=v[6], data1=v[6], vdst=v[8], offset0=1, offset1=2),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][8], 0x11111111, "v8 should have old val1 low")
+    self.assertEqual(st.vgpr[0][9], 0x22222222, "v9 should have old val1 high")
+    self.assertEqual(st.vgpr[0][10], 0x11111111, "v10 should have old val2 low")
+    self.assertEqual(st.vgpr[0][11], 0x22222222, "v11 should have old val2 high")
+
+
+class TestAtomicOrdering(unittest.TestCase):
+  """Tests for atomic operation return values and ordering."""
+
+  def test_ds_add_rtn_sequence(self):
+    """DS_ADD_RTN returns correct old values in sequence."""
+    instructions = [
+      v_mov_b32_e32(v[10], 0),
+      v_mov_b32_e32(v[0], 100),
+      DS(DSOp.DS_STORE_B32, addr=v[10], data0=v[0], vdst=v[0], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[1], 25),
+      DS(DSOp.DS_ADD_RTN_U32, addr=v[10], data0=v[1], vdst=v[2], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_ADD_RTN_U32, addr=v[10], data0=v[1], vdst=v[3], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+      DS(DSOp.DS_LOAD_B32, addr=v[10], vdst=v[4], offset0=0),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 100, "First add should return 100")
+    self.assertEqual(st.vgpr[0][3], 125, "Second add should return 125")
+    self.assertEqual(st.vgpr[0][4], 150, "Final value should be 150")
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_flat.py b/extra/assembly/amd/test/hw/test_flat.py
new file mode 100644
index 0000000000..f962a134bf
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_flat.py
@@ -0,0 +1,363 @@
+"""Tests for FLAT instructions - flat memory operations.
+
+Includes: flat_load_*, flat_store_*, flat_atomic_*
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestFlatAtomic(unittest.TestCase):
+  """Tests for FLAT atomic instructions."""
+
+  def _make_test(self, setup_instrs, atomic_instr, check_fn, test_offset=2000):
+    """Helper to create atomic test instructions."""
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+    ] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    check_fn(st)
+
+  def test_flat_atomic_add_u32(self):
+    """FLAT_ATOMIC_ADD_U32 adds to memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 50),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 100)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_swap_b32(self):
+    """FLAT_ATOMIC_SWAP_B32 swaps memory value and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_and_b32(self):
+    """FLAT_ATOMIC_AND_B32 ANDs with memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0xFF00FF00),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0xFFFF0000),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_AND_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 0xFF00FF00)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_or_b32(self):
+    """FLAT_ATOMIC_OR_B32 ORs with memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0x00FF0000),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0x0000FF00),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_OR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 0x00FF0000)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_inc_u32(self):
+    """FLAT_ATOMIC_INC_U32 increments and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 10),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 100),  # threshold
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 10)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_dec_u32(self):
+    """FLAT_ATOMIC_DEC_U32 decrements and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 10),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_DEC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 10)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_sub_u32(self):
+    """FLAT_ATOMIC_SUB_U32 subtracts from memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 30),
+      v_mov_b32_e32(v[3], s[0]),  # sub 30
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_SUB_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_xor_b32(self):
+    """FLAT_ATOMIC_XOR_B32 XORs with memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[3], s[0]),  # XOR mask
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_XOR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 should have old value")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_min_u32(self):
+    """FLAT_ATOMIC_MIN_U32 stores min and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 50),
+      v_mov_b32_e32(v[3], s[0]),  # compare value (smaller)
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_MIN_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_max_u32(self):
+    """FLAT_ATOMIC_MAX_U32 stores max and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 50),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[3], s[0]),  # compare value (larger)
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_MAX_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 50, "v4 should have old value (50)")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_inc_u64_returns_old_value(self):
+    """FLAT_ATOMIC_INC_U64 should return full 64-bit old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      # Store initial 64-bit value: 0xCAFEBABE_DEADBEEF
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[3], s[0]),
+      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      # Threshold: 0xFFFFFFFF_FFFFFFFF
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[4], s[0]),
+      v_mov_b32_e32(v[5], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][6], 0xDEADBEEF, "v6 should have old value low dword")
+      self.assertEqual(st.vgpr[0][7], 0xCAFEBABE, "v7 should have old value high dword")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_add_u64(self):
+    """FLAT_ATOMIC_ADD_U64 adds 64-bit value and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[3], s[0]),
+      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0x00000001),  # add 1
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[0], 0x00000000),
+      v_mov_b32_e32(v[5], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][6], 0x11111111, "v6 should have old value low")
+      self.assertEqual(st.vgpr[0][7], 0x22222222, "v7 should have old value high")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_flat_atomic_swap_b64(self):
+    """FLAT_ATOMIC_SWAP_B64 swaps 64-bit value and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[3], s[0]),
+      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0xCCCCCCCC),
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[0], 0xDDDDDDDD),
+      v_mov_b32_e32(v[5], s[0]),
+    ]
+    atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
+    def check(st):
+      self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have old value low")
+      self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have old value high")
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+
+class TestFlatLoad(unittest.TestCase):
+  """Tests for FLAT load instructions."""
+
+  def test_flat_load_b32(self):
+    """FLAT_LOAD_B32 loads 32-bit value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(FLATOp.FLAT_LOAD_B32, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
+
+  def test_flat_load_b64(self):
+    """FLAT_LOAD_B64 loads 64-bit value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[3], s[0]),
+      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(FLATOp.FLAT_LOAD_B64, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE)
+
+  def test_flat_load_b96(self):
+    """FLAT_LOAD_B96 loads 96-bit (3 dword) value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[3], s[0]),
+      s_mov_b32(s[0], 0x33333333),
+      v_mov_b32_e32(v[4], s[0]),
+      global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(FLATOp.FLAT_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][5], 0x11111111)
+    self.assertEqual(st.vgpr[0][6], 0x22222222)
+    self.assertEqual(st.vgpr[0][7], 0x33333333)
+
+  def test_flat_load_b128(self):
+    """FLAT_LOAD_B128 loads 128-bit value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0x11111111),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x22222222),
+      v_mov_b32_e32(v[3], s[0]),
+      s_mov_b32(s[0], 0x33333333),
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[0], 0x44444444),
+      v_mov_b32_e32(v[5], s[0]),
+      global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(FLATOp.FLAT_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][6], 0x11111111)
+    self.assertEqual(st.vgpr[0][7], 0x22222222)
+    self.assertEqual(st.vgpr[0][8], 0x33333333)
+    self.assertEqual(st.vgpr[0][9], 0x44444444)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_global.py b/extra/assembly/amd/test/hw/test_global.py
new file mode 100644
index 0000000000..8589eae4a2
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_global.py
@@ -0,0 +1,364 @@
+"""Tests for GLOBAL instructions - global memory operations.
+
+Includes: global_load_*, global_store_*, global_atomic_*, global_load_d16_*
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestGlobalAtomic(unittest.TestCase):
+  """Tests for GLOBAL atomic instructions."""
+
+  def _make_test(self, setup_instrs, atomic_instr, check_fn, test_offset=2000):
+    """Helper to create atomic test instructions."""
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+    ] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    check_fn(st)
+
+  def test_global_atomic_add_u32(self):
+    """GLOBAL_ATOMIC_ADD_U32 adds to memory and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 100),
+      v_mov_b32_e32(v[2], s[0]),
+      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 50),
+      v_mov_b32_e32(v[3], s[0]),
+    ]
+    atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2)
+    def check(st):
+      self.assertEqual(st.vgpr[0][4], 100)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+  def test_global_atomic_add_u64(self):
+    """GLOBAL_ATOMIC_ADD_U64 adds 64-bit value and returns old value."""
+    TEST_OFFSET = 2000
+    setup = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0x00000000),
+      v_mov_b32_e32(v[3], s[0]),
+      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[0], 0x00000001),
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[0], 0x00000000),
+      v_mov_b32_e32(v[5], s[0]),
+    ]
+    atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2)
+    def check(st):
+      self.assertEqual(st.vgpr[0][6], 0xFFFFFFFF)
+      self.assertEqual(st.vgpr[0][7], 0x00000000)
+    self._make_test(setup, atomic, check, TEST_OFFSET)
+
+
+class TestGlobalLoad(unittest.TestCase):
+  """Tests for GLOBAL load instructions."""
+
+  def test_global_load_b96(self):
+    """GLOBAL_LOAD_B96 loads 96-bit value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0xAAAAAAAA),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xBBBBBBBB),
+      v_mov_b32_e32(v[3], s[0]),
+      s_mov_b32(s[0], 0xCCCCCCCC),
+      v_mov_b32_e32(v[4], s[0]),
+      global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][5], 0xAAAAAAAA)
+    self.assertEqual(st.vgpr[0][6], 0xBBBBBBBB)
+    self.assertEqual(st.vgpr[0][7], 0xCCCCCCCC)
+
+  def test_global_load_b128(self):
+    """GLOBAL_LOAD_B128 loads 128-bit value correctly."""
+    TEST_OFFSET = 2000
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[2], s[0]),
+      s_mov_b32(s[0], 0xCAFEBABE),
+      v_mov_b32_e32(v[3], s[0]),
+      s_mov_b32(s[0], 0x12345678),
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[0], 0x9ABCDEF0),
+      v_mov_b32_e32(v[5], s[0]),
+      global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][6], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][7], 0xCAFEBABE)
+    self.assertEqual(st.vgpr[0][8], 0x12345678)
+    self.assertEqual(st.vgpr[0][9], 0x9ABCDEF0)
+
+
+class TestGlobalStore(unittest.TestCase):
+  """Tests for GLOBAL store instructions."""
+
+  def test_global_store_b64_basic(self):
+    """GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xDEADBEEF),
+      s_mov_b32(s[5], 0xCAFEBABE),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[3], s[5]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b64(addr=v[0], data=v[2], saddr=s[2], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[0], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[4]),
+      v_mov_b32_e32(v[1], v[5]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0xDEADBEEF)
+    self.assertEqual(st.vgpr[0][1], 0xCAFEBABE)
+
+
+class TestD16HiLoads(unittest.TestCase):
+  """Tests for D16_HI load instructions that load into high 16 bits."""
+
+  def test_global_load_d16_hi_b16_preserves_low_bits(self):
+    """GLOBAL_LOAD_D16_HI_B16 must preserve low 16 bits of destination."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      v_mov_b32_e32(v[0], s[2]),
+      v_mov_b32_e32(v[1], s[3]),
+      s_mov_b32(s[4], 0xCAFE),
+      v_mov_b32_e32(v[2], s[4]),
+      global_store_b16(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0x0000BEEF),
+      v_mov_b32_e32(v[3], s[4]),
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[0], vdst=v[3], data=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[3]),
+      v_mov_b32_e32(v[1], 0),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    self.assertEqual(result, 0xCAFEBEEF, f"Expected 0xCAFEBEEF, got 0x{result:08x}")
+
+  def test_global_load_d16_hi_b16_data_differs_from_vdst(self):
+    """GLOBAL_LOAD_D16_HI_B16 where data field differs from vdst."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xCAFE),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[3], 0),
+      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0x0000DEAD),
+      v_mov_b32_e32(v[0], s[4]),  # data field - should NOT affect result
+      v_mov_b32_e32(v[1], 0),     # vdst - low bits should be preserved
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[1]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}")
+
+  def test_global_load_d16_hi_u8_data_differs_from_vdst(self):
+    """GLOBAL_LOAD_D16_HI_U8 where data field differs from vdst."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xAB),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[3], 0),
+      global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0x0000DEAD),
+      v_mov_b32_e32(v[4], s[4]),  # data field
+      s_mov_b32(s[4], 0x0000BEEF),
+      v_mov_b32_e32(v[5], s[4]),  # vdst
+      v_mov_b32_e32(v[3], 0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_U8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[5]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    self.assertEqual(result, 0x00ABBEEF, f"Expected 0x00ABBEEF, got 0x{result:08x}")
+
+  def test_global_load_d16_hi_b16_same_addr_and_dst_zero_addr(self):
+    """GLOBAL_LOAD_D16_HI_B16 with same register for addr and vdst, addr value=0."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0xCAFE),
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[3], 0),
+      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[1], 0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[1]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}")
+
+  def test_global_load_d16_hi_b16_tril_exact_pattern(self):
+    """Exact pattern from tril() failure: data=v0 differs from vdst=v1."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0x01010101),
+      v_mov_b32_e32(v[10], s[4]),
+      v_mov_b32_e32(v[3], 0),
+      global_store_b32(addr=v[3], data=v[10], saddr=s[2], offset=TEST_OFFSET),
+      global_store_b32(addr=v[3], data=v[10], saddr=s[2], offset=TEST_OFFSET+4),
+      s_waitcnt(vmcnt=0),
+      # Set v[0] to 0x0101 (simulating prior u16 load result)
+      s_mov_b32(s[4], 0x0101),
+      v_mov_b32_e32(v[0], s[4]),
+      # Set v[1] to 0
+      v_mov_b32_e32(v[1], 0),
+      # Load using v[1] as addr AND vdst, but v[0] as data
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET+6, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[1]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    # Expected: hi=0x0101 (loaded), lo=0x0000 (from v1) -> 0x01010000
+    self.assertEqual(result, 0x01010000, f"Expected 0x01010000, got 0x{result:08x}")
+
+  def test_global_load_d16_hi_i8_data_differs_from_vdst(self):
+    """GLOBAL_LOAD_D16_HI_I8 where data field differs from vdst."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0x80),  # negative signed byte = -128
+      v_mov_b32_e32(v[2], s[4]),
+      v_mov_b32_e32(v[3], 0),
+      global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
+      s_waitcnt(vmcnt=0),
+      s_mov_b32(s[4], 0x0000DEAD),
+      v_mov_b32_e32(v[4], s[4]),  # data field
+      s_mov_b32(s[4], 0x0000BEEF),
+      v_mov_b32_e32(v[5], s[4]),  # vdst
+      v_mov_b32_e32(v[3], 0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_I8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[5]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    # 0x80 sign-extended = 0xFF80, lo=0xBEEF -> 0xFF80BEEF
+    self.assertEqual(result, 0xFF80BEEF, f"Expected 0xFF80BEEF, got 0x{result:08x}")
+
+  def test_global_store_b64_tril_pattern(self):
+    """Test the exact pattern from tril() kernel that was failing."""
+    TEST_OFFSET = 256
+    instructions = [
+      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
+      s_waitcnt(lgkmcnt=0),
+      s_mov_b32(s[4], 0x01010101),
+      v_mov_b32_e32(v[10], s[4]),
+      v_mov_b32_e32(v[11], s[4]),
+      s_mov_b32(s[4], 0x01),
+      v_mov_b32_e32(v[12], s[4]),
+      v_mov_b32_e32(v[0], 0),
+      global_store_b64(addr=v[0], data=v[10], saddr=s[2], offset=TEST_OFFSET),
+      global_store_b8(addr=v[0], data=v[12], saddr=s[2], offset=TEST_OFFSET+8),
+      s_waitcnt(vmcnt=0),
+
+      v_mov_b32_e32(v[2], 0),
+      v_mov_b32_e32(v[1], 0),
+      FLAT(GLOBALOp.GLOBAL_LOAD_U16, addr=v[2], vdst=v[0], data=v[0], saddr=s[2], offset=TEST_OFFSET+3, seg=2),
+      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET+6, seg=2),
+      FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[3], data=v[3], saddr=s[2], offset=TEST_OFFSET, seg=2),
+      FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET+8, seg=2),
+      s_waitcnt(vmcnt=0),
+
+      v_and_b32_e32(v[5], 0xffff, v[0]),
+      v_lshlrev_b32_e32(v[0], 24, v[0]),
+      v_lshrrev_b32_e32(v[5], 8, v[5]),
+      v_or_b32_e32(v[0], v[3], v[0]),
+      v_or_b32_e32(v[1], v[5], v[1]),
+
+      global_store_b64(addr=v[2], data=v[0], saddr=s[2], offset=TEST_OFFSET+16),
+      s_waitcnt(vmcnt=0),
+
+      FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[2], vdst=v[6], data=v[6], saddr=s[2], offset=TEST_OFFSET+16, seg=2),
+      s_waitcnt(vmcnt=0),
+      v_mov_b32_e32(v[0], v[6]),
+      v_mov_b32_e32(v[1], v[7]),
+      s_mov_b32(s[2], 0),
+      s_mov_b32(s[3], 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+
+    v0 = st.vgpr[0][0]
+    v1 = st.vgpr[0][1]
+    self.assertEqual(v0, 0x01000001, f"v0: expected 0x01000001, got 0x{v0:08x}")
+    self.assertEqual(v1, 0x01010001, f"v1: expected 0x01010001, got 0x{v1:08x}")
+
+    byte5 = (v1 >> 8) & 0xff
+    self.assertEqual(byte5, 0x00, f"byte5: expected 0x00, got 0x{byte5:02x}")
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_sop.py b/extra/assembly/amd/test/hw/test_sop.py
new file mode 100644
index 0000000000..5dd34b2528
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_sop.py
@@ -0,0 +1,205 @@
+"""Tests for SOP instructions - scalar operations.
+
+Includes: s_add_u32, s_mov_b32, s_and_b32, s_or_b32, s_quadmask_b32, s_wqm_b32,
+          s_cbranch_vccnz, s_cbranch_vccz
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestBasicScalar(unittest.TestCase):
+  """Tests for basic scalar operations."""
+
+  def test_s_add_u32(self):
+    """S_ADD_U32 adds two scalar values."""
+    instructions = [
+      s_mov_b32(s[0], 100),
+      s_mov_b32(s[1], 200),
+      s_add_u32(s[2], s[0], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[2], 300)
+
+  def test_s_add_u32_carry(self):
+    """S_ADD_U32 sets SCC on overflow."""
+    instructions = [
+      s_mov_b32(s[0], 64),
+      s_not_b32(s[0], s[0]),  # ~64 = 0xffffffbf
+      s_mov_b32(s[1], 64),
+      s_add_u32(s[2], s[0], s[1]),  # 0xffffffbf + 64 = 0xffffffff
+      s_mov_b32(s[3], 1),
+      s_add_u32(s[4], s[2], s[3]),  # 0xffffffff + 1 = overflow
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[4], 0)
+    self.assertEqual(st.scc, 1)
+
+
+class TestQuadmaskWqm(unittest.TestCase):
+  """Tests for S_QUADMASK_B32 and S_WQM_B32."""
+
+  def test_s_quadmask_b32_all_quads_active(self):
+    """S_QUADMASK_B32 with all quads active."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),  # All lanes active
+      s_quadmask_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Each quad (4 lanes) with any bit set -> 1 bit in result
+    # 32 lanes = 8 quads, all active -> 0xFF
+    self.assertEqual(st.sgpr[1], 0xFF)
+
+  def test_s_quadmask_b32_alternating_quads(self):
+    """S_QUADMASK_B32 with alternating quads active."""
+    instructions = [
+      s_mov_b32(s[0], 0x0F0F0F0F),  # Quads 0,2,4,6 active
+      s_quadmask_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Quads 0,2,4,6 have at least one bit -> 0b01010101 = 0x55
+    self.assertEqual(st.sgpr[1], 0x55)
+
+  def test_s_quadmask_b32_no_quads_active(self):
+    """S_QUADMASK_B32 with no quads active."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_quadmask_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[1], 0)
+
+  def test_s_quadmask_b32_single_lane_per_quad(self):
+    """S_QUADMASK_B32 with single lane active in each quad."""
+    instructions = [
+      s_mov_b32(s[0], 0x11111111),  # Bit 0 of each nibble
+      s_quadmask_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # All 8 quads have at least one lane -> 0xFF
+    self.assertEqual(st.sgpr[1], 0xFF)
+
+  def test_s_wqm_b32_all_active(self):
+    """S_WQM_B32 with all lanes active returns all 1s."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_wqm_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[1], 0xFFFFFFFF)
+
+  def test_s_wqm_b32_alternating_quads(self):
+    """S_WQM_B32 with single lane per quad expands to full quads."""
+    instructions = [
+      s_mov_b32(s[0], 0x11111111),  # One lane per quad
+      s_wqm_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Each quad with any bit expands to all 4 bits
+    self.assertEqual(st.sgpr[1], 0xFFFFFFFF)
+
+  def test_s_wqm_b32_zero(self):
+    """S_WQM_B32 with zero input returns zero."""
+    instructions = [
+      s_mov_b32(s[0], 0),
+      s_wqm_b32(s[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[1], 0)
+
+
+class TestBranch(unittest.TestCase):
+  """Tests for branch instructions."""
+
+  def test_cbranch_vccnz_ignores_vcc_hi(self):
+    """S_CBRANCH_VCCNZ should only check VCC_LO in wave32."""
+    instructions = [
+      # Set VCC_LO = 0, VCC_HI = 1
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),
+      s_mov_b32(s[SrcEnum.VCC_HI - 128], 1),
+      v_mov_b32_e32(v[0], 0),
+      # If VCC_HI is incorrectly used, branch will be taken
+      s_cbranch_vccnz(1),  # Skip next instruction if VCC != 0
+      v_mov_b32_e32(v[0], 42),  # This should execute
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 42, "Branch should NOT be taken (VCC_LO is 0)")
+
+  def test_cbranch_vccz_ignores_vcc_hi(self):
+    """S_CBRANCH_VCCZ should only check VCC_LO in wave32."""
+    instructions = [
+      # Set VCC_LO = 1, VCC_HI = 0
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),
+      s_mov_b32(s[SrcEnum.VCC_HI - 128], 0),
+      v_mov_b32_e32(v[0], 0),
+      # If VCC_HI is incorrectly used, branch will be taken
+      s_cbranch_vccz(1),  # Skip next instruction if VCC == 0
+      v_mov_b32_e32(v[0], 42),  # This should execute
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 42, "Branch should NOT be taken (VCC_LO is 1)")
+
+  def test_cbranch_vccnz_branches_on_vcc_lo(self):
+    """S_CBRANCH_VCCNZ branches when VCC_LO is non-zero."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),
+      v_mov_b32_e32(v[0], 0),
+      s_cbranch_vccnz(1),  # Skip next instruction if VCC != 0
+      v_mov_b32_e32(v[0], 42),  # This should be skipped
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 0, "Branch should be taken (VCC_LO is 1)")
+
+
+class Test64BitLiterals(unittest.TestCase):
+  """Tests for 64-bit literal encoding in instructions."""
+
+  def test_64bit_literal_negative_encoding(self):
+    """64-bit literal -2^32 encodes correctly."""
+    lit = -4294967296.0  # -2^32
+    lit_bits = f2i64(lit)
+    instructions = [
+      s_mov_b32(s[0], lit_bits & 0xffffffff),
+      s_mov_b32(s[1], lit_bits >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertAlmostEqual(result, -4294967296.0, places=5)
+
+  def test_64bit_literal_positive_encoding(self):
+    """64-bit instruction encodes large positive literals correctly."""
+    large_val = 0x12345678
+    inst = v_add_f64(v[2], v[0], large_val)
+    self.assertIsNotNone(inst._literal, "Literal should be set")
+    actual_lit = (inst._literal >> 32) & 0xffffffff
+    self.assertEqual(actual_lit, large_val, f"Literal should be {large_val:#x}, got {actual_lit:#x}")
+
+
+class TestSCCBehavior(unittest.TestCase):
+  """Tests for SCC condition code behavior."""
+
+  def test_scc_from_s_cmp(self):
+    """SCC should be set by scalar compare."""
+    instructions = [
+      s_mov_b32(s[0], 10),
+      s_cmp_eq_u32(s[0], 10),
+      s_cselect_b32(s[1], 1, 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[1], 1, "SCC should be true")
+    self.assertEqual(st.scc, 1)
+
+  def test_scc_clear(self):
+    """SCC should be cleared by failing compare."""
+    instructions = [
+      s_mov_b32(s[0], 10),
+      s_cmp_eq_u32(s[0], 20),
+      s_cselect_b32(s[1], 1, 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[1], 0, "SCC should be false")
+    self.assertEqual(st.scc, 0)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop1.py b/extra/assembly/amd/test/hw/test_vop1.py
new file mode 100644
index 0000000000..5e86fe47a5
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vop1.py
@@ -0,0 +1,1242 @@
+"""Tests for VOP1 instructions - single operand vector operations.
+
+Includes: v_mov_b32, v_cvt_*, v_sin_f32, v_rcp_f32, v_exp_f32, v_rndne_f32,
+          v_floor_f32, v_trunc_f32, v_fract_f32, v_clz_i32_u32, v_ctz_i32_b32,
+          v_readfirstlane_b32
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestMov(unittest.TestCase):
+  """Tests for V_MOV_B32."""
+
+  def test_v_mov_b32(self):
+    """V_MOV_B32 moves a value."""
+    instructions = [
+      s_mov_b32(s[0], 42),
+      v_mov_b32_e32(v[0], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][0], 42)
+
+  def test_v_mov_all_lanes(self):
+    """V_MOV_B32 sets all lanes to the same value."""
+    instructions = [
+      s_mov_b32(s[0], 42),
+      v_mov_b32_e32(v[0], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][0], 42)
+
+  def test_v_mov_b16_to_hi(self):
+    """V_MOV_B16 can write to high 16 bits with .h suffix."""
+    instructions = [
+      s_mov_b32(s[0], 0x0000DEAD),  # lo=0xDEAD, hi=0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b16_e32(v[0].h, 0x5678),  # Move 0x5678 to high half
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
+    result_lo = st.vgpr[0][0] & 0xFFFF
+    self.assertEqual(result_hi, 0x5678, f"Expected hi=0x5678, got 0x{result_hi:04x}")
+    self.assertEqual(result_lo, 0xDEAD, f"Expected lo=0xDEAD (preserved), got 0x{result_lo:04x}")
+
+  def test_v_mov_b16_to_lo(self):
+    """V_MOV_B16 writes to low 16 bits by default."""
+    instructions = [
+      s_mov_b32(s[0], 0xBEEF0000),  # hi=0xBEEF, lo=0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b16_e32(v[0], 0x1234),  # Move to low half
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
+    result_lo = st.vgpr[0][0] & 0xFFFF
+    self.assertEqual(result_lo, 0x1234, f"Expected lo=0x1234, got 0x{result_lo:04x}")
+    self.assertEqual(result_hi, 0xBEEF, f"Expected hi=0xBEEF (preserved), got 0x{result_hi:04x}")
+
+
+class TestTrigonometry(unittest.TestCase):
+  """Tests for trigonometric instructions."""
+
+  def test_v_sin_f32_small(self):
+    """V_SIN_F32 computes sin for small values."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_sin_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    expected = math.sin(1.0 * 2 * math.pi)
+    self.assertAlmostEqual(result, expected, places=4)
+
+  def test_v_sin_f32_quarter(self):
+    """V_SIN_F32 at 0.25 cycles = sin(pi/2) = 1.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(0.25)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_sin_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 1.0, places=4)
+
+  def test_v_sin_f32_large(self):
+    """V_SIN_F32 for large input value (132000.0)."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], f2i(132000.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_sin_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    expected = math.sin(132000.0 * 2 * math.pi)
+    self.assertAlmostEqual(result, expected, places=2, msg=f"sin(132000) got {result}, expected ~{expected}")
+
+
+class TestRounding(unittest.TestCase):
+  """Tests for rounding instructions."""
+
+  def test_v_rndne_f32_half_even(self):
+    """V_RNDNE_F32 rounds to nearest even."""
+    instructions = [
+      s_mov_b32(s[0], f2i(2.5)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rndne_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)
+
+  def test_v_rndne_f32_half_odd(self):
+    """V_RNDNE_F32 rounds 3.5 to 4 (nearest even)."""
+    instructions = [
+      s_mov_b32(s[0], f2i(3.5)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rndne_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4.0, places=5)
+
+  def test_v_rndne_f32_large(self):
+    """V_RNDNE_F32 with large value (like sin reduction uses)."""
+    val = 100000.0 * 0.15915494309189535
+    instructions = [
+      s_mov_b32(s[0], f2i(val)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rndne_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    expected = round(val)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), expected, places=0)
+
+  def test_v_floor_f32(self):
+    """V_FLOOR_F32 floors to integer."""
+    instructions = [
+      s_mov_b32(s[0], f2i(3.7)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_floor_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 3.0, places=5)
+
+  def test_v_trunc_f32(self):
+    """V_TRUNC_F32 truncates toward zero."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-3.7)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_trunc_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -3.0, places=5)
+
+  def test_v_fract_f32(self):
+    """V_FRACT_F32 returns fractional part."""
+    instructions = [
+      s_mov_b32(s[0], f2i(3.75)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.75, places=5)
+
+  def test_v_fract_f32_large(self):
+    """V_FRACT_F32 with large value - precision matters here."""
+    instructions = [
+      s_mov_b32(s[0], f2i(132000.25)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertGreaterEqual(result, 0.0)
+    self.assertLess(result, 1.0)
+
+
+class TestConversion(unittest.TestCase):
+  """Tests for conversion instructions."""
+
+  def test_v_cvt_i32_f32_positive(self):
+    """V_CVT_I32_F32 converts float to signed int."""
+    instructions = [
+      s_mov_b32(s[0], f2i(42.7)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_i32_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 42)
+
+  def test_v_cvt_i32_f32_negative(self):
+    """V_CVT_I32_F32 converts negative float to signed int."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-42.7)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_i32_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1] & 0xffffffff, (-42) & 0xffffffff)
+
+  def test_v_cvt_i32_f32_large(self):
+    """V_CVT_I32_F32 with large float (used in sin for quadrant)."""
+    instructions = [
+      s_mov_b32(s[0], f2i(15915.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_i32_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 15915)
+
+  def test_v_cvt_f32_i32(self):
+    """V_CVT_F32_I32 converts signed int to float."""
+    instructions = [
+      s_mov_b32(s[0], 42),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_i32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 42.0, places=5)
+
+  def test_v_cvt_f32_u32(self):
+    """V_CVT_F32_U32 converts unsigned int to float."""
+    instructions = [
+      s_mov_b32(s[0], 0xffffffff),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4294967296.0, places=-5)
+
+
+class TestF16Conversions(unittest.TestCase):
+  """Tests for f16 conversion instructions."""
+
+  def test_v_cvt_f16_f32_basic(self):
+    """V_CVT_F16_F32 converts f32 to f16 in low 16 bits."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo_bits = result & 0xffff
+    self.assertEqual(lo_bits, 0x3c00, f"Expected 0x3c00, got 0x{lo_bits:04x}")
+
+  def test_v_cvt_f16_f32_negative(self):
+    """V_CVT_F16_F32 converts negative f32 to f16."""
+    instructions = [
+      v_mov_b32_e32(v[0], -2.0),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo_bits = result & 0xffff
+    self.assertEqual(lo_bits, 0xc000, f"Expected 0xc000, got 0x{lo_bits:04x}")
+
+  def test_v_cvt_f16_f32_small(self):
+    """V_CVT_F16_F32 converts small f32 value."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    instructions = [
+      v_mov_b32_e32(v[0], 0.5),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo_bits = result & 0xffff
+    expected = f32_to_f16(0.5)
+    self.assertEqual(lo_bits, expected, f"Expected 0x{expected:04x}, got 0x{lo_bits:04x}")
+
+  def test_v_cvt_f16_f32_preserves_high_bits(self):
+    """V_CVT_F16_F32 preserves high 16 bits of destination."""
+    instructions = [
+      s_mov_b32(s[0], 0xdead0000),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mov_b32_e32(v[0], 1.0),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    hi_bits = (result >> 16) & 0xffff
+    lo_bits = result & 0xffff
+    self.assertEqual(lo_bits, 0x3c00, f"Low bits should be 0x3c00, got 0x{lo_bits:04x}")
+    self.assertEqual(hi_bits, 0xdead, f"High bits should be preserved as 0xdead, got 0x{hi_bits:04x}")
+
+  def test_v_cvt_f16_f32_same_src_dst_preserves_high_bits(self):
+    """V_CVT_F16_F32 with same src/dst preserves high bits of source."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_cvt_f16_f32_e32(v[0], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][0]
+    self.assertEqual(result, 0x3f803c00, f"Expected 0x3f803c00, got 0x{result:08x}")
+
+  def test_v_cvt_f16_f32_reads_full_32bit_source(self):
+    """V_CVT_F16_F32 must read full 32-bit f32 source."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x3fc00000),  # f32 1.5
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo_bits = result & 0xffff
+    self.assertEqual(lo_bits, 0x3e00, f"Expected f16(1.5)=0x3e00, got 0x{lo_bits:04x} ({_f16(lo_bits)})")
+
+  def test_v_cvt_i16_f16_zero(self):
+    """V_CVT_I16_F16 converts f16 zero to i16 zero."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_cvt_i16_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 0, f"Expected 0, got {result}")
+
+  def test_v_cvt_i16_f16_one(self):
+    """V_CVT_I16_F16 converts f16 1.0 to i16 1."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0 in low bits
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_i16_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 1, f"Expected 1, got {result}")
+
+  def test_v_cvt_i16_f16_negative(self):
+    """V_CVT_I16_F16 converts f16 -2.0 to i16 -2."""
+    instructions = [
+      s_mov_b32(s[0], 0xc000),  # f16 -2.0 in low bits
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_i16_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, (-2) & 0xffff, f"Expected 0xfffe (-2), got 0x{result:04x}")
+
+  def test_v_cvt_i16_f16_from_hi(self):
+    """V_CVT_I16_F16 can read from high 16 bits with opsel."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c000000),  # f16 1.0 in HIGH bits, 0 in low
+      v_mov_b32_e32(v[0], s[0]),
+      VOP3(VOP3Op.V_CVT_I16_F16, vdst=v[1], src0=v[0], opsel=0b0001),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 1, f"Expected 1 from high bits, got {result}")
+
+
+class TestClz(unittest.TestCase):
+  """Tests for V_CLZ_I32_U32 - count leading zeros."""
+
+  def test_v_clz_i32_u32_zero(self):
+    """V_CLZ_I32_U32 of 0 returns -1 (all bits are 0)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_clz_i32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+  def test_v_clz_i32_u32_one(self):
+    """V_CLZ_I32_U32 of 1 returns 31 (31 leading zeros)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1),
+      v_clz_i32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 31)
+
+  def test_v_clz_i32_u32_msb_set(self):
+    """V_CLZ_I32_U32 of 0x80000000 returns 0 (no leading zeros)."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_clz_i32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+  def test_v_clz_i32_u32_half(self):
+    """V_CLZ_I32_U32 of 0x8000 (bit 15) returns 16."""
+    instructions = [
+      s_mov_b32(s[0], 0x8000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_clz_i32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 16)
+
+  def test_v_clz_i32_u32_all_ones(self):
+    """V_CLZ_I32_U32 of 0xFFFFFFFF returns 0."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_clz_i32_u32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+
+class TestCtz(unittest.TestCase):
+  """Tests for V_CTZ_I32_B32 - count trailing zeros."""
+
+  def test_v_ctz_i32_b32_zero(self):
+    """V_CTZ_I32_B32 of 0 returns -1 (all bits are 0)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_ctz_i32_b32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+  def test_v_ctz_i32_b32_one(self):
+    """V_CTZ_I32_B32 of 1 returns 0 (no trailing zeros)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1),
+      v_ctz_i32_b32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+  def test_v_ctz_i32_b32_msb_set(self):
+    """V_CTZ_I32_B32 of 0x80000000 returns 31."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ctz_i32_b32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 31)
+
+  def test_v_ctz_i32_b32_half(self):
+    """V_CTZ_I32_B32 of 0x8000 (bit 15) returns 15."""
+    instructions = [
+      s_mov_b32(s[0], 0x8000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ctz_i32_b32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 15)
+
+  def test_v_ctz_i32_b32_all_ones(self):
+    """V_CTZ_I32_B32 of 0xFFFFFFFF returns 0."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ctz_i32_b32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+
+class TestRcp(unittest.TestCase):
+  """Tests for V_RCP_F32 - reciprocal."""
+
+  def test_v_rcp_f32_normal(self):
+    """V_RCP_F32 of 2.0 returns 0.5."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_rcp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
+
+  def test_v_rcp_f32_inf(self):
+    """V_RCP_F32 of +inf returns 0."""
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rcp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_rcp_f32_neg_inf(self):
+    """V_RCP_F32 of -inf returns -0."""
+    instructions = [
+      s_mov_b32(s[0], 0xff800000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rcp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertEqual(result, 0.0)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_rcp_f32_zero(self):
+    """V_RCP_F32 of 0 returns +inf."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_rcp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+
+
+class TestExp(unittest.TestCase):
+  """Tests for V_EXP_F32 - base-2 exponential."""
+
+  def test_v_exp_f32_large_negative(self):
+    """V_EXP_F32 of large negative value (2^-100) returns very small number."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-100.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertLess(result, 1e-20)
+
+  def test_v_exp_f32_large_positive(self):
+    """V_EXP_F32 of large positive value (2^100) returns very large number."""
+    instructions = [
+      s_mov_b32(s[0], f2i(100.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertGreater(result, 1e20)
+
+
+class TestReadFirstLane(unittest.TestCase):
+  """Tests for V_READFIRSTLANE_B32."""
+
+  def _readfirstlane(self, sdst_idx, vsrc):
+    """Helper to create V_READFIRSTLANE_B32 with SGPR destination."""
+    return VOP1(VOP1Op.V_READFIRSTLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc)
+
+  def test_v_readfirstlane_b32_basic(self):
+    """V_READFIRSTLANE_B32 reads from the first active lane."""
+    instructions = [
+      v_lshlrev_b32_e32(v[0], 2, v[255]),
+      v_add_nc_u32_e32(v[0], 1000, v[0]),
+      self._readfirstlane(0, v[0]),
+      v_mov_b32_e32(v[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 1000)
+
+  def test_v_readfirstlane_b32_different_vgpr(self):
+    """V_READFIRSTLANE_B32 reading from different VGPR index."""
+    instructions = [
+      v_lshlrev_b32_e32(v[7], 5, v[255]),
+      v_add_nc_u32_e32(v[7], 200, v[7]),
+      self._readfirstlane(0, v[7]),
+      v_mov_b32_e32(v[8], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][8], 200)
+
+
+class TestCvtF16Modifiers(unittest.TestCase):
+  """Tests for V_CVT_F32_F16 with VOP3 abs/neg modifiers."""
+
+  def test_v_cvt_f32_f16_abs_negative(self):
+    """V_CVT_F32_F16 with |abs| on negative value."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_neg1 = f32_to_f16(-1.0)  # 0xbc00
+    instructions = [
+      s_mov_b32(s[0], f16_neg1),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cvt_f32_f16_e64(v[0], abs(v[1])),  # |(-1.0)| = 1.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][0])
+    self.assertAlmostEqual(result, 1.0, places=5)
+
+  def test_v_cvt_f32_f16_abs_positive(self):
+    """V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_2 = f32_to_f16(2.0)  # 0x4000
+    instructions = [
+      s_mov_b32(s[0], f16_2),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cvt_f32_f16_e64(v[0], abs(v[1])),  # |2.0| = 2.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][0])
+    self.assertAlmostEqual(result, 2.0, places=5)
+
+  def test_v_cvt_f32_f16_neg_positive(self):
+    """V_CVT_F32_F16 with neg on positive value."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_2 = f32_to_f16(2.0)  # 0x4000
+    instructions = [
+      s_mov_b32(s[0], f16_2),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cvt_f32_f16_e64(v[0], -v[1]),  # -(2.0) = -2.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][0])
+    self.assertAlmostEqual(result, -2.0, places=5)
+
+  def test_v_cvt_f32_f16_neg_negative(self):
+    """V_CVT_F32_F16 with neg on negative value (double negative)."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_neg2 = f32_to_f16(-2.0)  # 0xc000
+    instructions = [
+      s_mov_b32(s[0], f16_neg2),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cvt_f32_f16_e64(v[0], -v[1]),  # -(-2.0) = 2.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][0])
+    self.assertAlmostEqual(result, 2.0, places=5)
+
+  def test_v_cvt_f16_f32_then_pack_for_wmma(self):
+    """CVT F32->F16 followed by pack (common WMMA pattern)."""
+    from extra.assembly.amd.pcode import _f16
+    f32_val = 3.5
+    instructions = [
+      s_mov_b32(s[0], f2i(f32_val)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+      v_pack_b32_f16(v[2], v[1], v[1]),  # Pack same value
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo = _f16(st.vgpr[0][2] & 0xffff)
+    hi = _f16((st.vgpr[0][2] >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, f32_val, places=1)
+    self.assertAlmostEqual(hi, f32_val, places=1)
+
+
+class TestConversionRounding(unittest.TestCase):
+  """Tests for conversion rounding behavior."""
+
+  def test_cvt_f32_to_i32_round_toward_zero(self):
+    """F32 to I32 should truncate (round toward zero)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.9),
+      v_mov_b32_e32(v[1], -2.9),
+      v_cvt_i32_f32_e32(v[2], v[0]),
+      v_cvt_i32_f32_e32(v[3], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 2, "2.9 -> 2")
+    self.assertEqual(st.vgpr[0][3] & 0xFFFFFFFF, 0xFFFFFFFE, "-2.9 -> -2")
+
+  def test_cvt_f32_to_u32_negative(self):
+    """F32 to U32 with negative input should clamp to 0."""
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_cvt_u32_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+  def test_rndne_f32_half_even(self):
+    """V_RNDNE_F32 should round to nearest even."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.5),
+      v_mov_b32_e32(v[1], 3.5),
+      v_mov_b32_e32(v[2], 4.5),
+      v_rndne_f32_e32(v[3], v[0]),
+      v_rndne_f32_e32(v[4], v[1]),
+      v_rndne_f32_e32(v[5], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 2.0, places=5)  # 2.5 -> 2 (even)
+    self.assertAlmostEqual(i2f(st.vgpr[0][4]), 4.0, places=5)  # 3.5 -> 4 (even)
+    self.assertAlmostEqual(i2f(st.vgpr[0][5]), 4.0, places=5)  # 4.5 -> 4 (even)
+
+  def test_f16_to_f32_precision(self):
+    """F16 to F32 conversion precision."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_val = f32_to_f16(1.5)
+    instructions = [
+      s_mov_b32(s[0], f16_val),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.5, places=5)
+
+  def test_f16_denormal_to_f32(self):
+    """F16 denormal converts to small positive f32."""
+    from extra.assembly.amd.pcode import _f16
+    f16_denorm = 0x0001  # Smallest positive f16 denormal
+    instructions = [
+      v_mov_b32_e32(v[0], f16_denorm),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertGreater(result, 0)
+    self.assertLess(result, 1e-6)
+
+
+class TestSqrt(unittest.TestCase):
+  """Tests for V_SQRT_F32 - square root."""
+
+  def test_v_sqrt_f32_normal(self):
+    """V_SQRT_F32 of 4.0 returns 2.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 4.0),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)
+
+  def test_v_sqrt_f32_one(self):
+    """V_SQRT_F32 of 1.0 returns 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=5)
+
+  def test_v_sqrt_f32_zero(self):
+    """V_SQRT_F32 of 0.0 returns 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_sqrt_f32_neg_zero(self):
+    """V_SQRT_F32 of -0.0 returns -0.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)  # -0.0
+
+  def test_v_sqrt_f32_inf(self):
+    """V_SQRT_F32 of +inf returns +inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertGreater(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_sqrt_f32_negative(self):
+    """V_SQRT_F32 of negative value returns NaN."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_v_sqrt_f32_nan(self):
+    """V_SQRT_F32 of NaN returns NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_v_sqrt_f32_small(self):
+    """V_SQRT_F32 of small value (0.25) returns 0.5."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0.25),
+      v_sqrt_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
+
+
+class TestRsq(unittest.TestCase):
+  """Tests for V_RSQ_F32 - reciprocal square root (1/sqrt(x))."""
+
+  def test_v_rsq_f32_normal(self):
+    """V_RSQ_F32 of 4.0 returns 0.5."""
+    instructions = [
+      v_mov_b32_e32(v[0], 4.0),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
+
+  def test_v_rsq_f32_one(self):
+    """V_RSQ_F32 of 1.0 returns 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=5)
+
+  def test_v_rsq_f32_zero(self):
+    """V_RSQ_F32 of 0 returns +inf."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertGreater(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_rsq_f32_neg_zero(self):
+    """V_RSQ_F32 of -0.0 returns -inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertLess(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_rsq_f32_inf(self):
+    """V_RSQ_F32 of +inf returns 0."""
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_rsq_f32_negative(self):
+    """V_RSQ_F32 of negative value returns NaN."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_v_rsq_f32_large(self):
+    """V_RSQ_F32 of large value."""
+    instructions = [
+      s_mov_b32(s[0], f2i(1e10)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rsq_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    # 1/sqrt(1e10) ~= 1e-5
+    self.assertAlmostEqual(result, 1e-5, places=8)
+
+
+class TestLog(unittest.TestCase):
+  """Tests for V_LOG_F32 - base-2 logarithm."""
+
+  def test_v_log_f32_one(self):
+    """V_LOG_F32 of 1.0 returns 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.0, places=4)
+
+  def test_v_log_f32_two(self):
+    """V_LOG_F32 of 2.0 returns 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=4)
+
+  def test_v_log_f32_four(self):
+    """V_LOG_F32 of 4.0 returns 2.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 4.0),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=4)
+
+  def test_v_log_f32_half(self):
+    """V_LOG_F32 of 0.5 returns -1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0.5),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=4)
+
+  def test_v_log_f32_zero(self):
+    """V_LOG_F32 of 0 returns -inf."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertLess(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_log_f32_inf(self):
+    """V_LOG_F32 of +inf returns +inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertGreater(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_log_f32_negative(self):
+    """V_LOG_F32 of negative value returns NaN."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_log_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+
+class TestCos(unittest.TestCase):
+  """Tests for V_COS_F32 - cosine (input in cycles, not radians)."""
+
+  def test_v_cos_f32_zero(self):
+    """V_COS_F32 at 0 cycles = cos(0) = 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_cos_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=4)
+
+  def test_v_cos_f32_quarter(self):
+    """V_COS_F32 at 0.25 cycles = cos(pi/2) = 0.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(0.25)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cos_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.0, places=4)
+
+  def test_v_cos_f32_half(self):
+    """V_COS_F32 at 0.5 cycles = cos(pi) = -1.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(0.5)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cos_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=4)
+
+  def test_v_cos_f32_full(self):
+    """V_COS_F32 at 1.0 cycles = cos(2*pi) = 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_cos_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=4)
+
+  def test_v_cos_f32_large(self):
+    """V_COS_F32 for large input value."""
+    import math
+    val = 132000.0
+    instructions = [
+      s_mov_b32(s[0], f2i(val)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_cos_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    expected = math.cos(val * 2 * math.pi)
+    self.assertAlmostEqual(result, expected, places=2)
+
+
+class TestFractEdgeCases(unittest.TestCase):
+  """Additional edge case tests for V_FRACT_F32."""
+
+  def test_v_fract_f32_negative(self):
+    """V_FRACT_F32 of -1.25 should return 0.75 (fract is always positive)."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-1.25)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 0.75, places=5)
+
+  def test_v_fract_f32_negative_small(self):
+    """V_FRACT_F32 of -0.25 should return 0.75."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-0.25)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 0.75, places=5)
+
+  def test_v_fract_f32_whole_number(self):
+    """V_FRACT_F32 of 5.0 should return 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 5.0),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 0.0, places=5)
+
+  def test_v_fract_f32_negative_whole(self):
+    """V_FRACT_F32 of -5.0 should return 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], -5.0),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertAlmostEqual(result, 0.0, places=5)
+
+  def test_v_fract_f32_zero(self):
+    """V_FRACT_F32 of 0.0 returns 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_fract_f32_inf(self):
+    """V_FRACT_F32 of +inf returns NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_v_fract_f32_nan(self):
+    """V_FRACT_F32 of NaN returns NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_fract_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+
+class TestF16EdgeCases(unittest.TestCase):
+  """Additional F16 conversion edge cases."""
+
+  def test_v_cvt_f32_f16_inf(self):
+    """V_CVT_F32_F16 converts f16 infinity to f32 infinity."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7c00),  # f16 +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertGreater(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_cvt_f32_f16_neg_inf(self):
+    """V_CVT_F32_F16 converts f16 -inf to f32 -inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0xfc00),  # f16 -inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertLess(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_cvt_f32_f16_nan(self):
+    """V_CVT_F32_F16 converts f16 NaN to f32 NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7e00),  # f16 quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_v_cvt_f32_f16_neg_zero(self):
+    """V_CVT_F32_F16 preserves negative zero."""
+    instructions = [
+      s_mov_b32(s[0], 0x8000),  # f16 -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f32_f16_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_cvt_f16_f32_overflow(self):
+    """V_CVT_F16_F32 converts large f32 to f16 infinity."""
+    instructions = [
+      s_mov_b32(s[0], f2i(100000.0)),  # too large for f16
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo_bits = st.vgpr[0][1] & 0xffff
+    self.assertEqual(lo_bits, 0x7c00)  # f16 +inf
+
+  def test_v_cvt_f16_f32_underflow(self):
+    """V_CVT_F16_F32 converts very small f32 to f16 zero or denormal."""
+    instructions = [
+      s_mov_b32(s[0], f2i(1e-10)),  # very small, below f16 range
+      v_mov_b32_e32(v[0], s[0]),
+      v_cvt_f16_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo_bits = st.vgpr[0][1] & 0xffff
+    # Should be zero or very small denormal
+    self.assertLess(lo_bits, 0x0400)  # Less than smallest normal f16
+
+
+class TestExpEdgeCases(unittest.TestCase):
+  """Additional edge cases for V_EXP_F32."""
+
+  def test_v_exp_f32_zero(self):
+    """V_EXP_F32 of 0.0 returns 1.0 (2^0 = 1)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=5)
+
+  def test_v_exp_f32_one(self):
+    """V_EXP_F32 of 1.0 returns 2.0 (2^1 = 2)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)
+
+  def test_v_exp_f32_neg_one(self):
+    """V_EXP_F32 of -1.0 returns 0.5 (2^-1 = 0.5)."""
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
+
+  def test_v_exp_f32_inf(self):
+    """V_EXP_F32 of +inf returns +inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
+    self.assertGreater(i2f(st.vgpr[0][1]), 0)
+
+  def test_v_exp_f32_neg_inf(self):
+    """V_EXP_F32 of -inf returns 0."""
+    instructions = [
+      s_mov_b32(s[0], 0xff800000),  # -inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_exp_f32_nan(self):
+    """V_EXP_F32 of NaN returns NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_exp_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+
+class TestFloorEdgeCases(unittest.TestCase):
+  """Additional edge cases for V_FLOOR_F32."""
+
+  def test_v_floor_f32_negative(self):
+    """V_FLOOR_F32 of -2.3 returns -3.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-2.3)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_floor_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -3.0, places=5)
+
+  def test_v_floor_f32_neg_zero(self):
+    """V_FLOOR_F32 of -0.0 returns -0.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_floor_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_floor_f32_small_positive(self):
+    """V_FLOOR_F32 of 0.9 returns 0.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(0.9)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_floor_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_floor_f32_small_negative(self):
+    """V_FLOOR_F32 of -0.9 returns -1.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-0.9)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_floor_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop2.py b/extra/assembly/amd/test/hw/test_vop2.py
new file mode 100644
index 0000000000..a6bdee321c
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vop2.py
@@ -0,0 +1,451 @@
+"""Tests for VOP2 instructions - two operand vector operations.
+
+Includes: v_add_f32, v_mul_f32, v_and_b32, v_or_b32, v_xor_b32,
+          v_lshrrev_b32, v_lshlrev_b32, v_fmac_f32, v_fmaak_f32, v_fmamk_f32,
+          v_add_nc_u32, v_cndmask_b32, v_add_f16, v_mul_f16
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestBasicArithmetic(unittest.TestCase):
+  """Tests for basic arithmetic VOP2 instructions."""
+
+  def test_v_add_f32(self):
+    """V_ADD_F32 adds two floats."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_add_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
+
+  def test_v_mul_f32(self):
+    """V_MUL_F32 multiplies two floats."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 8.0, places=5)
+
+  def test_v_fmac_f32(self):
+    """V_FMAC_F32: d = d + a*b using inline constants."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_fmac_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
+
+  def test_v_fmaak_f32(self):
+    """V_FMAAK_F32: d = a * b + K using inline constants."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_fmaak_f32_e32(v[2], v[0], v[1], 0x3f800000),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
+
+  def test_v_fmamk_f32_basic(self):
+    """V_FMAMK_F32: d = a * K + b."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_fmamk_f32_e32(v[2], v[0], 0x40800000, v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
+
+  def test_v_fmamk_f32_small_constant(self):
+    """V_FMAMK_F32 with small constant."""
+    instructions = [
+      v_mov_b32_e32(v[0], 4.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_fmamk_f32_e32(v[2], v[0], f2i(0.5), v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
+
+
+class TestBitManipulation(unittest.TestCase):
+  """Tests for bit manipulation VOP2 instructions."""
+
+  def test_v_and_b32(self):
+    """V_AND_B32 bitwise and."""
+    instructions = [
+      s_mov_b32(s[0], 0xff),
+      s_mov_b32(s[1], 0x0f),
+      v_mov_b32_e32(v[0], s[0]),
+      v_and_b32_e32(v[1], s[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x0f)
+
+  def test_v_and_b32_quadrant(self):
+    """V_AND_B32 for quadrant extraction (n & 3)."""
+    instructions = [
+      s_mov_b32(s[0], 15915),
+      v_mov_b32_e32(v[0], s[0]),
+      v_and_b32_e32(v[1], 3, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 15915 & 3)
+
+  def test_v_lshrrev_b32(self):
+    """V_LSHRREV_B32 logical shift right."""
+    instructions = [
+      s_mov_b32(s[0], 0xff00),
+      v_mov_b32_e32(v[0], s[0]),
+      v_lshrrev_b32_e32(v[1], 8, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xff)
+
+  def test_v_lshlrev_b32(self):
+    """V_LSHLREV_B32 logical shift left."""
+    instructions = [
+      s_mov_b32(s[0], 0xff),
+      v_mov_b32_e32(v[0], s[0]),
+      v_lshlrev_b32_e32(v[1], 8, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xff00)
+
+  def test_v_xor_b32(self):
+    """V_XOR_B32 bitwise xor (used in sin for sign)."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      s_mov_b32(s[1], f2i(1.0)),
+      v_mov_b32_e32(v[0], s[1]),
+      v_xor_b32_e32(v[1], s[0], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
+
+  def test_v_xor_b32_sign_flip(self):
+    """V_XOR_B32 for sign flip pattern."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], -2.0),
+      v_xor_b32_e32(v[1], s[0], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)
+
+
+class TestSpecialValues(unittest.TestCase):
+  """Tests for special float values - inf, nan, zero handling."""
+
+  def test_v_mul_f32_zero_times_inf(self):
+    """V_MUL_F32: 0 * inf = NaN."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      s_mov_b32(s[0], 0x7f800000),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
+
+  def test_v_add_f32_inf_minus_inf(self):
+    """V_ADD_F32: inf + (-inf) = NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),
+      s_mov_b32(s[1], 0xff800000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
+
+
+class TestF16Ops(unittest.TestCase):
+  """Tests for 16-bit VOP2 operations."""
+
+  def test_v_add_f16_basic(self):
+    """V_ADD_F16 adds two f16 values."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x4200, f"Expected 0x4200 (f16 3.0), got 0x{result:04x}")
+
+  def test_v_add_f16_negative(self):
+    """V_ADD_F16 with negative values."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0xc000),  # f16 -2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0xbc00, f"Expected 0xbc00 (f16 -1.0), got 0x{result:04x}")
+
+  def test_v_mul_f16_basic(self):
+    """V_MUL_F16 multiplies two f16 values."""
+    instructions = [
+      s_mov_b32(s[0], 0x4000),  # f16 2.0
+      s_mov_b32(s[1], 0x4200),  # f16 3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x4600, f"Expected 0x4600 (f16 6.0), got 0x{result:04x}")
+
+  def test_v_mul_f16_by_zero(self):
+    """V_MUL_F16 by zero."""
+    instructions = [
+      s_mov_b32(s[0], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0),
+      v_mul_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x0000, f"Expected 0x0000 (f16 0.0), got 0x{result:04x}")
+
+  def test_v_fmac_f16_basic(self):
+    """V_FMAC_F16: d = d + a*b."""
+    instructions = [
+      s_mov_b32(s[0], 0x4000),  # f16 2.0
+      s_mov_b32(s[1], 0x4200),  # f16 3.0
+      s_mov_b32(s[2], 0x3c00),  # f16 1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_fmac_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    # 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
+    self.assertEqual(result, 0x4700, f"Expected 0x4700 (f16 7.0), got 0x{result:04x}")
+
+  def test_v_fmaak_f16_basic(self):
+    """V_FMAAK_F16: d = a * b + K."""
+    instructions = [
+      s_mov_b32(s[0], 0x4000),  # f16 2.0
+      s_mov_b32(s[1], 0x4200),  # f16 3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_fmaak_f16_e32(v[2], v[0], v[1], 0x3c00),  # + f16 1.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    # 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
+    self.assertEqual(result, 0x4700, f"Expected 0x4700 (f16 7.0), got 0x{result:04x}")
+
+
+class TestHiHalfOps(unittest.TestCase):
+  """Tests for VOP2 16-bit operations with hi-half operands."""
+
+  def test_v_add_f16_src0_hi_fold(self):
+    """V_ADD_F16 with src0 hi-half fold (same register, different halves)."""
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # lo=f16(1.0), hi=f16(2.0)
+      v_mov_b32_e32(v[0], s[0]),
+      VOP3(VOP3Op.V_ADD_F16, vdst=v[1], src0=v[0], src1=v[0], opsel=0b0001),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
+
+  def test_v_add_f16_src0_hi_different_reg(self):
+    """V_ADD_F16 with src0 hi-half from different register."""
+    instructions = [
+      s_mov_b32(s[0], 0x40000000),  # hi=f16(2.0), lo=0
+      s_mov_b32(s[1], 0x00003c00),  # hi=0, lo=f16(1.0)
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      VOP3(VOP3Op.V_ADD_F16, vdst=v[2], src0=v[0], src1=v[1], opsel=0b0001),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
+
+  def test_v_mul_f16_src0_hi(self):
+    """V_MUL_F16 with src0 from high half."""
+    instructions = [
+      s_mov_b32(s[0], 0x40000000),  # hi=f16(2.0), lo=0
+      s_mov_b32(s[1], 0x00004200),  # hi=0, lo=f16(3.0)
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      VOP3(VOP3Op.V_MUL_F16, vdst=v[2], src0=v[0], src1=v[1], opsel=0b0001),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
+
+  def test_v_mul_f16_hi_half(self):
+    """V_MUL_F16 reading from high half."""
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # lo=1.0, hi=2.0
+      v_mov_b32_e32(v[0], s[0]),
+      VOP3(VOP3Op.V_MUL_F16, vdst=v[1], src0=v[0], src1=v[0], opsel=0b0011),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1] & 0xffff
+    self.assertEqual(result, 0x4400, f"Expected f16(4.0)=0x4400, got 0x{result:04x}")
+
+  def test_v_fma_f16_hi_dest(self):
+    """V_FMA_F16 writing to high half with opsel.
+
+    Uses V_FMA_F16 (not V_FMAC_F16) because it has explicit src2 operand
+    which makes opsel handling clearer.
+    """
+    instructions = [
+      s_mov_b32(s[0], 0x3c000000),  # hi=f16(1.0), lo=0
+      s_mov_b32(s[1], 0x4000),      # f16(2.0) in lo
+      s_mov_b32(s[2], 0x4200),      # f16(3.0) in lo
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      # V_FMA_F16: dst = src0 * src1 + src2
+      # opsel=0b1100: bit2=src2 hi, bit3=dst hi
+      # So: v[0].hi = v[1].lo * v[2].lo + v[0].hi = 2.0 * 3.0 + 1.0 = 7.0
+      VOP3(VOP3Op.V_FMA_F16, vdst=v[0], src0=v[1], src1=v[2], src2=v[0], opsel=0b1100),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][0] >> 16) & 0xffff
+    # 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
+    self.assertEqual(hi, 0x4700, f"Expected f16(7.0)=0x4700 in hi, got 0x{hi:04x}")
+
+  def test_v_add_f16_multilane(self):
+    """V_ADD_F16 with multiple lanes."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f16_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      result = st.vgpr[lane][2] & 0xffff
+      self.assertEqual(result, 0x4200, f"Lane {lane}: expected 0x4200, got 0x{result:04x}")
+
+
+class TestCndmask(unittest.TestCase):
+  """Tests for V_CNDMASK_B32 and V_CNDMASK_B16."""
+
+  def test_v_cndmask_b16_select_src0(self):
+    """V_CNDMASK_B16 selects src0 when VCC bit is 0."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # VCC = 0
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cndmask_b16(v[2], v[0], v[1], VCC),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x3c00, f"Expected src0=0x3c00, got 0x{result:04x}")
+
+  def test_v_cndmask_b16_select_src1(self):
+    """V_CNDMASK_B16 selects src1 when VCC bit is 1."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC = 1
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cndmask_b16(v[2], v[0], v[1], VCC),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2] & 0xffff
+    self.assertEqual(result, 0x4000, f"Expected src1=0x4000, got 0x{result:04x}")
+
+  def test_v_cndmask_b16_write_hi(self):
+    """V_CNDMASK_B16 can write to high 16 bits with opsel."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c003800),  # src0: hi=1.0, lo=0.5
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 0x4000c000),  # src1: hi=2.0, lo=-2.0
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], 0xDEAD0000),  # v2 initial: hi=0xDEAD, lo=0
+      v_mov_b32_e32(v[2], s[2]),
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # vcc = 0, select src0
+      # opsel=0b1011: bit0=src0 hi, bit1=src1 hi, bit3=dst hi
+      VOP3(VOP3Op.V_CNDMASK_B16, vdst=v[2], src0=v[0], src1=v[1], src2=SrcEnum.VCC_LO, opsel=0b1011),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    hi = (st.vgpr[0][2] >> 16) & 0xffff
+    lo = st.vgpr[0][2] & 0xffff
+    # vcc=0 selects src0.h = 1.0 = 0x3c00, writes to hi
+    self.assertEqual(hi, 0x3c00, f"Expected hi=0x3c00 (1.0), got 0x{hi:04x}")
+    self.assertEqual(lo, 0x0000, f"Expected lo preserved as 0, got 0x{lo:04x}")
+
+
+class TestSpecialFloatValues(unittest.TestCase):
+  """Tests for special float value handling in VOP2 instructions."""
+
+  def test_neg_zero_add(self):
+    """-0.0 + 0.0 = +0.0 (IEEE 754)."""
+    neg_zero = 0x80000000
+    instructions = [
+      s_mov_b32(s[0], neg_zero),
+      v_mov_b32_e32(v[0], s[0]),
+      v_add_f32_e32(v[1], 0.0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x00000000, "Should be +0.0")
+
+  def test_neg_zero_mul(self):
+    """-0.0 * -1.0 = +0.0."""
+    neg_zero = 0x80000000
+    instructions = [
+      s_mov_b32(s[0], neg_zero),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mul_f32_e32(v[1], -1.0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x00000000, "Should be +0.0")
+
+  def test_inf_minus_inf(self):
+    """+inf - inf = NaN."""
+    import math
+    pos_inf = 0x7f800000
+    neg_inf = 0xff800000
+    instructions = [
+      s_mov_b32(s[0], pos_inf),
+      s_mov_b32(s[1], neg_inf),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_sub_f32_e32(v[2], v[0], v[1]),  # inf - (-inf) = inf
+      v_add_f32_e32(v[3], v[0], v[1]),  # inf + (-inf) = NaN
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], pos_inf, "inf - (-inf) = inf")
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "inf + (-inf) = NaN")
+
+  def test_denormal_f32_mul_ftz(self):
+    """Denormal * normal - RDNA3 flushes denormals to zero (FTZ mode)."""
+    smallest_denorm = 0x00000001  # Smallest positive denormal
+    instructions = [
+      s_mov_b32(s[0], smallest_denorm),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mul_f32_e32(v[1], 2.0, v[0]),  # Denormal input gets flushed to 0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x00000000)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop3.py b/extra/assembly/amd/test/hw/test_vop3.py
new file mode 100644
index 0000000000..932b02e3df
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vop3.py
@@ -0,0 +1,2266 @@
+"""Tests for VOP3 instructions - three operand vector operations.
+
+Includes: v_fma_f32, v_div_scale_f32, v_div_fmas_f32, v_div_fixup_f32,
+          v_alignbit_b32, v_bfe_i32, v_mad_u64_u32, v_readlane_b32, v_writelane_b32
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestFMA(unittest.TestCase):
+  """Tests for FMA instructions."""
+
+  def test_v_fma_f32_basic(self):
+    """V_FMA_F32: a*b+c basic case."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_fma_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 9.0, places=5)
+
+  def test_v_fma_f32_negative(self):
+    """V_FMA_F32 with negative multiplier."""
+    instructions = [
+      v_mov_b32_e32(v[0], -2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_fma_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), -7.0, places=5)
+
+  def test_v_fma_f32_with_sgpr(self):
+    """V_FMA_F32: using SGPR for non-inline constant."""
+    instructions = [
+      s_mov_b32(s[0], f2i(3.0)),
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mov_b32_e32(v[2], 4.0),
+      v_fma_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 10.0, places=5)
+
+  def test_v_fma_f32_with_inf(self):
+    """V_FMA_F32: 1.0 * inf + 0 = inf."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      s_mov_b32(s[0], 0x7f800000),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mov_b32_e32(v[2], 0),
+      v_fma_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertTrue(math.isinf(result) and result > 0)
+
+
+class TestDivScale(unittest.TestCase):
+  """Tests for V_DIV_SCALE_F32."""
+
+  def test_div_scale_f32_vcc_zero_single_lane(self):
+    """V_DIV_SCALE_F32 sets VCC=0 when no scaling needed."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc, 0, "VCC should be 0 when no scaling needed")
+
+  def test_div_scale_f32_vcc_zero_multiple_lanes(self):
+    """V_DIV_SCALE_F32 sets VCC=0 for all lanes when no scaling needed."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.vcc & 0xf, 0, "VCC should be 0 for all lanes")
+
+  def test_div_scale_f32_preserves_input(self):
+    """V_DIV_SCALE_F32 outputs S0 when no scaling needed."""
+    instructions = [
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 4.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 2.0, places=5)
+
+  def test_div_scale_f32_zero_denom_gives_nan(self):
+    """V_DIV_SCALE_F32: zero denominator -> NaN, VCC=1."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 0.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero denom")
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
+
+  def test_div_scale_f32_zero_numer_gives_nan(self):
+    """V_DIV_SCALE_F32: zero numerator -> NaN, VCC=1."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 0.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero numer")
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero numer")
+
+  def test_div_scale_f32_large_exp_diff_scales_denom(self):
+    """V_DIV_SCALE_F32: exp(numer) - exp(denom) >= 96 -> scale denom, VCC=1."""
+    max_float = 0x7f7fffff  # 3.4028235e+38, exp=254
+    instructions = [
+      s_mov_b32(s[0], max_float),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling denom for large exp diff")
+    expected = 1.0 * (2.0 ** 64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=expected * 1e-6)
+
+  def test_div_scale_f32_denorm_denom(self):
+    """V_DIV_SCALE_F32: denormalized denominator -> NaN, VCC=1."""
+    import math
+    denorm = 0x00000001
+    instructions = [
+      s_mov_b32(s[0], denorm),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], s[0]),
+      v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Hardware returns NaN for denorm denom")
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for denorm denom")
+
+  def test_div_scale_f32_tiny_numer_exp_le_23(self):
+    """V_DIV_SCALE_F32: exponent(numer) <= 23 -> scale by 2^64, VCC=1."""
+    smallest_normal = 0x00800000
+    instructions = [
+      s_mov_b32(s[0], smallest_normal),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    numer_f = i2f(smallest_normal)
+    expected = numer_f * (2.0 ** 64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=abs(expected) * 1e-5)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling tiny numer")
+
+  def test_div_scale_f32_result_would_be_denorm(self):
+    """V_DIV_SCALE_F32: result would be denorm -> no scaling, VCC=1."""
+    large_denom = 0x7f000000  # 2^127
+    instructions = [
+      s_mov_b32(s[0], large_denom),
+      v_mov_b32_e32(v[0], 1.0),   # numer = 1.0 (S2)
+      v_mov_b32_e32(v[1], s[0]),  # denom = 2^127 (S1)
+      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when result would be denorm")
+
+
+class TestDivFmas(unittest.TestCase):
+  """Tests for V_DIV_FMAS_F32."""
+
+  def test_div_fmas_f32_no_scale(self):
+    """V_DIV_FMAS_F32: VCC=0 -> normal FMA."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 3.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 7.0, places=5)
+
+  def test_div_fmas_f32_scale_up(self):
+    """V_DIV_FMAS_F32: VCC=1 with S2 >= 2.0 -> scale by 2^+64."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mov_b32_e32(v[2], 2.0),
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    expected = 3.0 * (2.0 ** 64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
+
+  def test_div_fmas_f32_scale_down(self):
+    """V_DIV_FMAS_F32: VCC=1 with S2 < 2.0 -> scale by 2^-64."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),
+      v_mov_b32_e32(v[0], 2.0),
+      v_mov_b32_e32(v[1], 3.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    expected = 7.0 * (2.0 ** -64)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
+
+  def test_div_fmas_f32_per_lane_vcc(self):
+    """V_DIV_FMAS_F32: different VCC per lane with S2 < 2.0."""
+    instructions = [
+      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0b0101),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fmas_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    scaled = 2.0 * (2.0 ** -64)
+    unscaled = 2.0
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), scaled, delta=abs(scaled) * 1e-6)
+    self.assertAlmostEqual(i2f(st.vgpr[1][3]), unscaled, places=5)
+    self.assertAlmostEqual(i2f(st.vgpr[2][3]), scaled, delta=abs(scaled) * 1e-6)
+    self.assertAlmostEqual(i2f(st.vgpr[3][3]), unscaled, places=5)
+
+
+class TestDivFixup(unittest.TestCase):
+  """Tests for V_DIV_FIXUP_F32."""
+
+  def test_div_fixup_f32_normal(self):
+    """V_DIV_FIXUP_F32: normal division passes through quotient."""
+    instructions = [
+      v_mov_b32_e32(v[0], 3.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_mov_b32_e32(v[2], 6.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
+
+  def test_div_fixup_f32_zero_div_zero(self):
+    """V_DIV_FIXUP_F32: 0/0 -> NaN."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 0.0),
+      v_mov_b32_e32(v[2], 0.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "0/0 should be NaN")
+
+  def test_div_fixup_f32_x_div_zero(self):
+    """V_DIV_FIXUP_F32: x/0 -> +/-inf based on sign."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 0.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "x/0 should be inf")
+
+  def test_div_fixup_f32_one_div_inf(self):
+    """V_DIV_FIXUP_F32: 1.0 / +inf = 0."""
+    instructions = [
+      s_mov_b32(s[0], 0),           # approximation (rcp of inf = 0)
+      s_mov_b32(s[1], 0x7f800000),  # denominator = +inf
+      s_mov_b32(s[2], f2i(1.0)),    # numerator = 1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_div_fixup_f32_inf_div_inf(self):
+    """V_DIV_FIXUP_F32: inf / inf = NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0),           # approximation
+      s_mov_b32(s[1], 0x7f800000),  # denominator = +inf
+      s_mov_b32(s[2], 0x7f800000),  # numerator = +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_div_fixup_f32_nan_numer(self):
+    """V_DIV_FIXUP_F32: NaN numerator -> quiet NaN."""
+    import math
+    nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], nan),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mov_b32_e32(v[2], s[0]),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])))
+
+  def test_div_fixup_f32_nan_denom(self):
+    """V_DIV_FIXUP_F32: NaN denominator -> quiet NaN."""
+    import math
+    nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], nan),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])))
+
+  def test_div_fixup_f32_neg_x_div_zero(self):
+    """V_DIV_FIXUP_F32: -x/0 -> -inf."""
+    import math
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 0.0),
+      v_mov_b32_e32(v[2], -1.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])))
+    self.assertLess(i2f(st.vgpr[0][3]), 0, "-1/0 should be -inf")
+
+  def test_div_fixup_f32_zero_div_x(self):
+    """V_DIV_FIXUP_F32: 0/x -> 0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_mov_b32_e32(v[2], 0.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][3]), 0.0)
+
+  def test_div_fixup_f32_x_div_inf(self):
+    """V_DIV_FIXUP_F32: x/inf -> 0."""
+    pos_inf = 0x7f800000
+    instructions = [
+      s_mov_b32(s[0], pos_inf),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mov_b32_e32(v[2], 1.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][3]), 0.0)
+
+  def test_div_fixup_f32_inf_div_x(self):
+    """V_DIV_FIXUP_F32: inf/x -> inf."""
+    import math
+    pos_inf = 0x7f800000
+    instructions = [
+      s_mov_b32(s[0], pos_inf),
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mov_b32_e32(v[2], s[0]),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])))
+
+  def test_div_fixup_f32_sign_propagation(self):
+    """V_DIV_FIXUP_F32: sign is XOR of numer and denom signs."""
+    instructions = [
+      v_mov_b32_e32(v[0], 3.0),
+      v_mov_b32_e32(v[1], -2.0),
+      v_mov_b32_e32(v[2], 6.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), -3.0, places=5)
+
+  def test_div_fixup_f32_neg_neg(self):
+    """V_DIV_FIXUP_F32: neg/neg -> positive."""
+    instructions = [
+      v_mov_b32_e32(v[0], 3.0),
+      v_mov_b32_e32(v[1], -2.0),
+      v_mov_b32_e32(v[2], -6.0),
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
+
+  def test_div_fixup_f32_nan_estimate_overflow(self):
+    """V_DIV_FIXUP_F32: NaN estimate returns overflow (inf)."""
+    import math
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),  # S0 = NaN (failed estimate)
+      v_mov_b32_e32(v[1], 1.0),   # S1 = denominator = 1.0
+      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
+    self.assertEqual(st.vgpr[0][3], 0x7f800000, "Should be +inf (pos/pos)")
+
+  def test_div_fixup_f32_nan_estimate_sign(self):
+    """V_DIV_FIXUP_F32: NaN estimate with negative sign returns -inf."""
+    import math
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),  # S0 = NaN (failed estimate)
+      v_mov_b32_e32(v[1], -1.0),  # S1 = denominator = -1.0
+      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
+    self.assertEqual(st.vgpr[0][3], 0xff800000, "Should be -inf (pos/neg)")
+
+  def test_v_div_fixup_f32_one_div_neg_inf(self):
+    """V_DIV_FIXUP_F32: 1/-inf = -0."""
+    neg_inf = 0xff800000
+    instructions = [
+      v_mov_b32_e32(v[0], 0.0),   # estimate (doesn't matter, will be overridden)
+      s_mov_b32(s[0], neg_inf),
+      v_mov_b32_e32(v[1], s[0]),  # denom = -inf
+      v_mov_b32_e32(v[2], 1.0),   # numer = 1.0
+      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 0x80000000, "1/-inf should be -0")
+
+
+class TestAlignbit(unittest.TestCase):
+  """Tests for V_ALIGNBIT_B32."""
+
+  def test_v_alignbit_b32(self):
+    """V_ALIGNBIT_B32 extracts bits from concatenated sources."""
+    instructions = [
+      s_mov_b32(s[0], 0x12),
+      s_mov_b32(s[1], 0x34),
+      s_mov_b32(s[2], 4),
+      v_mov_b32_e32(v[0], s[2]),
+      v_alignbit_b32(v[1], s[0], s[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    expected = ((0x12 << 32) | 0x34) >> 4
+    self.assertEqual(st.vgpr[0][1], expected & 0xffffffff)
+
+
+class TestBfe(unittest.TestCase):
+  """Tests for V_BFE_I32."""
+
+  def test_v_bfe_i32_sign_extend(self):
+    """V_BFE_I32 sign extends based on MSB of extracted field."""
+    instructions = [
+      s_mov_b32(s[0], 0x0000007F),  # 0x7F = 0b1111111
+      v_mov_b32_e32(v[0], s[0]),
+      v_bfe_i32(v[1], v[0], 0, 7),  # Extract 7 bits from offset 0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 0x7F in 7 bits has bit 6 = 1 (the sign bit in 7-bit signed)
+    # So it represents -1 in 7-bit signed, sign-extended to 32 bits = 0xFFFFFFFF
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+  def test_v_bfe_i32_sign_extend_negative(self):
+    """V_BFE_I32 sign extends negative."""
+    instructions = [
+      s_mov_b32(s[0], 0x000000FF),  # -1 in 8 bits
+      v_mov_b32_e32(v[0], s[0]),
+      v_bfe_i32(v[1], v[0], 0, 8),  # Extract 8 bits from offset 0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 0xFF in 8 bits is -1, sign-extended to 32 bits = 0xFFFFFFFF
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+
+class TestMad64(unittest.TestCase):
+  """Tests for V_MAD_U64_U32."""
+
+  def test_v_mad_u64_u32_simple(self):
+    """V_MAD_U64_U32: D = S0 * S1 + S2 (64-bit result)."""
+    instructions = [
+      s_mov_b32(s[0], 3),
+      s_mov_b32(s[1], 4),
+      v_mov_b32_e32(v[2], 5),
+      v_mov_b32_e32(v[3], 0),
+      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_lo = st.vgpr[0][4]
+    result_hi = st.vgpr[0][5]
+    result = result_lo | (result_hi << 32)
+    self.assertEqual(result, 17)
+
+  def test_v_mad_u64_u32_large_mult(self):
+    """V_MAD_U64_U32 with large values that overflow 32 bits."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      s_mov_b32(s[1], 2),
+      v_mov_b32_e32(v[2], 0),
+      v_mov_b32_e32(v[3], 0),
+      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_lo = st.vgpr[0][4]
+    result_hi = st.vgpr[0][5]
+    result = result_lo | (result_hi << 32)
+    self.assertEqual(result, 0x100000000)
+
+
+class TestLaneOps(unittest.TestCase):
+  """Tests for lane operations (readlane, writelane)."""
+
+  def _readlane(self, sdst_idx, vsrc, lane_idx):
+    return VOP3(VOP3Op.V_READLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc, src1=lane_idx)
+
+  def test_v_readlane_b32_basic(self):
+    """V_READLANE_B32 reads a value from a specific lane's VGPR."""
+    instructions = [
+      v_lshlrev_b32_e32(v[0], 1, v[255]),
+      v_lshlrev_b32_e32(v[1], 3, v[255]),
+      v_add_nc_u32_e32(v[0], v[0], v[1]),
+      self._readlane(0, v[0], 2),
+      v_mov_b32_e32(v[2], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][2], 20)
+
+  def test_v_readlane_b32_lane_0(self):
+    """V_READLANE_B32 reading from lane 0."""
+    instructions = [
+      v_lshlrev_b32_e32(v[0], 2, v[255]),  # v0 = lane_id * 4
+      v_add_nc_u32_e32(v[0], 100, v[0]),   # v0 = 100 + lane_id * 4
+      self._readlane(0, v[0], 0),          # s0 = lane 0's v0 = 100
+      v_mov_b32_e32(v[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 100)
+
+  def test_v_readlane_b32_last_lane(self):
+    """V_READLANE_B32 reading from the last active lane (lane 3)."""
+    instructions = [
+      v_lshlrev_b32_e32(v[0], 2, v[255]),  # v0 = lane_id * 4
+      v_add_nc_u32_e32(v[0], 100, v[0]),   # v0 = 100 + lane_id * 4
+      self._readlane(0, v[0], 3),          # s0 = lane 3's v0 = 112
+      v_mov_b32_e32(v[1], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 112)
+
+  def test_v_readlane_b32_different_vgpr(self):
+    """V_READLANE_B32 reading from different VGPR indices."""
+    instructions = [
+      v_lshlrev_b32_e32(v[5], 3, v[255]),  # v5 = lane_id * 8
+      v_add_nc_u32_e32(v[5], 50, v[5]),    # v5 = 50 + lane_id * 8
+      self._readlane(0, v[5], 1),          # s0 = lane 1's v5 = 58
+      v_mov_b32_e32(v[6], s[0]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][6], 58)
+
+  def test_v_writelane_b32_basic(self):
+    """V_WRITELANE_B32 writes a scalar to a specific lane's VGPR."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      s_mov_b32(s[0], 999),
+      v_writelane_b32(v[0], s[0], 2),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      if lane == 2:
+        self.assertEqual(st.vgpr[lane][0], 999)
+      else:
+        self.assertEqual(st.vgpr[lane][0], 0)
+
+  def test_v_writelane_then_readlane(self):
+    """V_WRITELANE followed by V_READLANE to verify round-trip."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      s_mov_b32(s[0], 0xdeadbeef),
+      v_writelane_b32(v[0], s[0], 1),      # Write to lane 1
+      self._readlane(1, v[0], 1),          # Read back from lane 1 into s1
+      v_mov_b32_e32(v[1], s[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 0xdeadbeef)
+
+  def test_v_readlane_for_reduction(self):
+    """Simulate a wave reduction using readlane - common WMMA/reduction pattern."""
+    instructions = [
+      v_add_nc_u32_e32(v[0], 1, v[255]),   # v0 = lane_id + 1 (1, 2, 3, 4)
+      self._readlane(0, v[0], 0),          # s0 = 1
+      self._readlane(1, v[0], 1),          # s1 = 2
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 3
+      self._readlane(1, v[0], 2),          # s1 = 3
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 6
+      self._readlane(1, v[0], 3),          # s1 = 4
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 10
+      v_mov_b32_e32(v[1], s[0]),           # Broadcast sum to all lanes
+    ]
+    st = run_program(instructions, n_lanes=4)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 10, "Sum 1+2+3+4 should be 10")
+
+  def test_v_writelane_b32_different_vgpr(self):
+    """V_WRITELANE_B32 writes to a non-zero VGPR index.
+
+    Regression test for bug where vdst_idx was always 0 due to function signature
+    mismatch (_vars parameter shifted all arguments). This caused all WRITELANE
+    operations to write to v[0] regardless of the actual destination register.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
+      v_mov_b32_e32(v[5], 0),              # Initialize v5 = 0
+      s_mov_b32(s[0], 0x12345678),         # Value to write
+      v_writelane_b32(v[5], s[0], 1),      # Write to lane 1's v5 (NOT v0!)
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # v[0] should remain 0 for all lanes (bug would have written here)
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)")
+    # v[5] should have the value only in lane 1
+    for lane in range(4):
+      if lane == 1:
+        self.assertEqual(st.vgpr[lane][5], 0x12345678, f"v[5] lane 1 should have 0x12345678")
+      else:
+        self.assertEqual(st.vgpr[lane][5], 0, f"v[5] lane {lane} should be 0")
+
+  def test_v_writelane_b32_high_vgpr_index(self):
+    """V_WRITELANE_B32 writes to a high VGPR index (v[15]).
+
+    Tests that the vdst_idx is correctly passed through for larger register indices.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
+      v_mov_b32_e32(v[15], 0),             # Initialize v15 = 0
+      s_mov_b32(s[0], 0xCAFEBABE),         # Value to write
+      v_writelane_b32(v[15], s[0], 0),     # Write to lane 0's v15
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # v[0] should remain 0 for all lanes
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0")
+    # v[15] should have the value only in lane 0
+    self.assertEqual(st.vgpr[0][15], 0xCAFEBABE, "v[15] lane 0 should have 0xCAFEBABE")
+    for lane in range(1, 4):
+      self.assertEqual(st.vgpr[lane][15], 0, f"v[15] lane {lane} should be 0")
+
+  def test_v_writelane_b32_multiple_writes_different_vgprs(self):
+    """V_WRITELANE_B32 writes to multiple different VGPRs.
+
+    This is the pattern used in sparse_categorical_crossentropy where values
+    are written to different VGPR indices via writelane, then read back.
+    """
+    instructions = [
+      # Initialize all target VGPRs to 0
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[3], 0),
+      v_mov_b32_e32(v[7], 0),
+      v_mov_b32_e32(v[10], 0),
+      # Write different values to different VGPRs at different lanes
+      s_mov_b32(s[0], 100),
+      v_writelane_b32(v[3], s[0], 0),      # v[3] lane 0 = 100
+      s_mov_b32(s[0], 200),
+      v_writelane_b32(v[7], s[0], 1),      # v[7] lane 1 = 200
+      s_mov_b32(s[0], 300),
+      v_writelane_b32(v[10], s[0], 2),     # v[10] lane 2 = 300
+    ]
+    st = run_program(instructions, n_lanes=4)
+
+    # v[0] should remain 0 everywhere
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0")
+
+    # Check each target VGPR
+    self.assertEqual(st.vgpr[0][3], 100, "v[3] lane 0 should be 100")
+    for lane in range(1, 4):
+      self.assertEqual(st.vgpr[lane][3], 0, f"v[3] lane {lane} should be 0")
+
+    self.assertEqual(st.vgpr[1][7], 200, "v[7] lane 1 should be 200")
+    for lane in [0, 2, 3]:
+      self.assertEqual(st.vgpr[lane][7], 0, f"v[7] lane {lane} should be 0")
+
+    self.assertEqual(st.vgpr[2][10], 300, "v[10] lane 2 should be 300")
+    for lane in [0, 1, 3]:
+      self.assertEqual(st.vgpr[lane][10], 0, f"v[10] lane {lane} should be 0")
+
+  def test_v_writelane_then_readlane_different_vgpr(self):
+    """V_WRITELANE followed by V_READLANE on a non-zero VGPR.
+
+    Regression test: the original bug caused writelane to always write to v[0],
+    so reading back from the intended VGPR would return 0 instead of the written value.
+    This is the exact pattern that failed in sparse_categorical_crossentropy.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
+      v_mov_b32_e32(v[8], 0),              # Initialize v8 = 0
+      s_mov_b32(s[0], 0xABCD1234),
+      v_writelane_b32(v[8], s[0], 2),      # Write to lane 2's v8
+      self._readlane(1, v[8], 2),          # Read back from lane 2's v8 into s1
+      v_mov_b32_e32(v[1], s[1]),           # Broadcast to all lanes
+    ]
+    st = run_program(instructions, n_lanes=4)
+    # The read value should be what we wrote
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][1], 0xABCD1234,
+                       f"Lane {lane}: readlane should return 0xABCD1234, got 0x{st.vgpr[lane][1]:08x}")
+    # v[0] should still be 0 (bug would have written here instead of v[8])
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)")
+
+  def test_v_writelane_b32_accumulate_pattern(self):
+    """V_WRITELANE_B32 used to accumulate values across lanes into a single VGPR.
+
+    This pattern is used in reductions where each lane writes its result to
+    a different lane of the same VGPR, then the results are read back.
+    """
+    instructions = [
+      v_mov_b32_e32(v[6], 0),              # Initialize accumulator v6 = 0
+      # Each "iteration" writes to a different lane
+      s_mov_b32(s[0], 10),
+      v_writelane_b32(v[6], s[0], 0),      # lane 0 gets 10
+      s_mov_b32(s[0], 20),
+      v_writelane_b32(v[6], s[0], 1),      # lane 1 gets 20
+      s_mov_b32(s[0], 30),
+      v_writelane_b32(v[6], s[0], 2),      # lane 2 gets 30
+      s_mov_b32(s[0], 40),
+      v_writelane_b32(v[6], s[0], 3),      # lane 3 gets 40
+      # Now read them all back and sum
+      self._readlane(0, v[6], 0),          # s0 = 10
+      self._readlane(1, v[6], 1),          # s1 = 20
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 30
+      self._readlane(1, v[6], 2),          # s1 = 30
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 60
+      self._readlane(1, v[6], 3),          # s1 = 40
+      s_add_u32(s[0], s[0], s[1]),         # s0 = 100
+      v_mov_b32_e32(v[7], s[0]),           # Broadcast sum to all lanes
+    ]
+    st = run_program(instructions, n_lanes=4)
+
+    # Check that each lane of v[6] has the correct value
+    self.assertEqual(st.vgpr[0][6], 10, "v[6] lane 0 should be 10")
+    self.assertEqual(st.vgpr[1][6], 20, "v[6] lane 1 should be 20")
+    self.assertEqual(st.vgpr[2][6], 30, "v[6] lane 2 should be 30")
+    self.assertEqual(st.vgpr[3][6], 40, "v[6] lane 3 should be 40")
+
+    # Check the sum
+    for lane in range(4):
+      self.assertEqual(st.vgpr[lane][7], 100, f"Sum should be 100, got {st.vgpr[lane][7]}")
+
+
+class TestF16Modifiers(unittest.TestCase):
+  """Tests for F16 operations with abs/neg modifiers and inline constants."""
+
+  def test_v_fma_f16_inline_const_1_0(self):
+    """V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
+    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    f16_a = f32_to_f16(0.325928)  # ~0x3537
+    f16_b = f32_to_f16(-0.486572)  # ~0xb7c9
+    instructions = [
+      s_mov_b32(s[0], f16_a),
+      v_mov_b32_e32(v[4], s[0]),
+      s_mov_b32(s[1], f16_b),
+      v_mov_b32_e32(v[6], s[1]),
+      v_fma_f16(v[4], v[4], v[6], 1.0),  # 1.0 is inline constant
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = _f16(st.vgpr[0][4] & 0xffff)
+    expected = 0.325928 * (-0.486572) + 1.0
+    self.assertAlmostEqual(result, expected, delta=0.01)
+
+  def test_v_fma_f16_inline_const_0_5(self):
+    """V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
+    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    f16_a = f32_to_f16(2.0)
+    f16_b = f32_to_f16(3.0)
+    instructions = [
+      s_mov_b32(s[0], f16_a),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f16_b),
+      v_mov_b32_e32(v[1], s[1]),
+      v_fma_f16(v[2], v[0], v[1], 0.5),  # 0.5 is inline constant
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = _f16(st.vgpr[0][2] & 0xffff)
+    expected = 2.0 * 3.0 + 0.5
+    self.assertAlmostEqual(result, expected, delta=0.01)
+
+  def test_v_fma_f16_inline_const_neg_1_0(self):
+    """V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
+    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    f16_a = f32_to_f16(2.0)
+    f16_b = f32_to_f16(3.0)
+    instructions = [
+      s_mov_b32(s[0], f16_a),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f16_b),
+      v_mov_b32_e32(v[1], s[1]),
+      v_fma_f16(v[2], v[0], v[1], -1.0),  # -1.0 is inline constant
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = _f16(st.vgpr[0][2] & 0xffff)
+    expected = 2.0 * 3.0 + (-1.0)
+    self.assertAlmostEqual(result, expected, delta=0.01)
+
+  def test_v_add_f16_abs_both(self):
+    """V_ADD_F16 with abs on both operands."""
+    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    f16_neg2 = f32_to_f16(-2.0)
+    f16_neg3 = f32_to_f16(-3.0)
+    instructions = [
+      s_mov_b32(s[0], f16_neg2),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f16_neg3),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f16_e64(v[2], abs(v[0]), abs(v[1])),  # |-2| + |-3| = 5
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = _f16(st.vgpr[0][2] & 0xffff)
+    self.assertAlmostEqual(result, 5.0, delta=0.01)
+
+  def test_v_mul_f16_neg_abs(self):
+    """V_MUL_F16 with neg on one operand and abs on another."""
+    from extra.assembly.amd.pcode import f32_to_f16, _f16
+    f16_2 = f32_to_f16(2.0)
+    f16_neg3 = f32_to_f16(-3.0)
+    instructions = [
+      s_mov_b32(s[0], f16_2),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f16_neg3),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_f16_e64(v[2], -v[0], abs(v[1])),  # -(2) * |-3| = -6
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = _f16(st.vgpr[0][2] & 0xffff)
+    self.assertAlmostEqual(result, -6.0, delta=0.01)
+
+  def test_v_fmac_f16_hi_dest(self):
+    """v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.
+
+    This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
+    """
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x38003c00),  # v0 = {hi=0.5, lo=1.0}
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 0x38000000),  # v1 = {hi=0.5, lo=0.0}
+      v_mov_b32_e32(v[1], s[1]),
+      # v_fmac_f16 v0.h, literal(0.318...), v1.l: D.h = D.h + S0 * S1 = 0.5 + 0.318 * 0.0 = 0.5
+      VOP2(VOP2Op.V_FMAC_F16, vdst=RawImm(128), src0=RawImm(255), vsrc1=RawImm(1), literal=0x3518),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    v0 = st.vgpr[0][0]
+    result_hi = _f16((v0 >> 16) & 0xffff)
+    result_lo = _f16(v0 & 0xffff)
+    self.assertAlmostEqual(result_hi, 0.5, delta=0.01, msg=f"Expected hi=0.5, got {result_hi}")
+    self.assertAlmostEqual(result_lo, 1.0, delta=0.01, msg=f"Expected lo=1.0, got {result_lo}")
+
+
+class TestF16FmaMix(unittest.TestCase):
+  """Tests for V_FMA_MIX_F32/F16."""
+
+  def test_v_fma_mix_f32_all_f32(self):
+    """V_FMA_MIX_F32 with all f32 sources."""
+    instructions = [
+      s_mov_b32(s[0], f2i(2.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f2i(1.0)),
+      v_mov_b32_e32(v[2], s[2]),
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 7.0, places=5)
+
+
+class TestF64Ops(unittest.TestCase):
+  """Tests for 64-bit float operations."""
+
+  def test_v_add_f64_inline_constant(self):
+    """V_ADD_F64 with inline constant POS_ONE (1.0) as f64."""
+    one_f64 = f2i64(1.0)
+    instructions = [
+      s_mov_b32(s[0], one_f64 & 0xffffffff),
+      s_mov_b32(s[1], one_f64 >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f64(v[2:4], v[0:2], SrcEnum.POS_ONE),  # 1.0 + 1.0 = 2.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
+    self.assertAlmostEqual(result, 2.0, places=5)
+
+  def test_v_mul_f64_basic(self):
+    """V_MUL_F64: 2.0 * 3.0 = 6.0."""
+    two_f64 = f2i64(2.0)
+    three_f64 = f2i64(3.0)
+    instructions = [
+      s_mov_b32(s[0], two_f64 & 0xffffffff),
+      s_mov_b32(s[1], two_f64 >> 32),
+      s_mov_b32(s[2], three_f64 & 0xffffffff),
+      s_mov_b32(s[3], three_f64 >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_mul_f64(v[4:6], v[0:2], v[2:4]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    self.assertAlmostEqual(result, 6.0, places=10)
+
+  def test_v_cvt_i32_f64_writes_32bit_only(self):
+    """V_CVT_I32_F64 should only write 32 bits, not clobber vdst+1."""
+    val_bits = f2i64(-1.0)
+    instructions = [
+      s_mov_b32(s[0], val_bits & 0xffffffff),
+      s_mov_b32(s[1], val_bits >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], 0xDEADBEEF),
+      v_mov_b32_e32(v[3], s[2]),     # Canary in v3
+      v_cvt_i32_f64_e32(v[2], v[0:2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xffffffff, "-1.0 converts to -1")
+    self.assertEqual(st.vgpr[0][3], 0xDEADBEEF, "v3 canary should not be clobbered")
+
+  def test_v_ldexp_f64_negative_exponent(self):
+    """V_LDEXP_F64 with negative exponent (-32)."""
+    val = -8.0
+    val_bits = f2i64(val)
+    expected = -8.0 * (2.0 ** -32)
+    instructions = [
+      s_mov_b32(s[0], val_bits & 0xffffffff),
+      s_mov_b32(s[1], val_bits >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0),  # -32
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
+    self.assertAlmostEqual(result, expected, places=15)
+
+  def test_v_frexp_mant_f64_range(self):
+    """V_FREXP_MANT_F64 should return mantissa in [0.5, 1.0) range."""
+    two_f64 = f2i64(2.0)
+    instructions = [
+      s_mov_b32(s[0], two_f64 & 0xffffffff),
+      s_mov_b32(s[1], two_f64 >> 32),
+      v_frexp_mant_f64_e32(v[0:2], s[0:2]),
+      v_frexp_exp_i32_f64_e32(v[2], s[0:2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    mant = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    exp = st.vgpr[0][2]
+    if exp >= 0x80000000: exp -= 0x100000000  # sign extend
+    self.assertAlmostEqual(mant, 0.5, places=10)
+    self.assertEqual(exp, 2)
+
+  def test_v_div_scale_f64_reads_64bit_sources(self):
+    """V_DIV_SCALE_F64 must read all sources as 64-bit values."""
+    import math
+    sqrt2_f64 = f2i64(1.4142135623730951)
+    one_f64 = f2i64(1.0)
+    instructions = [
+      s_mov_b32(s[0], sqrt2_f64 & 0xffffffff),
+      s_mov_b32(s[1], sqrt2_f64 >> 32),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], one_f64 & 0xffffffff),
+      s_mov_b32(s[3], one_f64 >> 32),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4], sdst=s[10], src0=v[0], src1=v[0], src2=v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    self.assertFalse(math.isnan(result), "Result should not be NaN")
+    self.assertAlmostEqual(result, 1.4142135623730951, places=10)
+
+  def test_f64_to_i64_conversion_sequence(self):
+    """Full f64->i64 conversion sequence with negative value."""
+    import struct
+    val = f2i64(-8.0)
+    lit = 0xC1F00000  # high 32 bits of f64 -2^32
+    instructions = [
+      s_mov_b32(s[0], val & 0xffffffff),
+      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
+      v_trunc_f64_e32(v[0:2], s[0:2]),
+      v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0),  # -32
+      v_floor_f64_e32(v[2:4], v[2:4]),
+      s_mov_b32(s[2], f2i64(-4294967296.0) & 0xffffffff),
+      s_mov_b32(s[3], f2i64(-4294967296.0) >> 32),
+      v_fma_f64(v[0:2], s[2:4], v[2:4], v[0:2]),
+      v_cvt_u32_f64_e32(v[4], v[0:2]),
+      v_cvt_i32_f64_e32(v[5], v[2:4]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo = st.vgpr[0][4]
+    hi = st.vgpr[0][5]
+    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
+    self.assertEqual(result, -8)
+
+  def test_v_trig_preop_f64_index0(self):
+    """V_TRIG_PREOP_F64 index=0: primary chunk of 2/PI."""
+    import math
+    two_over_pi = 2.0 / math.pi
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
+      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
+      v_trig_preop_f64(v[0], abs(s[0]), 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertAlmostEqual(result, two_over_pi, places=10)
+
+  def test_v_trig_preop_f64_sum_equals_two_over_pi(self):
+    """V_TRIG_PREOP_F64: sum of chunks 0,1,2 should equal 2/PI."""
+    import math
+    two_over_pi = 2.0 / math.pi
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
+      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
+      v_trig_preop_f64(v[0], abs(s[0]), 0),
+      v_trig_preop_f64(v[2], abs(s[0]), 1),
+      v_trig_preop_f64(v[4], abs(s[0]), 2),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    p0 = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    p1 = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
+    p2 = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    total = p0 + p1 + p2
+    self.assertAlmostEqual(total, two_over_pi, places=14)
+
+
+class TestMad64More(unittest.TestCase):
+  """More tests for V_MAD_U64_U32."""
+
+  def test_v_mad_u64_u32_with_add(self):
+    """V_MAD_U64_U32 with 64-bit addend."""
+    instructions = [
+      s_mov_b32(s[0], 1000),
+      s_mov_b32(s[1], 1000),
+      v_mov_b32_e32(v[2], 0),  # S2 lo
+      v_mov_b32_e32(v[3], 1),  # S2 hi = 0x100000000
+      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_lo = st.vgpr[0][4]
+    result_hi = st.vgpr[0][5]
+    result = result_lo | (result_hi << 32)
+    expected = 1000 * 1000 + 0x100000000
+    self.assertEqual(result, expected)
+
+  def test_v_mad_u64_u32_max_values(self):
+    """V_MAD_U64_U32 with max u32 values."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_mov_b32(s[1], 0xFFFFFFFF),
+      v_mov_b32_e32(v[2], 0),
+      v_mov_b32_e32(v[3], 0),
+      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result_lo = st.vgpr[0][4]
+    result_hi = st.vgpr[0][5]
+    result = result_lo | (result_hi << 32)
+    expected = 0xFFFFFFFF * 0xFFFFFFFF
+    self.assertEqual(result, expected)
+
+
+class TestPermMore(unittest.TestCase):
+  """More tests for V_PERM_B32."""
+
+  def test_v_perm_b32_select_high_bytes(self):
+    """V_PERM_B32: Select bytes from high word (s0)."""
+    instructions = [
+      s_mov_b32(s[0], 0x03020100),
+      s_mov_b32(s[1], 0x07060504),
+      s_mov_b32(s[2], 0x04050607),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_perm_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][3]
+    self.assertEqual(result, 0x00010203)
+
+  def test_v_perm_b32_constant_values(self):
+    """V_PERM_B32: Test constant 0x00 (sel=12) and 0xFF (sel>=13)."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xABCDEF01),
+      s_mov_b32(s[2], 0x0C0D0E0F),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_perm_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][3]
+    self.assertEqual(result, 0x00FFFFFF)
+
+  def test_v_perm_b32_sign_extend(self):
+    """V_PERM_B32: Test sign extension selectors 8-11."""
+    instructions = [
+      s_mov_b32(s[0], 0x00008000),
+      s_mov_b32(s[1], 0x80000080),
+      s_mov_b32(s[2], 0x08090A0B),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_perm_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][3]
+    self.assertEqual(result, 0x00FFFF00)
+
+
+class TestF64LiteralOps(unittest.TestCase):
+  """Tests for 64-bit operations with literal encoding."""
+
+  def test_v_fma_f64_literal_neg_2pow32(self):
+    """V_FMA_F64 with literal encoding of -2^32."""
+    val_41 = f2i64(-41.0)
+    val_m1 = f2i64(-1.0)
+    lit = 0xC1F00000  # high 32 bits of f64 -2^32
+    instructions = [
+      s_mov_b32(s[0], val_41 & 0xffffffff),
+      s_mov_b32(s[1], (val_41 >> 32) & 0xffffffff),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], val_m1 & 0xffffffff),
+      s_mov_b32(s[3], (val_m1 >> 32) & 0xffffffff),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3(VOP3Op.V_FMA_F64, vdst=v[4], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    expected = 4294967255.0  # 2^32 - 41
+    self.assertAlmostEqual(result, expected, places=0)
+
+  def test_v_ldexp_f64_literal_neg32(self):
+    """V_LDEXP_F64 with literal -32 for exponent."""
+    val = f2i64(-41.0)
+    expected = -41.0 * (2.0 ** -32)
+    instructions = [
+      s_mov_b32(s[0], val & 0xffffffff),
+      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),  # -32
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
+    self.assertAlmostEqual(result, expected, places=15)
+
+
+class TestF64ToI64Conversion(unittest.TestCase):
+  """Tests for f64 to i64 conversion sequence."""
+
+  def _convert_f64_to_i64(self, val_f64):
+    """Helper to create f64->i64 conversion sequence."""
+    val = f2i64(val_f64)
+    lit = 0xC1F00000
+    instructions = [
+      s_mov_b32(s[0], val & 0xffffffff),
+      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_trunc_f64_e32(v[0:2], v[0:2]),
+      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),
+      v_floor_f64_e32(v[2:4], v[2:4]),
+      VOP3(VOP3Op.V_FMA_F64, vdst=v[0], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
+      v_cvt_u32_f64_e32(v[4], v[0:2]),
+      v_cvt_i32_f64_e32(v[5], v[2:4]),
+    ]
+    return instructions
+
+  def test_f64_to_i64_full_sequence(self):
+    """Full f64->i64 conversion sequence with negative value."""
+    import struct
+    instructions = self._convert_f64_to_i64(-41.0)
+    st = run_program(instructions, n_lanes=1)
+    lo = st.vgpr[0][4]
+    hi = st.vgpr[0][5]
+    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
+    self.assertEqual(result, -41)
+
+  def test_f64_to_i64_large_negative(self):
+    """f64->i64 conversion with larger negative value (-1000000)."""
+    import struct
+    instructions = self._convert_f64_to_i64(-1000000.0)
+    st = run_program(instructions, n_lanes=1)
+    lo = st.vgpr[0][4]
+    hi = st.vgpr[0][5]
+    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
+    self.assertEqual(result, -1000000)
+
+  def test_f64_to_i64_positive(self):
+    """f64->i64 conversion with positive value (1000000)."""
+    import struct
+    instructions = self._convert_f64_to_i64(1000000.0)
+    st = run_program(instructions, n_lanes=1)
+    lo = st.vgpr[0][4]
+    hi = st.vgpr[0][5]
+    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
+    self.assertEqual(result, 1000000)
+
+  def test_f64_to_i64_large_positive(self):
+    """f64->i64 conversion with value > 2^32."""
+    import struct
+    instructions = self._convert_f64_to_i64(5000000000.0)
+    st = run_program(instructions, n_lanes=1)
+    lo = st.vgpr[0][4]
+    hi = st.vgpr[0][5]
+    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
+    self.assertEqual(result, 5000000000)
+
+
+class TestWMMAMore(unittest.TestCase):
+  """More WMMA tests."""
+
+  def test_v_wmma_f32_16x16x16_f16_basic(self):
+    """V_WMMA_F32_16X16X16_F16 basic test - verify output is non-zero."""
+    instructions = []
+    instructions.append(s_mov_b32(s[0], 0x3c003c00))
+    for i in range(16, 32):
+      instructions.append(v_mov_b32_e32(v[i], s[0]))
+    for i in range(8):
+      instructions.append(v_mov_b32_e32(v[i], 0))
+    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
+    st = run_program(instructions, n_lanes=32)
+    any_nonzero = any(st.vgpr[lane][0] != 0 for lane in range(32))
+    self.assertTrue(any_nonzero, "WMMA should produce non-zero output")
+
+
+class TestSinReduction(unittest.TestCase):
+  """Tests for sin argument reduction steps."""
+
+  def test_sin_reduction_step1_mul(self):
+    """First step: v1 = |x| * (1/2pi)."""
+    import math
+    one_over_2pi = 1.0 / (2.0 * math.pi)
+    x = 100000.0
+    instructions = [
+      s_mov_b32(s[0], f2i(x)),
+      s_mov_b32(s[1], f2i(one_over_2pi)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mul_f32_e32(v[1], s[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    expected = x * one_over_2pi
+    self.assertAlmostEqual(result, expected, places=0)
+
+  def test_sin_reduction_step2_round(self):
+    """Second step: round to nearest integer."""
+    import math
+    one_over_2pi = 1.0 / (2.0 * math.pi)
+    x = 100000.0
+    val = x * one_over_2pi  # ~15915.49
+    instructions = [
+      s_mov_b32(s[0], f2i(val)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_rndne_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    expected = round(val)
+    self.assertAlmostEqual(result, expected, places=0)
+
+  def test_sin_reduction_step3_fma(self):
+    """Third step: x - n * (pi/2) via FMA."""
+    import math
+    neg_half_pi = -math.pi / 2.0
+    x = 100000.0
+    n = 15915.0
+    instructions = [
+      s_mov_b32(s[0], f2i(neg_half_pi)),
+      s_mov_b32(s[1], f2i(n)),
+      s_mov_b32(s[2], f2i(x)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_fma_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    expected = x + neg_half_pi * n
+    self.assertAlmostEqual(result, expected, places=2)
+
+  def test_sin_1e5_full_reduction(self):
+    """Full reduction sequence for sin(1e5)."""
+    import math
+    x = 100000.0
+    one_over_2pi = 1.0 / (2.0 * math.pi)
+    neg_half_pi = -math.pi / 2.0
+
+    instructions = [
+      s_mov_b32(s[0], f2i(x)),
+      s_mov_b32(s[1], f2i(one_over_2pi)),
+      s_mov_b32(s[2], f2i(neg_half_pi)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mul_f32_e32(v[1], s[1], v[0]),
+      v_rndne_f32_e32(v[2], v[1]),
+      v_fma_f32(v[3], s[2], v[2], v[0]),
+      v_cvt_i32_f32_e32(v[4], v[2]),
+      v_and_b32_e32(v[5], 3, v[4]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+
+    mul_result = i2f(st.vgpr[0][1])
+    round_result = i2f(st.vgpr[0][2])
+    quadrant = st.vgpr[0][5]
+
+    expected_mul = x * one_over_2pi
+    expected_round = round(expected_mul)
+    expected_quadrant = int(expected_round) & 3
+
+    self.assertAlmostEqual(mul_result, expected_mul, places=0)
+    self.assertAlmostEqual(round_result, expected_round, places=0)
+    self.assertEqual(quadrant, expected_quadrant)
+
+
+class TestTrigPreop(unittest.TestCase):
+  """Tests for V_TRIG_PREOP_F64 - chunks of 2/PI for argument reduction."""
+
+  def test_trig_preop_f64_index0(self):
+    """V_TRIG_PREOP_F64 index=0: primary chunk of 2/PI."""
+    import math
+    two_over_pi = 2.0 / math.pi
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
+      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
+      v_trig_preop_f64(v[0], abs(s[0]), 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertAlmostEqual(result, two_over_pi, places=10)
+
+  def test_trig_preop_f64_index1(self):
+    """V_TRIG_PREOP_F64 index=1: secondary chunk (extended precision bits)."""
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),
+      s_mov_b32(s[1], 0x3ff00000),
+      v_trig_preop_f64(v[0], abs(s[0]), 1),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertLess(abs(result), 1e-15)
+    self.assertGreater(abs(result), 0)
+
+  def test_trig_preop_f64_index2(self):
+    """V_TRIG_PREOP_F64 index=2: tertiary chunk (more extended precision bits)."""
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),
+      s_mov_b32(s[1], 0x3ff00000),
+      v_trig_preop_f64(v[0], abs(s[0]), 2),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertLess(abs(result), 1e-30)
+
+  def test_trig_preop_f64_sum_equals_two_over_pi(self):
+    """V_TRIG_PREOP_F64: sum of chunks 0,1,2 should equal 2/PI."""
+    import math
+    two_over_pi = 2.0 / math.pi
+    instructions = [
+      s_mov_b32(s[0], 0x00000000),
+      s_mov_b32(s[1], 0x3ff00000),
+      v_trig_preop_f64(v[0], abs(s[0]), 0),
+      v_trig_preop_f64(v[2], abs(s[0]), 1),
+      v_trig_preop_f64(v[4], abs(s[0]), 2),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    p0 = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    p1 = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
+    p2 = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
+    total = p0 + p1 + p2
+    self.assertAlmostEqual(total, two_over_pi, places=14)
+
+  def test_trig_preop_f64_large_input(self):
+    """V_TRIG_PREOP_F64 with larger input should adjust shift based on exponent."""
+    import math
+    large_val = 2.0 ** 60
+    large_bits = f2i64(large_val)
+    instructions = [
+      s_mov_b32(s[0], large_bits & 0xffffffff),
+      s_mov_b32(s[1], (large_bits >> 32) & 0xffffffff),
+      v_trig_preop_f64(v[0], abs(s[0]), 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
+    self.assertFalse(math.isnan(result))
+    self.assertFalse(math.isinf(result))
+
+
+class TestModifierInteractions(unittest.TestCase):
+  """Tests for abs/neg/clamp/omod modifier interactions."""
+
+  def test_neg_abs_combination(self):
+    """-|x| should negate the absolute value."""
+    instructions = [
+      v_mov_b32_e32(v[0], -5.0),
+      VOP3(VOP3Op.V_MUL_F32, vdst=v[1], src0=1.0, src1=v[0], neg=0b10, abs_=0b10),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -5.0, places=5)
+
+  def test_abs_neg_on_neg_zero(self):
+    """|(-0.0)| = +0.0, -|(-0.0)| = -0.0."""
+    neg_zero = 0x80000000
+    instructions = [
+      s_mov_b32(s[0], neg_zero),
+      v_mov_b32_e32(v[0], s[0]),
+      VOP3(VOP3Op.V_MUL_F32, vdst=v[1], src0=1.0, src1=v[0], abs_=0b10),
+      VOP3(VOP3Op.V_MUL_F32, vdst=v[2], src0=1.0, src1=v[0], neg=0b10, abs_=0b10),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x00000000, "|(-0.0)| = +0.0")
+    self.assertEqual(st.vgpr[0][2], 0x80000000, "-|(-0.0)| = -0.0")
+
+  def test_clamp_with_nan(self):
+    """Clamp with NaN input should still produce NaN."""
+    import math
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      VOP3(VOP3Op.V_ADD_F32, vdst=v[1], src0=v[0], src1=0.0, clamp=1),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
+
+  def test_omod_ignored(self):
+    """OMOD field is ignored on RDNA3 hardware."""
+    instructions = [
+      v_mov_b32_e32(v[0], 3.0),
+      VOP3(VOP3Op.V_ADD_F32, vdst=v[1], src0=v[0], src1=1.0, omod=1),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4.0, places=5)
+
+  def test_nan_propagation(self):
+    """NaN should propagate through FMA operations."""
+    import math
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      v_fma_f32(v[1], v[0], 1.0, 0.0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])), "fma(NaN, 1, 0) = NaN")
+
+
+class TestBitfieldEdges(unittest.TestCase):
+  """Tests for bitfield operation edge cases."""
+
+  def test_bfe_u32_max_width(self):
+    """V_BFE_U32 extracting max 31 bits (width field is 5 bits)."""
+    instructions = [
+      s_mov_b32(s[0], 0xDEADBEEF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_bfe_u32(v[1], v[0], 0, 31),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x5EADBEEF)
+
+  def test_bfe_u32_zero_width(self):
+    """V_BFE_U32 with zero width should return 0."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_bfe_u32(v[1], v[0], 16, 0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0)
+
+  def test_bfe_i32_sign_extend(self):
+    """V_BFE_I32 should sign extend."""
+    instructions = [
+      s_mov_b32(s[0], 0x000000F0),
+      v_mov_b32_e32(v[0], s[0]),
+      v_bfe_i32(v[1], v[0], 4, 4),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+  def test_bfi_b32_basic(self):
+    """V_BFI_B32 bit field insert."""
+    instructions = [
+      s_mov_b32(s[0], 0x0000FFFF),
+      s_mov_b32(s[1], 0xAAAAAAAA),
+      s_mov_b32(s[2], 0x55555555),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_bfi_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 0x5555AAAA)
+
+
+class TestCarryBorrow(unittest.TestCase):
+  """Tests for carry/borrow operations (VOP3SD)."""
+
+  def test_add_co_u32_no_carry(self):
+    """V_ADD_CO_U32 without carry."""
+    instructions = [
+      v_mov_b32_e32(v[0], 100),
+      v_mov_b32_e32(v[1], 50),
+      v_add_co_u32(v[2], VCC, v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 150)
+    self.assertEqual(st.vcc & 1, 0, "No carry")
+
+  def test_add_co_u32_with_carry(self):
+    """V_ADD_CO_U32 with carry."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 2),
+      v_add_co_u32(v[2], VCC, v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 1)
+    self.assertEqual(st.vcc & 1, 1, "Should have carry")
+
+  def test_sub_co_u32_no_borrow(self):
+    """V_SUB_CO_U32 without borrow."""
+    instructions = [
+      v_mov_b32_e32(v[0], 100),
+      v_mov_b32_e32(v[1], 50),
+      v_sub_co_u32(v[2], VCC, v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 50)
+    self.assertEqual(st.vcc & 1, 0, "No borrow")
+
+  def test_sub_co_u32_with_borrow(self):
+    """V_SUB_CO_U32 with borrow."""
+    instructions = [
+      v_mov_b32_e32(v[0], 50),
+      v_mov_b32_e32(v[1], 100),
+      v_sub_co_u32(v[2], VCC, v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xFFFFFFCE)
+    self.assertEqual(st.vcc & 1, 1, "Should have borrow")
+
+  def test_addc_co_u32_chain(self):
+    """V_ADD_CO_CI_U32 chained addition (64-bit add via two 32-bit adds)."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_mov_b32(s[1], 0x00000001),
+      s_mov_b32(s[2], 0x00000001),
+      s_mov_b32(s[3], 0x00000001),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], s[3]),
+      v_add_co_u32(v[4], VCC, v[0], v[2]),
+      v_add_co_ci_u32_e32(v[5], VCC, v[1], v[3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result")
+    self.assertEqual(st.vgpr[0][5], 0x00000003, "hi result")
+
+
+class TestReadlane(unittest.TestCase):
+  """Tests for V_READLANE_B32 and related cross-lane operations."""
+
+  def test_lane_id_distinct(self):
+    """Each lane should have distinct lane_id in v255."""
+    instructions = [
+      v_mov_b32_e32(v[0], v[255]),
+    ]
+    st = run_program(instructions, n_lanes=32)
+    for lane in range(32):
+      self.assertEqual(st.vgpr[lane][0], lane)
+
+  def test_reduction_pattern(self):
+    """Test reduction using readlane."""
+    def _readlane(sdst_idx, vsrc, lane_idx):
+      return VOP3(VOP3Op.V_READLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc, src1=lane_idx)
+
+    instructions = [
+      v_mov_b32_e32(v[0], v[255]),
+      _readlane(0, v[0], 0),
+      _readlane(1, v[0], 1),
+      _readlane(2, v[0], 2),
+      _readlane(3, v[0], 3),
+      s_add_u32(s[4], s[0], s[1]),
+      s_add_u32(s[4], s[4], s[2]),
+      s_add_u32(s[4], s[4], s[3]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.sgpr[4], 6)
+
+
+class TestMed3(unittest.TestCase):
+  """Tests for V_MED3 - median of 3 values."""
+
+  def test_v_med3_f32_basic(self):
+    """V_MED3_F32: median of 1.0, 2.0, 3.0 is 2.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_mov_b32_e32(v[2], 3.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 2.0, places=5)
+
+  def test_v_med3_f32_reversed(self):
+    """V_MED3_F32: median of 3.0, 2.0, 1.0 is still 2.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 3.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 2.0, places=5)
+
+  def test_v_med3_f32_two_equal(self):
+    """V_MED3_F32: median of 1.0, 3.0, 3.0 is 3.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 3.0),
+      v_mov_b32_e32(v[2], 3.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
+
+  def test_v_med3_f32_all_equal(self):
+    """V_MED3_F32: median of 5.0, 5.0, 5.0 is 5.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 5.0),
+      v_mov_b32_e32(v[1], 5.0),
+      v_mov_b32_e32(v[2], 5.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 5.0, places=5)
+
+  def test_v_med3_f32_negative(self):
+    """V_MED3_F32: median of -1.0, 0.0, 1.0 is 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_mov_b32_e32(v[1], 0.0),
+      v_mov_b32_e32(v[2], 1.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 0.0, places=5)
+
+  def test_v_med3_f32_with_nan(self):
+    """V_MED3_F32: NaN handling - returns min of non-NaN values."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mov_b32_e32(v[2], 2.0),
+      v_med3_f32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    # With one NaN, result should be min of non-NaN values
+    self.assertAlmostEqual(result, 1.0, places=5)
+
+  def test_v_med3_i32_basic(self):
+    """V_MED3_I32: median of signed integers."""
+    instructions = [
+      s_mov_b32(s[0], (-5) & 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0),
+      v_mov_b32_e32(v[2], 10),
+      v_med3_i32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 0)
+
+  def test_v_med3_i32_all_negative(self):
+    """V_MED3_I32: median of -10, -5, -1 is -5."""
+    instructions = [
+      s_mov_b32(s[0], (-10) & 0xFFFFFFFF),
+      s_mov_b32(s[1], (-5) & 0xFFFFFFFF),
+      s_mov_b32(s[2], (-1) & 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_med3_i32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], (-5) & 0xFFFFFFFF)
+
+  def test_v_med3_u32_basic(self):
+    """V_MED3_U32: median of unsigned integers."""
+    instructions = [
+      v_mov_b32_e32(v[0], 100),
+      v_mov_b32_e32(v[1], 200),
+      v_mov_b32_e32(v[2], 150),
+      v_med3_u32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 150)
+
+  def test_v_med3_u32_large(self):
+    """V_MED3_U32: median with large unsigned values."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_mov_b32(s[1], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_med3_u32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 0x80000000)
+
+
+class TestMinMax(unittest.TestCase):
+  """Tests for V_MIN/V_MAX with edge cases including NaN."""
+
+  def test_v_min_f32_basic(self):
+    """V_MIN_F32: min of 1.0 and 2.0 is 1.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_min_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+
+  def test_v_max_f32_basic(self):
+    """V_MAX_F32: max of 1.0 and 2.0 is 2.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], 2.0),
+      v_max_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 2.0, places=5)
+
+  def test_v_min_f32_with_nan_first(self):
+    """V_MIN_F32: min(NaN, 1.0) returns 1.0 (IEEE 754-2008)."""
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_min_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+
+  def test_v_min_f32_with_nan_second(self):
+    """V_MIN_F32: min(1.0, NaN) returns 1.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # NaN
+      v_mov_b32_e32(v[0], 1.0),
+      v_mov_b32_e32(v[1], s[0]),
+      v_min_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+
+  def test_v_max_f32_with_nan(self):
+    """V_MAX_F32: max(NaN, 1.0) returns 1.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x7fc00000),  # NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_max_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
+
+  def test_v_min_f32_neg_zero(self):
+    """V_MIN_F32: min(+0, -0) should return -0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),          # +0
+      s_mov_b32(s[0], 0x80000000),     # -0
+      v_mov_b32_e32(v[1], s[0]),
+      v_min_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # -0 < +0 according to IEEE 754 totalOrder
+    self.assertEqual(st.vgpr[0][2], 0x80000000)
+
+  def test_v_max_f32_neg_zero(self):
+    """V_MAX_F32: max(+0, -0) should return +0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),          # +0
+      s_mov_b32(s[0], 0x80000000),     # -0
+      v_mov_b32_e32(v[1], s[0]),
+      v_max_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0)
+
+  def test_v_min_i32_signed(self):
+    """V_MIN_I32: handles signed comparison correctly."""
+    instructions = [
+      s_mov_b32(s[0], (-5) & 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 5),
+      v_min_i32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], (-5) & 0xFFFFFFFF)
+
+  def test_v_max_u32_large(self):
+    """V_MAX_U32: handles large unsigned values."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 100),
+      v_max_u32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0xFFFFFFFF)
+
+
+class TestCeil(unittest.TestCase):
+  """Tests for V_CEIL_F32."""
+
+  def test_v_ceil_f32_positive_frac(self):
+    """V_CEIL_F32: ceil(2.3) = 3.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(2.3)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 3.0, places=5)
+
+  def test_v_ceil_f32_negative_frac(self):
+    """V_CEIL_F32: ceil(-2.3) = -2.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-2.3)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -2.0, places=5)
+
+  def test_v_ceil_f32_whole(self):
+    """V_CEIL_F32: ceil(5.0) = 5.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 5.0),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 5.0, places=5)
+
+  def test_v_ceil_f32_zero(self):
+    """V_CEIL_F32: ceil(0.0) = 0.0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
+
+  def test_v_ceil_f32_neg_zero(self):
+    """V_CEIL_F32: ceil(-0.0) = -0.0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_ceil_f32_small_positive(self):
+    """V_CEIL_F32: ceil(0.1) = 1.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(0.1)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=5)
+
+  def test_v_ceil_f32_small_negative(self):
+    """V_CEIL_F32: ceil(-0.1) = -0.0."""
+    instructions = [
+      s_mov_b32(s[0], f2i(-0.1)),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ceil_f32_e32(v[1], v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][1])
+    self.assertEqual(result, 0.0)
+
+
+class TestAlignBit(unittest.TestCase):
+  """Tests for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32."""
+
+  def test_v_alignbit_b32_zero_shift(self):
+    """V_ALIGNBIT_B32: shift by 0 returns src1."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_alignbit_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][3], 0xAABBCCDD)
+
+  def test_v_alignbit_b32_shift_8(self):
+    """V_ALIGNBIT_B32: shift by 8 bits."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 8),
+      v_alignbit_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # (0x12345678 << 24) | (0xAABBCCDD >> 8) = 0x78AABBCC
+    self.assertEqual(st.vgpr[0][3], 0x78AABBCC)
+
+  def test_v_alignbit_b32_shift_16(self):
+    """V_ALIGNBIT_B32: shift by 16 bits."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 16),
+      v_alignbit_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # (0x12345678 << 16) | (0xAABBCCDD >> 16) = 0x5678AABB
+    self.assertEqual(st.vgpr[0][3], 0x5678AABB)
+
+  def test_v_alignbit_b32_shift_32(self):
+    """V_ALIGNBIT_B32: shift by 32 returns src0."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 32),
+      v_alignbit_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Hardware only uses low 5 bits of shift, so shift 32 = shift 0
+    self.assertEqual(st.vgpr[0][3], 0xAABBCCDD)
+
+  def test_v_alignbyte_b32_shift_1(self):
+    """V_ALIGNBYTE_B32: shift by 1 byte."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 1),
+      v_alignbyte_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # (0x12345678 << 24) | (0xAABBCCDD >> 8) = 0x78AABBCC
+    self.assertEqual(st.vgpr[0][3], 0x78AABBCC)
+
+  def test_v_alignbyte_b32_shift_3(self):
+    """V_ALIGNBYTE_B32: shift by 3 bytes."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      s_mov_b32(s[1], 0xAABBCCDD),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 3),
+      v_alignbyte_b32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # (0x12345678 << 8) | (0xAABBCCDD >> 24) = 0x345678AA
+    self.assertEqual(st.vgpr[0][3], 0x345678AA)
+
+
+class TestShiftEdgeCases(unittest.TestCase):
+  """Tests for shift operations with edge cases."""
+
+  def test_v_lshlrev_b32_by_0(self):
+    """V_LSHLREV_B32: shift by 0 returns original."""
+    instructions = [
+      s_mov_b32(s[0], 0x12345678),
+      v_mov_b32_e32(v[0], s[0]),
+      v_lshlrev_b32_e32(v[1], 0, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x12345678)
+
+  def test_v_lshlrev_b32_by_31(self):
+    """V_LSHLREV_B32: shift by 31 bits."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1),
+      v_lshlrev_b32_e32(v[1], 31, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_lshlrev_b32_by_32(self):
+    """V_LSHLREV_B32: shift by 32 - only low 5 bits used."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1),
+      v_lshlrev_b32_e32(v[1], 32, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 32 & 0x1f = 0, so no shift
+    self.assertEqual(st.vgpr[0][1], 1)
+
+  def test_v_lshrrev_b32_by_32(self):
+    """V_LSHRREV_B32: shift by 32 - only low 5 bits used."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_lshrrev_b32_e32(v[1], 32, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 32 & 0x1f = 0, so no shift
+    self.assertEqual(st.vgpr[0][1], 0x80000000)
+
+  def test_v_ashrrev_i32_negative(self):
+    """V_ASHRREV_I32: arithmetic shift preserves sign."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -2147483648
+      v_mov_b32_e32(v[0], s[0]),
+      v_ashrrev_i32_e32(v[1], 4, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Arithmetic right shift fills with sign bit
+    self.assertEqual(st.vgpr[0][1], 0xF8000000)
+
+  def test_v_ashrrev_i32_by_31(self):
+    """V_ASHRREV_I32: shift by 31 gives all 1s for negative."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_ashrrev_i32_e32(v[1], 31, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
+
+  def test_v_lshrrev_b32_by_31(self):
+    """V_LSHRREV_B32: logical shift by 31 gives 0 or 1."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_lshrrev_b32_e32(v[1], 31, v[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][1], 1)
+
+
+class TestMulHiLo(unittest.TestCase):
+  """Tests for V_MUL_HI/V_MUL_LO operations."""
+
+  def test_v_mul_lo_u32_basic(self):
+    """V_MUL_LO_U32: low 32 bits of 32x32 multiply."""
+    instructions = [
+      v_mov_b32_e32(v[0], 100),
+      v_mov_b32_e32(v[1], 200),
+      v_mul_lo_u32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 20000)
+
+  def test_v_mul_lo_u32_overflow(self):
+    """V_MUL_LO_U32: result wraps on overflow."""
+    instructions = [
+      s_mov_b32(s[0], 0x10000),
+      s_mov_b32(s[1], 0x10000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_lo_u32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 0x10000 * 0x10000 = 0x100000000, low 32 bits = 0
+    self.assertEqual(st.vgpr[0][2], 0)
+
+  def test_v_mul_hi_u32_basic(self):
+    """V_MUL_HI_U32: high 32 bits of 32x32 multiply."""
+    instructions = [
+      s_mov_b32(s[0], 0x10000),
+      s_mov_b32(s[1], 0x10000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_hi_u32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 0x10000 * 0x10000 = 0x100000000, high 32 bits = 1
+    self.assertEqual(st.vgpr[0][2], 1)
+
+  def test_v_mul_hi_u32_large(self):
+    """V_MUL_HI_U32: large values."""
+    instructions = [
+      s_mov_b32(s[0], 0xFFFFFFFF),
+      s_mov_b32(s[1], 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_hi_u32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 0xFFFFFFFF * 0xFFFFFFFF = 0xFFFFFFFE00000001, high = 0xFFFFFFFE
+    self.assertEqual(st.vgpr[0][2], 0xFFFFFFFE)
+
+  def test_v_mul_hi_i32_positive(self):
+    """V_MUL_HI_I32: signed multiply with positive values."""
+    instructions = [
+      s_mov_b32(s[0], 0x10000),
+      s_mov_b32(s[1], 0x10000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_hi_i32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 1)
+
+  def test_v_mul_hi_i32_negative(self):
+    """V_MUL_HI_I32: signed multiply with negative value."""
+    instructions = [
+      s_mov_b32(s[0], (-10000) & 0xFFFFFFFF),
+      s_mov_b32(s[1], 100000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_hi_i32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # -10000 * 100000 = -1000000000, which fits in 32 bits
+    # high 32 bits should be -1 (0xFFFFFFFF) for negative numbers that fit
+    self.assertEqual(st.vgpr[0][2], 0xFFFFFFFF)
+
+  def test_v_mul_hi_i32_both_negative(self):
+    """V_MUL_HI_I32: both values negative."""
+    instructions = [
+      s_mov_b32(s[0], (-0x10000) & 0xFFFFFFFF),
+      s_mov_b32(s[1], (-0x10000) & 0xFFFFFFFF),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mul_hi_i32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # -0x10000 * -0x10000 = 0x100000000, high = 1
+    self.assertEqual(st.vgpr[0][2], 1)
+
+
+class TestMulF32EdgeCases(unittest.TestCase):
+  """Edge cases for V_MUL_F32."""
+
+  def test_v_mul_f32_inf_by_zero(self):
+    """V_MUL_F32: inf * 0 = NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
+
+  def test_v_mul_f32_inf_by_inf(self):
+    """V_MUL_F32: inf * inf = inf."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[0]),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isinf(i2f(st.vgpr[0][2])))
+
+  def test_v_mul_f32_neg_zero_by_pos(self):
+    """V_MUL_F32: -0 * positive = -0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 1.0),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x80000000)
+
+  def test_v_mul_f32_neg_zero_by_neg(self):
+    """V_MUL_F32: -0 * negative = +0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], -1.0),
+      v_mul_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0)  # +0
+
+
+class TestAddF32EdgeCases(unittest.TestCase):
+  """Edge cases for V_ADD_F32."""
+
+  def test_v_add_f32_inf_minus_inf(self):
+    """V_ADD_F32: inf + (-inf) = NaN."""
+    import math
+    instructions = [
+      s_mov_b32(s[0], 0x7f800000),  # +inf
+      s_mov_b32(s[1], 0xff800000),  # -inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_add_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
+
+  def test_v_add_f32_pos_neg_zero(self):
+    """V_ADD_F32: +0 + (-0) = +0."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[1], s[0]),
+      v_add_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0)  # +0
+
+  def test_v_add_f32_neg_neg_zero(self):
+    """V_ADD_F32: -0 + (-0) = -0."""
+    instructions = [
+      s_mov_b32(s[0], 0x80000000),  # -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[0]),
+      v_add_f32_e32(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x80000000)  # -0
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vop3p.py b/extra/assembly/amd/test/hw/test_vop3p.py
new file mode 100644
index 0000000000..5935b5abc2
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vop3p.py
@@ -0,0 +1,538 @@
+"""Tests for VOP3P instructions - packed 16-bit vector operations.
+
+Includes: v_pk_add_f16, v_pk_mul_f16, v_pk_fma_f16, v_pack_b32_f16, v_wmma_*, v_dot2_*
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+class TestPackInstructions(unittest.TestCase):
+  """Tests for pack instructions."""
+
+  def test_v_pack_b32_f16(self):
+    """V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pack_b32_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x40003c00, f"Expected 0x40003c00, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_opsel_hi_hi(self):
+    """V_PACK_B32_F16 with opsel to read high halves."""
+    inst = v_pack_b32_f16(v[2], v[0], v[1])
+    inst._values['opsel'] = 0b0011
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # hi=2.0, lo=1.0
+      s_mov_b32(s[1], 0x44004200),  # hi=4.0, lo=3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      inst,
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x44004000, f"Expected 0x44004000, got 0x{result:08x}")
+
+
+class TestPackMore(unittest.TestCase):
+  """Additional pack instruction tests."""
+
+  def test_v_pack_b32_f16_basic(self):
+    """V_PACK_B32_F16 packs two f16 values."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),  # f16 1.0
+      s_mov_b32(s[1], 0x4000),  # f16 2.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pack_b32_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x40003c00, f"Expected 0x40003c00, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_with_cvt(self):
+    """V_PACK_B32_F16 after V_CVT_F16_F32 conversions."""
+    instructions = [
+      s_mov_b32(s[0], 0x3f800000),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cvt_f16_f32_e32(v[2], v[0]),
+      v_cvt_f16_f32_e32(v[3], v[1]),
+      v_pack_b32_f16(v[4], v[2], v[3]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][4]
+    self.assertEqual(result, 0x3c003c00, f"Expected 0x3c003c00, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_packed_sources(self):
+    """V_PACK_B32_F16 with packed f16 sources (reads lo halves)."""
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # hi=2.0, lo=1.0
+      s_mov_b32(s[1], 0x44004200),  # hi=4.0, lo=3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pack_b32_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    # Expected: hi=v1.lo=0x4200 (3.0), lo=v0.lo=0x3c00 (1.0) -> 0x42003c00
+    self.assertEqual(result, 0x42003c00, f"Expected 0x42003c00, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_opsel_lo_hi(self):
+    """V_PACK_B32_F16 with opsel=0b0010 to read lo from src0, hi from src1."""
+    inst = v_pack_b32_f16(v[2], v[0], v[1])
+    inst._values['opsel'] = 0b0010
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),
+      s_mov_b32(s[1], 0x44004200),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      inst,
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x44003c00, f"Expected 0x44003c00, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_opsel_hi_lo(self):
+    """V_PACK_B32_F16 with opsel=0b0001 to read hi from src0, lo from src1."""
+    inst = v_pack_b32_f16(v[2], v[0], v[1])
+    inst._values['opsel'] = 0b0001
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),
+      s_mov_b32(s[1], 0x44004200),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      inst,
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x42004000, f"Expected 0x42004000, got 0x{result:08x}")
+
+  def test_v_pack_b32_f16_zeros(self):
+    """V_PACK_B32_F16 with zero values."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),
+      v_mov_b32_e32(v[1], 0),
+      v_pack_b32_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0)
+
+  def test_v_pack_b32_f16_both_positive(self):
+    """V_PACK_B32_F16 with positive f16 values."""
+    instructions = [
+      s_mov_b32(s[0], 0x4200),  # f16 3.0
+      s_mov_b32(s[1], 0x4400),  # f16 4.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pack_b32_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x44004200, f"Expected 0x44004200, got 0x{result:08x}")
+
+
+class TestFmaMix(unittest.TestCase):
+  """Tests for V_FMA_MIX_F32 and V_FMA_MIXLO_F16."""
+
+  def test_v_fma_mix_f32_all_f32_sources(self):
+    """V_FMA_MIX_F32 with all f32 sources."""
+    instructions = [
+      s_mov_b32(s[0], f2i(2.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f2i(1.0)),
+      v_mov_b32_e32(v[2], s[2]),
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 7.0, places=5)
+
+  def test_v_fma_mix_f32_src2_f16_lo(self):
+    """V_FMA_MIX_F32 with src2 as f16 from lo bits."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_2 = f32_to_f16(2.0)
+    instructions = [
+      s_mov_b32(s[0], f2i(1.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f16_2),
+      v_mov_b32_e32(v[2], s[2]),
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 5.0, places=5)
+
+  def test_v_fma_mix_f32_src2_f16_hi(self):
+    """V_FMA_MIX_F32 with src2 as f16 from hi bits."""
+    from extra.assembly.amd.pcode import f32_to_f16
+    f16_2 = f32_to_f16(2.0)
+    val = (f16_2 << 16) | 0
+    instructions = [
+      s_mov_b32(s[0], f2i(1.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], val),
+      v_mov_b32_e32(v[2], s[2]),
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 5.0, places=5)
+
+  def test_v_fma_mix_f32_with_abs(self):
+    """V_FMA_MIX_F32 with abs modifier on src2."""
+    instructions = [
+      s_mov_b32(s[0], f2i(2.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f2i(-1.0)),
+      v_mov_b32_e32(v[2], s[2]),
+      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0, neg_hi=4),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 7.0, places=5)
+
+  def test_v_fma_mixlo_f16(self):
+    """V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], f2i(2.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(3.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f2i(1.0)),
+      v_mov_b32_e32(v[2], s[2]),
+      s_mov_b32(s[3], 0xdead0000),
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo = _f16(st.vgpr[0][3] & 0xffff)
+    hi = (st.vgpr[0][3] >> 16) & 0xffff
+    self.assertAlmostEqual(lo, 7.0, places=1)
+    self.assertEqual(hi, 0xdead, f"hi should be preserved, got 0x{hi:04x}")
+
+  def test_v_fma_mixlo_f16_all_f32_sources(self):
+    """V_FMA_MIXLO_F16 with all f32 sources."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], f2i(1.0)),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], f2i(2.0)),
+      v_mov_b32_e32(v[1], s[1]),
+      s_mov_b32(s[2], f2i(3.0)),
+      v_mov_b32_e32(v[2], s[2]),
+      v_mov_b32_e32(v[3], 0),
+      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo = _f16(st.vgpr[0][3] & 0xffff)
+    # 1*2+3 = 5
+    self.assertAlmostEqual(lo, 5.0, places=1)
+
+  def test_v_fma_mixlo_f16_sin_case(self):
+    """V_FMA_MIXLO_F16 case from sin kernel."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x3f800000),  # f32 1.0
+      v_mov_b32_e32(v[3], s[0]),
+      s_mov_b32(s[1], 0xaf05a309),  # f32 tiny negative
+      s_mov_b32(s[6], s[1]),
+      s_mov_b32(s[2], 0xc0490fdb),  # f32 -π
+      v_mov_b32_e32(v[5], s[2]),
+      s_mov_b32(s[3], 0x3f800000),
+      v_mov_b32_e32(v[3], s[3]),
+      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[3], src1=s[6], src2=v[5], opsel=0, opsel_hi=0, opsel_hi2=0),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    lo = _f16(st.vgpr[0][3] & 0xffff)
+    self.assertAlmostEqual(lo, -3.14159, delta=0.01)
+
+
+class TestVOP3P(unittest.TestCase):
+  """Tests for VOP3P packed 16-bit operations."""
+
+  def test_v_pk_add_f16_basic(self):
+    """V_PK_ADD_F16 adds two packed f16 values."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # hi=2.0, lo=1.0
+      s_mov_b32(s[1], 0x44004200),  # hi=4.0, lo=3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pk_add_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, 4.0, places=2)
+    self.assertAlmostEqual(hi, 6.0, places=2)
+
+  def test_v_pk_mul_f16_basic(self):
+    """V_PK_MUL_F16 multiplies two packed f16 values."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x42004000),  # hi=3.0, lo=2.0
+      s_mov_b32(s[1], 0x45004400),  # hi=5.0, lo=4.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pk_mul_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, 8.0, places=1)
+    self.assertAlmostEqual(hi, 15.0, places=1)
+
+  def test_v_pk_fma_f16_basic(self):
+    """V_PK_FMA_F16: D = A * B + C for packed f16."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x42004000),  # A: hi=3.0, lo=2.0
+      s_mov_b32(s[1], 0x45004400),  # B: hi=5.0, lo=4.0
+      s_mov_b32(s[2], 0x3c003c00),  # C: hi=1.0, lo=1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], s[2]),
+      v_pk_fma_f16(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][3]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, 9.0, places=1)   # 2*4+1
+    self.assertAlmostEqual(hi, 16.0, places=0)  # 3*5+1
+
+  def test_v_pk_add_f16_with_inline_constant(self):
+    """V_PK_ADD_F16 with inline constant POS_ONE (1.0).
+    Inline constants for VOP3P are f16 values in the low 16 bits only.
+    hi half of inline constant is 0, so hi result = v0.hi + 0 = 1.0.
+    """
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x3c003c00),  # packed f16: hi=1.0, lo=1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE),  # Add inline constant 1.0
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    # lo = 1.0 + 1.0 = 2.0, hi = 1.0 + 0.0 = 1.0 (inline const hi half is 0)
+    self.assertAlmostEqual(lo, 2.0, places=2)
+    self.assertAlmostEqual(hi, 1.0, places=2)
+
+  def test_v_pk_mul_f16_with_inline_constant(self):
+    """V_PK_MUL_F16 with inline constant POS_TWO (2.0).
+    Inline constant has value only in low 16 bits, hi is 0.
+    """
+    from extra.assembly.amd.pcode import _f16
+    # v0 = packed (3.0, 4.0), multiply by POS_TWO
+    # lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
+    instructions = [
+      s_mov_b32(s[0], 0x44004200),  # packed f16: hi=4.0, lo=3.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][1]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, 6.0, places=1)
+    self.assertAlmostEqual(hi, 0.0, places=1)
+
+
+class TestWMMA(unittest.TestCase):
+  """Tests for WMMA (Wave Matrix Multiply-Accumulate) instructions."""
+
+  def test_v_wmma_f32_16x16x16_f16_all_ones(self):
+    """V_WMMA_F32_16X16X16_F16 with all ones produces 16.0."""
+    instructions = []
+    instructions.append(s_mov_b32(s[0], 0x3c003c00))  # packed f16 1.0
+    for i in range(16, 32):
+      instructions.append(v_mov_b32_e32(v[i], s[0]))
+    for i in range(8):
+      instructions.append(v_mov_b32_e32(v[i], 0))
+    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
+    st = run_program(instructions, n_lanes=32)
+    expected = f2i(16.0)
+    for lane in range(32):
+      for reg in range(8):
+        result = st.vgpr[lane][reg]
+        self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 16.0, got {i2f(result)}")
+
+  def test_v_wmma_f32_16x16x16_f16_with_accumulator(self):
+    """V_WMMA_F32_16X16X16_F16 with non-zero accumulator."""
+    instructions = []
+    instructions.append(s_mov_b32(s[0], 0x3c003c00))
+    instructions.append(s_mov_b32(s[1], f2i(5.0)))
+    for i in range(16, 32):
+      instructions.append(v_mov_b32_e32(v[i], s[0]))
+    for i in range(8):
+      instructions.append(v_mov_b32_e32(v[i], s[1]))
+    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
+    st = run_program(instructions, n_lanes=32)
+    expected = f2i(21.0)  # 16 + 5
+    for lane in range(32):
+      for reg in range(8):
+        result = st.vgpr[lane][reg]
+        self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 21.0, got {i2f(result)}")
+
+
+class TestSpecialOps(unittest.TestCase):
+  """Tests for special operations (SAD, PERM, DOT2)."""
+
+  def test_v_sad_u8_basic(self):
+    """V_SAD_U8 computes sum of absolute differences."""
+    instructions = [
+      s_mov_b32(s[0], 0x04030201),  # bytes: 1, 2, 3, 4
+      s_mov_b32(s[1], 0x05040302),  # bytes: 2, 3, 4, 5
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_sad_u8(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # |1-2| + |2-3| + |3-4| + |4-5| = 1 + 1 + 1 + 1 = 4
+    self.assertEqual(st.vgpr[0][3], 4)
+
+  def test_v_sad_u8_identical_bytes(self):
+    """V_SAD_U8 with identical inputs returns accumulator."""
+    instructions = [
+      s_mov_b32(s[0], 0x04030201),
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 10),
+      v_mov_b32_e32(v[2], s[1]),
+      v_sad_u8(v[3], v[0], v[0], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Same inputs -> SAD = 0, result = accumulator = 10
+    self.assertEqual(st.vgpr[0][3], 10)
+
+  def test_v_sad_u16_basic(self):
+    """V_SAD_U16 computes sum of absolute differences of u16 pairs."""
+    instructions = [
+      s_mov_b32(s[0], 0x00030001),  # hi=3, lo=1
+      s_mov_b32(s[1], 0x00050002),  # hi=5, lo=2
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_sad_u16(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # |1-2| + |3-5| = 1 + 2 = 3
+    self.assertEqual(st.vgpr[0][3], 3)
+
+  def test_v_sad_u32_basic(self):
+    """V_SAD_U32 computes absolute difference of u32 values."""
+    instructions = [
+      s_mov_b32(s[0], 100),
+      s_mov_b32(s[1], 70),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_sad_u32(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # |100-70| = 30
+    self.assertEqual(st.vgpr[0][3], 30)
+
+  def test_v_msad_u8_masked(self):
+    """V_MSAD_U8 masked SAD operation."""
+    instructions = [
+      s_mov_b32(s[0], 0x04030201),
+      s_mov_b32(s[1], 0x05040302),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_msad_u8(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # V_MSAD_U8 skips bytes where src0 is 0
+    # Since no bytes are 0, result same as V_SAD_U8 = 4
+    self.assertEqual(st.vgpr[0][3], 4)
+
+  def test_v_perm_b32_select_bytes(self):
+    """V_PERM_B32 selects bytes from two sources.
+
+    V_PERM_B32 concatenates {S1, S0} as a 64-bit value with S1 in low 32 bits.
+    Selector byte values 0-3 select from S1, values 4-7 select from S0.
+    """
+    instructions = [
+      s_mov_b32(s[0], 0x44332211),  # src0: bytes 4-7 in 64-bit view
+      s_mov_b32(s[1], 0x88776655),  # src1: bytes 0-3 in 64-bit view
+      s_mov_b32(s[2], 0x07060504),  # select bytes 4,5,6,7 (from src0)
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_perm_b32(v[2], v[0], v[1], s[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vgpr[0][2], 0x44332211)
+
+  def test_v_dot2_f32_bf16_basic(self):
+    """V_DOT2_F32_BF16 computes dot product of bf16 pairs."""
+    # bf16 1.0 = 0x3f80, bf16 2.0 = 0x4000
+    instructions = [
+      s_mov_b32(s[0], 0x3f803f80),  # packed bf16: 1.0, 1.0
+      s_mov_b32(s[1], 0x40003f80),  # packed bf16: 2.0, 1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_mov_b32_e32(v[2], 0),
+      v_dot2_f32_bf16(v[3], v[0], v[1], v[2]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # 1.0*1.0 + 1.0*2.0 + 0 = 3.0
+    result = i2f(st.vgpr[0][3])
+    self.assertAlmostEqual(result, 3.0, places=4)
+
+
+class TestPackedMixedSigns(unittest.TestCase):
+  """Tests for packed operations with mixed sign values."""
+
+  def test_pk_add_f16_mixed_signs(self):
+    """V_PK_ADD_F16 with mixed positive/negative values."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0xc0003c00),  # packed: hi=-2.0, lo=1.0
+      s_mov_b32(s[1], 0x3c003c00),  # packed: hi=1.0, lo=1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pk_add_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    lo = _f16(result & 0xffff)
+    hi = _f16((result >> 16) & 0xffff)
+    self.assertAlmostEqual(lo, 2.0, places=2)   # 1.0 + 1.0
+    self.assertAlmostEqual(hi, -1.0, places=2)  # -2.0 + 1.0
+
+  def test_pk_mul_f16_zero(self):
+    """V_PK_MUL_F16 with zero."""
+    from extra.assembly.amd.pcode import _f16
+    instructions = [
+      s_mov_b32(s[0], 0x40004000),  # packed: 2.0, 2.0
+      s_mov_b32(s[1], 0x00000000),  # packed: 0.0, 0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_pk_mul_f16(v[2], v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    result = st.vgpr[0][2]
+    self.assertEqual(result, 0x00000000, "2.0 * 0.0 should be 0.0")
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/hw/test_vopc.py b/extra/assembly/amd/test/hw/test_vopc.py
new file mode 100644
index 0000000000..d27dde8b0f
--- /dev/null
+++ b/extra/assembly/amd/test/hw/test_vopc.py
@@ -0,0 +1,486 @@
+"""Tests for VOPC instructions - vector compare operations.
+
+Includes: v_cmp_class_f32, v_cmp_class_f16, v_cmp_eq_*, v_cmp_lt_*, v_cmp_gt_*
+"""
+import unittest
+from extra.assembly.amd.test.hw.helpers import *
+
+VCC = 106  # SGPR index for VCC_LO
+
+class TestCmpClass(unittest.TestCase):
+  """Tests for V_CMP_CLASS_F32 float classification."""
+
+  def test_cmp_class_quiet_nan(self):
+    """V_CMP_CLASS_F32 detects quiet NaN."""
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0b0000000010),  # bit 1 = quiet NaN
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect quiet NaN")
+
+  def test_cmp_class_signaling_nan(self):
+    """V_CMP_CLASS_F32 detects signaling NaN."""
+    signal_nan = 0x7f800001
+    instructions = [
+      s_mov_b32(s[0], signal_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0b0000000001),  # bit 0 = signaling NaN
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect signaling NaN")
+
+  def test_cmp_class_positive_inf(self):
+    """V_CMP_CLASS_F32 detects +inf."""
+    pos_inf = 0x7f800000
+    instructions = [
+      s_mov_b32(s[0], pos_inf),
+      s_mov_b32(s[1], 0b1000000000),  # bit 9 = +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect +inf")
+
+  def test_cmp_class_negative_inf(self):
+    """V_CMP_CLASS_F32 detects -inf."""
+    neg_inf = 0xff800000
+    instructions = [
+      s_mov_b32(s[0], neg_inf),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0b0000000100),  # bit 2 = -inf
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect -inf")
+
+  def test_cmp_class_normal_positive(self):
+    """V_CMP_CLASS_F32 detects positive normal."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1.0),
+      s_mov_b32(s[1], 0b0100000000),  # bit 8 = positive normal
+      v_mov_b32_e32(v[1], s[1]),
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect positive normal")
+
+  def test_cmp_class_normal_negative(self):
+    """V_CMP_CLASS_F32 detects negative normal."""
+    instructions = [
+      v_mov_b32_e32(v[0], -1.0),
+      v_mov_b32_e32(v[1], 0b0000001000),  # bit 3 = negative normal
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect negative normal")
+
+  def test_cmp_class_quiet_nan_not_signaling(self):
+    """Quiet NaN does not match signaling NaN mask."""
+    quiet_nan = 0x7fc00000
+    instructions = [
+      s_mov_b32(s[0], quiet_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0b0000000001),  # bit 0 = signaling NaN only
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "Quiet NaN should not match signaling mask")
+
+  def test_cmp_class_signaling_nan_not_quiet(self):
+    """Signaling NaN does not match quiet NaN mask."""
+    signal_nan = 0x7f800001
+    instructions = [
+      s_mov_b32(s[0], signal_nan),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0b0000000010),  # bit 1 = quiet NaN only
+      v_cmp_class_f32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "Signaling NaN should not match quiet mask")
+
+  def test_v_cmp_sets_vcc_bits(self):
+    """V_CMP_EQ sets VCC bits based on per-lane comparison."""
+    instructions = [
+      s_mov_b32(s[0], 5),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cmp_eq_u32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
+
+
+class TestCmpClassF16(unittest.TestCase):
+  """Tests for V_CMP_CLASS_F16 float classification.
+
+  Class bit mapping:
+    bit 0 = signaling NaN
+    bit 1 = quiet NaN
+    bit 2 = -infinity
+    bit 3 = -normal
+    bit 4 = -denormal
+    bit 5 = -zero
+    bit 6 = +zero
+    bit 7 = +denormal
+    bit 8 = +normal
+    bit 9 = +infinity
+  """
+
+  def test_cmp_class_f16_positive_zero(self):
+    """V_CMP_CLASS_F16: +zero matches bit 6."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0x0000),  # f16 +0.0
+      v_mov_b32_e32(v[1], 0x40),     # bit 6 = +zero
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect positive zero")
+
+  def test_cmp_class_f16_negative_zero(self):
+    """V_CMP_CLASS_F16: -zero matches bit 5."""
+    instructions = [
+      s_mov_b32(s[0], 0x8000),       # f16 -0.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x20),     # bit 5 = -zero
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect negative zero")
+
+  def test_cmp_class_f16_positive_normal(self):
+    """V_CMP_CLASS_F16: +1.0 (normal) matches bit 8."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),       # f16 +1.0
+      s_mov_b32(s[1], 0x100),        # bit 8 = +normal
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect positive normal")
+
+  def test_cmp_class_f16_negative_normal(self):
+    """V_CMP_CLASS_F16: -1.0 (normal) matches bit 3."""
+    instructions = [
+      s_mov_b32(s[0], 0xbc00),       # f16 -1.0
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x08),     # bit 3 = -normal
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect negative normal")
+
+  def test_cmp_class_f16_positive_infinity(self):
+    """V_CMP_CLASS_F16: +inf matches bit 9."""
+    instructions = [
+      s_mov_b32(s[0], 0x7c00),       # f16 +inf
+      s_mov_b32(s[1], 0x200),        # bit 9 = +inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect positive infinity")
+
+  def test_cmp_class_f16_negative_infinity(self):
+    """V_CMP_CLASS_F16: -inf matches bit 2."""
+    instructions = [
+      s_mov_b32(s[0], 0xfc00),       # f16 -inf
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x04),     # bit 2 = -inf
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect negative infinity")
+
+  def test_cmp_class_f16_quiet_nan(self):
+    """V_CMP_CLASS_F16: quiet NaN matches bit 1."""
+    instructions = [
+      s_mov_b32(s[0], 0x7e00),       # f16 quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x02),     # bit 1 = quiet NaN
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect quiet NaN")
+
+  def test_cmp_class_f16_signaling_nan(self):
+    """V_CMP_CLASS_F16: signaling NaN matches bit 0."""
+    instructions = [
+      s_mov_b32(s[0], 0x7c01),       # f16 signaling NaN
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x01),     # bit 0 = signaling NaN
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect signaling NaN")
+
+  def test_cmp_class_f16_positive_denormal(self):
+    """V_CMP_CLASS_F16: positive denormal matches bit 7."""
+    instructions = [
+      v_mov_b32_e32(v[0], 1),        # f16 +denormal (0x0001)
+      v_mov_b32_e32(v[1], 0x80),     # bit 7 = +denormal
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect positive denormal")
+
+  def test_cmp_class_f16_negative_denormal(self):
+    """V_CMP_CLASS_F16: negative denormal matches bit 4."""
+    instructions = [
+      s_mov_b32(s[0], 0x8001),       # f16 -denormal
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], 0x10),     # bit 4 = -denormal
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Should detect negative denormal")
+
+  def test_cmp_class_f16_combined_mask_zeros(self):
+    """V_CMP_CLASS_F16: mask 0x60 covers both +zero and -zero."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),         # f16 +0.0
+      v_mov_b32_e32(v[1], 0x60),      # bits 5 and 6 (+-zero)
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x60")
+
+  def test_cmp_class_f16_combined_mask_1f8(self):
+    """V_CMP_CLASS_F16: mask 0x1f8 covers -normal,-denorm,-zero,+zero,+denorm,+normal.
+
+    This is the exact mask used in the f16 sin kernel at PC=46.
+    """
+    instructions = [
+      v_mov_b32_e32(v[0], 0),         # f16 +0.0
+      s_mov_b32(s[0], 0x1f8),
+      v_mov_b32_e32(v[1], s[0]),      # mask 0x1f8
+      v_cmp_class_f16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x1f8")
+
+  def test_cmp_class_f16_vop3_encoding(self):
+    """V_CMP_CLASS_F16 in VOP3 encoding (v_cmp_class_f16_e64)."""
+    instructions = [
+      v_mov_b32_e32(v[0], 0),         # f16 +0.0
+      s_mov_b32(s[0], 0x1f8),         # class mask
+      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[0]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with VOP3 encoding")
+
+  def test_cmp_class_f16_vop3_normal_positive(self):
+    """V_CMP_CLASS_F16 VOP3 encoding with +1.0 (normal)."""
+    instructions = [
+      s_mov_b32(s[0], 0x3c00),        # f16 +1.0
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 0x1f8),         # class mask
+      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +1.0 (normal) with mask 0x1f8")
+
+  def test_cmp_class_f16_vop3_nan_fails_mask(self):
+    """V_CMP_CLASS_F16 VOP3: NaN should NOT match mask 0x1f8 (no NaN bits set)."""
+    instructions = [
+      s_mov_b32(s[0], 0x7e00),        # f16 quiet NaN
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 0x1f8),         # class mask
+      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for NaN with mask 0x1f8 (no NaN bits)")
+
+  def test_cmp_class_f16_vop3_inf_fails_mask(self):
+    """V_CMP_CLASS_F16 VOP3: +inf should NOT match mask 0x1f8 (no inf bits set)."""
+    instructions = [
+      s_mov_b32(s[0], 0x7c00),        # f16 +inf
+      v_mov_b32_e32(v[0], s[0]),
+      s_mov_b32(s[1], 0x1f8),         # class mask
+      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for +inf with mask 0x1f8 (no inf bits)")
+
+
+class TestCmpInt(unittest.TestCase):
+  """Tests for integer comparison operations."""
+
+  def test_v_cmp_eq_u32(self):
+    """V_CMP_EQ_U32 sets VCC bits based on per-lane comparison."""
+    instructions = [
+      s_mov_b32(s[0], 5),
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[0]),
+      v_cmp_eq_u32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=4)
+    self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
+
+  def test_cmp_eq_u16_opsel_lo_lo(self):
+    """V_CMP_EQ_U16 comparing lo halves."""
+    instructions = [
+      s_mov_b32(s[0], 0x12340005),  # lo=5, hi=0x1234
+      s_mov_b32(s[1], 0xABCD0005),  # lo=5, hi=0xABCD
+      v_mov_b32_e32(v[0], s[0]),
+      v_mov_b32_e32(v[1], s[1]),
+      v_cmp_eq_u16_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Lo halves should be equal")
+
+  def test_cmp_eq_u16_opsel_hi_hi(self):
+    """V_CMP_EQ_U16 comparing hi halves with VOP3 opsel.
+
+    VOPC doesn't have opsel, so we use VOP3 form for hi-half comparisons.
+    VOP3 compares write result to SGPR via vdst field.
+    """
+    instructions = [
+      s_mov_b32(s[2], 0x00051234),  # hi=5, lo=0x1234
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x0005ABCD),  # hi=5, lo=0xABCD
+      v_mov_b32_e32(v[1], s[2]),
+      # opsel=3 means compare hi halves, vdst=v[0] actually writes to s[0]
+      VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Result is in sgpr[0], not vcc
+    self.assertEqual(st.sgpr[0] & 1, 1, "Hi halves should be equal: 5==5")
+
+  def test_cmp_eq_u16_opsel_hi_hi_equal(self):
+    """V_CMP_EQ_U16 VOP3 with opsel=3 compares hi halves (equal case)."""
+    instructions = [
+      s_mov_b32(s[2], 0x12340005),  # lo=5, hi=0x1234
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x12340009),  # lo=9, hi=0x1234
+      v_mov_b32_e32(v[1], s[2]),
+      VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0] & 1, 1, "hi==hi should be true: 0x1234==0x1234")
+
+  def test_cmp_gt_u16_opsel_hi(self):
+    """V_CMP_GT_U16 VOP3 with opsel=3 compares hi halves."""
+    instructions = [
+      s_mov_b32(s[2], 0x99990005),  # lo=5, hi=0x9999
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x12340005),  # lo=5, hi=0x1234
+      v_mov_b32_e32(v[1], s[2]),
+      VOP3(VOP3Op.V_CMP_GT_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.sgpr[0] & 1, 1, "hi>hi should be true: 0x9999>0x1234")
+
+
+class TestCmpFloat(unittest.TestCase):
+  """Tests for float comparison operations."""
+
+  def test_v_cmp_lt_f16_vsrc1_hi(self):
+    """V_CMP_LT_F16 with both operands from high half using VOP3 opsel."""
+    instructions = [
+      s_mov_b32(s[2], 0x3c000000),  # hi=1.0 (f16), lo=0
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x40000000),  # hi=2.0 (f16), lo=0
+      v_mov_b32_e32(v[1], s[2]),
+      # opsel=3 means read hi halves for both src0 and src1
+      VOP3(VOP3Op.V_CMP_LT_F16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Result is in sgpr[0]
+    self.assertEqual(st.sgpr[0] & 1, 1, "1.0 < 2.0 should be true")
+
+  def test_v_cmp_gt_f16_vsrc1_hi(self):
+    """V_CMP_GT_F16 with both operands from high half using VOP3 opsel."""
+    instructions = [
+      s_mov_b32(s[2], 0x40000000),  # hi=2.0 (f16), lo=0
+      v_mov_b32_e32(v[0], s[2]),
+      s_mov_b32(s[2], 0x3c000000),  # hi=1.0 (f16), lo=0
+      v_mov_b32_e32(v[1], s[2]),
+      # opsel=3 means read hi halves for both src0 and src1
+      VOP3(VOP3Op.V_CMP_GT_F16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    # Result is in sgpr[0]
+    self.assertEqual(st.sgpr[0] & 1, 1, "2.0 > 1.0 should be true")
+
+  def test_v_cmp_eq_f16_vsrc1_hi_equal(self):
+    """v_cmp_eq_f16 with equal low and high halves."""
+    instructions = [
+      s_mov_b32(s[0], 0x42004200),  # hi=3.0 (0x4200), lo=3.0 (0x4200)
+      v_mov_b32_e32(v[0], s[0]),
+      v_cmp_eq_f16_e32(v[0], v[0].h),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (3.0 == 3.0)")
+
+  def test_v_cmp_neq_f16_vsrc1_hi(self):
+    """v_cmp_neq_f16 with different low and high halves."""
+    instructions = [
+      s_mov_b32(s[0], 0x40003c00),  # hi=2.0 (0x4000), lo=1.0 (0x3c00)
+      v_mov_b32_e32(v[0], s[0]),
+      v_cmp_lg_f16_e32(v[0], v[0].h),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (1.0 != 2.0)")
+
+  def test_v_cmp_nge_f16_inf_self(self):
+    """v_cmp_nge_f16 comparing -inf with itself (unordered less than).
+
+    Regression test: -inf < -inf should be false (IEEE 754).
+    """
+    instructions = [
+      s_mov_b32(s[0], 0xFC00FC00),  # both halves = -inf (0xFC00)
+      v_mov_b32_e32(v[0], s[0]),
+      v_cmp_nge_f16_e32(v[0], v[0].h),
+    ]
+    st = run_program(instructions, n_lanes=1)
+    self.assertEqual(st.vcc & 1, 0, "Expected vcc=0 (-inf >= -inf)")
+
+  def test_v_cmp_f16_multilane(self):
+    """v_cmp_lt_f16 with vsrc1=v128 across multiple lanes."""
+    instructions = [
+      # Lane 0: v0 = 0x40003c00 (hi=2.0, lo=1.0) -> 1.0 < 2.0 = true
+      # Lane 1: v0 = 0x3c004000 (hi=1.0, lo=2.0) -> 2.0 < 1.0 = false
+      v_mov_b32_e32(v[0], 0x40003c00),  # default
+      v_cmp_eq_u32_e32(1, v[255]),  # vcc = (lane == 1)
+      v_cndmask_b32_e64(v[0], v[0], 0x3c004000, SrcEnum.VCC_LO),
+      v_cmp_lt_f16_e32(v[0], v[0].h),
+    ]
+    st = run_program(instructions, n_lanes=2)
+    self.assertEqual(st.vcc & 1, 1, "Lane 0: expected vcc=1 (1.0 < 2.0)")
+    self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
+
+
+class TestVCCBehavior(unittest.TestCase):
+  """Tests for VCC condition code behavior."""
+
+  def test_vcc_all_lanes_true(self):
+    """VCC should have all bits set when all lanes compare true."""
+    instructions = [
+      v_mov_b32_e32(v[0], 5),
+      v_mov_b32_e32(v[1], 5),
+      v_cmp_eq_u32_e32(v[0], v[1]),
+    ]
+    st = run_program(instructions, n_lanes=32)
+    self.assertEqual(st.vcc, 0xFFFFFFFF, "All 32 lanes should be true")
+
+  def test_vcc_lane_dependent(self):
+    """VCC should differ per lane based on lane_id comparison."""
+    instructions = [
+      v_mov_b32_e32(v[0], 16),
+      v_cmp_lt_u32_e32(v[255], v[0]),  # lanes 0-15 are < 16
+    ]
+    st = run_program(instructions, n_lanes=32)
+    self.assertEqual(st.vcc & 0xFFFF, 0xFFFF, "Lanes 0-15 should be true")
+    self.assertEqual(st.vcc >> 16, 0x0000, "Lanes 16-31 should be false")
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/extra/assembly/amd/test/test_emu.py b/extra/assembly/amd/test/test_emu.py
deleted file mode 100644
index 66b5bb4d30..0000000000
--- a/extra/assembly/amd/test/test_emu.py
+++ /dev/null
@@ -1,5768 +0,0 @@
-#!/usr/bin/env python3
-"""Regression tests for the RDNA3 emulator instruction execution.
-Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
-
-Set USE_HW=1 to run on both emulator and real hardware, comparing results.
-"""
-
-import ctypes, unittest, os, struct
-from extra.assembly.amd.autogen.rdna3.ins import *
-from extra.assembly.amd.dsl import RawImm
-from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
-from extra.assembly.amd.pcode import _i32, _f32
-
-VCC = SrcEnum.VCC_LO  # For VOP3SD sdst field
-USE_HW = os.environ.get("USE_HW", "0") == "1"
-# Tolerance for float comparisons (in ULPs or absolute)
-FLOAT_TOLERANCE = 1e-5
-
-# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
-# Each VGPR store writes 32 lanes (128 bytes), so vgpr[i] is at offset i*128
-N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
-VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4  # 16 regs * 32 lanes * 4 bytes = 2048
-SGPR_BYTES = N_SGPRS * 4  # 16 regs * 4 bytes = 64
-OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8  # + vcc + scc
-
-def f2i(f: float) -> int: return _i32(f)
-def i2f(i: int) -> float: return _f32(i)
-def f2i64(f: float) -> int: return struct.unpack('<Q', struct.pack('<d', f))[0]
-def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
-
-def assemble(instructions: list) -> bytes:
-  return b''.join(inst.to_bytes() for inst in instructions)
-
-def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
-  """Generate prologue and epilogue instructions for state capture."""
-  # Prologue: save s[0:1] and v[0] before test clobbers them
-  # Use s[80:81] for args pointer (safe range, avoiding VCC=106-107 and staying under 100)
-  prologue = [
-    s_mov_b32(s[80], s[0]),
-    s_mov_b32(s[81], s[1]),
-    v_mov_b32_e32(v[255], v[0]),
-  ]
-  # Zero out test registers (v0-v15, s0-s15, vcc) so emu and hw start from same state
-  for i in range(N_VGPRS):
-    prologue.append(v_mov_b32_e32(v[i], 0))
-  for i in range(N_SGPRS):
-    prologue.append(s_mov_b32(s[i], 0))
-  prologue.append(s_mov_b32(s[SrcEnum.VCC_LO - 128], 0))  # zero VCC
-
-  # Epilogue: store wave state to memory
-  # Use s[90-99] for epilogue temps to stay in safe SGPR range (<100, avoiding VCC=106-107)
-  # s[90] = saved VCC, s[91] = saved SCC, s[92:93] = output addr, s[94] = saved EXEC
-  # Save VCC/SCC first before we clobber them
-  epilogue = [
-    s_mov_b32(s[90], SrcEnum.VCC_LO),  # save VCC
-    s_cselect_b32(s[91], 1, 0),  # save SCC
-    s_load_b64(s[92:93], s[80], 0, soffset=SrcEnum.NULL),
-    s_waitcnt(lgkmcnt=0),
-    v_lshlrev_b32_e32(v[240], 2, v[255]),  # v[240] = lane_id * 4
-  ]
-  # Store VGPRs: vgpr[i] at offset i*128 + lane_id*4
-  for i in range(N_VGPRS):
-    epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92], offset=i * WAVE_SIZE * 4))
-  # Store SGPRs at VGPR_BYTES + i*4 (lane 0 only via exec mask)
-  epilogue.append(v_mov_b32_e32(v[241], 0))
-  epilogue.append(v_cmp_eq_u32_e32(v[255], v[241]))
-  epilogue.append(s_and_saveexec_b32(s[94], SrcEnum.VCC_LO))
-  epilogue.append(v_mov_b32_e32(v[240], 0))
-  for i in range(N_SGPRS):
-    epilogue.append(v_mov_b32_e32(v[243], s[i]))
-    epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + i * 4))
-  # Store saved VCC
-  epilogue.append(v_mov_b32_e32(v[243], s[90]))
-  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES))
-  # Store saved SCC
-  epilogue.append(v_mov_b32_e32(v[243], s[91]))
-  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES + 4))
-  epilogue.append(s_mov_b32(s[SrcEnum.EXEC_LO - 128], s[94]))  # restore exec
-  epilogue.append(s_endpgm())
-
-  return prologue, epilogue
-
-def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
-  """Parse output buffer into WaveState."""
-  st = WaveState()
-  for i in range(N_VGPRS):
-    for lane in range(n_lanes):
-      off = i * WAVE_SIZE * 4 + lane * 4
-      st.vgpr[lane][i] = struct.unpack_from('<I', out_buf, off)[0]
-  for i in range(N_SGPRS):
-    st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
-  st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
-  st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
-  return st
-
-def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
-  """Run instructions via emulator run_asm, dump state to memory, return WaveState."""
-  out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES))
-  out_addr = ctypes.addressof(out_buf)
-
-  prologue, epilogue = get_prologue_epilogue(n_lanes)
-  code = assemble(prologue + instructions + epilogue)
-
-  args = (ctypes.c_uint64 * 1)(out_addr)
-  args_ptr = ctypes.addressof(args)
-  kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
-  lib_ptr = ctypes.addressof(kernel_buf)
-
-  set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
-  result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr)
-  assert result == 0, f"run_asm failed with {result}"
-
-  return parse_output(bytes(out_buf), n_lanes)
-
-def run_program_hw(instructions: list, n_lanes: int = 1) -> WaveState:
-  """Run instructions on real AMD hardware via HIPCompiler and AMDProgram."""
-  from tinygrad.device import Device
-  from tinygrad.runtime.ops_amd import AMDProgram
-  from tinygrad.runtime.support.compiler_amd import HIPCompiler
-  from tinygrad.helpers import flat_mv
-
-  dev = Device["AMD"]
-  compiler = HIPCompiler(dev.arch)
-
-  prologue, epilogue = get_prologue_epilogue(n_lanes)
-  code = assemble(prologue + instructions + epilogue)
-
-  # Create inline assembly source with .byte directives
-  byte_str = ', '.join(f'0x{b:02x}' for b in code)
-  asm_src = f""".text
-.globl test
-.p2align 8
-.type test,@function
-test:
-.byte {byte_str}
-
-.rodata
-.p2align 6
-.amdhsa_kernel test
-  .amdhsa_next_free_vgpr 256
-  .amdhsa_next_free_sgpr 96
-  .amdhsa_wavefront_size32 1
-  .amdhsa_user_sgpr_kernarg_segment_ptr 1
-  .amdhsa_kernarg_size 8
-  .amdhsa_group_segment_fixed_size 65536
-.end_amdhsa_kernel
-
-.amdgpu_metadata
----
-amdhsa.version:
-  - 1
-  - 0
-amdhsa.kernels:
-  - .name: test
-    .symbol: test.kd
-    .kernarg_segment_size: 8
-    .group_segment_fixed_size: 65536
-    .private_segment_fixed_size: 0
-    .kernarg_segment_align: 8
-    .wavefront_size: 32
-    .sgpr_count: 96
-    .vgpr_count: 256
-    .max_flat_workgroup_size: 1024
-...
-.end_amdgpu_metadata
-"""
-
-  lib = compiler.compile(asm_src)
-  prg = AMDProgram(dev, "test", lib)
-
-  # Allocate output buffer on GPU
-  out_gpu = dev.allocator.alloc(OUT_BYTES)
-
-  # Run the kernel
-  prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True)
-
-  # Copy result back
-  out_buf = bytearray(OUT_BYTES)
-  dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu)
-
-  return parse_output(bytes(out_buf), n_lanes)
-
-def compare_wave_states(emu_st: WaveState, hw_st: WaveState, n_lanes: int, n_vgprs: int = N_VGPRS) -> list[str]:
-  """Compare two WaveStates and return list of differences."""
-  import math
-  diffs = []
-  # Compare VGPRs - vgpr is list[lane][reg]
-  for i in range(n_vgprs):
-    for lane in range(n_lanes):
-      emu_val = emu_st.vgpr[lane][i]
-      hw_val = hw_st.vgpr[lane][i]
-      if emu_val != hw_val:
-        emu_f, hw_f = _f32(emu_val), _f32(hw_val)
-        # Handle NaN comparison
-        if math.isnan(emu_f) and math.isnan(hw_f):
-          continue
-        diffs.append(f"v[{i}] lane {lane}: emu=0x{emu_val:08x} ({emu_f:.6g}) hw=0x{hw_val:08x} ({hw_f:.6g})")
-  # Compare SGPRs - sgpr is list
-  for i in range(N_SGPRS):
-    emu_val = emu_st.sgpr[i]
-    hw_val = hw_st.sgpr[i]
-    if emu_val != hw_val:
-      diffs.append(f"s[{i}]: emu=0x{emu_val:08x} hw=0x{hw_val:08x}")
-  # Compare VCC
-  if emu_st.vcc != hw_st.vcc:
-    diffs.append(f"vcc: emu=0x{emu_st.vcc:08x} hw=0x{hw_st.vcc:08x}")
-  # Compare SCC
-  if emu_st.scc != hw_st.scc:
-    diffs.append(f"scc: emu={emu_st.scc} hw={hw_st.scc}")
-  return diffs
-
-def run_program(instructions: list, n_lanes: int = 1) -> WaveState:
-  """Run instructions and return WaveState.
-
-  If USE_HW=1, runs on both emulator and hardware, compares results, and raises if they differ.
-  Otherwise, runs only on emulator.
-  """
-  emu_st = run_program_emu(instructions, n_lanes)
-  if USE_HW:
-    hw_st = run_program_hw(instructions, n_lanes)
-    diffs = compare_wave_states(emu_st, hw_st, n_lanes)
-    if diffs:
-      raise AssertionError(f"Emulator vs Hardware mismatch:\n" + "\n".join(diffs))
-    return hw_st  # Return hardware result when both match
-  return emu_st
-
-
-class TestVDivScale(unittest.TestCase):
-  """Tests for V_DIV_SCALE_F32 edge cases.
-
-  V_DIV_SCALE_F32 is used in the Newton-Raphson division sequence to handle
-  denormals and near-overflow cases. It scales operands and sets VCC when
-  the final result needs to be unscaled.
-
-  Pseudocode cases:
-  1. Zero operands -> NaN
-  2. exp(S2) - exp(S1) >= 96 -> scale denom, VCC=1
-  3. S1 is denorm -> scale by 2^64
-  4. 1/S1 is f64 denorm AND S2/S1 is f32 denorm -> scale denom, VCC=1
-  5. 1/S1 is f64 denorm -> scale by 2^-64
-  6. S2/S1 is f32 denorm -> scale numer, VCC=1
-  7. exp(S2) <= 23 -> scale by 2^64 (tiny numerator)
-  """
-
-  def test_div_scale_f32_vcc_zero_single_lane(self):
-    """V_DIV_SCALE_F32 sets VCC=0 when no scaling needed."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),  # uses inline constant
-      v_mov_b32_e32(v[1], 4.0),  # uses inline constant
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc, 0, "VCC should be 0 when no scaling needed")
-
-  def test_div_scale_f32_vcc_zero_multiple_lanes(self):
-    """V_DIV_SCALE_F32 sets VCC=0 for all lanes when no scaling needed."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),
-      v_mov_b32_e32(v[1], 4.0),
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    self.assertEqual(st.vcc & 0xf, 0, "VCC should be 0 for all lanes")
-
-  def test_div_scale_f32_preserves_input(self):
-    """V_DIV_SCALE_F32 outputs S0 when no scaling needed."""
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),  # numerator - use inline constant
-      v_mov_b32_e32(v[1], 4.0),  # denominator
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 2.0, places=5)
-
-  def test_div_scale_f32_zero_denom_gives_nan(self):
-    """V_DIV_SCALE_F32: zero denominator -> NaN, VCC=1."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),  # numerator
-      v_mov_b32_e32(v[1], 0.0),  # denominator = 0
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero denom")
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
-
-  def test_div_scale_f32_zero_numer_gives_nan(self):
-    """V_DIV_SCALE_F32: zero numerator -> NaN, VCC=1."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0.0),  # numerator = 0
-      v_mov_b32_e32(v[1], 1.0),  # denominator
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero numer")
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero numer")
-
-  def test_div_scale_f32_large_exp_diff_scales_denom(self):
-    """V_DIV_SCALE_F32: exp(numer) - exp(denom) >= 96 -> scale denom, VCC=1."""
-    # Need exp difference >= 96. Use MAX_FLOAT / tiny_normal
-    # MAX_FLOAT exp=254, tiny_normal with exp <= 254-96=158
-    # Let's use exp=127 (1.0) for denom, exp=254 for numer -> diff = 127 (>96)
-    max_float = 0x7f7fffff  # 3.4028235e+38, exp=254
-    instructions = [
-      s_mov_b32(s[0], max_float),
-      v_mov_b32_e32(v[0], s[0]),  # numer = MAX_FLOAT (S2)
-      v_mov_b32_e32(v[1], 1.0),   # denom = 1.0 (S1), exp=127. diff = 254-127 = 127 >= 96
-      # S0=denom (what we're scaling), S1=denom, S2=numer
-      v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling denom for large exp diff")
-    # Result should be denom * 2^64
-    expected = 1.0 * (2.0 ** 64)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=expected * 1e-6)
-
-  def test_div_scale_f32_denorm_denom(self):
-    """V_DIV_SCALE_F32: denormalized denominator -> NaN, VCC=1.
-
-    Hardware returns NaN when denominator is denormalized (different from PDF pseudocode).
-    """
-    # Smallest positive denorm: 0x00000001 = 1.4e-45
-    denorm = 0x00000001
-    instructions = [
-      s_mov_b32(s[0], denorm),
-      v_mov_b32_e32(v[0], 1.0),   # numer = 1.0 (S2)
-      v_mov_b32_e32(v[1], s[0]), # denom = denorm (S1)
-      # S0=denom, S1=denom, S2=numer -> scale denom
-      v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Hardware returns NaN for denorm denom")
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for denorm denom")
-
-  def test_div_scale_f32_tiny_numer_exp_le_23(self):
-    """V_DIV_SCALE_F32: exponent(numer) <= 23 -> scale by 2^64, VCC=1."""
-    # exp <= 23 means exponent field is 0..23
-    # exp=23 corresponds to float value around 2^(23-127) = 2^-104 ≈ 4.9e-32
-    # Use exp=1 (smallest normal), which is 2^(1-127) = 2^-126 ≈ 1.18e-38
-    smallest_normal = 0x00800000  # exp=1, mantissa=0
-    instructions = [
-      s_mov_b32(s[0], smallest_normal),
-      v_mov_b32_e32(v[0], s[0]),  # numer = smallest_normal (S2), exp=1 <= 23
-      v_mov_b32_e32(v[1], 1.0),   # denom = 1.0 (S1)
-      # S0=numer, S1=denom, S2=numer -> scale numer
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # Numer scaled by 2^64, VCC=1 to indicate scaling was done
-    numer_f = i2f(smallest_normal)
-    expected = numer_f * (2.0 ** 64)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=abs(expected) * 1e-5)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling tiny numer")
-
-  def test_div_scale_f32_result_would_be_denorm(self):
-    """V_DIV_SCALE_F32: result would be denorm -> no scaling applied, VCC=1.
-
-    When the result of numer/denom would be denormalized, hardware sets VCC=1
-    but does NOT scale the input (returns it unchanged). The scaling happens
-    elsewhere in the division sequence.
-    """
-    # If S2/S1 would be denorm, set VCC but don't scale
-    # Denorm result: exp < 1, i.e., |result| < 2^-126
-    # Use 1.0 / 2^127 ≈ 5.9e-39 (result would be denorm)
-    large_denom = 0x7f000000  # 2^127
-    instructions = [
-      s_mov_b32(s[0], large_denom),
-      v_mov_b32_e32(v[0], 1.0),   # numer = 1.0 (S2)
-      v_mov_b32_e32(v[1], s[0]), # denom = 2^127 (S1)
-      # S0=numer, S1=denom, S2=numer -> check if we need to scale numer
-      v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # Hardware returns input unchanged but sets VCC=1
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when result would be denorm")
-
-
-class TestVDivFmas(unittest.TestCase):
-  """Tests for V_DIV_FMAS_F32 edge cases.
-
-  V_DIV_FMAS_F32 performs FMA with optional scaling based on VCC.
-  The scale direction depends on S2's exponent (the addend):
-  - If exponent(S2) > 127 (i.e., S2 >= 2.0): scale by 2^+64
-  - Otherwise: scale by 2^-64
-
-  NOTE: The PDF (page 449) incorrectly says just 2^32.
-  """
-
-  def test_div_fmas_f32_no_scale(self):
-    """V_DIV_FMAS_F32: VCC=0 -> normal FMA."""
-    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # VCC = 0
-      v_mov_b32_e32(v[0], 2.0),   # S0
-      v_mov_b32_e32(v[1], 3.0),   # S1
-      v_mov_b32_e32(v[2], 1.0),   # S2
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2*3+1 = 7
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 7.0, places=5)
-
-  def test_div_fmas_f32_scale_up(self):
-    """V_DIV_FMAS_F32: VCC=1 with S2 >= 2.0 -> scale by 2^+64."""
-    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC = 1
-      v_mov_b32_e32(v[0], 1.0),   # S0
-      v_mov_b32_e32(v[1], 1.0),   # S1
-      v_mov_b32_e32(v[2], 2.0),   # S2 >= 2.0, so scale UP
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^+64 * (1*1+2) = 2^+64 * 3
-    ]
-    st = run_program(instructions, n_lanes=1)
-    expected = 3.0 * (2.0 ** 64)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
-
-  def test_div_fmas_f32_scale_down(self):
-    """V_DIV_FMAS_F32: VCC=1 with S2 < 2.0 -> scale by 2^-64."""
-    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC = 1
-      v_mov_b32_e32(v[0], 2.0),   # S0
-      v_mov_b32_e32(v[1], 3.0),   # S1
-      v_mov_b32_e32(v[2], 1.0),   # S2 < 2.0, so scale DOWN
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # 2^-64 * (2*3+1) = 2^-64 * 7
-    ]
-    st = run_program(instructions, n_lanes=1)
-    expected = 7.0 * (2.0 ** -64)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
-
-  def test_div_fmas_f32_per_lane_vcc(self):
-    """V_DIV_FMAS_F32: different VCC per lane with S2 < 2.0."""
-    instructions = [
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0b0101),  # VCC: lanes 0,2 set
-      v_mov_b32_e32(v[0], 1.0),
-      v_mov_b32_e32(v[1], 1.0),
-      v_mov_b32_e32(v[2], 1.0),  # S2 < 2.0, so scale DOWN
-      v_div_fmas_f32(v[3], v[0], v[1], v[2]),  # fma(1,1,1) = 2, scaled = 2^-64 * 2
-    ]
-    st = run_program(instructions, n_lanes=4)
-    scaled = 2.0 * (2.0 ** -64)
-    unscaled = 2.0
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), scaled, delta=abs(scaled) * 1e-6)  # lane 0: VCC=1
-    self.assertAlmostEqual(i2f(st.vgpr[1][3]), unscaled, places=5)                 # lane 1: VCC=0
-    self.assertAlmostEqual(i2f(st.vgpr[2][3]), scaled, delta=abs(scaled) * 1e-6)  # lane 2: VCC=1
-    self.assertAlmostEqual(i2f(st.vgpr[3][3]), unscaled, places=5)                 # lane 3: VCC=0
-
-
-class TestVDivFixup(unittest.TestCase):
-  """Tests for V_DIV_FIXUP_F32 edge cases.
-
-  V_DIV_FIXUP_F32 is the final step of Newton-Raphson division.
-  It handles special cases: NaN, Inf, zero, overflow, underflow.
-
-  Args: S0=quotient from NR iteration, S1=denominator, S2=numerator
-  """
-
-  def test_div_fixup_f32_normal(self):
-    """V_DIV_FIXUP_F32: normal division passes through quotient."""
-    # 6.0 / 2.0 = 3.0
-    instructions = [
-      v_mov_b32_e32(v[0], 3.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 2.0),   # S1 = denominator
-      v_mov_b32_e32(v[2], 6.0),   # S2 = numerator
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
-
-  def test_div_fixup_f32_nan_numer(self):
-    """V_DIV_FIXUP_F32: NaN numerator -> quiet NaN."""
-    nan = 0x7fc00000  # quiet NaN
-    instructions = [
-      s_mov_b32(s[0], nan),
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 1.0),   # S1 = denominator
-      v_mov_b32_e32(v[2], s[0]), # S2 = numerator = NaN
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "Should be NaN")
-
-  def test_div_fixup_f32_nan_denom(self):
-    """V_DIV_FIXUP_F32: NaN denominator -> quiet NaN."""
-    nan = 0x7fc00000  # quiet NaN
-    instructions = [
-      s_mov_b32(s[0], nan),
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], s[0]), # S1 = denominator = NaN
-      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "Should be NaN")
-
-  def test_div_fixup_f32_zero_div_zero(self):
-    """V_DIV_FIXUP_F32: 0/0 -> NaN (0xffc00000)."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient (doesn't matter)
-      v_mov_b32_e32(v[1], 0.0),   # S1 = denominator = 0
-      v_mov_b32_e32(v[2], 0.0),   # S2 = numerator = 0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "0/0 should be NaN")
-
-  def test_div_fixup_f32_inf_div_inf(self):
-    """V_DIV_FIXUP_F32: inf/inf -> NaN."""
-    pos_inf = 0x7f800000
-    instructions = [
-      s_mov_b32(s[0], pos_inf),
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], s[0]), # S1 = denominator = +inf
-      v_mov_b32_e32(v[2], s[0]), # S2 = numerator = +inf
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "inf/inf should be NaN")
-
-  def test_div_fixup_f32_x_div_zero(self):
-    """V_DIV_FIXUP_F32: x/0 -> +/-inf based on sign."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 0.0),   # S1 = denominator = 0
-      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "x/0 should be inf")
-    self.assertGreater(i2f(st.vgpr[0][3]), 0, "1/0 should be +inf")
-
-  def test_div_fixup_f32_neg_x_div_zero(self):
-    """V_DIV_FIXUP_F32: -x/0 -> -inf."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 0.0),   # S1 = denominator = 0
-      v_mov_b32_e32(v[2], -1.0),  # S2 = numerator = -1.0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "-x/0 should be inf")
-    self.assertLess(i2f(st.vgpr[0][3]), 0, "-1/0 should be -inf")
-
-  def test_div_fixup_f32_zero_div_x(self):
-    """V_DIV_FIXUP_F32: 0/x -> 0."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 2.0),   # S1 = denominator = 2.0
-      v_mov_b32_e32(v[2], 0.0),   # S2 = numerator = 0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(i2f(st.vgpr[0][3]), 0.0, "0/x should be 0")
-
-  def test_div_fixup_f32_x_div_inf(self):
-    """V_DIV_FIXUP_F32: x/inf -> 0."""
-    pos_inf = 0x7f800000
-    instructions = [
-      s_mov_b32(s[0], pos_inf),
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], s[0]), # S1 = denominator = +inf
-      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(i2f(st.vgpr[0][3]), 0.0, "x/inf should be 0")
-
-  def test_div_fixup_f32_inf_div_x(self):
-    """V_DIV_FIXUP_F32: inf/x -> inf."""
-    pos_inf = 0x7f800000
-    instructions = [
-      s_mov_b32(s[0], pos_inf),
-      v_mov_b32_e32(v[0], 1.0),   # S0 = quotient
-      v_mov_b32_e32(v[1], 1.0),   # S1 = denominator = 1.0
-      v_mov_b32_e32(v[2], s[0]), # S2 = numerator = +inf
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "inf/x should be inf")
-
-  def test_div_fixup_f32_sign_propagation(self):
-    """V_DIV_FIXUP_F32: sign is XOR of numer and denom signs."""
-    instructions = [
-      v_mov_b32_e32(v[0], 3.0),   # S0 = |quotient|
-      v_mov_b32_e32(v[1], -2.0),  # S1 = denominator (negative)
-      v_mov_b32_e32(v[2], 6.0),   # S2 = numerator (positive)
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # pos / neg = neg
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), -3.0, places=5)
-
-  def test_div_fixup_f32_neg_neg(self):
-    """V_DIV_FIXUP_F32: neg/neg -> positive."""
-    instructions = [
-      v_mov_b32_e32(v[0], 3.0),   # S0 = |quotient|
-      v_mov_b32_e32(v[1], -2.0),  # S1 = denominator (negative)
-      v_mov_b32_e32(v[2], -6.0),  # S2 = numerator (negative)
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # neg / neg = pos
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
-
-  def test_div_fixup_f32_nan_estimate_overflow(self):
-    """V_DIV_FIXUP_F32: NaN estimate returns overflow (inf).
-
-    PDF doesn't check isNAN(S0), but hardware returns OVERFLOW if S0 is NaN.
-    This happens when division fails (e.g., denorm denominator in V_DIV_SCALE).
-    """
-    quiet_nan = 0x7fc00000
-    instructions = [
-      s_mov_b32(s[0], quiet_nan),
-      v_mov_b32_e32(v[0], s[0]),  # S0 = NaN (failed estimate)
-      v_mov_b32_e32(v[1], 1.0),   # S1 = denominator = 1.0
-      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
-    self.assertEqual(st.vgpr[0][3], 0x7f800000, "Should be +inf (pos/pos)")
-
-  def test_div_fixup_f32_nan_estimate_sign(self):
-    """V_DIV_FIXUP_F32: NaN estimate with negative sign returns -inf."""
-    quiet_nan = 0x7fc00000
-    instructions = [
-      s_mov_b32(s[0], quiet_nan),
-      v_mov_b32_e32(v[0], s[0]),  # S0 = NaN (failed estimate)
-      v_mov_b32_e32(v[1], -1.0),  # S1 = denominator = -1.0
-      v_mov_b32_e32(v[2], 1.0),   # S2 = numerator = 1.0
-      v_div_fixup_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
-    self.assertEqual(st.vgpr[0][3], 0xff800000, "Should be -inf (pos/neg)")
-
-
-class TestVCmpClass(unittest.TestCase):
-  """Tests for V_CMP_CLASS_F32 float classification."""
-
-  def test_cmp_class_quiet_nan(self):
-    """V_CMP_CLASS_F32 detects quiet NaN."""
-    quiet_nan = 0x7fc00000
-    instructions = [
-      s_mov_b32(s[0], quiet_nan),  # large int encodes as literal
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], 0b0000000010),  # bit 1 = quiet NaN (mask in VGPR for VOPC)
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect quiet NaN")
-
-  def test_cmp_class_signaling_nan(self):
-    """V_CMP_CLASS_F32 detects signaling NaN."""
-    signal_nan = 0x7f800001
-    instructions = [
-      s_mov_b32(s[0], signal_nan),  # large int encodes as literal
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], 0b0000000001),  # bit 0 = signaling NaN
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect signaling NaN")
-
-  def test_cmp_class_quiet_nan_not_signaling(self):
-    """Quiet NaN does not match signaling NaN mask."""
-    quiet_nan = 0x7fc00000
-    instructions = [
-      s_mov_b32(s[0], quiet_nan),  # large int encodes as literal
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], 0b0000000001),  # bit 0 = signaling NaN only
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 0, "Quiet NaN should not match signaling mask")
-
-  def test_cmp_class_signaling_nan_not_quiet(self):
-    """Signaling NaN does not match quiet NaN mask."""
-    signal_nan = 0x7f800001
-    instructions = [
-      s_mov_b32(s[0], signal_nan),  # large int encodes as literal
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], 0b0000000010),  # bit 1 = quiet NaN only
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 0, "Signaling NaN should not match quiet mask")
-
-  def test_cmp_class_positive_inf(self):
-    """V_CMP_CLASS_F32 detects +inf."""
-    pos_inf = 0x7f800000
-    instructions = [
-      s_mov_b32(s[0], pos_inf),  # large int encodes as literal
-      s_mov_b32(s[1], 0b1000000000),  # bit 9 = +inf (512 is outside inline range)
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], s[1]),  # mask in VGPR
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect +inf")
-
-  def test_cmp_class_negative_inf(self):
-    """V_CMP_CLASS_F32 detects -inf."""
-    neg_inf = 0xff800000
-    instructions = [
-      s_mov_b32(s[0], neg_inf),  # large int encodes as literal
-      v_mov_b32_e32(v[0], s[0]),  # value to classify
-      v_mov_b32_e32(v[1], 0b0000000100),  # bit 2 = -inf
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect -inf")
-
-  def test_cmp_class_normal_positive(self):
-    """V_CMP_CLASS_F32 detects positive normal."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),  # inline constant - value to classify
-      s_mov_b32(s[1], 0b0100000000),  # bit 8 = positive normal (256 is outside inline range)
-      v_mov_b32_e32(v[1], s[1]),  # mask in VGPR
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect positive normal")
-
-  def test_cmp_class_normal_negative(self):
-    """V_CMP_CLASS_F32 detects negative normal."""
-    instructions = [
-      v_mov_b32_e32(v[0], -1.0),  # inline constant - value to classify
-      v_mov_b32_e32(v[1], 0b0000001000),  # bit 3 = negative normal
-      v_cmp_class_f32_e32(v[0], v[1]),  # VOPC: src0=value, vsrc1=mask, writes VCC
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Should detect negative normal")
-
-
-class TestBasicOps(unittest.TestCase):
-  """Basic instruction tests."""
-
-  def test_v_add_f32(self):
-    """V_ADD_F32 adds two floats."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),  # inline constant
-      v_mov_b32_e32(v[1], 2.0),  # inline constant
-      v_add_f32_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
-
-  def test_v_mul_f32(self):
-    """V_MUL_F32 multiplies two floats."""
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),  # inline constant
-      v_mov_b32_e32(v[1], 4.0),  # inline constant
-      v_mul_f32_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 8.0, places=5)
-
-  def test_v_mov_b32(self):
-    """V_MOV_B32 moves a value."""
-    instructions = [
-      s_mov_b32(s[0], 42),
-      v_mov_b32_e32(v[0], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][0], 42)
-
-  def test_s_add_u32(self):
-    """S_ADD_U32 adds two scalar values."""
-    instructions = [
-      s_mov_b32(s[0], 100),
-      s_mov_b32(s[1], 200),
-      s_add_u32(s[2], s[0], s[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.sgpr[2], 300)
-
-  def test_s_add_u32_carry(self):
-    """S_ADD_U32 sets SCC on overflow."""
-    instructions = [
-      s_mov_b32(s[0], 64),  # use inline constant for max
-      s_not_b32(s[0], s[0]),  # s0 = ~64 = 0xffffffbf, close to max
-      s_mov_b32(s[1], 64),
-      s_add_u32(s[2], s[0], s[1]),  # 0xffffffbf + 64 = 0xffffffff
-      s_mov_b32(s[3], 1),
-      s_add_u32(s[4], s[2], s[3]),  # 0xffffffff + 1 = overflow
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.sgpr[4], 0)
-    self.assertEqual(st.scc, 1)
-
-  def test_v_alignbit_b32(self):
-    """V_ALIGNBIT_B32 extracts bits from concatenated sources."""
-    instructions = [
-      s_mov_b32(s[0], 0x12),  # small values as inline constants
-      s_mov_b32(s[1], 0x34),
-      s_mov_b32(s[2], 4),  # shift amount
-      v_mov_b32_e32(v[0], s[2]),
-      v_alignbit_b32(v[1], s[0], s[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # {0x12, 0x34} >> 4 = 0x0000001200000034 >> 4 = 0x20000003
-    expected = ((0x12 << 32) | 0x34) >> 4
-    self.assertEqual(st.vgpr[0][1], expected & 0xffffffff)
-
-
-class TestMultiLane(unittest.TestCase):
-  """Tests for multi-lane execution."""
-
-  def test_v_mov_all_lanes(self):
-    """V_MOV_B32 sets all lanes to the same value."""
-    instructions = [
-      s_mov_b32(s[0], 42),
-      v_mov_b32_e32(v[0], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][0], 42)
-
-  def test_v_cmp_sets_vcc_bits(self):
-    """V_CMP_EQ sets VCC bits based on per-lane comparison."""
-    instructions = [
-      s_mov_b32(s[0], 5),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[0]),
-      v_cmp_eq_u32_e32(v[0], v[1]),  # VOPC: src0, vsrc1 - writes VCC implicitly
-    ]
-    st = run_program(instructions, n_lanes=4)
-    self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
-
-
-class TestLaneInstructions(unittest.TestCase):
-  """Tests for cross-lane instructions (readlane, writelane, readfirstlane).
-
-  These are critical for wave-level reductions and WMMA matrix operations.
-
-  Note: V_READLANE_B32 and V_READFIRSTLANE_B32 write to SGPR, but the VOP1/VOP3
-  encoding has a 'vdst' field. We use RawImm to encode SGPR indices directly.
-  """
-
-  def _readlane(self, sdst_idx, vsrc, lane_idx):
-    """Helper to create V_READLANE_B32 with SGPR destination."""
-    return VOP3(VOP3Op.V_READLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc, src1=lane_idx)
-
-  def _readfirstlane(self, sdst_idx, vsrc):
-    """Helper to create V_READFIRSTLANE_B32 with SGPR destination."""
-    return VOP1(VOP1Op.V_READFIRSTLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc)
-
-  def test_v_readlane_b32_basic(self):
-    """V_READLANE_B32 reads a value from a specific lane's VGPR."""
-    # v[255] = lane_id from prologue; compute v[0] = lane_id * 10
-    instructions = [
-      v_lshlrev_b32_e32(v[0], 1, v[255]),  # v0 = lane_id * 2
-      v_lshlrev_b32_e32(v[1], 3, v[255]),  # v1 = lane_id * 8
-      v_add_nc_u32_e32(v[0], v[0], v[1]),  # v0 = lane_id * 10
-      # Now read lane 2's value (should be 20) into s0
-      self._readlane(0, v[0], 2),          # s0 = v0 from lane 2 = 20
-      v_mov_b32_e32(v[2], s[0]),           # broadcast to all lanes
-    ]
-    st = run_program(instructions, n_lanes=4)
-    # All lanes should have the value 20 (lane 2's value)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][2], 20, f"Lane {lane}: expected 20, got {st.vgpr[lane][2]}")
-
-  def test_v_readlane_b32_lane_0(self):
-    """V_READLANE_B32 reading from lane 0."""
-    instructions = [
-      v_lshlrev_b32_e32(v[0], 2, v[255]),  # v0 = lane_id * 4
-      v_add_nc_u32_e32(v[0], 100, v[0]),   # v0 = 100 + lane_id * 4
-      self._readlane(0, v[0], 0),          # s0 = lane 0's v0 = 100
-      v_mov_b32_e32(v[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 100)
-
-  def test_v_readlane_b32_last_lane(self):
-    """V_READLANE_B32 reading from the last active lane (lane 3 in 4-lane test)."""
-    instructions = [
-      v_lshlrev_b32_e32(v[0], 2, v[255]),  # v0 = lane_id * 4
-      v_add_nc_u32_e32(v[0], 100, v[0]),   # v0 = 100 + lane_id * 4
-      self._readlane(0, v[0], 3),          # s0 = lane 3's v0 = 112
-      v_mov_b32_e32(v[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 112)
-
-  def test_v_readlane_b32_different_vgpr(self):
-    """V_READLANE_B32 reading from different VGPR indices.
-
-    Regression test for bug where rd_lane was checked against VGPR values
-    instead of being used as an index (using 'in' operator on list instead
-    of checking if index is within bounds).
-    """
-    instructions = [
-      # Set up v[5] with per-lane values
-      v_lshlrev_b32_e32(v[5], 3, v[255]),  # v5 = lane_id * 8
-      v_add_nc_u32_e32(v[5], 50, v[5]),    # v5 = 50 + lane_id * 8
-      # Read lane 1's v[5] (should be 58)
-      self._readlane(0, v[5], 1),
-      v_mov_b32_e32(v[6], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][6], 58, f"Lane {lane}: expected 58 from v[5] lane 1")
-
-  def test_v_readfirstlane_b32_basic(self):
-    """V_READFIRSTLANE_B32 reads from the first active lane."""
-    instructions = [
-      v_lshlrev_b32_e32(v[0], 2, v[255]),  # v0 = lane_id * 4
-      v_add_nc_u32_e32(v[0], 1000, v[0]),  # v0 = 1000 + lane_id * 4
-      self._readfirstlane(0, v[0]),        # s0 = first lane's v0 = 1000
-      v_mov_b32_e32(v[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 1000)
-
-  def test_v_readfirstlane_b32_different_vgpr(self):
-    """V_READFIRSTLANE_B32 reading from different VGPR index.
-
-    Regression test for bug where src0_idx bounds check was incorrect.
-    """
-    instructions = [
-      v_lshlrev_b32_e32(v[7], 5, v[255]),  # v7 = lane_id * 32
-      v_add_nc_u32_e32(v[7], 200, v[7]),   # v7 = 200 + lane_id * 32
-      self._readfirstlane(0, v[7]),        # s0 = first lane's v7 = 200
-      v_mov_b32_e32(v[8], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][8], 200)
-
-  def test_v_writelane_b32_basic(self):
-    """V_WRITELANE_B32 writes a scalar to a specific lane's VGPR."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0 for all lanes
-      s_mov_b32(s[0], 999),                # Value to write
-      v_writelane_b32(v[0], s[0], 2),      # Write 999 to lane 2's v0
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      if lane == 2:
-        self.assertEqual(st.vgpr[lane][0], 999, f"Lane 2 should have 999")
-      else:
-        self.assertEqual(st.vgpr[lane][0], 0, f"Lane {lane} should have 0")
-
-  def test_v_writelane_then_readlane(self):
-    """V_WRITELANE followed by V_READLANE to verify round-trip."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      s_mov_b32(s[0], 0xdeadbeef),
-      v_writelane_b32(v[0], s[0], 1),      # Write to lane 1
-      self._readlane(1, v[0], 1),          # Read back from lane 1 into s1
-      v_mov_b32_e32(v[1], s[1]),
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 0xdeadbeef)
-
-  def test_v_readlane_for_reduction(self):
-    """Simulate a wave reduction using readlane - common pattern in WMMA/reductions.
-
-    This pattern is used when reducing across lanes, e.g., for computing
-    the sum of all elements in a wave.
-    """
-    # Each lane computes lane_id + 1, then we sum lanes 0-3 using readlane
-    instructions = [
-      v_add_nc_u32_e32(v[0], 1, v[255]),   # v0 = lane_id + 1 (1, 2, 3, 4)
-      # Read all 4 lanes and sum in scalar registers
-      self._readlane(0, v[0], 0),          # s0 = 1
-      self._readlane(1, v[0], 1),          # s1 = 2
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 3
-      self._readlane(1, v[0], 2),          # s1 = 3
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 6
-      self._readlane(1, v[0], 3),          # s1 = 4
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 10
-      v_mov_b32_e32(v[1], s[0]),           # Broadcast sum to all lanes
-    ]
-    st = run_program(instructions, n_lanes=4)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 10, f"Sum 1+2+3+4 should be 10")
-
-  def test_v_writelane_b32_different_vgpr(self):
-    """V_WRITELANE_B32 writes to a non-zero VGPR index.
-
-    Regression test for bug where vdst_idx was always 0 due to function signature
-    mismatch (_vars parameter shifted all arguments). This caused all WRITELANE
-    operations to write to v[0] regardless of the actual destination register.
-    """
-    instructions = [
-      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
-      v_mov_b32_e32(v[5], 0),              # Initialize v5 = 0
-      s_mov_b32(s[0], 0x12345678),         # Value to write
-      v_writelane_b32(v[5], s[0], 1),      # Write to lane 1's v5 (NOT v0!)
-    ]
-    st = run_program(instructions, n_lanes=4)
-    # v[0] should remain 0 for all lanes (bug would have written here)
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)")
-    # v[5] should have the value only in lane 1
-    for lane in range(4):
-      if lane == 1:
-        self.assertEqual(st.vgpr[lane][5], 0x12345678, f"v[5] lane 1 should have 0x12345678")
-      else:
-        self.assertEqual(st.vgpr[lane][5], 0, f"v[5] lane {lane} should be 0")
-
-  def test_v_writelane_b32_high_vgpr_index(self):
-    """V_WRITELANE_B32 writes to a high VGPR index (v[15]).
-
-    Tests that the vdst_idx is correctly passed through for larger register indices.
-    """
-    instructions = [
-      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
-      v_mov_b32_e32(v[15], 0),             # Initialize v15 = 0
-      s_mov_b32(s[0], 0xCAFEBABE),         # Value to write
-      v_writelane_b32(v[15], s[0], 0),     # Write to lane 0's v15
-    ]
-    st = run_program(instructions, n_lanes=4)
-    # v[0] should remain 0 for all lanes
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0")
-    # v[15] should have the value only in lane 0
-    self.assertEqual(st.vgpr[0][15], 0xCAFEBABE, "v[15] lane 0 should have 0xCAFEBABE")
-    for lane in range(1, 4):
-      self.assertEqual(st.vgpr[lane][15], 0, f"v[15] lane {lane} should be 0")
-
-  def test_v_writelane_b32_multiple_writes_different_vgprs(self):
-    """V_WRITELANE_B32 writes to multiple different VGPRs.
-
-    This is the pattern used in sparse_categorical_crossentropy where values
-    are written to different VGPR indices via writelane, then read back.
-    """
-    instructions = [
-      # Initialize all target VGPRs to 0
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[3], 0),
-      v_mov_b32_e32(v[7], 0),
-      v_mov_b32_e32(v[10], 0),
-      # Write different values to different VGPRs at different lanes
-      s_mov_b32(s[0], 100),
-      v_writelane_b32(v[3], s[0], 0),      # v[3] lane 0 = 100
-      s_mov_b32(s[0], 200),
-      v_writelane_b32(v[7], s[0], 1),      # v[7] lane 1 = 200
-      s_mov_b32(s[0], 300),
-      v_writelane_b32(v[10], s[0], 2),     # v[10] lane 2 = 300
-    ]
-    st = run_program(instructions, n_lanes=4)
-
-    # v[0] should remain 0 everywhere
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0")
-
-    # Check each target VGPR
-    self.assertEqual(st.vgpr[0][3], 100, "v[3] lane 0 should be 100")
-    for lane in range(1, 4):
-      self.assertEqual(st.vgpr[lane][3], 0, f"v[3] lane {lane} should be 0")
-
-    self.assertEqual(st.vgpr[1][7], 200, "v[7] lane 1 should be 200")
-    for lane in [0, 2, 3]:
-      self.assertEqual(st.vgpr[lane][7], 0, f"v[7] lane {lane} should be 0")
-
-    self.assertEqual(st.vgpr[2][10], 300, "v[10] lane 2 should be 300")
-    for lane in [0, 1, 3]:
-      self.assertEqual(st.vgpr[lane][10], 0, f"v[10] lane {lane} should be 0")
-
-  def test_v_writelane_then_readlane_different_vgpr(self):
-    """V_WRITELANE followed by V_READLANE on a non-zero VGPR.
-
-    Regression test: the original bug caused writelane to always write to v[0],
-    so reading back from the intended VGPR would return 0 instead of the written value.
-    This is the exact pattern that failed in sparse_categorical_crossentropy.
-    """
-    instructions = [
-      v_mov_b32_e32(v[0], 0),              # Initialize v0 = 0
-      v_mov_b32_e32(v[8], 0),              # Initialize v8 = 0
-      s_mov_b32(s[0], 0xABCD1234),
-      v_writelane_b32(v[8], s[0], 2),      # Write to lane 2's v8
-      self._readlane(1, v[8], 2),          # Read back from lane 2's v8 into s1
-      v_mov_b32_e32(v[1], s[1]),           # Broadcast to all lanes
-    ]
-    st = run_program(instructions, n_lanes=4)
-    # The read value should be what we wrote
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][1], 0xABCD1234,
-                       f"Lane {lane}: readlane should return 0xABCD1234, got 0x{st.vgpr[lane][1]:08x}")
-    # v[0] should still be 0 (bug would have written here instead of v[8])
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)")
-
-  def test_v_writelane_b32_accumulate_pattern(self):
-    """V_WRITELANE_B32 used to accumulate values across lanes into a single VGPR.
-
-    This pattern is used in reductions where each lane writes its result to
-    a different lane of the same VGPR, then the results are read back.
-    """
-    instructions = [
-      v_mov_b32_e32(v[6], 0),              # Initialize accumulator v6 = 0
-      # Each "iteration" writes to a different lane
-      s_mov_b32(s[0], 10),
-      v_writelane_b32(v[6], s[0], 0),      # lane 0 gets 10
-      s_mov_b32(s[0], 20),
-      v_writelane_b32(v[6], s[0], 1),      # lane 1 gets 20
-      s_mov_b32(s[0], 30),
-      v_writelane_b32(v[6], s[0], 2),      # lane 2 gets 30
-      s_mov_b32(s[0], 40),
-      v_writelane_b32(v[6], s[0], 3),      # lane 3 gets 40
-      # Now read them all back and sum
-      self._readlane(0, v[6], 0),          # s0 = 10
-      self._readlane(1, v[6], 1),          # s1 = 20
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 30
-      self._readlane(1, v[6], 2),          # s1 = 30
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 60
-      self._readlane(1, v[6], 3),          # s1 = 40
-      s_add_u32(s[0], s[0], s[1]),         # s0 = 100
-      v_mov_b32_e32(v[7], s[0]),           # Broadcast sum to all lanes
-    ]
-    st = run_program(instructions, n_lanes=4)
-
-    # Check that each lane of v[6] has the correct value
-    self.assertEqual(st.vgpr[0][6], 10, "v[6] lane 0 should be 10")
-    self.assertEqual(st.vgpr[1][6], 20, "v[6] lane 1 should be 20")
-    self.assertEqual(st.vgpr[2][6], 30, "v[6] lane 2 should be 30")
-    self.assertEqual(st.vgpr[3][6], 40, "v[6] lane 3 should be 40")
-
-    # Check the sum
-    for lane in range(4):
-      self.assertEqual(st.vgpr[lane][7], 100, f"Sum should be 100, got {st.vgpr[lane][7]}")
-
-
-class TestTrigonometry(unittest.TestCase):
-  """Tests for trigonometric instructions."""
-
-  def test_v_sin_f32_small(self):
-    """V_SIN_F32 computes sin for small values."""
-    import math
-    # sin(1.0) ≈ 0.8414709848
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),
-      v_sin_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    expected = math.sin(1.0 * 2 * math.pi)  # V_SIN_F32 expects input in cycles (0-1 = 0-2π)
-    self.assertAlmostEqual(result, expected, places=4)
-
-  def test_v_sin_f32_quarter(self):
-    """V_SIN_F32 at 0.25 cycles = sin(π/2) = 1.0."""
-    instructions = [
-      s_mov_b32(s[0], f2i(0.25)),  # 0.25 is not an inline constant, use f2i
-      v_mov_b32_e32(v[0], s[0]),
-      v_sin_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    self.assertAlmostEqual(result, 1.0, places=4)
-
-  def test_v_sin_f32_large(self):
-    """V_SIN_F32 for large input value (132000.0)."""
-    import math
-    # This is the failing case: sin(132000.0) should be ≈ 0.294
-    # V_SIN_F32 input is in cycles, so we need frac(132000.0) * 2π
-    instructions = [
-      s_mov_b32(s[0], f2i(132000.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_sin_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    # frac(132000.0) = 0, so sin(0) = 0... but actually V_SIN_F32 does its own frac internally
-    # The expected value is sin(frac(132000.0) * 2π) where frac is done in the instruction
-    # For 132000.0, the hardware computes frac(132000.0) ≈ 0.046875 (due to precision)
-    # sin(0.046875 * 2π) ≈ 0.294
-    expected = math.sin(132000.0 * 2 * math.pi)
-    # Allow some tolerance due to precision differences
-    self.assertAlmostEqual(result, expected, places=2, msg=f"sin(132000) got {result}, expected ~{expected}")
-
-
-class TestFMA(unittest.TestCase):
-  """Tests for FMA instructions - key for OCML sin argument reduction."""
-
-  def test_v_fma_f32_basic(self):
-    """V_FMA_F32: a*b+c basic case using inline constants only."""
-    # Inline float constants: 0.5, -0.5, 1.0, -1.0, 2.0, -2.0, 4.0, -4.0
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),  # inline constant
-      v_mov_b32_e32(v[1], 4.0),  # inline constant
-      v_mov_b32_e32(v[2], 1.0),  # inline constant
-      v_fma_f32(v[3], v[0], v[1], v[2]),  # 2*4+1 = 9
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 9.0, places=5)
-
-  def test_v_fma_f32_negative(self):
-    """V_FMA_F32 with negative multiplier (used in sin reduction)."""
-    instructions = [
-      v_mov_b32_e32(v[0], -2.0),  # inline constant
-      v_mov_b32_e32(v[1], 4.0),   # inline constant
-      v_mov_b32_e32(v[2], 1.0),   # inline constant
-      v_fma_f32(v[3], v[0], v[1], v[2]),  # -2*4+1 = -7
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), -7.0, places=5)
-
-  def test_v_fmac_f32(self):
-    """V_FMAC_F32: d = d + a*b using inline constants."""
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),  # inline constant
-      v_mov_b32_e32(v[1], 4.0),  # inline constant
-      v_mov_b32_e32(v[2], 1.0),  # inline constant
-      v_fmac_f32_e32(v[2], v[0], v[1]),  # v2 = v2 + v0*v1 = 1 + 2*4 = 9
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
-
-  def test_v_fmaak_f32(self):
-    """V_FMAAK_F32: d = a * b + K using inline constants."""
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),  # inline constant
-      v_mov_b32_e32(v[1], 4.0),  # inline constant
-      v_fmaak_f32_e32(v[2], v[0], v[1], 0x3f800000),  # v2 = v0 * v1 + 1.0 = 2*4+1 = 9
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
-
-  def test_v_fma_f32_with_sgpr(self):
-    """V_FMA_F32: using SGPR for non-inline constant."""
-    # Use SGPR to load 3.0 which is not an inline constant
-    instructions = [
-      s_mov_b32(s[0], f2i(3.0)),  # 3.0 via literal in SGPR
-      v_mov_b32_e32(v[0], 2.0),   # inline constant
-      v_mov_b32_e32(v[1], s[0]),  # 3.0 from SGPR
-      v_mov_b32_e32(v[2], 4.0),   # inline constant
-      v_fma_f32(v[3], v[0], v[1], v[2]),  # 2*3+4 = 10
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][3]), 10.0, places=5)
-
-
-class TestRounding(unittest.TestCase):
-  """Tests for rounding instructions - used in sin argument reduction."""
-
-  def test_v_rndne_f32_half_even(self):
-    """V_RNDNE_F32 rounds to nearest even."""
-    instructions = [
-      s_mov_b32(s[0], f2i(2.5)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_rndne_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)  # rounds to even
-
-  def test_v_rndne_f32_half_odd(self):
-    """V_RNDNE_F32 rounds 3.5 to 4 (nearest even)."""
-    instructions = [
-      s_mov_b32(s[0], f2i(3.5)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_rndne_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4.0, places=5)
-
-  def test_v_rndne_f32_large(self):
-    """V_RNDNE_F32 with large value (like sin reduction uses)."""
-    # sin(1e5) reduction: 1e5 * (1/2pi) ≈ 15915.49...
-    val = 100000.0 * 0.15915494309189535  # 1/(2*pi)
-    instructions = [
-      s_mov_b32(s[0], f2i(val)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_rndne_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    expected = round(val)  # Python's round does banker's rounding
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), expected, places=0)
-
-  def test_v_floor_f32(self):
-    """V_FLOOR_F32 floors to integer."""
-    instructions = [
-      s_mov_b32(s[0], f2i(3.7)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_floor_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 3.0, places=5)
-
-  def test_v_trunc_f32(self):
-    """V_TRUNC_F32 truncates toward zero."""
-    instructions = [
-      s_mov_b32(s[0], f2i(-3.7)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_trunc_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -3.0, places=5)
-
-  def test_v_fract_f32(self):
-    """V_FRACT_F32 returns fractional part."""
-    instructions = [
-      s_mov_b32(s[0], f2i(3.75)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_fract_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.75, places=5)
-
-  def test_v_fract_f32_large(self):
-    """V_FRACT_F32 with large value - precision matters here."""
-    instructions = [
-      s_mov_b32(s[0], f2i(132000.25)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_fract_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    # For large floats, fract precision degrades
-    self.assertGreaterEqual(result, 0.0)
-    self.assertLess(result, 1.0)
-
-
-class TestConversion(unittest.TestCase):
-  """Tests for conversion instructions."""
-
-  def test_v_cvt_i32_f32_positive(self):
-    """V_CVT_I32_F32 converts float to signed int."""
-    instructions = [
-      s_mov_b32(s[0], f2i(42.7)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i32_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 42)
-
-  def test_v_cvt_i32_f32_negative(self):
-    """V_CVT_I32_F32 converts negative float to signed int."""
-    instructions = [
-      s_mov_b32(s[0], f2i(-42.7)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i32_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # Result is signed, stored as unsigned
-    self.assertEqual(st.vgpr[0][1] & 0xffffffff, (-42) & 0xffffffff)
-
-  def test_v_cvt_i32_f32_large(self):
-    """V_CVT_I32_F32 with large float (used in sin for quadrant)."""
-    # sin reduction converts round(x * 1/2pi) to int for quadrant selection
-    instructions = [
-      s_mov_b32(s[0], f2i(15915.0)),  # ~1e5 / (2*pi)
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i32_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 15915)
-
-  def test_v_cvt_f32_i32(self):
-    """V_CVT_F32_I32 converts signed int to float."""
-    instructions = [
-      s_mov_b32(s[0], 42),
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_f32_i32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 42.0, places=5)
-
-  def test_v_cvt_f32_u32(self):
-    """V_CVT_F32_U32 converts unsigned int to float."""
-    instructions = [
-      s_mov_b32(s[0], 0xffffffff),  # max u32
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_f32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4294967296.0, places=-5)
-
-
-class TestBitManipulation(unittest.TestCase):
-  """Tests for bit manipulation - used in sin for quadrant selection."""
-
-  def test_v_and_b32(self):
-    """V_AND_B32 bitwise and."""
-    instructions = [
-      s_mov_b32(s[0], 0xff),
-      s_mov_b32(s[1], 0x0f),
-      v_mov_b32_e32(v[0], s[0]),
-      v_and_b32_e32(v[1], s[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0x0f)
-
-  def test_v_and_b32_quadrant(self):
-    """V_AND_B32 for quadrant extraction (n & 3)."""
-    instructions = [
-      s_mov_b32(s[0], 15915),  # some large number
-      v_mov_b32_e32(v[0], s[0]),
-      v_and_b32_e32(v[1], 3, v[0]),  # n & 3 for quadrant
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 15915 & 3)
-
-  def test_v_lshrrev_b32(self):
-    """V_LSHRREV_B32 logical shift right."""
-    instructions = [
-      s_mov_b32(s[0], 0xff00),
-      v_mov_b32_e32(v[0], s[0]),
-      v_lshrrev_b32_e32(v[1], 8, v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0xff)
-
-  def test_v_lshlrev_b32(self):
-    """V_LSHLREV_B32 logical shift left."""
-    instructions = [
-      s_mov_b32(s[0], 0xff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_lshlrev_b32_e32(v[1], 8, v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0xff00)
-
-  def test_v_xor_b32(self):
-    """V_XOR_B32 bitwise xor (used in sin for sign)."""
-    instructions = [
-      s_mov_b32(s[0], 0x80000000),  # sign bit
-      s_mov_b32(s[1], f2i(1.0)),
-      v_mov_b32_e32(v[0], s[1]),
-      v_xor_b32_e32(v[1], s[0], v[0]),  # flip sign
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
-
-
-class TestOCMLSinSequence(unittest.TestCase):
-  """Test the specific instruction sequence used in OCML sin."""
-
-  def test_sin_reduction_step1_mul(self):
-    """First step: v12 = |x| * (1/2pi)."""
-    import math
-    one_over_2pi = 1.0 / (2.0 * math.pi)  # 0x3e22f983 in hex
-    x = 100000.0
-    instructions = [
-      s_mov_b32(s[0], f2i(x)),
-      s_mov_b32(s[1], f2i(one_over_2pi)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mul_f32_e32(v[1], s[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    expected = x * one_over_2pi
-    self.assertAlmostEqual(result, expected, places=0)
-
-  def test_sin_reduction_step2_round(self):
-    """Second step: round to nearest integer."""
-    import math
-    one_over_2pi = 1.0 / (2.0 * math.pi)
-    x = 100000.0
-    val = x * one_over_2pi  # ~15915.49
-    instructions = [
-      s_mov_b32(s[0], f2i(val)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_rndne_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    expected = round(val)
-    self.assertAlmostEqual(result, expected, places=0)
-
-  def test_sin_reduction_step3_fma(self):
-    """Third step: x - n * (pi/2) via FMA."""
-    import math
-    # This is where precision matters - the FMA does: |x| + (-pi/2) * n
-    neg_half_pi = -math.pi / 2.0  # 0xbfc90fda
-    x = 100000.0
-    n = 15915.0
-    instructions = [
-      s_mov_b32(s[0], f2i(neg_half_pi)),
-      s_mov_b32(s[1], f2i(n)),
-      s_mov_b32(s[2], f2i(x)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_fma_f32(v[3], v[0], v[1], v[2]),  # x + (-pi/2) * n
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    expected = x + neg_half_pi * n
-    # Allow some tolerance due to float precision
-    self.assertAlmostEqual(result, expected, places=2)
-
-  def test_sin_1e5_full_reduction(self):
-    """Full reduction sequence for sin(1e5)."""
-    import math
-    x = 100000.0
-    one_over_2pi = 1.0 / (2.0 * math.pi)
-    neg_half_pi = -math.pi / 2.0
-
-    instructions = [
-      # Load constants
-      s_mov_b32(s[0], f2i(x)),
-      s_mov_b32(s[1], f2i(one_over_2pi)),
-      s_mov_b32(s[2], f2i(neg_half_pi)),
-      # Step 1: v1 = x * (1/2pi)
-      v_mov_b32_e32(v[0], s[0]),
-      v_mul_f32_e32(v[1], s[1], v[0]),
-      # Step 2: v2 = round(v1)
-      v_rndne_f32_e32(v[2], v[1]),
-      # Step 3: v3 = x + (-pi/2) * round_val (FMA)
-      v_fma_f32(v[3], s[2], v[2], v[0]),
-      # Step 4: convert to int for quadrant
-      v_cvt_i32_f32_e32(v[4], v[2]),
-      # Step 5: quadrant = n & 3
-      v_and_b32_e32(v[5], 3, v[4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-
-    # Check intermediate values
-    mul_result = i2f(st.vgpr[0][1])
-    round_result = i2f(st.vgpr[0][2])
-    reduced = i2f(st.vgpr[0][3])
-    quadrant = st.vgpr[0][5]
-
-    # Verify results match expected
-    expected_mul = x * one_over_2pi
-    expected_round = round(expected_mul)
-    expected_reduced = x + neg_half_pi * expected_round
-    expected_quadrant = int(expected_round) & 3
-
-    self.assertAlmostEqual(mul_result, expected_mul, places=0, msg=f"mul: got {mul_result}, expected {expected_mul}")
-    self.assertAlmostEqual(round_result, expected_round, places=0, msg=f"round: got {round_result}, expected {expected_round}")
-    self.assertEqual(quadrant, expected_quadrant, f"quadrant: got {quadrant}, expected {expected_quadrant}")
-
-
-class TestMad64(unittest.TestCase):
-  """Tests for V_MAD_U64_U32 - critical for OCML Payne-Hanek sin reduction."""
-
-  def test_v_mad_u64_u32_simple(self):
-    """V_MAD_U64_U32: D = S0 * S1 + S2 (64-bit result)."""
-    # 3 * 4 + 5 = 17
-    instructions = [
-      s_mov_b32(s[0], 3),
-      s_mov_b32(s[1], 4),
-      v_mov_b32_e32(v[2], 5),  # S2 lo
-      v_mov_b32_e32(v[3], 0),  # S2 hi
-      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),  # result in v[4:5]
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_lo = st.vgpr[0][4]
-    result_hi = st.vgpr[0][5]
-    result = result_lo | (result_hi << 32)
-    self.assertEqual(result, 17)
-
-  def test_v_mad_u64_u32_large_mult(self):
-    """V_MAD_U64_U32 with large values that overflow 32 bits."""
-    # 0x80000000 * 2 + 0 = 0x100000000
-    instructions = [
-      s_mov_b32(s[0], 0x80000000),
-      s_mov_b32(s[1], 2),
-      v_mov_b32_e32(v[2], 0),
-      v_mov_b32_e32(v[3], 0),
-      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_lo = st.vgpr[0][4]
-    result_hi = st.vgpr[0][5]
-    result = result_lo | (result_hi << 32)
-    self.assertEqual(result, 0x100000000)
-
-  def test_v_mad_u64_u32_with_add(self):
-    """V_MAD_U64_U32 with 64-bit addend."""
-    # 1000 * 1000 + 0x100000000 = 1000000 + 0x100000000 = 0x1000F4240
-    instructions = [
-      s_mov_b32(s[0], 1000),
-      s_mov_b32(s[1], 1000),
-      v_mov_b32_e32(v[2], 0),  # S2 lo
-      v_mov_b32_e32(v[3], 1),  # S2 hi = 0x100000000
-      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_lo = st.vgpr[0][4]
-    result_hi = st.vgpr[0][5]
-    result = result_lo | (result_hi << 32)
-    expected = 1000 * 1000 + 0x100000000
-    self.assertEqual(result, expected)
-
-  def test_v_mad_u64_u32_max_values(self):
-    """V_MAD_U64_U32 with max u32 values."""
-    # 0xFFFFFFFF * 0xFFFFFFFF + 0 = 0xFFFFFFFE00000001
-    instructions = [
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      s_mov_b32(s[1], 0xFFFFFFFF),
-      v_mov_b32_e32(v[2], 0),
-      v_mov_b32_e32(v[3], 0),
-      v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_lo = st.vgpr[0][4]
-    result_hi = st.vgpr[0][5]
-    result = result_lo | (result_hi << 32)
-    expected = 0xFFFFFFFF * 0xFFFFFFFF
-    self.assertEqual(result, expected)
-
-
-class TestClz(unittest.TestCase):
-  """Tests for V_CLZ_I32_U32 - count leading zeros, used in Payne-Hanek."""
-
-  def test_v_clz_i32_u32_zero(self):
-    """V_CLZ_I32_U32 of 0 returns -1 (all bits are 0)."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      v_clz_i32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # -1 as unsigned 32-bit
-    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
-
-  def test_v_clz_i32_u32_one(self):
-    """V_CLZ_I32_U32 of 1 returns 31 (31 leading zeros)."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1),
-      v_clz_i32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 31)
-
-  def test_v_clz_i32_u32_msb_set(self):
-    """V_CLZ_I32_U32 of 0x80000000 returns 0 (no leading zeros)."""
-    instructions = [
-      s_mov_b32(s[0], 0x80000000),
-      v_mov_b32_e32(v[0], s[0]),
-      v_clz_i32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0)
-
-  def test_v_clz_i32_u32_half(self):
-    """V_CLZ_I32_U32 of 0x8000 (bit 15) returns 16."""
-    instructions = [
-      s_mov_b32(s[0], 0x8000),
-      v_mov_b32_e32(v[0], s[0]),
-      v_clz_i32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 16)
-
-  def test_v_clz_i32_u32_all_ones(self):
-    """V_CLZ_I32_U32 of 0xFFFFFFFF returns 0."""
-    instructions = [
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      v_mov_b32_e32(v[0], s[0]),
-      v_clz_i32_u32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0)
-
-
-class TestCtz(unittest.TestCase):
-  """Tests for V_CTZ_I32_B32 - count trailing zeros."""
-
-  def test_v_ctz_i32_b32_zero(self):
-    """V_CTZ_I32_B32 of 0 returns -1 (all bits are 0)."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      v_ctz_i32_b32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
-
-  def test_v_ctz_i32_b32_one(self):
-    """V_CTZ_I32_B32 of 1 returns 0 (no trailing zeros)."""
-    instructions = [
-      v_mov_b32_e32(v[0], 1),
-      v_ctz_i32_b32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0)
-
-  def test_v_ctz_i32_b32_msb_set(self):
-    """V_CTZ_I32_B32 of 0x80000000 returns 31."""
-    instructions = [
-      s_mov_b32(s[0], 0x80000000),
-      v_mov_b32_e32(v[0], s[0]),
-      v_ctz_i32_b32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 31)
-
-  def test_v_ctz_i32_b32_half(self):
-    """V_CTZ_I32_B32 of 0x8000 (bit 15) returns 15."""
-    instructions = [
-      s_mov_b32(s[0], 0x8000),
-      v_mov_b32_e32(v[0], s[0]),
-      v_ctz_i32_b32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 15)
-
-  def test_v_ctz_i32_b32_all_ones(self):
-    """V_CTZ_I32_B32 of 0xFFFFFFFF returns 0."""
-    instructions = [
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      v_mov_b32_e32(v[0], s[0]),
-      v_ctz_i32_b32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0)
-
-
-class TestDivision(unittest.TestCase):
-  """Tests for division instructions - V_RCP, V_DIV_SCALE, V_DIV_FMAS, V_DIV_FIXUP."""
-
-  def test_v_rcp_f32_normal(self):
-    """V_RCP_F32 of 2.0 returns 0.5."""
-    instructions = [
-      v_mov_b32_e32(v[0], 2.0),
-      v_rcp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
-
-  def test_v_rcp_f32_inf(self):
-    """V_RCP_F32 of +inf returns 0."""
-    instructions = [
-      s_mov_b32(s[0], 0x7f800000),  # +inf
-      v_mov_b32_e32(v[0], s[0]),
-      v_rcp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
-
-  def test_v_rcp_f32_neg_inf(self):
-    """V_RCP_F32 of -inf returns -0."""
-    instructions = [
-      s_mov_b32(s[0], 0xff800000),  # -inf
-      v_mov_b32_e32(v[0], s[0]),
-      v_rcp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    self.assertEqual(result, 0.0)
-    # Check it's negative zero
-    self.assertEqual(st.vgpr[0][1], 0x80000000)
-
-  def test_v_rcp_f32_zero(self):
-    """V_RCP_F32 of 0 returns +inf."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      v_rcp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    import math
-    self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
-
-  def test_v_div_fixup_f32_normal(self):
-    """V_DIV_FIXUP_F32 normal division 1.0/2.0."""
-    # S0 = approximation (from rcp * scale), S1 = denominator, S2 = numerator
-    instructions = [
-      s_mov_b32(s[0], f2i(0.5)),   # approximation
-      s_mov_b32(s[1], f2i(2.0)),   # denominator
-      s_mov_b32(s[2], f2i(1.0)),   # numerator
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
-
-  def test_v_div_fixup_f32_one_div_inf(self):
-    """V_DIV_FIXUP_F32: 1.0 / +inf = 0."""
-    # For x/inf: S0=approx(~0), S1=inf, S2=x
-    instructions = [
-      s_mov_b32(s[0], 0),           # approximation (rcp of inf = 0)
-      s_mov_b32(s[1], 0x7f800000),  # denominator = +inf
-      s_mov_b32(s[2], f2i(1.0)),    # numerator = 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
-
-  def test_v_div_fixup_f32_one_div_neg_inf(self):
-    """V_DIV_FIXUP_F32: 1.0 / -inf = -0."""
-    instructions = [
-      s_mov_b32(s[0], 0x80000000),  # approximation (rcp of -inf = -0)
-      s_mov_b32(s[1], 0xff800000),  # denominator = -inf
-      s_mov_b32(s[2], f2i(1.0)),    # numerator = 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0x80000000)  # -0.0
-
-  def test_v_div_fixup_f32_inf_div_inf(self):
-    """V_DIV_FIXUP_F32: inf / inf = NaN."""
-    import math
-    instructions = [
-      s_mov_b32(s[0], 0),           # approximation
-      s_mov_b32(s[1], 0x7f800000),  # denominator = +inf
-      s_mov_b32(s[2], 0x7f800000),  # numerator = +inf
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
-
-  def test_v_div_fixup_f32_zero_div_zero(self):
-    """V_DIV_FIXUP_F32: 0 / 0 = NaN."""
-    import math
-    instructions = [
-      s_mov_b32(s[0], 0),  # approximation
-      s_mov_b32(s[1], 0),  # denominator = 0
-      s_mov_b32(s[2], 0),  # numerator = 0
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
-
-  def test_v_div_fixup_f32_x_div_zero(self):
-    """V_DIV_FIXUP_F32: 1.0 / 0 = +inf."""
-    import math
-    instructions = [
-      s_mov_b32(s[0], 0x7f800000),  # approximation (rcp of 0 = inf)
-      s_mov_b32(s[1], 0),           # denominator = 0
-      s_mov_b32(s[2], f2i(1.0)),    # numerator = 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    self.assertTrue(math.isinf(result) and result > 0)
-
-  def test_v_div_fixup_f32_neg_x_div_zero(self):
-    """V_DIV_FIXUP_F32: -1.0 / 0 = -inf."""
-    import math
-    instructions = [
-      s_mov_b32(s[0], 0xff800000),  # approximation (rcp of 0 = inf, with sign)
-      s_mov_b32(s[1], 0),           # denominator = 0
-      s_mov_b32(s[2], f2i(-1.0)),   # numerator = -1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_div_fixup_f32(v[1], v[0], s[1], s[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][1])
-    self.assertTrue(math.isinf(result) and result < 0)
-
-
-class TestSpecialValues(unittest.TestCase):
-  """Tests for special float values - inf, nan, zero handling."""
-
-  def test_v_mul_f32_zero_times_inf(self):
-    """V_MUL_F32: 0 * inf = NaN."""
-    import math
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      s_mov_b32(s[0], 0x7f800000),  # +inf
-      v_mov_b32_e32(v[1], s[0]),
-      v_mul_f32_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
-
-  def test_v_add_f32_inf_minus_inf(self):
-    """V_ADD_F32: inf + (-inf) = NaN."""
-    import math
-    instructions = [
-      s_mov_b32(s[0], 0x7f800000),  # +inf
-      s_mov_b32(s[1], 0xff800000),  # -inf
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_add_f32_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
-
-  def test_v_fma_f32_with_inf(self):
-    """V_FMA_F32: 1.0 * inf + 0 = inf."""
-    import math
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),
-      s_mov_b32(s[0], 0x7f800000),  # +inf
-      v_mov_b32_e32(v[1], s[0]),
-      v_mov_b32_e32(v[2], 0),
-      v_fma_f32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertTrue(math.isinf(result) and result > 0)
-
-  def test_v_exp_f32_large_negative(self):
-    """V_EXP_F32 of large negative value (2^-100) returns very small number."""
-    instructions = [
-      s_mov_b32(s[0], f2i(-100.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_exp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # V_EXP_F32 computes 2^x, so 2^-100 is ~7.9e-31 (very small but not 0)
-    result = i2f(st.vgpr[0][1])
-    self.assertLess(result, 1e-20)  # Just verify it's very small
-
-  def test_v_exp_f32_large_positive(self):
-    """V_EXP_F32 of large positive value (2^100) returns very large number."""
-    instructions = [
-      s_mov_b32(s[0], f2i(100.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      v_exp_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # V_EXP_F32 computes 2^x, so 2^100 is ~1.27e30 (very large)
-    result = i2f(st.vgpr[0][1])
-    self.assertGreater(result, 1e20)  # Just verify it's very large
-
-
-class TestF16Conversions(unittest.TestCase):
-  """Tests for f16 conversion and packing instructions."""
-
-  def test_v_cvt_f16_f32_basic(self):
-    """V_CVT_F16_F32 converts f32 to f16 in low 16 bits."""
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),  # f32 1.0 = 0x3f800000
-      v_cvt_f16_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    # f16 1.0 = 0x3c00, should be in low 16 bits
-    lo_bits = result & 0xffff
-    self.assertEqual(lo_bits, 0x3c00, f"Expected 0x3c00, got 0x{lo_bits:04x}")
-
-  def test_v_cvt_f16_f32_negative(self):
-    """V_CVT_F16_F32 converts negative f32 to f16."""
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      v_mov_b32_e32(v[0], -2.0),  # f32 -2.0 = 0xc0000000
-      v_cvt_f16_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    lo_bits = result & 0xffff
-    # f16 -2.0 = 0xc000
-    self.assertEqual(lo_bits, 0xc000, f"Expected 0xc000, got 0x{lo_bits:04x}")
-
-  def test_v_cvt_f16_f32_small(self):
-    """V_CVT_F16_F32 converts small f32 value."""
-    from extra.assembly.amd.pcode import _f16, f32_to_f16
-    instructions = [
-      v_mov_b32_e32(v[0], 0.5),
-      v_cvt_f16_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    lo_bits = result & 0xffff
-    expected = f32_to_f16(0.5)  # Should be 0x3800
-    self.assertEqual(lo_bits, expected, f"Expected 0x{expected:04x}, got 0x{lo_bits:04x}")
-
-  def test_v_cvt_f16_f32_preserves_high_bits(self):
-    """V_CVT_F16_F32 preserves high 16 bits of destination.
-
-    Hardware verified: V_CVT_F16_F32 only writes to the low 16 bits of the
-    destination register, preserving the high 16 bits. This is important for
-    the common pattern of converting two f32 values and packing them.
-    """
-    instructions = [
-      s_mov_b32(s[0], 0xdead0000),  # Pre-fill with garbage in high bits
-      v_mov_b32_e32(v[1], s[0]),
-      v_mov_b32_e32(v[0], 1.0),
-      v_cvt_f16_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    hi_bits = (result >> 16) & 0xffff
-    lo_bits = result & 0xffff
-    self.assertEqual(lo_bits, 0x3c00, f"Low bits should be 0x3c00, got 0x{lo_bits:04x}")
-    self.assertEqual(hi_bits, 0xdead, f"High bits should be preserved as 0xdead, got 0x{hi_bits:04x}")
-
-  def test_v_cvt_f16_f32_same_src_dst_preserves_high_bits(self):
-    """V_CVT_F16_F32 with same src/dst preserves high bits of source.
-
-    Regression test: When converting v0 in-place (v_cvt_f16_f32 v0, v0),
-    the high 16 bits of the original f32 value are preserved in the result.
-    For f32 1.0 (0x3f800000), the result should be 0x3f803c00:
-    - Low 16 bits: 0x3c00 (f16 1.0)
-    - High 16 bits: 0x3f80 (preserved from original f32)
-    """
-    instructions = [
-      v_mov_b32_e32(v[0], 1.0),      # v0 = 0x3f800000
-      v_cvt_f16_f32_e32(v[0], v[0]), # convert v0 in-place
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    # Hardware preserves high bits: 0x3f800000 -> 0x3f803c00
-    self.assertEqual(result, 0x3f803c00, f"Expected 0x3f803c00, got 0x{result:08x}")
-
-  def test_v_cvt_f16_f32_reads_full_32bit_source(self):
-    """V_CVT_F16_F32 must read full 32-bit f32 source, not just low 16 bits.
-
-    Regression test for a bug where V_CVT_F16_F32 was incorrectly treated as having
-    a 16-bit source because '_F16' is in the instruction name. The CVT naming convention
-    is V_CVT_DST_SRC, so V_CVT_F16_F32 has a 32-bit f32 source and 16-bit f16 destination.
-
-    The bug caused the emulator to only read the low 16 bits of the source register,
-    which would produce wrong results when the significant bits of the f32 value are
-    in the upper bits (as they are for most f32 values > 1.0 or < -1.0).
-    """
-    from extra.assembly.amd.pcode import _f16
-    # Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong.
-    # Correct f16 result: 0x3e00 (1.5 in half precision)
-    instructions = [
-      s_mov_b32(s[0], 0x3fc00000),  # f32 1.5
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_f16_f32_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    lo_bits = result & 0xffff
-    # f16(1.5) = 0x3e00
-    self.assertEqual(lo_bits, 0x3e00, f"Expected f16(1.5)=0x3e00, got 0x{lo_bits:04x} ({_f16(lo_bits)})")
-
-  def test_v_cvt_f16_f32_then_pack_for_wmma(self):
-    """Regression test: f32->f16 conversion followed by pack for WMMA input.
-
-    This sequence is used in fused fp16 GEMM kernels where f32 data is loaded,
-    converted to f16, packed into pairs, and fed to WMMA instructions.
-
-    The bug was: V_CVT_F16_F32 was treated as having 16-bit source (because '_F16'
-    is in the name), causing it to read only low 16 bits of the f32 input.
-    This resulted in WMMA receiving zero inputs and producing zero outputs.
-    """
-    from extra.assembly.amd.pcode import _f16
-    # Simulate loading two f32 values and converting/packing for WMMA
-    # f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000
-    # After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100
-    # After PACK: 0x41003e00 (hi=2.5, lo=1.5)
-    instructions = [
-      s_mov_b32(s[0], 0x3fc00000),  # f32 1.5
-      s_mov_b32(s[1], 0x40200000),  # f32 2.5
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_cvt_f16_f32_e32(v[2], v[0]),  # v2 = f16(1.5) = 0x3e00
-      v_cvt_f16_f32_e32(v[3], v[1]),  # v3 = f16(2.5) = 0x4100
-      v_pack_b32_f16(v[4], v[2], v[3]),  # v4 = pack(v2, v3) = 0x41003e00
-    ]
-    st = run_program(instructions, n_lanes=1)
-
-    # Check intermediate CVT results
-    v2_lo = st.vgpr[0][2] & 0xffff
-    v3_lo = st.vgpr[0][3] & 0xffff
-    self.assertEqual(v2_lo, 0x3e00, f"v2 should be f16(1.5)=0x3e00, got 0x{v2_lo:04x} ({_f16(v2_lo)})")
-    self.assertEqual(v3_lo, 0x4100, f"v3 should be f16(2.5)=0x4100, got 0x{v3_lo:04x} ({_f16(v3_lo)})")
-
-    # Check packed result
-    result = st.vgpr[0][4]
-    self.assertEqual(result, 0x41003e00, f"Expected packed 0x41003e00, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_basic(self):
-    """V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      # First convert two f32 values to f16
-      v_mov_b32_e32(v[0], 1.0),   # Will become f16 0x3c00
-      v_mov_b32_e32(v[2], -2.0),  # Will become f16 0xc000
-      v_cvt_f16_f32_e32(v[1], v[0]),  # v1 low = 0x3c00
-      v_cvt_f16_f32_e32(v[3], v[2]),  # v3 low = 0xc000
-      # Now pack them: v4 = (v3.f16 << 16) | v1.f16
-      v_pack_b32_f16(v[4], v[1], v[3]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][4]
-    lo_bits = result & 0xffff
-    hi_bits = (result >> 16) & 0xffff
-    # Expected: lo=0x3c00 (1.0), hi=0xc000 (-2.0)
-    self.assertEqual(lo_bits, 0x3c00, f"Lo should be 0x3c00 (1.0), got 0x{lo_bits:04x} ({_f16(lo_bits)})")
-    self.assertEqual(hi_bits, 0xc000, f"Hi should be 0xc000 (-2.0), got 0x{hi_bits:04x} ({_f16(hi_bits)})")
-
-  def test_v_pack_b32_f16_both_positive(self):
-    """V_PACK_B32_F16 packs two positive f16 values."""
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      v_mov_b32_e32(v[0], 0.5),   # f16 0x3800
-      v_mov_b32_e32(v[2], 2.0),   # f16 0x4000
-      v_cvt_f16_f32_e32(v[1], v[0]),
-      v_cvt_f16_f32_e32(v[3], v[2]),
-      v_pack_b32_f16(v[4], v[1], v[3]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][4]
-    lo_bits = result & 0xffff
-    hi_bits = (result >> 16) & 0xffff
-    self.assertEqual(lo_bits, 0x3800, f"Lo should be 0x3800 (0.5), got 0x{lo_bits:04x}")
-    self.assertEqual(hi_bits, 0x4000, f"Hi should be 0x4000 (2.0), got 0x{hi_bits:04x}")
-
-  def test_v_pack_b32_f16_zeros(self):
-    """V_PACK_B32_F16 packs two zero values."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[2], 0),
-      v_cvt_f16_f32_e32(v[1], v[0]),
-      v_cvt_f16_f32_e32(v[3], v[2]),
-      v_pack_b32_f16(v[4], v[1], v[3]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][4]
-    self.assertEqual(result, 0, f"Expected 0x00000000, got 0x{result:08x}")
-
-
-class TestPackInstructions(unittest.TestCase):
-  """Tests for pack instructions."""
-
-  def test_v_pack_b32_f16(self):
-    """V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
-    instructions = []
-    # f16 1.0 = 0x3c00, f16 2.0 = 0x4000
-    instructions.append(s_mov_b32(s[0], 0x3c00))  # f16 1.0
-    instructions.append(s_mov_b32(s[1], 0x4000))  # f16 2.0
-    instructions.append(v_mov_b32_e32(v[0], s[0]))
-    instructions.append(v_mov_b32_e32(v[1], s[1]))
-    # Pack: v[2] = (v[1].f16 << 16) | v[0].f16
-    instructions.append(v_pack_b32_f16(v[2], v[0], v[1]))
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: hi=0x4000 (2.0), lo=0x3c00 (1.0) -> 0x40003c00
-    self.assertEqual(result, 0x40003c00, f"Expected 0x40003c00, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_with_cvt(self):
-    """V_PACK_B32_F16 after V_CVT_F16_F32 conversions."""
-    instructions = []
-    # f32 1.0 = 0x3f800000
-    instructions.append(s_mov_b32(s[0], 0x3f800000))
-    instructions.append(v_mov_b32_e32(v[0], s[0]))  # f32 1.0
-    instructions.append(v_mov_b32_e32(v[1], s[0]))  # f32 1.0
-    # Convert to f16
-    instructions.append(v_cvt_f16_f32_e32(v[2], v[0]))  # v[2].f16 = 1.0
-    instructions.append(v_cvt_f16_f32_e32(v[3], v[1]))  # v[3].f16 = 1.0
-    # Pack
-    instructions.append(v_pack_b32_f16(v[4], v[2], v[3]))
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][4]
-    # Expected: 0x3c003c00 (two f16 1.0 values)
-    self.assertEqual(result, 0x3c003c00, f"Expected 0x3c003c00, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_packed_sources(self):
-    """V_PACK_B32_F16 with sources that have packed f16 pairs (both hi and lo used).
-    This mimics what happens in matmul kernels where VGPRs contain packed f16 data.
-    """
-    instructions = []
-    # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
-    # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
-    # V_PACK_B32_F16 with default opsel=0 reads low halves from each source
-    # Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.lo=0x3c00 (1.0) -> 0x42003c00
-    instructions.append(s_mov_b32(s[0], 0x40003c00))  # packed: hi=2.0, lo=1.0
-    instructions.append(s_mov_b32(s[1], 0x44004200))  # packed: hi=4.0, lo=3.0
-    instructions.append(v_mov_b32_e32(v[0], s[0]))
-    instructions.append(v_mov_b32_e32(v[1], s[1]))
-    instructions.append(v_pack_b32_f16(v[2], v[0], v[1]))
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: hi=0x4200 (3.0), lo=0x3c00 (1.0) -> 0x42003c00
-    self.assertEqual(result, 0x42003c00, f"Expected 0x42003c00, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_opsel_hi_hi(self):
-    """V_PACK_B32_F16 with opsel=0b0011 to read high halves from both sources.
-    This is used when extracting the high f16 values from packed registers.
-    """
-    # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
-    # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
-    # With opsel=0b0011: read hi from v0 (0x4000=2.0) and hi from v1 (0x4400=4.0)
-    # Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.hi=0x4000 (2.0) -> 0x44004000
-    inst = v_pack_b32_f16(v[2], v[0], v[1])
-    inst._values['opsel'] = 0b0011  # opsel[0]=1 for src0 hi, opsel[1]=1 for src1 hi
-
-    instructions = [
-      s_mov_b32(s[0], 0x40003c00),  # packed: hi=2.0, lo=1.0
-      s_mov_b32(s[1], 0x44004200),  # packed: hi=4.0, lo=3.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      inst,
-    ]
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: hi=0x4400 (4.0), lo=0x4000 (2.0) -> 0x44004000
-    self.assertEqual(result, 0x44004000, f"Expected 0x44004000, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_opsel_lo_hi(self):
-    """V_PACK_B32_F16 with opsel=0b0010 to read lo from src0, hi from src1."""
-    # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
-    # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
-    # With opsel=0b0010: read lo from v0 (0x3c00=1.0), hi from v1 (0x4400=4.0)
-    # Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.lo=0x3c00 (1.0) -> 0x44003c00
-    inst = v_pack_b32_f16(v[2], v[0], v[1])
-    inst._values['opsel'] = 0b0010  # opsel[0]=0 for src0 lo, opsel[1]=1 for src1 hi
-
-    instructions = [
-      s_mov_b32(s[0], 0x40003c00),
-      s_mov_b32(s[1], 0x44004200),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      inst,
-    ]
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: hi=0x4400 (4.0), lo=0x3c00 (1.0) -> 0x44003c00
-    self.assertEqual(result, 0x44003c00, f"Expected 0x44003c00, got 0x{result:08x}")
-
-  def test_v_pack_b32_f16_opsel_hi_lo(self):
-    """V_PACK_B32_F16 with opsel=0b0001 to read hi from src0, lo from src1."""
-    # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
-    # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
-    # With opsel=0b0001: read hi from v0 (0x4000=2.0), lo from v1 (0x4200=3.0)
-    # Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.hi=0x4000 (2.0) -> 0x42004000
-    inst = v_pack_b32_f16(v[2], v[0], v[1])
-    inst._values['opsel'] = 0b0001  # opsel[0]=1 for src0 hi, opsel[1]=0 for src1 lo
-
-    instructions = [
-      s_mov_b32(s[0], 0x40003c00),
-      s_mov_b32(s[1], 0x44004200),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      inst,
-    ]
-
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: hi=0x4200 (3.0), lo=0x4000 (2.0) -> 0x42004000
-    self.assertEqual(result, 0x42004000, f"Expected 0x42004000, got 0x{result:08x}")
-
-
-class TestWMMA(unittest.TestCase):
-  """Tests for WMMA (Wave Matrix Multiply-Accumulate) instructions."""
-
-  def test_v_wmma_f32_16x16x16_f16_basic(self):
-    """V_WMMA_F32_16X16X16_F16 basic test - verify emulator matches hardware."""
-    # WMMA does D = A @ B + C where A,B are 16x16 f16, C,D are 16x16 f32
-    # Use: A=v[16:23], B=v[24:31], C=D=v[0:7] (output in captured range v[0:15])
-    instructions = []
-
-    # f16 1.0 = 0x3c00, packed pair = 0x3c003c00
-    instructions.append(s_mov_b32(s[0], 0x3c003c00))
-
-    # Set A (v16-v23) and B (v24-v31) to all 1.0s
-    for i in range(16, 32):
-      instructions.append(v_mov_b32_e32(v[i], s[0]))
-
-    # Set C (v0-v7) to all 0s (will also be output D)
-    for i in range(8):
-      instructions.append(v_mov_b32_e32(v[i], 0))
-
-    # Execute WMMA: v[0:7] = A @ B + C
-    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
-
-    # Just run and compare - USE_HW=1 will verify emulator matches hardware
-    st = run_program(instructions, n_lanes=32)
-
-    # Verify at least some output is non-zero (actual values depend on WMMA layout)
-    # Output should be 16.0 (16 x 1.0 x 1.0) for each element
-    any_nonzero = any(st.vgpr[lane][0] != 0 for lane in range(32))
-    self.assertTrue(any_nonzero, "WMMA should produce non-zero output")
-
-  def test_v_wmma_f32_16x16x16_f16_all_ones(self):
-    """V_WMMA_F32_16X16X16_F16 with all ones should produce 16.0 for each output element.
-    This verifies the matrix multiply is computing the correct sum.
-    """
-    instructions = []
-
-    # f16 1.0 = 0x3c00, packed pair = 0x3c003c00
-    instructions.append(s_mov_b32(s[0], 0x3c003c00))
-
-    # Set A (v16-v23) and B (v24-v31) to all 1.0s
-    for i in range(16, 32):
-      instructions.append(v_mov_b32_e32(v[i], s[0]))
-
-    # Set C (v0-v7) to all 0s (will also be output D)
-    for i in range(8):
-      instructions.append(v_mov_b32_e32(v[i], 0))
-
-    # Execute WMMA: v[0:7] = A @ B + C
-    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
-
-    st = run_program(instructions, n_lanes=32)
-
-    # All output elements should be 16.0 (sum of 16 * 1.0 * 1.0)
-    expected = f2i(16.0)
-    for lane in range(32):
-      for reg in range(8):
-        result = st.vgpr[lane][reg]
-        self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (16.0), got 0x{result:08x} ({i2f(result)})")
-
-  def test_v_wmma_f32_16x16x16_f16_with_accumulator(self):
-    """V_WMMA_F32_16X16X16_F16 with non-zero accumulator.
-    Verifies that C matrix is properly added to the product.
-    """
-    instructions = []
-
-    # f16 1.0 = 0x3c00, packed pair = 0x3c003c00
-    instructions.append(s_mov_b32(s[0], 0x3c003c00))
-    # f32 5.0 = 0x40a00000
-    instructions.append(s_mov_b32(s[1], f2i(5.0)))
-
-    # Set A (v16-v23) and B (v24-v31) to all 1.0s
-    for i in range(16, 32):
-      instructions.append(v_mov_b32_e32(v[i], s[0]))
-
-    # Set C (v0-v7) to all 5.0s
-    for i in range(8):
-      instructions.append(v_mov_b32_e32(v[i], s[1]))
-
-    # Execute WMMA: v[0:7] = A @ B + C = 16.0 + 5.0 = 21.0
-    instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
-
-    st = run_program(instructions, n_lanes=32)
-
-    # All output elements should be 21.0 (16.0 + 5.0)
-    expected = f2i(21.0)
-    for lane in range(32):
-      for reg in range(8):
-        result = st.vgpr[lane][reg]
-        self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (21.0), got 0x{result:08x} ({i2f(result)})")
-
-
-class TestVOP3P(unittest.TestCase):
-  """Tests for VOP3P packed 16-bit operations."""
-
-  def test_v_pk_add_f16_basic(self):
-    """V_PK_ADD_F16 adds two packed f16 values."""
-    from extra.assembly.amd.pcode import _f16
-    # v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0)
-    # Result should be packed (4.0, 6.0)
-    instructions = [
-      s_mov_b32(s[0], 0x40003c00),  # packed f16: hi=2.0, lo=1.0
-      s_mov_b32(s[1], 0x44004200),  # packed f16: hi=4.0, lo=3.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_pk_add_f16(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    # Expected: lo=1.0+3.0=4.0 (0x4400), hi=2.0+4.0=6.0 (0x4600) -> 0x46004400
-    lo = _f16(result & 0xffff)
-    hi = _f16((result >> 16) & 0xffff)
-    self.assertAlmostEqual(lo, 4.0, places=2, msg=f"lo: expected 4.0, got {lo}")
-    self.assertAlmostEqual(hi, 6.0, places=2, msg=f"hi: expected 6.0, got {hi}")
-
-  def test_v_pk_add_f16_with_inline_constant(self):
-    """V_PK_ADD_F16 with inline constant POS_ONE (1.0).
-    Inline constants for VOP3P are f16 values in the low 16 bits only.
-    The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant.
-    """
-    from extra.assembly.amd.pcode import _f16
-    # v0 = packed (1.0, 1.0), add POS_ONE
-    # With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant)
-    # But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding
-    # Default opsel_hi=3 means: bit0=1 (src0 hi from hi), bit1=1 (src1 hi from hi)
-    # Since inline constant has 0 in hi half, hi result = v0.hi + 0 = 1.0
-    instructions = [
-      s_mov_b32(s[0], 0x3c003c00),  # packed f16: hi=1.0, lo=1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE),  # Add inline constant 1.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    lo = _f16(result & 0xffff)
-    hi = _f16((result >> 16) & 0xffff)
-    # lo = 1.0 + 1.0 = 2.0, hi = 1.0 + 0.0 = 1.0 (inline const hi half is 0)
-    self.assertAlmostEqual(lo, 2.0, places=2, msg=f"lo: expected 2.0, got {lo} (result=0x{result:08x})")
-    self.assertAlmostEqual(hi, 1.0, places=2, msg=f"hi: expected 1.0, got {hi} (result=0x{result:08x})")
-
-  def test_v_pk_mul_f16_basic(self):
-    """V_PK_MUL_F16 multiplies two packed f16 values."""
-    from extra.assembly.amd.pcode import _f16
-    # v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0)
-    # Result should be packed (8.0, 15.0)
-    instructions = [
-      s_mov_b32(s[0], 0x42004000),  # packed f16: hi=3.0, lo=2.0
-      s_mov_b32(s[1], 0x45004400),  # packed f16: hi=5.0, lo=4.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_pk_mul_f16(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    lo = _f16(result & 0xffff)
-    hi = _f16((result >> 16) & 0xffff)
-    self.assertAlmostEqual(lo, 8.0, places=1, msg=f"lo: expected 8.0, got {lo}")
-    self.assertAlmostEqual(hi, 15.0, places=1, msg=f"hi: expected 15.0, got {hi}")
-
-  def test_v_pk_mul_f16_with_inline_constant(self):
-    """V_PK_MUL_F16 with inline constant POS_TWO (2.0).
-    Inline constant has value only in low 16 bits, hi is 0.
-    """
-    from extra.assembly.amd.pcode import _f16
-    # v0 = packed (3.0, 4.0), multiply by POS_TWO
-    # lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
-    instructions = [
-      s_mov_b32(s[0], 0x44004200),  # packed f16: hi=4.0, lo=3.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    lo = _f16(result & 0xffff)
-    hi = _f16((result >> 16) & 0xffff)
-    self.assertAlmostEqual(lo, 6.0, places=1, msg=f"lo: expected 6.0, got {lo}")
-    self.assertAlmostEqual(hi, 0.0, places=1, msg=f"hi: expected 0.0, got {hi}")
-
-  def test_v_pk_fma_f16_basic(self):
-    """V_PK_FMA_F16: D = A * B + C for packed f16."""
-    from extra.assembly.amd.pcode import _f16
-    # A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0)
-    # Result should be packed (2*4+1=9.0, 3*5+1=16.0)
-    instructions = [
-      s_mov_b32(s[0], 0x42004000),  # A: hi=3.0, lo=2.0
-      s_mov_b32(s[1], 0x45004400),  # B: hi=5.0, lo=4.0
-      s_mov_b32(s[2], 0x3c003c00),  # C: hi=1.0, lo=1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_pk_fma_f16(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    lo = _f16(result & 0xffff)
-    hi = _f16((result >> 16) & 0xffff)
-    self.assertAlmostEqual(lo, 9.0, places=1, msg=f"lo: expected 9.0, got {lo}")
-    self.assertAlmostEqual(hi, 16.0, places=0, msg=f"hi: expected 16.0, got {hi}")
-
-
-class TestVFmaMix(unittest.TestCase):
-  """Tests for V_FMA_MIX_F32/F16 mixed-precision FMA instructions.
-
-  These instructions are critical for OCML sin/cos implementations.
-  opsel_hi[i] controls whether source i is f32 (0) or f16 from hi bits (1)
-  opsel[i] selects which half (lo=0, hi=1) when source is f16
-  """
-
-  def test_v_fma_mix_f32_all_f32(self):
-    """V_FMA_MIX_F32 with all f32 sources."""
-    instructions = [
-      s_mov_b32(s[0], f2i(2.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f2i(1.0)),
-      v_mov_b32_e32(v[2], s[2]),
-      # opsel_hi=0, opsel_hi2=0 means all sources are f32
-      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+1=7, got {result}")
-
-  def test_v_fma_mix_f32_src2_f16_lo(self):
-    """V_FMA_MIX_F32 with src2 as f16 from lo bits."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_2 = f32_to_f16(2.0)  # 0x4000
-    instructions = [
-      s_mov_b32(s[0], f2i(1.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f16_2),  # f16 2.0 in lo bits, 0 in hi bits
-      v_mov_b32_e32(v[2], s[2]),
-      # opsel_hi2=1 means src2 is f16, opsel[2]=0 means use lo half
-      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 5.0, places=5, msg=f"1*3+2=5, got {result}")
-
-  def test_v_fma_mix_f32_src2_f16_hi(self):
-    """V_FMA_MIX_F32 with src2 as f16 from hi bits."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_2 = f32_to_f16(2.0)  # 0x4000
-    val = (f16_2 << 16) | 0  # hi = f16 2.0, lo = 0
-    instructions = [
-      s_mov_b32(s[0], f2i(1.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], val),
-      v_mov_b32_e32(v[2], s[2]),
-      # opsel_hi2=1 means src2 is f16, opsel[2]=1 (bit 2 set, opsel=4) means use hi half
-      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 5.0, places=5, msg=f"1*3+2=5, got {result}")
-
-  def test_v_fma_mix_f32_with_abs(self):
-    """V_FMA_MIX_F32 with abs modifier on src2."""
-    instructions = [
-      s_mov_b32(s[0], f2i(2.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f2i(-1.0)),  # -1.0
-      v_mov_b32_e32(v[2], s[2]),
-      # neg_hi field is used for abs in V_FMA_MIX, abs bit 2 (0b100) for |src2|
-      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0, neg_hi=4),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+|-1|=7, got {result}")
-
-  def test_v_fma_mixlo_f16(self):
-    """V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      s_mov_b32(s[0], f2i(2.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f2i(1.0)),
-      v_mov_b32_e32(v[2], s[2]),
-      s_mov_b32(s[3], 0xdead0000),  # garbage in hi bits
-      v_mov_b32_e32(v[3], s[3]),
-      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = _f16(st.vgpr[0][3] & 0xffff)
-    hi = (st.vgpr[0][3] >> 16) & 0xffff
-    self.assertAlmostEqual(lo, 7.0, places=1, msg=f"lo: 2*3+1=7, got {lo}")
-    self.assertEqual(hi, 0xdead, f"hi should be preserved, got 0x{hi:04x}")
-
-
-class TestF64Conversions(unittest.TestCase):
-  """Tests for 64-bit float operations and conversions."""
-
-  def test_v_add_f64_inline_constant(self):
-    """V_ADD_F64 with inline constant POS_ONE (1.0) as f64."""
-    one_f64 = f2i64(1.0)
-    instructions = [
-      s_mov_b32(s[0], one_f64 & 0xffffffff),
-      s_mov_b32(s[1], one_f64 >> 32),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_add_f64(v[2:4], v[0:2], SrcEnum.POS_ONE),  # 1.0 + 1.0 = 2.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
-    self.assertAlmostEqual(result, 2.0, places=5)
-
-  def test_v_ldexp_f64_negative_exponent(self):
-    """V_LDEXP_F64 with negative exponent (-32)."""
-    val = -8.0
-    val_bits = f2i64(val)
-    expected = -8.0 * (2.0 ** -32)  # -1.862645149230957e-09
-    instructions = [
-      s_mov_b32(s[0], val_bits & 0xffffffff),
-      s_mov_b32(s[1], val_bits >> 32),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0),  # -32
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
-    self.assertAlmostEqual(result, expected, places=15)
-
-  def test_f64_to_i64_conversion_sequence(self):
-    """Test the f64->i64 conversion sequence used by the compiler.
-
-    The compiler generates:
-      v_trunc_f64 -> v_ldexp_f64 (by -32) -> v_floor_f64 -> v_fma_f64 (by -2^32)
-      -> v_cvt_u32_f64 (low bits) -> v_cvt_i32_f64 (high bits)
-
-    The FMA computes: trunc + (-2^32) * floor = trunc - floor * 2^32
-    which gives the low 32 bits as a positive float (for proper u32 conversion).
-    """
-    val = -8.0
-    val_bits = f2i64(val)
-    lit = -4294967296.0  # -2^32 (note: NEGATIVE, so FMA does trunc - floor * 2^32)
-    lit_bits = f2i64(lit)
-
-    instructions = [
-      s_mov_b32(s[0], val_bits & 0xffffffff),
-      s_mov_b32(s[1], val_bits >> 32),
-      v_trunc_f64_e32(v[0:2], s[0:2]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0),  # -32
-      v_floor_f64_e32(v[2:4], v[2:4]),
-      s_mov_b32(s[2], lit_bits & 0xffffffff),
-      s_mov_b32(s[3], lit_bits >> 32),
-      v_fma_f64(v[0:2], s[2:4], v[2:4], v[0:2]),
-      v_cvt_u32_f64_e32(v[4], v[0:2]),
-      v_cvt_i32_f64_e32(v[5], v[2:4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v4 = low 32 bits, v5 = high 32 bits (sign extended)
-    lo = st.vgpr[0][4]
-    hi = st.vgpr[0][5]
-    # For -8: lo should be 0xfffffff8, hi should be 0xffffffff
-    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
-    self.assertEqual(result, -8, f"Expected -8, got {result} (lo=0x{lo:08x}, hi=0x{hi:08x})")
-
-  def test_v_cvt_i32_f64_writes_32bit_only(self):
-    """V_CVT_I32_F64 should only write 32 bits, not 64.
-
-    Regression test: V_CVT_I32_F64 has a 64-bit source (f64) but 32-bit destination (i32).
-    The emulator was incorrectly writing 64 bits (clobbering vdst+1) because
-    is_64bit_op was True for any op ending in '_F64'.
-    """
-    # Pre-fill v3 with a canary value that should NOT be clobbered
-    val_bits = f2i64(-1.0)
-    instructions = [
-      s_mov_b32(s[0], val_bits & 0xffffffff),
-      s_mov_b32(s[1], val_bits >> 32),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], 0xDEADBEEF),  # Canary value
-      v_mov_b32_e32(v[3], s[2]),    # Put canary in v3
-      v_cvt_i32_f64_e32(v[2], v[0:2]),  # Convert -1.0 -> -1 (0xffffffff)
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2]
-    canary = st.vgpr[0][3]
-    # V_CVT_I32_F64 of -1.0 should produce 0xffffffff (-1)
-    self.assertEqual(result, 0xffffffff, f"Expected 0xffffffff (-1), got 0x{result:08x}")
-    # v3 should still contain the canary (not clobbered by 64-bit write)
-    self.assertEqual(canary, 0xDEADBEEF, f"v3 canary should be 0xDEADBEEF, got 0x{canary:08x} (clobbered!)")
-
-  def test_v_frexp_mant_f64_range(self):
-    """V_FREXP_MANT_F64 should return mantissa in [0.5, 1.0) range.
-
-    Regression test: The mantissa() helper was incorrectly multiplying by 2.0,
-    returning values in [1.0, 2.0) instead of the correct [0.5, 1.0) range.
-    """
-    # Test with 2.0: frexp(2.0) should give mantissa=0.5, exponent=2
-    two_f64 = f2i64(2.0)
-    instructions = [
-      s_mov_b32(s[0], two_f64 & 0xffffffff),
-      s_mov_b32(s[1], two_f64 >> 32),
-      v_frexp_mant_f64_e32(v[0:2], s[0:2]),
-      v_frexp_exp_i32_f64_e32(v[2], s[0:2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    mant = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    exp = st.vgpr[0][2]
-    if exp >= 0x80000000: exp -= 0x100000000  # sign extend
-    # frexp(2.0) = 0.5 * 2^2
-    self.assertAlmostEqual(mant, 0.5, places=10, msg=f"Expected mantissa 0.5, got {mant}")
-    self.assertEqual(exp, 2, f"Expected exponent 2, got {exp}")
-
-  def test_v_div_scale_f64_reads_64bit_sources(self):
-    """V_DIV_SCALE_F64 must read all sources as 64-bit values.
-
-    Regression test: VOP3SD was reading sources as 32-bit for V_DIV_SCALE_F64,
-    causing incorrect results when the low 32 bits happened to look like 0 or denorm.
-    """
-    # Set up v0:v1 = sqrt(2) ≈ 1.414, v2:v3 = 1.0
-    sqrt2_f64 = f2i64(1.4142135623730951)
-    one_f64 = f2i64(1.0)
-    instructions = [
-      s_mov_b32(s[0], sqrt2_f64 & 0xffffffff),
-      s_mov_b32(s[1], sqrt2_f64 >> 32),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], one_f64 & 0xffffffff),
-      s_mov_b32(s[3], one_f64 >> 32),
-      v_mov_b32_e32(v[2], s[2]),
-      v_mov_b32_e32(v[3], s[3]),
-      # V_DIV_SCALE_F64: src0=v0:v1, src1=v0:v1, src2=v2:v3
-      # For normal inputs, should pass through src0 unchanged
-      VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4], sdst=s[10], src0=v[0], src1=v[0], src2=v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
-    # For normal (non-denorm, non-edge-case) inputs, V_DIV_SCALE_F64 passes through src0
-    self.assertAlmostEqual(result, 1.4142135623730951, places=10,
-                           msg=f"Expected ~1.414, got {result} (may be nan if 64-bit sources not read correctly)")
-
-
-class TestNewPcodeHelpers(unittest.TestCase):
-  """Tests for newly added pcode helper functions (SAD, BYTE_PERMUTE, BF16)."""
-
-  def test_v_sad_u8_basic(self):
-    """V_SAD_U8: Sum of absolute differences of 4 bytes."""
-    # s0 = 0x05040302, s1 = 0x04030201, s2 = 10 -> diff = 1+1+1+1 = 4, result = 14
-    instructions = [
-      s_mov_b32(s[0], 0x05040302),
-      s_mov_b32(s[1], 0x04030201),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], 10),
-      v_sad_u8(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 14, f"Expected 14, got {result}")
-
-  def test_v_sad_u8_identical_bytes(self):
-    """V_SAD_U8: When both operands are identical, SAD = 0 + accumulator."""
-    instructions = [
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[0]),  # Same as v0
-      v_mov_b32_e32(v[2], 42),    # Accumulator
-      v_sad_u8(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 42, f"Expected 42, got {result}")
-
-  def test_v_sad_u16_basic(self):
-    """V_SAD_U16: Sum of absolute differences of 2 half-words."""
-    # s0 = 0x00020003, s1 = 0x00010001 -> diff = |2-1| + |3-1| = 1 + 2 = 3
-    instructions = [
-      s_mov_b32(s[0], 0x00020003),
-      s_mov_b32(s[1], 0x00010001),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], 0),
-      v_sad_u16(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 3, f"Expected 3, got {result}")
-
-  def test_v_sad_u32_basic(self):
-    """V_SAD_U32: Absolute difference of 32-bit values."""
-    # s0 = 100, s1 = 30 -> diff = 70, s2 = 5 -> result = 75
-    instructions = [
-      v_mov_b32_e32(v[0], 100),
-      v_mov_b32_e32(v[1], 30),
-      v_mov_b32_e32(v[2], 5),
-      v_sad_u32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 75, f"Expected 75, got {result}")
-
-  def test_v_msad_u8_masked(self):
-    """V_MSAD_U8: Skip bytes where reference (s1) is 0."""
-    # s0 = 0x10101010, s1 = 0x00010001, s2 = 0
-    # Only bytes 0 and 2 of s1 are non-zero, so only those contribute
-    # diff = |0x10-0x01| + |0x10-0x01| = 15 + 15 = 30
-    instructions = [
-      s_mov_b32(s[0], 0x10101010),
-      s_mov_b32(s[1], 0x00010001),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], 0),
-      v_msad_u8(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 30, f"Expected 30, got {result}")
-
-  def test_v_perm_b32_select_bytes(self):
-    """V_PERM_B32: Select bytes from combined {s0, s1}."""
-    # Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
-    # s0 = 0x03020100 -> bytes 4-7 of combined
-    # s1 = 0x07060504 -> bytes 0-3 of combined
-    # Combined = 0x03020100_07060504
-    # selector = 0x00010203 -> select bytes 3,2,1,0 from combined = 0x04,0x05,0x06,0x07
-    instructions = [
-      s_mov_b32(s[0], 0x03020100),
-      s_mov_b32(s[1], 0x07060504),
-      s_mov_b32(s[2], 0x00010203),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_perm_b32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 0x04050607, f"Expected 0x04050607, got 0x{result:08x}")
-
-  def test_v_perm_b32_select_high_bytes(self):
-    """V_PERM_B32: Select bytes from high word (s0)."""
-    # Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
-    # s0 = 0x03020100 -> bytes 4-7 of combined
-    # s1 = 0x07060504 -> bytes 0-3 of combined
-    # selector = 0x04050607 -> select bytes 7,6,5,4 from combined = 0x00,0x01,0x02,0x03
-    instructions = [
-      s_mov_b32(s[0], 0x03020100),
-      s_mov_b32(s[1], 0x07060504),
-      s_mov_b32(s[2], 0x04050607),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_perm_b32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 0x00010203, f"Expected 0x00010203, got 0x{result:08x}")
-
-  def test_v_perm_b32_constant_values(self):
-    """V_PERM_B32: Test constant 0x00 (sel=12) and 0xFF (sel>=13)."""
-    # selector = 0x0C0D0E0F -> bytes: 12=0x00, 13=0xFF, 14=0xFF, 15=0xFF
-    instructions = [
-      s_mov_b32(s[0], 0x12345678),
-      s_mov_b32(s[1], 0xABCDEF01),
-      s_mov_b32(s[2], 0x0C0D0E0F),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_perm_b32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    # byte 0: sel=0x0F >= 13 -> 0xFF
-    # byte 1: sel=0x0E >= 13 -> 0xFF
-    # byte 2: sel=0x0D >= 13 -> 0xFF
-    # byte 3: sel=0x0C = 12 -> 0x00
-    self.assertEqual(result, 0x00FFFFFF, f"Expected 0x00FFFFFF, got 0x{result:08x}")
-
-  def test_v_perm_b32_sign_extend(self):
-    """V_PERM_B32: Test sign extension selectors 8-11."""
-    # Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
-    # s0 = 0x00008000 -> byte 5 (0x80) has sign bit set
-    # s1 = 0x80000080 -> bytes 1 (0x00) and 3 (0x80) have sign bits, byte 0 (0x80) has sign bit
-    # Combined = 0x00008000_80000080
-    # selector = 0x08090A0B -> sign of bytes 1,3,5,7
-    # byte 0: sel=0x0B -> sign of byte 7 (0x00) -> 0x00
-    # byte 1: sel=0x0A -> sign of byte 5 (0x80) -> 0xFF
-    # byte 2: sel=0x09 -> sign of byte 3 (0x80) -> 0xFF
-    # byte 3: sel=0x08 -> sign of byte 1 (0x00) -> 0x00
-    instructions = [
-      s_mov_b32(s[0], 0x00008000),
-      s_mov_b32(s[1], 0x80000080),
-      s_mov_b32(s[2], 0x08090A0B),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], s[2]),
-      v_perm_b32(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][3]
-    self.assertEqual(result, 0x00FFFF00, f"Expected 0x00FFFF00, got 0x{result:08x}")
-
-  def test_v_dot2_f32_bf16_basic(self):
-    """V_DOT2_F32_BF16: Dot product of two bf16 pairs accumulated into f32."""
-    from extra.assembly.amd.pcode import _ibf16
-    # A = packed (2.0, 3.0) as bf16, B = packed (4.0, 5.0) as bf16
-    # Result = 2*4 + 3*5 + acc = 8 + 15 + 0 = 23.0
-    a_lo, a_hi = _ibf16(2.0), _ibf16(3.0)
-    b_lo, b_hi = _ibf16(4.0), _ibf16(5.0)
-    a_packed = (a_hi << 16) | a_lo
-    b_packed = (b_hi << 16) | b_lo
-    instructions = [
-      s_mov_b32(s[0], a_packed),
-      s_mov_b32(s[1], b_packed),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mov_b32_e32(v[2], 0),  # accumulator = 0
-      v_dot2_f32_bf16(v[3], v[0], v[1], v[2]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 23.0, places=1, msg=f"Expected 23.0, got {result}")
-
-
-class TestQuadmaskWqm(unittest.TestCase):
-  """Tests for S_QUADMASK and S_WQM instructions."""
-
-  def test_s_quadmask_b32_all_quads_active(self):
-    """S_QUADMASK_B32: All quads have at least one active lane."""
-    # Input: 0xFFFFFFFF (all bits set) -> all 8 quads active -> result = 0xFF
-    instructions = [
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      s_quadmask_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0xFF, f"Expected 0xFF, got 0x{result:x}")
-    self.assertEqual(st.scc, 1, "SCC should be 1 (result != 0)")
-
-  def test_s_quadmask_b32_alternating_quads(self):
-    """S_QUADMASK_B32: Every other quad has lanes active."""
-    # Input: 0x0F0F0F0F -> quads 0,2,4,6 active (bits 0-3, 8-11, 16-19, 24-27)
-    # Result: bits 0,2,4,6 set = 0x55
-    instructions = [
-      s_mov_b32(s[0], 0x0F0F0F0F),
-      s_quadmask_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0x55, f"Expected 0x55, got 0x{result:x}")
-
-  def test_s_quadmask_b32_no_quads_active(self):
-    """S_QUADMASK_B32: No quads have active lanes."""
-    instructions = [
-      s_mov_b32(s[0], 0),
-      s_quadmask_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0, f"Expected 0, got 0x{result:x}")
-    self.assertEqual(st.scc, 0, "SCC should be 0 (result == 0)")
-
-  def test_s_quadmask_b32_single_lane_per_quad(self):
-    """S_QUADMASK_B32: Single lane active in each quad."""
-    # Input: 0x11111111 -> bit 0 of each nibble set -> all 8 quads active
-    instructions = [
-      s_mov_b32(s[0], 0x11111111),
-      s_quadmask_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0xFF, f"Expected 0xFF, got 0x{result:x}")
-
-  def test_s_wqm_b32_all_active(self):
-    """S_WQM_B32: Whole quad mode - if any lane in quad is active, activate all."""
-    # Input: 0x11111111 -> one lane per quad -> output all quads fully active = 0xFFFFFFFF
-    instructions = [
-      s_mov_b32(s[0], 0x11111111),
-      s_wqm_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0xFFFFFFFF, f"Expected 0xFFFFFFFF, got 0x{result:x}")
-    self.assertEqual(st.scc, 1, "SCC should be 1 (result != 0)")
-
-  def test_s_wqm_b32_alternating_quads(self):
-    """S_WQM_B32: Only some quads have active lanes."""
-    # Input: 0x0000000F -> only quad 0 has lanes -> output = 0x0000000F (quad 0 all active)
-    instructions = [
-      s_mov_b32(s[0], 0x00000001),  # single lane in quad 0
-      s_wqm_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0x0000000F, f"Expected 0x0000000F, got 0x{result:x}")
-
-  def test_s_wqm_b32_zero(self):
-    """S_WQM_B32: No lanes active."""
-    instructions = [
-      s_mov_b32(s[0], 0),
-      s_wqm_b32(s[1], s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.sgpr[1]
-    self.assertEqual(result, 0, f"Expected 0, got 0x{result:x}")
-    self.assertEqual(st.scc, 0, "SCC should be 0 (result == 0)")
-
-
-class TestVOP2_16bit_HiHalf(unittest.TestCase):
-  """Regression tests for VOP2 16-bit ops reading from high half of VGPR (v128+ encoding).
-
-  Bug: VOP2 16-bit ops like v_add_f16 with src0 as v128+ should read the HIGH 16 bits
-  of the corresponding VGPR (v128 = v0.hi, v129 = v1.hi, etc). The emulator was
-  incorrectly reading from VGPR v128+ instead of the high half of v0+.
-
-  Example: v_add_f16 v0, v128, v0 means v0.lo = v0.hi + v0.lo (fold packed result)
-  """
-
-  def test_v_add_f16_src0_hi_fold(self):
-    """v_add_f16 with src0=v128 (v0.hi) - fold packed f16 values.
-
-    This pattern is generated by LLVM for summing packed f16 results:
-    v_pk_mul_f16 produces [hi, lo] in v0, then v_add_f16 v0, v128, v0 sums them.
-    """
-    instructions = [
-      # v0 = packed f16: high=2.0 (0x4000), low=1.0 (0x3c00)
-      s_mov_b32(s[0], 0x40003c00),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_add_f16 v1, v128, v0 means: v1.lo = v0.hi + v0.lo = 2.0 + 1.0 = 3.0
-      # v128 in src0 means "read high 16 bits of v0"
-      v_add_f16_e32(v[1], v[0].h, v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xffff
-    self.assertEqual(result, 0x4200, f"Expected 3.0 (0x4200), got 0x{result:04x}")
-
-  def test_v_add_f16_src0_hi_different_reg(self):
-    """v_add_f16 with src0=v129 (v1.hi) reads high half of v1."""
-    instructions = [
-      s_mov_b32(s[0], 0x44004200),  # v1: high=4.0, low=3.0
-      v_mov_b32_e32(v[1], s[0]),
-      s_mov_b32(s[1], 0x3c00),      # v0: low=1.0
-      v_mov_b32_e32(v[0], s[1]),
-      # v_add_f16 v2, v129, v0 means: v2.lo = v1.hi + v0.lo = 4.0 + 1.0 = 5.0
-      v_add_f16_e32(v[2], v[1].h, v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xffff
-    self.assertEqual(result, 0x4500, f"Expected 5.0 (0x4500), got 0x{result:04x}")
-
-  def test_v_mul_f16_src0_hi(self):
-    """v_mul_f16 with src0 from high half."""
-    instructions = [
-      s_mov_b32(s[0], 0x40003c00),  # v0: high=2.0, low=1.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x4200),      # v1: low=3.0
-      v_mov_b32_e32(v[1], s[1]),
-      # v_mul_f16 v2, v128, v1 means: v2.lo = v0.hi * v1.lo = 2.0 * 3.0 = 6.0
-      v_mul_f16_e32(v[2], v[0].h, v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xffff
-    self.assertEqual(result, 0x4600, f"Expected 6.0 (0x4600), got 0x{result:04x}")
-
-  def test_v_add_f16_multilane(self):
-    """v_add_f16 with src0=v128 across multiple lanes."""
-    instructions = [
-      # Set up different packed values per lane using v_mov with lane-dependent values
-      # Lane 0: v0 = 0x40003c00 (hi=2.0, lo=1.0) -> sum = 3.0
-      # Lane 1: v0 = 0x44004200 (hi=4.0, lo=3.0) -> sum = 7.0
-      v_mov_b32_e32(v[0], 0x40003c00),  # default for all lanes
-      # Use v_cmp to select lane 1 (v255 = lane_id from prologue)
-      v_cmp_eq_u32_e32(1, v[255]),  # vcc = (lane == 1)
-      v_cndmask_b32_e64(v[0], v[0], 0x44004200, SrcEnum.VCC_LO),
-      # Now fold: v1.lo = v0.hi + v0.lo
-      v_add_f16_e32(v[1], v[0].h, v[0]),
-    ]
-    st = run_program(instructions, n_lanes=2)
-    # Lane 0: 2.0 + 1.0 = 3.0 (0x4200)
-    self.assertEqual(st.vgpr[0][1] & 0xffff, 0x4200, "Lane 0: expected 3.0")
-    # Lane 1: 4.0 + 3.0 = 7.0 (0x4700)
-    self.assertEqual(st.vgpr[1][1] & 0xffff, 0x4700, "Lane 1: expected 7.0")
-
-
-class TestVOPC_16bit_HiHalf(unittest.TestCase):
-  """Regression tests for VOPC 16-bit ops reading from high half of VGPR (v128+ encoding).
-
-  Bug: VOPC 16-bit ops like v_cmp_lt_f16 with vsrc1 as v128+ should read the HIGH 16 bits
-  of the corresponding VGPR. The emulator was incorrectly reading from VGPR v128+.
-
-  Example: v_cmp_nge_f16 vcc, v0, v128 compares v0.lo with v0.hi
-  """
-
-  def test_v_cmp_lt_f16_vsrc1_hi(self):
-    """v_cmp_lt_f16 comparing low half with high half of same register."""
-    instructions = [
-      # v0: high=2.0 (0x4000), low=1.0 (0x3c00)
-      s_mov_b32(s[0], 0x40003c00),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_cmp_lt_f16 vcc, v0, v128 means: vcc = (v0.lo < v0.hi) = (1.0 < 2.0) = true
-      v_cmp_lt_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (1.0 < 2.0)")
-
-  def test_v_cmp_gt_f16_vsrc1_hi(self):
-    """v_cmp_gt_f16 with vsrc1 from high half."""
-    instructions = [
-      # v0: high=1.0 (0x3c00), low=2.0 (0x4000)
-      s_mov_b32(s[0], 0x3c004000),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_cmp_gt_f16 vcc, v0, v128 means: vcc = (v0.lo > v0.hi) = (2.0 > 1.0) = true
-      v_cmp_gt_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (2.0 > 1.0)")
-
-  def test_v_cmp_eq_f16_vsrc1_hi_equal(self):
-    """v_cmp_eq_f16 with equal low and high halves."""
-    instructions = [
-      # v0: high=3.0 (0x4200), low=3.0 (0x4200)
-      s_mov_b32(s[0], 0x42004200),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_cmp_eq_f16 vcc, v0, v128 means: vcc = (v0.lo == v0.hi) = (3.0 == 3.0) = true
-      v_cmp_eq_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (3.0 == 3.0)")
-
-  def test_v_cmp_neq_f16_vsrc1_hi(self):
-    """v_cmp_neq_f16 with different low and high halves."""
-    instructions = [
-      # v0: high=2.0 (0x4000), low=1.0 (0x3c00)
-      s_mov_b32(s[0], 0x40003c00),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_cmp_neq_f16 vcc, v0, v128 means: vcc = (v0.lo != v0.hi) = (1.0 != 2.0) = true
-      v_cmp_lg_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (1.0 != 2.0)")
-
-  def test_v_cmp_nge_f16_inf_self(self):
-    """v_cmp_nge_f16 comparing -inf with itself (unordered less than).
-
-    Regression test: -inf < -inf should be false (IEEE 754).
-    The bug was VOPC 16-bit not handling v128+ encoding for vsrc1.
-    """
-    instructions = [
-      # v0: both halves = -inf (0xFC00)
-      s_mov_b32(s[0], 0xFC00FC00),
-      v_mov_b32_e32(v[0], s[0]),
-      # v_cmp_nge_f16 is "not greater or equal" which is equivalent to "unordered less than"
-      # -inf nge -inf should be false (since -inf >= -inf is true)
-      v_cmp_nge_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 0, "Expected vcc=0 (-inf >= -inf)")
-
-  def test_v_cmp_f16_multilane(self):
-    """v_cmp_lt_f16 with vsrc1=v128 across multiple lanes."""
-    instructions = [
-      # Lane 0: v0 = 0x40003c00 (hi=2.0, lo=1.0) -> 1.0 < 2.0 = true
-      # Lane 1: v0 = 0x3c004000 (hi=1.0, lo=2.0) -> 2.0 < 1.0 = false
-      v_mov_b32_e32(v[0], 0x40003c00),  # default
-      # Use v_cmp to select lane 1 (v255 = lane_id from prologue)
-      v_cmp_eq_u32_e32(1, v[255]),  # vcc = (lane == 1)
-      v_cndmask_b32_e64(v[0], v[0], 0x3c004000, SrcEnum.VCC_LO),
-      v_cmp_lt_f16_e32(v[0], v[0].h),
-    ]
-    st = run_program(instructions, n_lanes=2)
-    self.assertEqual(st.vcc & 1, 1, "Lane 0: expected vcc=1 (1.0 < 2.0)")
-    self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
-
-
-class TestF16SinKernelOps(unittest.TestCase):
-  """Tests for F16 instructions used in the sin kernel. Run with USE_HW=1 to compare emulator vs hardware."""
-
-  def test_v_cvt_i16_f16_zero(self):
-    """v_cvt_i16_f16: Convert f16 0.0 to i16 0."""
-    instructions = [
-      s_mov_b32(s[0], 0x00000000),  # f16 0.0 in low bits
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i16_f16_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xFFFF
-    self.assertEqual(result, 0, f"Expected 0, got {result}")
-
-  def test_v_cvt_i16_f16_one(self):
-    """v_cvt_i16_f16: Convert f16 1.0 (0x3c00) to i16 1."""
-    instructions = [
-      s_mov_b32(s[0], 0x00003c00),  # f16 1.0 in low bits
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i16_f16_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xFFFF
-    self.assertEqual(result, 1, f"Expected 1, got {result}")
-
-  def test_v_cvt_i16_f16_negative(self):
-    """v_cvt_i16_f16: Convert f16 -2.0 (0xc000) to i16 -2."""
-    instructions = [
-      s_mov_b32(s[0], 0x0000c000),  # f16 -2.0 in low bits
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i16_f16_e32(v[1], v[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xFFFF
-    # -2 as signed 16-bit = 0xFFFE
-    self.assertEqual(result, 0xFFFE, f"Expected 0xFFFE (-2), got 0x{result:04x}")
-
-  def test_v_cvt_i16_f16_from_hi(self):
-    """v_cvt_i16_f16: Convert f16 from high half of register."""
-    instructions = [
-      s_mov_b32(s[0], 0x3c000000),  # f16 1.0 in HIGH bits, 0.0 in low
-      v_mov_b32_e32(v[0], s[0]),
-      v_cvt_i16_f16_e32(v[1], v[0].h),  # Read from high half
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xFFFF
-    self.assertEqual(result, 1, f"Expected 1, got {result}")
-
-  def test_v_bfe_i32_sign_extend(self):
-    """v_bfe_i32: Extract 16 bits with sign extension."""
-    instructions = [
-      s_mov_b32(s[0], 0x80000001),  # low 16 bits = 0x0001
-      v_mov_b32_e32(v[0], s[0]),
-      v_bfe_i32(v[1], v[0], 0, 16),  # Extract bits 0-15 with sign extend
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    self.assertEqual(result, 1, f"Expected 1, got {result}")
-
-  def test_v_bfe_i32_sign_extend_negative(self):
-    """v_bfe_i32: Extract 16 bits with sign extension (negative value)."""
-    instructions = [
-      s_mov_b32(s[0], 0x0000FFFE),  # low 16 bits = 0xFFFE = -2 as i16
-      v_mov_b32_e32(v[0], s[0]),
-      v_bfe_i32(v[1], v[0], 0, 16),  # Extract bits 0-15 with sign extend
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1]
-    # -2 sign-extended to 32 bits = 0xFFFFFFFE
-    self.assertEqual(result, 0xFFFFFFFE, f"Expected 0xFFFFFFFE (-2), got 0x{result:08x}")
-
-  def test_v_cndmask_b16_select_src0(self):
-    """v_cndmask_b16: Select src0 when vcc=0."""
-    instructions = [
-      s_mov_b32(s[0], 0x3c003800),  # src0.h=1.0, src0.l=0.5
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x4000c000),  # src1.h=2.0, src1.l=-2.0
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # vcc = 0
-      v_cndmask_b16(v[2], v[0], v[1], SrcEnum.VCC_LO),  # Should select v0.l = 0.5
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x3800, f"Expected 0x3800 (0.5), got 0x{result:04x}")
-
-  def test_v_cndmask_b16_select_src1(self):
-    """v_cndmask_b16: Select src1 when vcc=1."""
-    instructions = [
-      s_mov_b32(s[0], 0x3c003800),  # src0.h=1.0, src0.l=0.5
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x4000c000),  # src1.h=2.0, src1.l=-2.0
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # vcc = 1 for lane 0
-      v_cndmask_b16(v[2], v[0], v[1], SrcEnum.VCC_LO),  # Should select v1.l = -2.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0xc000, f"Expected 0xc000 (-2.0), got 0x{result:04x}")
-
-  def test_v_cndmask_b16_write_hi(self):
-    """v_cndmask_b16: Write to high half with opsel."""
-    instructions = [
-      s_mov_b32(s[0], 0x3c003800),  # src0: hi=1.0, lo=0.5
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x4000c000),  # src1: hi=2.0, lo=-2.0
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], 0xDEAD0000),  # v2 initial: hi=0xDEAD, lo=0
-      v_mov_b32_e32(v[2], s[2]),
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # vcc = 0
-      # opsel=8 means write to high half (bit 3 = dst hi)
-      # opsel=1 means read src0 from hi, opsel=2 means read src1 from hi
-      # v_cndmask_b16 v2.h, v0.h, v1.h, vcc -> select v0.h = 1.0
-      VOP3(VOP3Op.V_CNDMASK_B16, vdst=v[2], src0=v[0], src1=v[1], src2=SrcEnum.VCC_LO, opsel=0b1011),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_hi = (st.vgpr[0][2] >> 16) & 0xFFFF
-    result_lo = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result_hi, 0x3c00, f"Expected hi=0x3c00 (1.0), got 0x{result_hi:04x}")
-    self.assertEqual(result_lo, 0x0000, f"Expected lo preserved as 0, got 0x{result_lo:04x}")
-
-  def test_v_mul_f16_basic(self):
-    """v_mul_f16: 2.0 * 3.0 = 6.0."""
-    instructions = [
-      s_mov_b32(s[0], 0x00004000),  # f16 2.0 in low bits
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x00004200),  # f16 3.0 in low bits
-      v_mov_b32_e32(v[1], s[1]),
-      v_mul_f16_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x4600, f"Expected 0x4600 (6.0), got 0x{result:04x}")
-
-  def test_v_mul_f16_by_zero(self):
-    """v_mul_f16: x * 0.0 = 0.0."""
-    instructions = [
-      s_mov_b32(s[0], 0x00003c00),  # f16 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x00000000),  # f16 0.0
-      v_mov_b32_e32(v[1], s[1]),
-      v_mul_f16_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x0000, f"Expected 0x0000 (0.0), got 0x{result:04x}")
-
-  def test_v_mul_f16_hi_half(self):
-    """v_mul_f16: Multiply using high halves."""
-    instructions = [
-      s_mov_b32(s[0], 0x40000000),  # hi=2.0, lo=0.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x42000000),  # hi=3.0, lo=0.0
-      v_mov_b32_e32(v[1], s[1]),
-      v_mul_f16_e32(v[2].h, v[0].h, v[1].h),  # 2.0 * 3.0 = 6.0 in hi
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_hi = (st.vgpr[0][2] >> 16) & 0xFFFF
-    self.assertEqual(result_hi, 0x4600, f"Expected hi=0x4600 (6.0), got 0x{result_hi:04x}")
-
-  def test_v_fmac_f16_basic(self):
-    """v_fmac_f16: dst = src0 * src1 + dst = 2.0 * 3.0 + 1.0 = 7.0."""
-    instructions = [
-      s_mov_b32(s[0], 0x00004000),  # f16 2.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x00004200),  # f16 3.0
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], 0x00003c00),  # f16 1.0 (accumulator)
-      v_mov_b32_e32(v[2], s[2]),
-      v_fmac_f16_e32(v[2], v[0], v[1]),  # v2 = v0 * v1 + v2
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x4700, f"Expected 0x4700 (7.0), got 0x{result:04x}")
-
-  def test_v_fmac_f16_hi_dest(self):
-    """v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.
-
-    This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
-    The accumulator D should be read from v0.h, not v0.l.
-    """
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    # Set up: v0 = {hi=0.5, lo=1.0}, src0 = 0.0 (literal), src1 = v1.l (any value)
-    # Expected: v0.h = 0.0 * v1.l + 0.5 = 0.5 (unchanged)
-    instructions = [
-      s_mov_b32(s[0], 0x38003c00),  # v0 = {hi=0.5, lo=1.0}
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x38000000),  # v1 = {hi=0.5, lo=0.0}
-      v_mov_b32_e32(v[1], s[1]),
-      # v_fmac_f16 v0.h, literal(0.318...), v1.l  (vdst=128 for .h)
-      # D = D + S0 * S1 = v0.h + 0.318 * 0.0 = 0.5 + 0 = 0.5
-      VOP2(VOP2Op.V_FMAC_F16, vdst=RawImm(128), src0=RawImm(255), vsrc1=RawImm(1), literal=0x3518),  # 0.318... * 0.0 + 0.5
-    ]
-    st = run_program(instructions, n_lanes=1)
-    v0 = st.vgpr[0][0]
-    result_hi = _f16((v0 >> 16) & 0xffff)
-    result_lo = _f16(v0 & 0xffff)
-    self.assertAlmostEqual(result_hi, 0.5, delta=0.01, msg=f"Expected v0.h=0.5, got {result_hi}")
-    self.assertAlmostEqual(result_lo, 1.0, delta=0.01, msg=f"Expected v0.l=1.0, got {result_lo}")
-
-  def test_v_add_f16_basic(self):
-    """v_add_f16: 1.0 + 2.0 = 3.0."""
-    instructions = [
-      s_mov_b32(s[0], 0x00003c00),  # f16 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x00004000),  # f16 2.0
-      v_mov_b32_e32(v[1], s[1]),
-      v_add_f16_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x4200, f"Expected 0x4200 (3.0), got 0x{result:04x}")
-
-  def test_v_add_f16_negative(self):
-    """v_add_f16: 1.0 + (-1.5703125) = -0.5703125."""
-    # 0xbe48 is approximately -1.5703125 in f16
-    instructions = [
-      s_mov_b32(s[0], 0x00003c00),  # f16 1.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x0000be48),  # f16 -1.5703125
-      v_mov_b32_e32(v[1], s[1]),
-      v_add_f16_e32(v[2], v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    # 1.0 + (-1.5703125) = -0.5703125 which is approximately 0xb890
-    # Allow some tolerance - just check it's negative and close
-    from extra.assembly.amd.pcode import _f16
-    result_f = _f16(result)
-    expected = 1.0 - 1.5703125
-    self.assertAlmostEqual(result_f, expected, places=2, msg=f"Expected ~{expected}, got {result_f}")
-
-  def test_v_fmaak_f16_basic(self):
-    """v_fmaak_f16: dst = src0 * vsrc1 + K."""
-    # v_fmaak_f16 computes: D = S0 * S1 + K
-    # 2.0 * 3.0 + 1.0 = 7.0
-    instructions = [
-      s_mov_b32(s[0], 0x00004000),  # f16 2.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x00004200),  # f16 3.0
-      v_mov_b32_e32(v[1], s[1]),
-      v_fmaak_f16_e32(v[2], v[0], v[1], 0x3c00),  # v2 = v0 * v1 + 1.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][2] & 0xFFFF
-    self.assertEqual(result, 0x4700, f"Expected 0x4700 (7.0), got 0x{result:04x}")
-
-  def test_v_fmamk_f32_basic(self):
-    """v_fmamk_f32: dst = src0 * K + vsrc1."""
-    # v_fmamk_f32 computes: D = S0 * K + S1
-    # 2.0 * 3.0 + 1.0 = 7.0
-    instructions = [
-      s_mov_b32(s[0], f2i(2.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(1.0)),  # accumulator
-      v_mov_b32_e32(v[1], s[1]),
-      v_fmamk_f32_e32(v[2], v[0], f2i(3.0), v[1]),  # v2 = v0 * 3.0 + v1
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][2])
-    self.assertAlmostEqual(result, 7.0, places=5, msg=f"Expected 7.0, got {result}")
-
-  def test_v_fmamk_f32_small_constant(self):
-    """v_fmamk_f32: Test with small constant like in sin kernel."""
-    # This mimics part of the sin kernel: 1.0 * (-1.13e-4) + (-3.1414795) ≈ -3.1415926
-    k_val = 0xb8ed5000  # approximately -0.0001131594 as f32
-    s1_val = f2i(-3.1414794921875)
-    instructions = [
-      s_mov_b32(s[0], f2i(1.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], s1_val),
-      v_mov_b32_e32(v[1], s[1]),
-      v_fmamk_f32_e32(v[2], v[0], k_val, v[1]),  # v2 = 1.0 * K + v1
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][2])
-    k_f32 = i2f(k_val)
-    expected = 1.0 * k_f32 + (-3.1414794921875)
-    self.assertAlmostEqual(result, expected, places=5, msg=f"Expected {expected}, got {result}")
-
-  def test_v_mov_b16_to_hi(self):
-    """v_mov_b16: Move immediate to high half, preserving low."""
-    instructions = [
-      s_mov_b32(s[0], 0x0000DEAD),  # initial: lo=0xDEAD, hi=0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b16_e32(v[0].h, 0x3800),  # Move 0.5 to high half
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
-    result_lo = st.vgpr[0][0] & 0xFFFF
-    self.assertEqual(result_hi, 0x3800, f"Expected hi=0x3800, got 0x{result_hi:04x}")
-    self.assertEqual(result_lo, 0xDEAD, f"Expected lo=0xDEAD (preserved), got 0x{result_lo:04x}")
-
-  def test_v_mov_b16_to_lo(self):
-    """v_mov_b16: Move immediate to low half, preserving high."""
-    instructions = [
-      s_mov_b32(s[0], 0xBEEF0000),  # initial: hi=0xBEEF, lo=0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b16_e32(v[0], 0x3c00),  # Move 1.0 to low half
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
-    result_lo = st.vgpr[0][0] & 0xFFFF
-    self.assertEqual(result_lo, 0x3c00, f"Expected lo=0x3c00, got 0x{result_lo:04x}")
-    self.assertEqual(result_hi, 0xBEEF, f"Expected hi=0xBEEF (preserved), got 0x{result_hi:04x}")
-
-  def test_v_xor_b32_sign_flip(self):
-    """v_xor_b32: XOR with 0x8000 flips sign of f16 in low bits."""
-    # 0x4246 is approximately 3.13671875 in f16
-    # XOR with 0x8000 gives 0xC246 which is -3.13671875
-    instructions = [
-      s_mov_b32(s[0], 0x00004246),  # f16 3.13671875
-      v_mov_b32_e32(v[0], s[0]),
-      v_xor_b32_e32(v[1], 0x8000, v[0]),  # Flip sign bit of low half
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][1] & 0xFFFF
-    self.assertEqual(result, 0xC246, f"Expected 0xC246 (-3.137), got 0x{result:04x}")
-
-  def test_v_fma_mix_f32_all_f32_sources(self):
-    """v_fma_mix_f32: All sources as f32 (opsel_hi=0)."""
-    instructions = [
-      s_mov_b32(s[0], f2i(2.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(3.0)),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f2i(1.0)),
-      v_mov_b32_e32(v[2], s[2]),
-      # opsel_hi=0,0,0 means all sources are f32
-      VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][3])
-    self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+1=7, got {result}")
-
-  def test_v_fma_mixlo_f16_all_f32_sources(self):
-    """v_fma_mixlo_f16: All sources as f32, result to low f16."""
-    instructions = [
-      s_mov_b32(s[0], f2i(1.0)),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f2i(-1.22e-10)),  # Very small
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], f2i(-3.1415927)),  # -pi
-      v_mov_b32_e32(v[2], s[2]),
-      s_mov_b32(s[3], 0xDEAD0000),  # Garbage in hi
-      v_mov_b32_e32(v[3], s[3]),
-      # 1.0 * (-1.22e-10) + (-3.1415927) ≈ -3.1415927
-      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    from extra.assembly.amd.pcode import _f16
-    result_lo = _f16(st.vgpr[0][3] & 0xFFFF)
-    result_hi = (st.vgpr[0][3] >> 16) & 0xFFFF
-    # Result should be approximately -pi
-    self.assertAlmostEqual(result_lo, -3.14, delta=0.01, msg=f"Expected ~-3.14, got {result_lo}")
-    self.assertEqual(result_hi, 0xDEAD, f"Expected hi preserved as 0xDEAD, got 0x{result_hi:04x}")
-
-
-class TestVCmpClassF16(unittest.TestCase):
-  """Tests for V_CMP_CLASS_F16 - critical for f16 sin/cos classification.
-
-  Class bit mapping:
-    bit 0 = signaling NaN
-    bit 1 = quiet NaN
-    bit 2 = -infinity
-    bit 3 = -normal
-    bit 4 = -denormal
-    bit 5 = -zero
-    bit 6 = +zero
-    bit 7 = +denormal
-    bit 8 = +normal
-    bit 9 = +infinity
-
-  This is crucial for the f16 sin kernel which uses v_cmp_class_f16 to detect
-  special values like +-0, +-inf, NaN and select appropriate outputs.
-  """
-
-  def test_cmp_class_f16_positive_zero(self):
-    """V_CMP_CLASS_F16: +zero should match bit 6."""
-    # f16 +0.0 = 0x0000
-    instructions = [
-      v_mov_b32_e32(v[0], 0),        # f16 +0.0 in low 16 bits
-      v_mov_b32_e32(v[1], 0x40),     # bit 6 only (+zero)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x40")
-
-  def test_cmp_class_f16_negative_zero(self):
-    """V_CMP_CLASS_F16: -zero should match bit 5."""
-    # f16 -0.0 = 0x8000
-    instructions = [
-      s_mov_b32(s[0], 0x8000),       # f16 -0.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x20),     # bit 5 only (-zero)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -zero with mask 0x20")
-
-  def test_cmp_class_f16_positive_normal(self):
-    """V_CMP_CLASS_F16: +1.0 (normal) should match bit 8."""
-    # f16 1.0 = 0x3c00
-    instructions = [
-      s_mov_b32(s[0], 0x3c00),       # f16 +1.0
-      s_mov_b32(s[1], 0x100),        # bit 8 (+normal)
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +1.0 with mask 0x100 (+normal)")
-
-  def test_cmp_class_f16_negative_normal(self):
-    """V_CMP_CLASS_F16: -1.0 (normal) should match bit 3."""
-    # f16 -1.0 = 0xbc00
-    instructions = [
-      s_mov_b32(s[0], 0xbc00),       # f16 -1.0
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x08),     # bit 3 (-normal)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -1.0 with mask 0x08 (-normal)")
-
-  def test_cmp_class_f16_positive_infinity(self):
-    """V_CMP_CLASS_F16: +inf should match bit 9."""
-    # f16 +inf = 0x7c00
-    instructions = [
-      s_mov_b32(s[0], 0x7c00),       # f16 +inf
-      s_mov_b32(s[1], 0x200),        # bit 9 (+inf)
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +inf with mask 0x200")
-
-  def test_cmp_class_f16_negative_infinity(self):
-    """V_CMP_CLASS_F16: -inf should match bit 2."""
-    # f16 -inf = 0xfc00
-    instructions = [
-      s_mov_b32(s[0], 0xfc00),       # f16 -inf
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x04),     # bit 2 (-inf)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -inf with mask 0x04")
-
-  def test_cmp_class_f16_quiet_nan(self):
-    """V_CMP_CLASS_F16: quiet NaN should match bit 1."""
-    # f16 quiet NaN = 0x7e00 (exponent all 1s, mantissa MSB set)
-    instructions = [
-      s_mov_b32(s[0], 0x7e00),       # f16 quiet NaN
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x02),     # bit 1 (quiet NaN)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for quiet NaN with mask 0x02")
-
-  def test_cmp_class_f16_signaling_nan(self):
-    """V_CMP_CLASS_F16: signaling NaN should match bit 0."""
-    # f16 signaling NaN = 0x7c01 (exponent all 1s, mantissa MSB clear, other mantissa bits set)
-    instructions = [
-      s_mov_b32(s[0], 0x7c01),       # f16 signaling NaN
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x01),     # bit 0 (signaling NaN)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for signaling NaN with mask 0x01")
-
-  def test_cmp_class_f16_positive_denormal(self):
-    """V_CMP_CLASS_F16: positive denormal should match bit 7."""
-    # f16 smallest positive denormal = 0x0001
-    instructions = [
-      v_mov_b32_e32(v[0], 1),        # f16 +denormal (0x0001)
-      v_mov_b32_e32(v[1], 0x80),     # bit 7 (+denormal)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +denormal with mask 0x80")
-
-  def test_cmp_class_f16_negative_denormal(self):
-    """V_CMP_CLASS_F16: negative denormal should match bit 4."""
-    # f16 smallest negative denormal = 0x8001
-    instructions = [
-      s_mov_b32(s[0], 0x8001),       # f16 -denormal
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], 0x10),     # bit 4 (-denormal)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -denormal with mask 0x10")
-
-  def test_cmp_class_f16_combined_mask_zeros(self):
-    """V_CMP_CLASS_F16: mask 0x60 covers both +zero and -zero."""
-    # Test with +0.0
-    instructions = [
-      v_mov_b32_e32(v[0], 0),        # f16 +0.0
-      v_mov_b32_e32(v[1], 0x60),     # bits 5 and 6 (+-zero)
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x60")
-
-  def test_cmp_class_f16_combined_mask_1f8(self):
-    """V_CMP_CLASS_F16: mask 0x1f8 covers -normal,-denorm,-zero,+zero,+denorm,+normal.
-
-    This is the exact mask used in the f16 sin kernel at PC=46:
-      v_cmp_class_f16_e64 vcc_lo, v1, 0x1f8
-
-    The kernel uses this to detect if the input is a "normal" finite value
-    (not NaN, not infinity). If the check fails (vcc=0), it selects NaN output.
-    """
-    # Test with +0.0 - should match via bit 6
-    instructions = [
-      v_mov_b32_e32(v[0], 0),           # f16 +0.0
-      s_mov_b32(s[0], 0x1f8),
-      v_mov_b32_e32(v[1], s[0]),        # mask 0x1f8
-      v_cmp_class_f16_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x1f8")
-
-  def test_cmp_class_f16_vop3_encoding(self):
-    """V_CMP_CLASS_F16 in VOP3 encoding (v_cmp_class_f16_e64).
-
-    This tests the exact instruction encoding used in the f16 sin kernel.
-    VOP3 encoding allows the result to go to any SGPR pair, not just VCC.
-    """
-    # v_cmp_class_f16_e64 vcc_lo, v0, 0x1f8
-    # Use SGPR to hold the mask since literals require special handling
-    instructions = [
-      v_mov_b32_e32(v[0], 0),           # f16 +0.0
-      s_mov_b32(s[0], 0x1f8),           # class mask
-      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[0]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with VOP3 encoding")
-
-  def test_cmp_class_f16_vop3_normal_positive(self):
-    """V_CMP_CLASS_F16 VOP3 encoding with +1.0 (normal)."""
-    # f16 1.0 = 0x3c00, should match bit 8 (+normal) in mask 0x1f8
-    instructions = [
-      s_mov_b32(s[0], 0x3c00),          # f16 +1.0
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x1f8),           # class mask
-      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +1.0 (normal) with mask 0x1f8")
-
-  def test_cmp_class_f16_vop3_nan_fails_mask(self):
-    """V_CMP_CLASS_F16 VOP3: NaN should NOT match mask 0x1f8 (no NaN bits set)."""
-    # f16 quiet NaN = 0x7e00, should NOT match mask 0x1f8 (bits 3-8 only)
-    instructions = [
-      s_mov_b32(s[0], 0x7e00),          # f16 quiet NaN
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x1f8),           # class mask
-      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for NaN with mask 0x1f8 (no NaN bits)")
-
-  def test_cmp_class_f16_vop3_inf_fails_mask(self):
-    """V_CMP_CLASS_F16 VOP3: +inf should NOT match mask 0x1f8 (no inf bits set)."""
-    # f16 +inf = 0x7c00, should NOT match mask 0x1f8 (bits 3-8 only)
-    instructions = [
-      s_mov_b32(s[0], 0x7c00),          # f16 +inf
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], 0x1f8),           # class mask
-      VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for +inf with mask 0x1f8 (no inf bits)")
-
-
-class TestVOP3F16Modifiers(unittest.TestCase):
-  """Tests for VOP3 16-bit ops with abs/neg modifiers and inline constants.
-
-  VOP3 16-bit ops must:
-  1. Use f16 inline constants (not f32)
-  2. Apply abs/neg modifiers as f16 operations (toggle bit 15)
-
-  This is critical for sin/cos kernels that use v_cvt_f32_f16 with |abs|
-  and v_fma_f16 with inline constants.
-  """
-
-  def test_v_cvt_f32_f16_abs_negative(self):
-    """V_CVT_F32_F16 with |abs| on negative value."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_neg1 = f32_to_f16(-1.0)  # 0xbc00
-    instructions = [
-      s_mov_b32(s[0], f16_neg1),
-      v_mov_b32_e32(v[1], s[0]),
-      v_cvt_f32_f16_e64(v[0], abs(v[1])),  # |(-1.0)| = 1.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][0])
-    self.assertAlmostEqual(result, 1.0, places=5, msg=f"Expected 1.0, got {result}")
-
-  def test_v_cvt_f32_f16_abs_positive(self):
-    """V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_2 = f32_to_f16(2.0)  # 0x4000
-    instructions = [
-      s_mov_b32(s[0], f16_2),
-      v_mov_b32_e32(v[1], s[0]),
-      v_cvt_f32_f16_e64(v[0], abs(v[1])),  # |2.0| = 2.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][0])
-    self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected 2.0, got {result}")
-
-  def test_v_cvt_f32_f16_neg_positive(self):
-    """V_CVT_F32_F16 with neg on positive value."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_2 = f32_to_f16(2.0)  # 0x4000
-    instructions = [
-      s_mov_b32(s[0], f16_2),
-      v_mov_b32_e32(v[1], s[0]),
-      v_cvt_f32_f16_e64(v[0], -v[1]),  # -(2.0) = -2.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][0])
-    self.assertAlmostEqual(result, -2.0, places=5, msg=f"Expected -2.0, got {result}")
-
-  def test_v_cvt_f32_f16_neg_negative(self):
-    """V_CVT_F32_F16 with neg on negative value (double negative)."""
-    from extra.assembly.amd.pcode import f32_to_f16
-    f16_neg2 = f32_to_f16(-2.0)  # 0xc000
-    instructions = [
-      s_mov_b32(s[0], f16_neg2),
-      v_mov_b32_e32(v[1], s[0]),
-      v_cvt_f32_f16_e64(v[0], -v[1]),  # -(-2.0) = 2.0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i2f(st.vgpr[0][0])
-    self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected 2.0, got {result}")
-
-  def test_v_fma_f16_inline_const_1_0(self):
-    """V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    # v4 = 0.3259 (f16), v6 = -0.4866 (f16), src2 = 1.0 inline
-    # Result: 0.3259 * (-0.4866) + 1.0 = 0.8413...
-    f16_a = f32_to_f16(0.325928)  # 0x3537
-    f16_b = f32_to_f16(-0.486572)  # 0xb7c9
-    instructions = [
-      s_mov_b32(s[0], f16_a),
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[1], f16_b),
-      v_mov_b32_e32(v[6], s[1]),
-      v_fma_f16(v[4], v[4], v[6], 1.0),  # 1.0 is inline constant
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = _f16(st.vgpr[0][4] & 0xffff)
-    expected = 0.325928 * (-0.486572) + 1.0
-    self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected ~{expected:.4f}, got {result}")
-
-  def test_v_fma_f16_inline_const_0_5(self):
-    """V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    f16_a = f32_to_f16(2.0)
-    f16_b = f32_to_f16(3.0)
-    instructions = [
-      s_mov_b32(s[0], f16_a),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f16_b),
-      v_mov_b32_e32(v[1], s[1]),
-      v_fma_f16(v[2], v[0], v[1], 0.5),  # 0.5 is inline constant
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = _f16(st.vgpr[0][2] & 0xffff)
-    expected = 2.0 * 3.0 + 0.5
-    self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected {expected}, got {result}")
-
-  def test_v_fma_f16_inline_const_neg_1_0(self):
-    """V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    f16_a = f32_to_f16(2.0)
-    f16_b = f32_to_f16(3.0)
-    instructions = [
-      s_mov_b32(s[0], f16_a),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f16_b),
-      v_mov_b32_e32(v[1], s[1]),
-      v_fma_f16(v[2], v[0], v[1], -1.0),  # -1.0 is inline constant
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = _f16(st.vgpr[0][2] & 0xffff)
-    expected = 2.0 * 3.0 + (-1.0)
-    self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected {expected}, got {result}")
-
-  def test_v_add_f16_abs_both(self):
-    """V_ADD_F16 with abs on both operands."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    f16_neg2 = f32_to_f16(-2.0)
-    f16_neg3 = f32_to_f16(-3.0)
-    instructions = [
-      s_mov_b32(s[0], f16_neg2),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f16_neg3),
-      v_mov_b32_e32(v[1], s[1]),
-      v_add_f16_e64(v[2], abs(v[0]), abs(v[1])),  # |-2| + |-3| = 5
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = _f16(st.vgpr[0][2] & 0xffff)
-    self.assertAlmostEqual(result, 5.0, delta=0.01, msg=f"Expected 5.0, got {result}")
-
-  def test_v_mul_f16_neg_abs(self):
-    """V_MUL_F16 with neg on one operand and abs on another."""
-    from extra.assembly.amd.pcode import f32_to_f16, _f16
-    f16_2 = f32_to_f16(2.0)
-    f16_neg3 = f32_to_f16(-3.0)
-    instructions = [
-      s_mov_b32(s[0], f16_2),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[1], f16_neg3),
-      v_mov_b32_e32(v[1], s[1]),
-      v_mul_f16_e64(v[2], -v[0], abs(v[1])),  # -(2) * |-3| = -6
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = _f16(st.vgpr[0][2] & 0xffff)
-    self.assertAlmostEqual(result, -6.0, delta=0.01, msg=f"Expected -6.0, got {result}")
-
-
-class TestVFmaMixSinCase(unittest.TestCase):
-  """Tests for the specific V_FMA_MIXLO_F16 case that fails in AMD_LLVM sin(0) kernel."""
-
-  def test_v_fma_mixlo_f16_sin_case(self):
-    """V_FMA_MIXLO_F16 case from sin kernel at pc=0x14e.
-
-    This tests the specific operands that produce the wrong result:
-    - src0 = v3 = 0x3f800000 (f32 1.0)
-    - src1 = s6 = 0xaf05a309 (f32 tiny negative)
-    - src2 = v5 = 0xc0490fdb (f32 -π)
-    - Result should be approximately -π (tiny * 1.0 + -π ≈ -π)
-    """
-    from extra.assembly.amd.pcode import _f16
-    instructions = [
-      # Set up operands as in the sin kernel
-      s_mov_b32(s[0], 0x3f800000),  # f32 1.0
-      v_mov_b32_e32(v[3], s[0]),
-      s_mov_b32(s[1], 0xaf05a309),  # f32 tiny negative
-      s_mov_b32(s[6], s[1]),
-      s_mov_b32(s[2], 0xc0490fdb),  # f32 -π
-      v_mov_b32_e32(v[5], s[2]),
-      # Pre-fill v3 with expected hi bits
-      s_mov_b32(s[3], 0x3f800000),  # hi = f32 1.0 encoding (will be overwritten by opsel behavior)
-      v_mov_b32_e32(v[3], s[3]),
-      # V_FMA_MIXLO_F16: src0=v3 (259), src1=s6, src2=v5 (261), opsel=0, opsel_hi=0, opsel_hi2=0
-      VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[3], src1=s[6], src2=v[5], opsel=0, opsel_hi=0, opsel_hi2=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = _f16(st.vgpr[0][3] & 0xffff)
-    # Result should be approximately -π = -3.14...
-    # f16 -π ≈ 0xc248 = -3.140625
-    self.assertAlmostEqual(lo, -3.14159, delta=0.01, msg=f"Expected ~-π, got {lo}")
-
-
-class TestVTrigPreopF64(unittest.TestCase):
-  """Tests for V_TRIG_PREOP_F64 instruction.
-
-  V_TRIG_PREOP_F64 extracts chunks of 2/PI for Payne-Hanek trig range reduction.
-  For input S0 (f64) and index S1 (0, 1, or 2), it returns a portion of 2/PI
-  scaled appropriately for computing |S0| * (2/PI) in extended precision.
-
-  The three chunks (index 0, 1, 2) when summed should equal 2/PI.
-  """
-
-  def test_trig_preop_f64_index0(self):
-    """V_TRIG_PREOP_F64 index=0: primary chunk of 2/PI."""
-    import math
-    two_over_pi = 2.0 / math.pi
-    instructions = [
-      # S0 = 1.0 (f64), S1 = 0 (index)
-      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
-      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
-      v_trig_preop_f64(v[0], abs(s[0]), 0),  # index 0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    # For x=1.0, index=0 should give the main part of 2/PI
-    self.assertAlmostEqual(result, two_over_pi, places=10, msg=f"Expected ~{two_over_pi}, got {result}")
-
-  def test_trig_preop_f64_index1(self):
-    """V_TRIG_PREOP_F64 index=1: secondary chunk (extended precision bits)."""
-    instructions = [
-      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
-      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
-      v_trig_preop_f64(v[0], abs(s[0]), 1),  # index 1
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    # Index 1 gives the next 53 bits, should be very small (~1e-16)
-    self.assertLess(abs(result), 1e-15, msg=f"Expected tiny value, got {result}")
-    self.assertGreater(abs(result), 0, msg="Expected non-zero value")
-
-  def test_trig_preop_f64_index2(self):
-    """V_TRIG_PREOP_F64 index=2: tertiary chunk (more extended precision bits)."""
-    instructions = [
-      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
-      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
-      v_trig_preop_f64(v[0], abs(s[0]), 2),  # index 2
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    # Index 2 gives the next 53 bits after index 1, should be tiny (~1e-32)
-    self.assertLess(abs(result), 1e-30, msg=f"Expected very tiny value, got {result}")
-
-  def test_trig_preop_f64_sum_equals_two_over_pi(self):
-    """V_TRIG_PREOP_F64: sum of chunks 0,1,2 should equal 2/PI."""
-    import math
-    two_over_pi = 2.0 / math.pi
-    instructions = [
-      s_mov_b32(s[0], 0x00000000),  # low bits of 1.0
-      s_mov_b32(s[1], 0x3ff00000),  # high bits of 1.0
-      v_trig_preop_f64(v[0], abs(s[0]), 0),  # index 0 -> v[0:1]
-      v_trig_preop_f64(v[2], abs(s[0]), 1),  # index 1 -> v[2:3]
-      v_trig_preop_f64(v[4], abs(s[0]), 2),  # index 2 -> v[4:5]
-    ]
-    st = run_program(instructions, n_lanes=1)
-    p0 = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    p1 = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
-    p2 = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
-    total = p0 + p1 + p2
-    self.assertAlmostEqual(total, two_over_pi, places=14, msg=f"Expected {two_over_pi}, got {total} (p0={p0}, p1={p1}, p2={p2})")
-
-  def test_trig_preop_f64_large_input(self):
-    """V_TRIG_PREOP_F64 with larger input should adjust shift based on exponent."""
-    import math
-    # For x=2.0, exponent(2.0)=1024 which is <= 1077, so no adjustment
-    # But let's test with x=2^60 where exponent > 1077
-    large_val = 2.0 ** 60  # exponent = 1083 > 1077
-    large_bits = f2i64(large_val)
-    instructions = [
-      s_mov_b32(s[0], large_bits & 0xffffffff),
-      s_mov_b32(s[1], (large_bits >> 32) & 0xffffffff),
-      v_trig_preop_f64(v[0], abs(s[0]), 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
-    # Result should still be a valid float (not NaN or inf)
-    self.assertFalse(math.isnan(result), "Result should not be NaN")
-    self.assertFalse(math.isinf(result), "Result should not be inf")
-
-
-class Test64BitLiterals(unittest.TestCase):
-  """Regression tests for 64-bit instruction literal encoding.
-  Tests verify that Inst.to_bytes() correctly encodes 64-bit literals."""
-
-  def test_64bit_literal_negative_encoding(self):
-    """Verify 64-bit instruction encodes negative literals correctly.
-    Regression test: -33 should encode as 0xffffffdf in the literal field,
-    NOT as 0xffffffff (which would happen with incorrect sign extension)."""
-    neg_val = -33
-    expected_lit = neg_val & 0xffffffff  # 0xffffffdf
-    inst = v_add_f64(v[2], v[0], neg_val)
-    # Check the literal is stored correctly (in high 32 bits for 64-bit ops)
-    self.assertIsNotNone(inst._literal, "Literal should be set")
-    # Literal is stored as (lit32 << 32) for 64-bit ops
-    actual_lit = (inst._literal >> 32) & 0xffffffff
-    self.assertEqual(actual_lit, expected_lit, f"Literal should be {expected_lit:#x}, got {actual_lit:#x}")
-    # Also verify the encoded bytes
-    code = inst.to_bytes()
-    # Literal is last 4 bytes
-    lit_bytes = code[-4:]
-    lit_val = int.from_bytes(lit_bytes, 'little')
-    self.assertEqual(lit_val, expected_lit, f"Encoded literal should be {expected_lit:#x}, got {lit_val:#x}")
-
-  def test_64bit_literal_positive_encoding(self):
-    """Verify 64-bit instruction encodes large positive literals correctly."""
-    large_val = 0x12345678
-    inst = v_add_f64(v[2], v[0], large_val)
-    self.assertIsNotNone(inst._literal, "Literal should be set")
-    actual_lit = (inst._literal >> 32) & 0xffffffff
-    self.assertEqual(actual_lit, large_val, f"Literal should be {large_val:#x}, got {actual_lit:#x}")
-    # Verify encoded bytes
-    code = inst.to_bytes()
-    lit_bytes = code[-4:]
-    lit_val = int.from_bytes(lit_bytes, 'little')
-    self.assertEqual(lit_val, large_val, f"Encoded literal should be {large_val:#x}, got {lit_val:#x}")
-
-
-class TestWave32VCCBranch(unittest.TestCase):
-  """Regression tests for wave32 VCC branch behavior.
-  In wave32 mode, S_CBRANCH_VCCNZ/VCCZ should only check VCC_LO (lower 32 bits),
-  ignoring VCC_HI. Bug: emulator was checking full 64-bit VCC, causing incorrect
-  branches when VCC_LO=0 but VCC_HI!=0."""
-
-  def test_cbranch_vccnz_ignores_vcc_hi(self):
-    """S_CBRANCH_VCCNZ should NOT branch when VCC_LO=0, even if VCC_HI!=0.
-    This is the fix for test_avg_pool3d failure where the emulator incorrectly
-    branched due to stale VCC_HI bits."""
-    instructions = [
-      # Set VCC_HI to non-zero (simulating stale bits from previous ops)
-      s_mov_b32(s[SrcEnum.VCC_HI - 128], 0x80000000),  # VCC_HI = 0x80000000
-      # Set VCC_LO to zero (the condition we're testing)
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # VCC_LO = 0
-      # Now S_CBRANCH_VCCNZ should NOT branch since VCC_LO is 0
-      # If it doesn't branch, we'll set v0 = 1; if it branches, v0 stays 0
-      v_mov_b32_e32(v[0], 0),
-      s_cbranch_vccnz(2),  # Skip next instruction if VCC != 0
-      v_mov_b32_e32(v[0], 1),  # This should execute
-      s_nop(0),  # Jump target
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v0 should be 1 because VCC_LO=0 means no branch
-    self.assertEqual(st.vgpr[0][0], 1, "Should NOT branch when VCC_LO=0 (VCC_HI ignored in wave32)")
-
-  def test_cbranch_vccz_ignores_vcc_hi(self):
-    """S_CBRANCH_VCCZ should branch when VCC_LO=0, regardless of VCC_HI."""
-    instructions = [
-      # Set VCC_HI to non-zero (simulating stale bits)
-      s_mov_b32(s[SrcEnum.VCC_HI - 128], 0x80000000),  # VCC_HI = 0x80000000
-      # Set VCC_LO to zero
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 0),  # VCC_LO = 0
-      # S_CBRANCH_VCCZ should branch since VCC_LO is 0
-      v_mov_b32_e32(v[0], 0),
-      s_cbranch_vccz(2),  # Skip next instruction if VCC == 0
-      v_mov_b32_e32(v[0], 1),  # This should NOT execute
-      s_nop(0),  # Jump target
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v0 should be 0 because VCC_LO=0 means branch is taken
-    self.assertEqual(st.vgpr[0][0], 0, "Should branch when VCC_LO=0 (VCC_HI ignored in wave32)")
-
-  def test_cbranch_vccnz_branches_on_vcc_lo(self):
-    """S_CBRANCH_VCCNZ should branch when VCC_LO!=0."""
-    instructions = [
-      # Set VCC_LO to non-zero
-      s_mov_b32(s[SrcEnum.VCC_LO - 128], 1),  # VCC_LO = 1
-      s_mov_b32(s[SrcEnum.VCC_HI - 128], 0),  # VCC_HI = 0
-      v_mov_b32_e32(v[0], 0),
-      s_cbranch_vccnz(2),  # Skip next instruction if VCC != 0
-      v_mov_b32_e32(v[0], 1),  # This should NOT execute
-      s_nop(0),  # Jump target
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v0 should be 0 because VCC_LO=1 means branch is taken
-    self.assertEqual(st.vgpr[0][0], 0, "Should branch when VCC_LO!=0")
-
-
-class TestVOP3VOPC16Bit(unittest.TestCase):
-  """Regression tests for VOP3-encoded VOPC 16-bit comparison instructions.
-  When VOPC comparisons are encoded in VOP3 format, they use opsel bits to select
-  which 16-bit half of each source to compare.
-  Bug: Emulator was ignoring opsel and using VGPR bit 7 encoding instead."""
-
-  def test_cmp_eq_u16_opsel_lo_lo(self):
-    """V_CMP_EQ_U16 VOP3 with opsel=0 compares lo halves."""
-    # v0 = 0x12340005 (lo=5, hi=0x1234)
-    # v1 = 0x56780005 (lo=5, hi=0x5678)
-    # opsel=0: compare lo halves -> 5 == 5 -> true
-    instructions = [
-      s_mov_b32(s[2], 0x12340005),
-      v_mov_b32_e32(v[0], s[2]),
-      s_mov_b32(s[2], 0x56780005),
-      v_mov_b32_e32(v[1], s[2]),
-      VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=0),  # dst=s0
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # s0 should have bit 0 set (comparison true for lane 0)
-    self.assertEqual(st.sgpr[0] & 1, 1, "lo==lo should be true: 5==5")
-
-  def test_cmp_eq_u16_opsel_hi_hi(self):
-    """V_CMP_EQ_U16 VOP3 with opsel=3 compares hi halves."""
-    # v0 = 0x12340005 (lo=5, hi=0x1234)
-    # v1 = 0x56780005 (lo=5, hi=0x5678)
-    # opsel=3 (bits 0 and 1 set): compare hi halves -> 0x1234 != 0x5678 -> false
-    instructions = [
-      s_mov_b32(s[2], 0x12340005),
-      v_mov_b32_e32(v[0], s[2]),
-      s_mov_b32(s[2], 0x56780005),
-      v_mov_b32_e32(v[1], s[2]),
-      VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),  # dst=s0, hi vs hi
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # s0 should have bit 0 clear (comparison false for lane 0)
-    self.assertEqual(st.sgpr[0] & 1, 0, "hi==hi should be false: 0x1234!=0x5678")
-
-  def test_cmp_eq_u16_opsel_hi_hi_equal(self):
-    """V_CMP_EQ_U16 VOP3 with opsel=3 compares hi halves (equal case)."""
-    # v0 = 0x12340005 (lo=5, hi=0x1234)
-    # v1 = 0x12340009 (lo=9, hi=0x1234)
-    # opsel=3: compare hi halves -> 0x1234 == 0x1234 -> true
-    instructions = [
-      s_mov_b32(s[2], 0x12340005),
-      v_mov_b32_e32(v[0], s[2]),
-      s_mov_b32(s[2], 0x12340009),
-      v_mov_b32_e32(v[1], s[2]),
-      VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),  # dst=s0, hi vs hi
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # s0 should have bit 0 set (comparison true for lane 0)
-    self.assertEqual(st.sgpr[0] & 1, 1, "hi==hi should be true: 0x1234==0x1234")
-
-  def test_cmp_gt_u16_opsel_hi(self):
-    """V_CMP_GT_U16 VOP3 with opsel=3 compares hi halves."""
-    # v0 = 0x99990005 (lo=5, hi=0x9999)
-    # v1 = 0x12340005 (lo=5, hi=0x1234)
-    # opsel=3: compare hi halves -> 0x9999 > 0x1234 -> true
-    instructions = [
-      s_mov_b32(s[2], 0x99990005),
-      v_mov_b32_e32(v[0], s[2]),
-      s_mov_b32(s[2], 0x12340005),
-      v_mov_b32_e32(v[1], s[2]),
-      VOP3(VOP3Op.V_CMP_GT_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3),  # dst=s0, hi vs hi
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # s0 should have bit 0 set (comparison true for lane 0)
-    self.assertEqual(st.sgpr[0] & 1, 1, "hi>hi should be true: 0x9999>0x1234")
-
-
-class Test64BitLiteralSources(unittest.TestCase):
-  """Regression tests for 64-bit instruction literal source handling.
-
-  For f64 operations, a 32-bit literal in the instruction stream represents the
-  HIGH 32 bits of the 64-bit value (low 32 bits are implicitly 0).
-
-  Bug: rsrc64() was returning the 32-bit literal as-is instead of shifting it
-  left by 32 bits. This caused V_FMA_F64 and V_LDEXP_F64 to use wrong values
-  when their source is a literal, breaking the f64->i64 conversion sequence.
-
-  The f64->i64 conversion sequence is:
-    v_trunc_f64 -> v_ldexp_f64 (by -32) -> v_floor_f64 -> v_fma_f64 (by -2^32)
-    -> v_cvt_u32_f64 (low bits) -> v_cvt_i32_f64 (high bits)
-
-  The V_FMA_F64 uses literal 0xC1F00000 which is the high 32 bits of f64 -2^32.
-  """
-
-  def test_v_fma_f64_literal_neg_2pow32(self):
-    """V_FMA_F64 with literal encoding of -2^32.
-
-    The f64 value -2^32 (-4294967296.0) has bits 0xC1F0000000000000.
-    The compiler encodes only the high 32 bits (0xC1F00000) as a literal.
-    The emulator must interpret this as 0xC1F00000_00000000.
-    """
-    # v[0:1] = -41.0 (trunc), v[2:3] = -1.0 (floor of -41/2^32)
-    # FMA: result = (-2^32) * (-1.0) + (-41.0) = 4294967296 - 41 = 4294967255.0
-    val_41 = f2i64(-41.0)
-    val_m1 = f2i64(-1.0)
-    # Literal 0xC1F00000 is high 32 bits of f64 -2^32
-    lit = 0xC1F00000
-    instructions = [
-      s_mov_b32(s[0], val_41 & 0xffffffff),
-      s_mov_b32(s[1], (val_41 >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      s_mov_b32(s[2], val_m1 & 0xffffffff),
-      s_mov_b32(s[3], (val_m1 >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[2], s[2]),
-      v_mov_b32_e32(v[3], s[3]),
-      # V_FMA_F64 v[4:5], literal, v[2:3], v[0:1]
-      # = (-2^32) * (-1.0) + (-41.0) = 4294967255.0
-      VOP3(VOP3Op.V_FMA_F64, vdst=v[4], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
-    expected = 4294967255.0  # 2^32 - 41
-    self.assertAlmostEqual(result, expected, places=0, msg=f"Expected {expected}, got {result}")
-
-  def test_v_ldexp_f64_literal_neg32(self):
-    """V_LDEXP_F64 with literal -32 for exponent.
-
-    V_LDEXP_F64 computes src0 * 2^src1 where src1 is an integer exponent.
-    The literal 0xFFFFFFE0 represents -32 as a 32-bit signed integer.
-    For V_LDEXP_F64, src1 is 32-bit (not 64-bit), so this is correct as-is.
-    """
-    val = f2i64(-41.0)
-    expected = -41.0 * (2.0 ** -32)  # -9.5367431640625e-09
-    instructions = [
-      s_mov_b32(s[0], val & 0xffffffff),
-      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      # V_LDEXP_F64 v[2:3], v[0:1], -32
-      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
-    self.assertAlmostEqual(result, expected, places=15, msg=f"Expected {expected}, got {result}")
-
-  def test_f64_to_i64_full_sequence(self):
-    """Full f64->i64 conversion sequence with negative value.
-
-    This is the exact sequence generated by the compiler for (long)(-41.0):
-      v_trunc_f64 v[0:1], v[0:1]
-      v_ldexp_f64 v[2:3], v[0:1], -32
-      v_floor_f64 v[2:3], v[2:3]
-      v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]  # -2^32
-      v_cvt_u32_f64 v0, v[0:1]
-      v_cvt_i32_f64 v1, v[2:3]
-
-    Result: v1:v0 = 0xFFFFFFFF:0xFFFFFFD7 = -41 as i64
-    """
-    val = f2i64(-41.0)
-    lit = 0xC1F00000  # high 32 bits of f64 -2^32
-    instructions = [
-      s_mov_b32(s[0], val & 0xffffffff),
-      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_trunc_f64_e32(v[0:2], v[0:2]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),  # -32
-      v_floor_f64_e32(v[2:4], v[2:4]),
-      VOP3(VOP3Op.V_FMA_F64, vdst=v[0], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
-      v_cvt_u32_f64_e32(v[4], v[0:2]),
-      v_cvt_i32_f64_e32(v[5], v[2:4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = st.vgpr[0][4]
-    hi = st.vgpr[0][5]
-    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
-    self.assertEqual(result, -41, f"Expected -41, got {result} (lo=0x{lo:08x}, hi=0x{hi:08x})")
-
-  def test_f64_to_i64_large_negative(self):
-    """f64->i64 conversion with larger negative value (-1000000).
-
-    Tests that the conversion sequence works for values that span both
-    high and low 32-bit parts of the result.
-    """
-    val = f2i64(-1000000.0)
-    lit = 0xC1F00000
-    instructions = [
-      s_mov_b32(s[0], val & 0xffffffff),
-      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_trunc_f64_e32(v[0:2], v[0:2]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),
-      v_floor_f64_e32(v[2:4], v[2:4]),
-      VOP3(VOP3Op.V_FMA_F64, vdst=v[0], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
-      v_cvt_u32_f64_e32(v[4], v[0:2]),
-      v_cvt_i32_f64_e32(v[5], v[2:4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = st.vgpr[0][4]
-    hi = st.vgpr[0][5]
-    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
-    self.assertEqual(result, -1000000, f"Expected -1000000, got {result}")
-
-  def test_f64_to_i64_positive(self):
-    """f64->i64 conversion with positive value (1000000)."""
-    val = f2i64(1000000.0)
-    lit = 0xC1F00000
-    instructions = [
-      s_mov_b32(s[0], val & 0xffffffff),
-      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_trunc_f64_e32(v[0:2], v[0:2]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),
-      v_floor_f64_e32(v[2:4], v[2:4]),
-      VOP3(VOP3Op.V_FMA_F64, vdst=v[0], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
-      v_cvt_u32_f64_e32(v[4], v[0:2]),
-      v_cvt_i32_f64_e32(v[5], v[2:4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = st.vgpr[0][4]
-    hi = st.vgpr[0][5]
-    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
-    self.assertEqual(result, 1000000, f"Expected 1000000, got {result}")
-
-  def test_f64_to_i64_large_positive(self):
-    """f64->i64 conversion with value > 2^32 (requires 64-bit result)."""
-    val = f2i64(5000000000.0)  # 5 billion, > 2^32
-    lit = 0xC1F00000
-    instructions = [
-      s_mov_b32(s[0], val & 0xffffffff),
-      s_mov_b32(s[1], (val >> 32) & 0xffffffff),
-      v_mov_b32_e32(v[0], s[0]),
-      v_mov_b32_e32(v[1], s[1]),
-      v_trunc_f64_e32(v[0:2], v[0:2]),
-      v_ldexp_f64(v[2:4], v[0:2], 0xFFFFFFE0),
-      v_floor_f64_e32(v[2:4], v[2:4]),
-      VOP3(VOP3Op.V_FMA_F64, vdst=v[0], src0=RawImm(255), src1=v[2], src2=v[0], literal=lit),
-      v_cvt_u32_f64_e32(v[4], v[0:2]),
-      v_cvt_i32_f64_e32(v[5], v[2:4]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    lo = st.vgpr[0][4]
-    hi = st.vgpr[0][5]
-    result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
-    self.assertEqual(result, 5000000000, f"Expected 5000000000, got {result}")
-
-
-class TestDS2Addr(unittest.TestCase):
-  """Regression tests for DS_LOAD_2ADDR and DS_STORE_2ADDR instructions.
-  These ops use offset scaling: offset * sizeof(data) for address calculation.
-  Bug: Emulator was using offset*4 for both B32 and B64, but B64 needs offset*8."""
-
-  def test_ds_store_load_2addr_b32(self):
-    """DS_STORE_2ADDR_B32 and DS_LOAD_2ADDR_B32 with offset scaling by 4."""
-    # Store 0x12345678 at offset0=0 (*4=0) and 0xDEADBEEF at offset1=1 (*4=4)
-    # Then load them back
-    instructions = [
-      v_mov_b32_e32(v[10], 0),  # addr base = 0
-      s_mov_b32(s[2], 0x12345678),
-      v_mov_b32_e32(v[0], s[2]),  # data0
-      s_mov_b32(s[2], 0xDEADBEEF),
-      v_mov_b32_e32(v[1], s[2]),  # data1
-      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0x12345678, "v2 should have value from offset 0")
-    self.assertEqual(st.vgpr[0][3], 0xDEADBEEF, "v3 should have value from offset 4")
-
-  def test_ds_store_load_2addr_b32_nonzero_offsets(self):
-    """DS_STORE_2ADDR_B32 with non-zero offsets (offset*4 scaling)."""
-    # Store at offset0=2 (*4=8) and offset1=5 (*4=20)
-    instructions = [
-      v_mov_b32_e32(v[10], 0),  # addr base = 0
-      s_mov_b32(s[2], 0x11111111),
-      v_mov_b32_e32(v[0], s[2]),
-      s_mov_b32(s[2], 0x22222222),
-      v_mov_b32_e32(v[1], s[2]),
-      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=2, offset1=5),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=2, offset1=5),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have value from offset 8 (2*4)")
-    self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have value from offset 20 (5*4)")
-
-  def test_ds_store_load_2addr_b64(self):
-    """DS_STORE_2ADDR_B64 and DS_LOAD_2ADDR_B64 with offset scaling by 8."""
-    # For B64: each value is 8 bytes (2 dwords), offsets scaled by 8
-    # Store 64-bit value at offset0=0 (*8=0) and another at offset1=1 (*8=8)
-    instructions = [
-      v_mov_b32_e32(v[10], 0),  # addr base = 0
-      # First 64-bit value: 0x123456789ABCDEF0
-      s_mov_b32(s[2], 0x9ABCDEF0),
-      v_mov_b32_e32(v[0], s[2]),  # low dword
-      s_mov_b32(s[2], 0x12345678),
-      v_mov_b32_e32(v[1], s[2]),  # high dword
-      # Second 64-bit value: 0xDEADBEEFCAFEBABE
-      s_mov_b32(s[2], 0xCAFEBABE),
-      v_mov_b32_e32(v[2], s[2]),  # low dword
-      s_mov_b32(s[2], 0xDEADBEEF),
-      v_mov_b32_e32(v[3], s[2]),  # high dword
-      DS(DSOp.DS_STORE_2ADDR_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v4,v5 = first 64-bit value from offset 0
-    self.assertEqual(st.vgpr[0][4], 0x9ABCDEF0, "v4 should have low dword of first value")
-    self.assertEqual(st.vgpr[0][5], 0x12345678, "v5 should have high dword of first value")
-    # v6,v7 = second 64-bit value from offset 8 (1*8)
-    self.assertEqual(st.vgpr[0][6], 0xCAFEBABE, "v6 should have low dword of second value")
-    self.assertEqual(st.vgpr[0][7], 0xDEADBEEF, "v7 should have high dword of second value")
-
-  def test_ds_2addr_b64_no_overlap(self):
-    """DS_LOAD_2ADDR_B64 with adjacent offsets should not overlap.
-    Regression test: offset1=1 should access bytes 8-15, not overlap with offset0=0 (bytes 0-7)."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      # Store 4 distinct dwords at addresses 0,4,8,12 using regular DS_STORE
-      s_mov_b32(s[2], 0x11111111),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_mov_b32(s[2], 0x22222222),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=4),
-      s_mov_b32(s[2], 0x33333333),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=8),
-      s_mov_b32(s[2], 0x44444444),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=12),
-      s_waitcnt(lgkmcnt=0),
-      # Load with DS_LOAD_2ADDR_B64: offset0=0 should get 0-7, offset1=1 should get 8-15
-      DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # v4,v5 from addr 0-7: 0x11111111, 0x22222222
-    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should be 0x11111111")
-    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should be 0x22222222")
-    # v6,v7 from addr 8-15: 0x33333333, 0x44444444
-    self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should be 0x33333333")
-    self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should be 0x44444444")
-
-
-class TestDSAtomic(unittest.TestCase):
-  """Tests for DS atomic instructions (add, max, min, and, or, xor, cmpstore, etc.)."""
-
-  def test_ds_max_rtn_u32(self):
-    """DS_MAX_RTN_U32: atomically store max(mem, data) and return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),  # addr = 0
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),  # initial value = 100
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 200),
-      v_mov_b32_e32(v[1], s[2]),  # data = 200 (greater than 100)
-      ds_max_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),  # read result
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
-    self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200")
-
-  def test_ds_max_u32_no_rtn(self):
-    """DS_MAX_U32 (no RTN): atomically store max, no return value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 100
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 200),
-      v_mov_b32_e32(v[1], s[2]),  # data = 200
-      ds_max_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200")
-
-  def test_ds_add_u32_no_rtn_preserves_vdst(self):
-    """DS_ADD_U32 (no RTN) should NOT write to vdst - vdst should preserve sentinel value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      # Set sentinel value in vdst
-      s_mov_b32(s[2], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[2]),  # sentinel in v2
-      # Store initial value
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      # Do non-RTN add (should NOT write to v2)
-      s_mov_b32(s[2], 50),
-      v_mov_b32_e32(v[1], s[2]),
-      ds_add_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      # Load result to verify add worked
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0xDEADBEEF, "v2 should preserve sentinel (no RTN)")
-    self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150")
-
-  def test_ds_add_rtn_u32_writes_vdst(self):
-    """DS_ADD_RTN_U32 should write old value to vdst."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      # Set sentinel value in vdst
-      s_mov_b32(s[2], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[2]),  # sentinel in v2
-      # Store initial value
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      # Do RTN add (SHOULD write old value to v2)
-      s_mov_b32(s[2], 50),
-      v_mov_b32_e32(v[1], s[2]),
-      ds_add_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      # Load result to verify add worked
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)")
-    self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150")
-
-  def test_ds_min_rtn_u32(self):
-    """DS_MIN_RTN_U32: atomically store min(mem, data) and return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 200),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 200
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[1], s[2]),  # data = 100
-      ds_min_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 200, "v2 should have old value (200)")
-    self.assertEqual(st.vgpr[0][3], 100, "v3 should have min(200, 100) = 100")
-
-  def test_ds_and_rtn_b32(self):
-    """DS_AND_RTN_B32: atomically AND mem with data and return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 0xFF00FF00),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 0xFF00FF00
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 0xFFFF0000),
-      v_mov_b32_e32(v[1], s[2]),  # data = 0xFFFF0000
-      ds_and_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0xFF00FF00, "v2 should have old value")
-    self.assertEqual(st.vgpr[0][3], 0xFF000000, "v3 should have 0xFF00FF00 & 0xFFFF0000 = 0xFF000000")
-
-  def test_ds_or_rtn_b32(self):
-    """DS_OR_RTN_B32: atomically OR mem with data and return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 0x00FF0000),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 0x00FF0000
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 0x000000FF),
-      v_mov_b32_e32(v[1], s[2]),  # data = 0x000000FF
-      ds_or_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0x00FF0000, "v2 should have old value")
-    self.assertEqual(st.vgpr[0][3], 0x00FF00FF, "v3 should have 0x00FF0000 | 0x000000FF = 0x00FF00FF")
-
-  def test_ds_xor_rtn_b32(self):
-    """DS_XOR_RTN_B32: atomically XOR mem with data and return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 0xAAAAAAAA),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 0xAAAAAAAA
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 0xFFFFFFFF),
-      v_mov_b32_e32(v[1], s[2]),  # data = 0xFFFFFFFF
-      ds_xor_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have old value")
-    self.assertEqual(st.vgpr[0][3], 0x55555555, "v3 should have 0xAAAAAAAA ^ 0xFFFFFFFF = 0x55555555")
-
-  def test_ds_cmpstore_b32_match(self):
-    """DS_CMPSTORE_B32: conditional store when compare matches."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 100
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 200),
-      v_mov_b32_e32(v[1], s[2]),  # new value = 200
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[2], s[2]),  # compare = 100 (matches current)
-      ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][4], 200, "mem should be updated to 200 (compare matched)")
-
-  def test_ds_cmpstore_b32_no_match(self):
-    """DS_CMPSTORE_B32: no store when compare doesn't match."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 100),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 100
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 200),
-      v_mov_b32_e32(v[1], s[2]),  # new value = 200
-      s_mov_b32(s[2], 50),
-      v_mov_b32_e32(v[2], s[2]),  # compare = 50 (doesn't match 100)
-      ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[4], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][4], 100, "mem should still be 100 (compare didn't match)")
-
-  def test_ds_inc_rtn_u32(self):
-    """DS_INC_RTN_U32: increment with wrap, return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 5),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 5
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 10),
-      v_mov_b32_e32(v[1], s[2]),  # limit = 10
-      ds_inc_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 5, "v2 should have old value (5)")
-    self.assertEqual(st.vgpr[0][3], 6, "v3 should have incremented value (6)")
-
-  def test_ds_dec_rtn_u32(self):
-    """DS_DEC_RTN_U32: decrement with wrap, return old value."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 5),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 5
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 10),
-      v_mov_b32_e32(v[1], s[2]),  # limit = 10
-      ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 5, "v2 should have old value (5)")
-    self.assertEqual(st.vgpr[0][3], 4, "v3 should have decremented value (4)")
-
-  def test_ds_dec_rtn_u32_wrap(self):
-    """DS_DEC_RTN_U32: wraps to limit when value is 0 or > limit."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[2], 0),
-      v_mov_b32_e32(v[0], s[2]),  # initial = 0
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[2], 10),
-      v_mov_b32_e32(v[1], s[2]),  # limit = 10
-      ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0, "v2 should have old value (0)")
-    self.assertEqual(st.vgpr[0][3], 10, "v3 should wrap to limit (10)")
-
-
-class TestDSRegisterWidth(unittest.TestCase):
-  """Regression tests: DS loads should only write the correct number of VGPRs."""
-
-  def test_ds_load_b32_no_overwrite(self):
-    """DS_LOAD_B32 should only write 1 VGPR, not overwrite subsequent registers."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),      # addr = 0
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[1], s[0]),   # store value
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[2], s[0]),   # sentinel
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[3], s[0]),   # sentinel
-      s_mov_b32(s[0], 0x33333333),
-      v_mov_b32_e32(v[4], s[0]),   # sentinel
-      ds_store_b32(addr=v[0], data0=v[1], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[0], vdst=v[1], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][1], 0xDEADBEEF, "v1 should have loaded value")
-    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should be untouched")
-    self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should be untouched")
-    self.assertEqual(st.vgpr[0][4], 0x33333333, "v4 should be untouched")
-
-  def test_ds_load_b64_no_overwrite(self):
-    """DS_LOAD_B64 should only write 2 VGPRs, not overwrite subsequent registers."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),      # addr = 0
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[1], s[0]),   # low dword
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[2], s[0]),   # high dword
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[5], s[0]),   # sentinel
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[6], s[0]),   # sentinel
-      DS(DSOp.DS_STORE_B64, addr=v[0], data0=v[1], vdst=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_B64, addr=v[0], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][3], 0xDEADBEEF, "v3 should have low dword")
-    self.assertEqual(st.vgpr[0][4], 0xCAFEBABE, "v4 should have high dword")
-    self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should be untouched")
-    self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should be untouched")
-
-  def test_ds_load_2addr_b32_no_overwrite(self):
-    """DS_LOAD_2ADDR_B32 should only write 2 VGPRs, not overwrite subsequent registers."""
-    instructions = [
-      v_mov_b32_e32(v[0], 0),      # addr = 0
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[1], s[0]),   # first value
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[2], s[0]),   # second value
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[5], s[0]),   # sentinel
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[6], s[0]),   # sentinel
-      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[0], data0=v[1], data1=v[2], vdst=v[0], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[0], vdst=v[3], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][3], 0xAAAAAAAA, "v3 should have first value")
-    self.assertEqual(st.vgpr[0][4], 0xBBBBBBBB, "v4 should have second value")
-    self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should be untouched")
-    self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should be untouched")
-
-
-class TestDS2AddrStride64(unittest.TestCase):
-  """Tests for DS_*_2ADDR_STRIDE64 instructions (offset * 256 for B32, offset * 512 for B64)."""
-
-  def test_ds_store_load_2addr_stride64_b32(self):
-    """DS_STORE_2ADDR_STRIDE64_B32: stores at ADDR + offset*256."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),     # base addr = 0
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[0], s[0]),   # first value
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[1], s[0]),   # second value
-      # Store with STRIDE64: offset0=1 -> addr 256, offset1=2 -> addr 512
-      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      # Load back using STRIDE64
-      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[2], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have value from addr 256")
-    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB, "v3 should have value from addr 512")
-
-  def test_ds_store_load_2addr_stride64_b64(self):
-    """DS_STORE_2ADDR_STRIDE64_B64: stores at ADDR + offset*512."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),     # base addr = 0
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[0], s[0]),   # first value low
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[1], s[0]),   # first value high
-      s_mov_b32(s[0], 0x12345678),
-      v_mov_b32_e32(v[2], s[0]),   # second value low
-      s_mov_b32(s[0], 0x9ABCDEF0),
-      v_mov_b32_e32(v[3], s[0]),   # second value high
-      # Store with STRIDE64: offset0=1 -> addr 512, offset1=2 -> addr 1024
-      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      # Load back using STRIDE64
-      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[4], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have first low dword")
-    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have first high dword")
-    self.assertEqual(st.vgpr[0][6], 0x12345678, "v6 should have second low dword")
-    self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0, "v7 should have second high dword")
-
-
-class TestDSStorexchg(unittest.TestCase):
-  """Tests for DS_STOREXCHG (exchange) instructions."""
-
-  def test_ds_storexchg_rtn_b32(self):
-    """DS_STOREXCHG_RTN_B32: exchange value and return old."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[0], s[0]),   # initial value
-      ds_store_b32(addr=v[10], data0=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[1], s[0]),   # new value
-      DS(DSOp.DS_STOREXCHG_RTN_B32, addr=v[10], data0=v[1], vdst=v[2], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      ds_load_b32(addr=v[10], vdst=v[3], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have old value")
-    self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB, "memory should have new value")
-
-  def test_ds_storexchg_2addr_rtn_b32(self):
-    """DS_STOREXCHG_2ADDR_RTN_B32: exchange at two addresses (offset*4)."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[0], s[0]),   # initial at offset0
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[1], s[0]),   # initial at offset1
-      # Store initial values at offset 0 and 4 (offset0=0, offset1=1, each *4)
-      DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),   # new value for offset0
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[3], s[0]),   # new value for offset1
-      # Exchange: write new values, return old
-      DS(DSOp.DS_STOREXCHG_2ADDR_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-      # Load back to verify new values
-      DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[6], offset0=0, offset1=1),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # Return value: v4=old[0], v5=old[1]
-    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value from offset0")
-    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have old value from offset1")
-    # Memory should have new values
-    self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have new value at offset0")
-    self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have new value at offset1")
-
-  def test_ds_storexchg_2addr_stride64_rtn_b32(self):
-    """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: exchange at two addresses (offset*256)."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[1], s[0]),
-      # Store initial values at offset*256
-      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[3], s[0]),
-      # Exchange
-      DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      # Load back
-      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[6], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value")
-    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have old value")
-    self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have new value")
-    self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have new value")
-
-  def test_ds_storexchg_rtn_b64(self):
-    """DS_STOREXCHG_RTN_B64: exchange 64-bit value and return old."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[0], s[0]),   # initial low
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[1], s[0]),   # initial high
-      DS(DSOp.DS_STORE_B64, addr=v[10], data0=v[0], vdst=v[0], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      s_mov_b32(s[0], 0x12345678),
-      v_mov_b32_e32(v[2], s[0]),   # new low
-      s_mov_b32(s[0], 0x9ABCDEF0),
-      v_mov_b32_e32(v[3], s[0]),   # new high
-      DS(DSOp.DS_STOREXCHG_RTN_B64, addr=v[10], data0=v[2], vdst=v[4], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_B64, addr=v[10], vdst=v[6], offset0=0),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have old low dword")
-    self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have old high dword")
-    self.assertEqual(st.vgpr[0][6], 0x12345678, "v6 should have new low dword")
-    self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0, "v7 should have new high dword")
-
-  def test_ds_store_load_2addr_stride64_b64_roundtrip(self):
-    """DS_STORE_2ADDR_STRIDE64_B64 followed by DS_LOAD_2ADDR_STRIDE64_B64 works correctly."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[1], s[0]),
-      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[2], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have val1 low")
-    self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have val1 high")
-    self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have val2 low")
-    self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have val2 high")
-
-  def test_ds_storexchg_2addr_stride64_rtn_b64_returns_old(self):
-    """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: returns old values correctly."""
-    instructions = [
-      v_mov_b32_e32(v[10], 0),
-      # Store initial values
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[0], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[1], s[0]),
-      DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-      # Exchange with new values
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[6], s[0]),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[7], s[0]),
-      DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, addr=v[10], data0=v[6], data1=v[6], vdst=v[8], offset0=1, offset1=2),
-      s_waitcnt(lgkmcnt=0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    # Return: v8-v11 = old values (4 dwords for 2x64-bit)
-    self.assertEqual(st.vgpr[0][8], 0x11111111, "v8 should have old val1 low")
-    self.assertEqual(st.vgpr[0][9], 0x22222222, "v9 should have old val1 high")
-    self.assertEqual(st.vgpr[0][10], 0x11111111, "v10 should have old val2 low")
-    self.assertEqual(st.vgpr[0][11], 0x22222222, "v11 should have old val2 high")
-
-class TestFLATAtomic(unittest.TestCase):
-  """Tests for FLAT and GLOBAL atomic instructions."""
-
-  # Helper to set up address in v[0:1] and clear after test
-  def _make_test(self, setup_instrs, atomic_instr, check_fn, test_offset=2000):
-    """Helper to create atomic test instructions."""
-    instructions = [
-      # Load output buffer address from args (saved in s[80:81] by prologue)
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),  # addr low
-      v_mov_b32_e32(v[1], s[3]),  # addr high
-    ] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0),
-      # Clear address registers that differ between emu/hw
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check_fn(st)
-
-  def test_flat_atomic_inc_u64_returns_old_value(self):
-    """FLAT_ATOMIC_INC_U64 should return full 64-bit old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      # Store initial 64-bit value: 0xCAFEBABE_DEADBEEF
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[3], s[0]),
-      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Threshold: 0xFFFFFFFF_FFFFFFFF
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      v_mov_b32_e32(v[4], s[0]),
-      v_mov_b32_e32(v[5], s[0]),
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0xDEADBEEF, "v6 should have old value low dword")
-      self.assertEqual(st.vgpr[0][7], 0xCAFEBABE, "v7 should have old value high dword")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_add_u32(self):
-    """FLAT_ATOMIC_ADD_U32 adds to memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 100),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 50),
-      v_mov_b32_e32(v[3], s[0]),  # add 50
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_sub_u32(self):
-    """FLAT_ATOMIC_SUB_U32 subtracts from memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 100),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 30),
-      v_mov_b32_e32(v[3], s[0]),  # sub 30
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_SUB_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_swap_b32(self):
-    """FLAT_ATOMIC_SWAP_B32 swaps memory value and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[3], s[0]),  # new value
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 should have old value")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_and_b32(self):
-    """FLAT_ATOMIC_AND_B32 ANDs with memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xFF00FF00),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0xFFFF0000),
-      v_mov_b32_e32(v[3], s[0]),  # AND mask
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_AND_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0xFF00FF00, "v4 should have old value")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_or_b32(self):
-    """FLAT_ATOMIC_OR_B32 ORs with memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0x00FF0000),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0x0000FF00),
-      v_mov_b32_e32(v[3], s[0]),  # OR mask
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_OR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0x00FF0000, "v4 should have old value")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_xor_b32(self):
-    """FLAT_ATOMIC_XOR_B32 XORs with memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      v_mov_b32_e32(v[3], s[0]),  # XOR mask
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_XOR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 should have old value")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_min_u32(self):
-    """FLAT_ATOMIC_MIN_U32 stores min and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 100),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 50),
-      v_mov_b32_e32(v[3], s[0]),  # compare value (smaller)
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_MIN_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_max_u32(self):
-    """FLAT_ATOMIC_MAX_U32 stores max and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 50),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 100),
-      v_mov_b32_e32(v[3], s[0]),  # compare value (larger)
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_MAX_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 50, "v4 should have old value (50)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_inc_u32(self):
-    """FLAT_ATOMIC_INC_U32 increments and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 10),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 100),  # threshold
-      v_mov_b32_e32(v[3], s[0]),
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 10, "v4 should have old value (10)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_dec_u32(self):
-    """FLAT_ATOMIC_DEC_U32 decrements and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 10),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 100),  # threshold
-      v_mov_b32_e32(v[3], s[0]),
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_DEC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 10, "v4 should have old value (10)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_add_u64(self):
-    """FLAT_ATOMIC_ADD_U64 adds 64-bit value and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[3], s[0]),
-      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0x00000001),  # add 1
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[0], 0x00000000),
-      v_mov_b32_e32(v[5], s[0]),
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0x11111111, "v6 should have old value low")
-      self.assertEqual(st.vgpr[0][7], 0x22222222, "v7 should have old value high")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_atomic_swap_b64(self):
-    """FLAT_ATOMIC_SWAP_B64 swaps 64-bit value and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[3], s[0]),
-      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 0xCCCCCCCC),
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[0], 0xDDDDDDDD),
-      v_mov_b32_e32(v[5], s[0]),
-    ]
-    atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have old value low")
-      self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have old value high")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_global_atomic_add_u32(self):
-    """GLOBAL_ATOMIC_ADD_U32 adds to memory and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 100),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      s_mov_b32(s[0], 50),
-      v_mov_b32_e32(v[3], s[0]),
-    ]
-    atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_global_atomic_add_u64(self):
-    """GLOBAL_ATOMIC_ADD_U64 adds 64-bit value and returns old value."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xFFFFFFFF),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0x00000000),
-      v_mov_b32_e32(v[3], s[0]),
-      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Add 1 to cause carry
-      s_mov_b32(s[0], 0x00000001),
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[0], 0x00000000),
-      v_mov_b32_e32(v[5], s[0]),
-    ]
-    atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0xFFFFFFFF, "v6 should have old value low")
-      self.assertEqual(st.vgpr[0][7], 0x00000000, "v7 should have old value high")
-    self._make_test(setup, atomic, check, TEST_OFFSET)
-
-  def test_flat_load_b32(self):
-    """FLAT_LOAD_B32 loads 32-bit value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[0]),
-      global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(FLATOp.FLAT_LOAD_B32, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have loaded value")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-  def test_flat_load_b64(self):
-    """FLAT_LOAD_B64 loads 64-bit value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[3], s[0]),
-      global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(FLATOp.FLAT_LOAD_B64, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET)
-    def check(st):
-      self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have loaded low dword")
-      self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have loaded high dword")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-  def test_flat_load_b96(self):
-    """FLAT_LOAD_B96 loads 96-bit (3 dword) value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[3], s[0]),
-      s_mov_b32(s[0], 0x33333333),
-      v_mov_b32_e32(v[4], s[0]),
-      global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(FLATOp.FLAT_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET)
-    def check(st):
-      self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should have dword 0")
-      self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should have dword 1")
-      self.assertEqual(st.vgpr[0][7], 0x33333333, "v7 should have dword 2")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-  def test_flat_load_b128(self):
-    """FLAT_LOAD_B128 loads 128-bit (4 dword) value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0x11111111),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0x22222222),
-      v_mov_b32_e32(v[3], s[0]),
-      s_mov_b32(s[0], 0x33333333),
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[0], 0x44444444),
-      v_mov_b32_e32(v[5], s[0]),
-      global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(FLATOp.FLAT_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0x11111111, "v6 should have dword 0")
-      self.assertEqual(st.vgpr[0][7], 0x22222222, "v7 should have dword 1")
-      self.assertEqual(st.vgpr[0][8], 0x33333333, "v8 should have dword 2")
-      self.assertEqual(st.vgpr[0][9], 0x44444444, "v9 should have dword 3")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-  def test_global_load_b96(self):
-    """GLOBAL_LOAD_B96 loads 96-bit value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xAAAAAAAA),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xBBBBBBBB),
-      v_mov_b32_e32(v[3], s[0]),
-      s_mov_b32(s[0], 0xCCCCCCCC),
-      v_mov_b32_e32(v[4], s[0]),
-      global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(GLOBALOp.GLOBAL_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2)
-    def check(st):
-      self.assertEqual(st.vgpr[0][5], 0xAAAAAAAA, "v5 should have dword 0")
-      self.assertEqual(st.vgpr[0][6], 0xBBBBBBBB, "v6 should have dword 1")
-      self.assertEqual(st.vgpr[0][7], 0xCCCCCCCC, "v7 should have dword 2")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-  def test_global_load_b128(self):
-    """GLOBAL_LOAD_B128 loads 128-bit value correctly."""
-    TEST_OFFSET = 2000
-    setup = [
-      s_mov_b32(s[0], 0xDEADBEEF),
-      v_mov_b32_e32(v[2], s[0]),
-      s_mov_b32(s[0], 0xCAFEBABE),
-      v_mov_b32_e32(v[3], s[0]),
-      s_mov_b32(s[0], 0x12345678),
-      v_mov_b32_e32(v[4], s[0]),
-      s_mov_b32(s[0], 0x9ABCDEF0),
-      v_mov_b32_e32(v[5], s[0]),
-      global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-    ]
-    load = FLAT(GLOBALOp.GLOBAL_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2)
-    def check(st):
-      self.assertEqual(st.vgpr[0][6], 0xDEADBEEF, "v6 should have dword 0")
-      self.assertEqual(st.vgpr[0][7], 0xCAFEBABE, "v7 should have dword 1")
-      self.assertEqual(st.vgpr[0][8], 0x12345678, "v8 should have dword 2")
-      self.assertEqual(st.vgpr[0][9], 0x9ABCDEF0, "v9 should have dword 3")
-    instructions = [
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-    ] + setup + [load, s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], 0),
-      v_mov_b32_e32(v[1], 0),
-      s_mov_b32(s[2], 0),
-      s_mov_b32(s[3], 0),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    check(st)
-
-
-class TestGlobalStoreB64(unittest.TestCase):
-  """Tests for global_store_b64 instruction."""
-
-  def test_global_store_b64_basic(self):
-    """GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory."""
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Set up v[2:3] with known values
-      s_mov_b32(s[4], 0xDEADBEEF),
-      s_mov_b32(s[5], 0xCAFEBABE),
-      v_mov_b32_e32(v[2], s[4]),  # v2 = 0xDEADBEEF (low dword)
-      v_mov_b32_e32(v[3], s[5]),  # v3 = 0xCAFEBABE (high dword)
-      # Set up address
-      v_mov_b32_e32(v[0], 0),
-      # Store 64 bits
-      global_store_b64(addr=v[0], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Load it back as two 32-bit values
-      FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[0], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      # Copy to v[0:1] for capture
-      v_mov_b32_e32(v[0], v[4]),
-      v_mov_b32_e32(v[1], v[5]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    self.assertEqual(st.vgpr[0][0], 0xDEADBEEF, f"Low dword: expected 0xDEADBEEF, got 0x{st.vgpr[0][0]:08x}")
-    self.assertEqual(st.vgpr[0][1], 0xCAFEBABE, f"High dword: expected 0xCAFEBABE, got 0x{st.vgpr[0][1]:08x}")
-
-  def test_global_store_b64_tril_pattern(self):
-    """Test the exact pattern from tril() kernel that was failing.
-
-    The kernel does:
-    - global_load_u16 v0, v2, s[2:3] offset:3  (loads bytes 3,4)
-    - global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 (loads bytes 6,7 into v1 hi16)
-    - global_load_u8 v3, v2, s[2:3]  (loads byte 0)
-    - global_load_u8 v4, v2, s[2:3] offset:8 (loads byte 8)
-    - v_and_b32 v5, 0xffff, v0
-    - v_lshlrev_b32 v0, 24, v0
-    - v_lshrrev_b32 v5, 8, v5
-    - v_or_b32 v0, v3, v0
-    - v_or_b32 v1, v5, v1
-    - global_store_b64 v2, v[0:1], s[0:1]  (stores 8 bytes)
-
-    For input all 0x01, the output at byte 5 should be 0x00.
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store input data: 9 bytes of 0x01
-      s_mov_b32(s[4], 0x01010101),
-      v_mov_b32_e32(v[10], s[4]),
-      v_mov_b32_e32(v[11], s[4]),
-      s_mov_b32(s[4], 0x01),
-      v_mov_b32_e32(v[12], s[4]),
-      v_mov_b32_e32(v[0], 0),
-      global_store_b64(addr=v[0], data=v[10], saddr=s[2], offset=TEST_OFFSET),
-      global_store_b8(addr=v[0], data=v[12], saddr=s[2], offset=TEST_OFFSET+8),
-      s_waitcnt(vmcnt=0),
-
-      # Now execute the tril pattern
-      v_mov_b32_e32(v[2], 0),
-      v_mov_b32_e32(v[1], 0),
-      # Load bytes 3,4 as u16
-      FLAT(GLOBALOp.GLOBAL_LOAD_U16, addr=v[2], vdst=v[0], data=v[0], saddr=s[2], offset=TEST_OFFSET+3, seg=2),
-      # Load bytes 6,7 into v1 hi16
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET+6, seg=2),
-      # Load byte 0
-      FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[3], data=v[3], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      # Load byte 8
-      FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET+8, seg=2),
-      s_waitcnt(vmcnt=0),
-
-      # Bit manipulation
-      v_and_b32_e32(v[5], 0xffff, v[0]),  # v5 = v0 & 0xffff = 0x0101
-      v_lshlrev_b32_e32(v[0], 24, v[0]),  # v0 = v0 << 24 = 0x01000000
-      v_lshrrev_b32_e32(v[5], 8, v[5]),   # v5 = v5 >> 8 = 0x01
-      v_or_b32_e32(v[0], v[3], v[0]),     # v0 = v3 | v0 = 0x01000001
-      v_or_b32_e32(v[1], v[5], v[1]),     # v1 = v5 | v1
-
-      # Store to different location so we can read it back
-      global_store_b64(addr=v[2], data=v[0], saddr=s[2], offset=TEST_OFFSET+16),
-      s_waitcnt(vmcnt=0),
-
-      # Load back to check
-      FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[2], vdst=v[6], data=v[6], saddr=s[2], offset=TEST_OFFSET+16, seg=2),
-      s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], v[6]),
-      v_mov_b32_e32(v[1], v[7]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-
-    # v0 should be 0x01000001 (bytes 0,1,2,3 = 01,00,00,01)
-    # v1 should be 0x01010001 (bytes 4,5,6,7 = 01,00,01,01)
-    v0 = st.vgpr[0][0]
-    v1 = st.vgpr[0][1]
-    self.assertEqual(v0, 0x01000001, f"v0: expected 0x01000001, got 0x{v0:08x}")
-    self.assertEqual(v1, 0x01010001, f"v1: expected 0x01010001, got 0x{v1:08x}")
-
-    # Check individual bytes
-    byte5 = (v1 >> 8) & 0xff  # This is the bug - should be 0x00
-    self.assertEqual(byte5, 0x00, f"byte5 (position 1,2): expected 0x00, got 0x{byte5:02x}")
-
-
-class TestD16HiLoads(unittest.TestCase):
-  """Tests for D16_HI load instructions that load into high 16 bits, preserving low 16 bits."""
-
-  def test_global_load_d16_hi_b16_preserves_low_bits(self):
-    """GLOBAL_LOAD_D16_HI_B16 must preserve low 16 bits of destination.
-
-    Regression test for tril() bug where position (1,2) was incorrectly True.
-    The bug was that D16_HI loads were not preserving the low 16 bits of the
-    destination register.
-    """
-    # Set up: store 0xCAFE at some memory location, then load it into high 16 bits
-    # of a register that has 0xBEEF in low 16 bits. Result should be 0xCAFEBEEF.
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Set up address in v[0:1]
-      v_mov_b32_e32(v[0], s[2]),
-      v_mov_b32_e32(v[1], s[3]),
-      # Store 0xCAFE0000 at TEST_OFFSET (we'll load the low 16 bits as b16)
-      s_mov_b32(s[4], 0xCAFE),
-      v_mov_b32_e32(v[2], s[4]),
-      global_store_b16(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Set destination register v[3] to have 0xBEEF in low 16 bits
-      s_mov_b32(s[4], 0x0000BEEF),
-      v_mov_b32_e32(v[3], s[4]),
-      # Load 16 bits from memory into HIGH 16 bits of v[3], preserving low 16 bits
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[0], vdst=v[3], data=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      # Copy result to v[0] for capture
-      v_mov_b32_e32(v[0], v[3]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    # Expected: hi=0xCAFE (from memory), lo=0xBEEF (preserved) -> 0xCAFEBEEF
-    self.assertEqual(result, 0xCAFEBEEF, f"Expected 0xCAFEBEEF, got 0x{result:08x}")
-
-  def test_global_load_d16_hi_b16_same_addr_and_dst_zero_addr(self):
-    """GLOBAL_LOAD_D16_HI_B16 with same register for addr and vdst, addr value=0.
-
-    This is the exact pattern from tril() that was failing:
-      global_load_d16_hi_b16 v1, v1, s[2:3] offset:6
-
-    Where v1=0 is used as both the address offset and destination.
-    After the load, low 16 bits should remain 0, high 16 bits should have loaded data.
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store 0xCAFE at TEST_OFFSET
-      s_mov_b32(s[4], 0xCAFE),
-      v_mov_b32_e32(v[2], s[4]),
-      v_mov_b32_e32(v[3], 0),  # addr offset = 0
-      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Set v[1] to 0 (addr offset = 0, and this is what low 16 bits should stay as)
-      v_mov_b32_e32(v[1], 0),
-      # Load using v[1] as both addr and destination
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      # Copy result to v[0] for capture
-      v_mov_b32_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    # Expected: hi=0xCAFE (from memory), lo=0x0000 (preserved) -> 0xCAFE0000
-    self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}")
-
-  def test_global_load_d16_hi_b16_data_differs_from_vdst(self):
-    """GLOBAL_LOAD_D16_HI_B16 where data field differs from vdst.
-
-    This is the ACTUAL pattern from tril() assembly:
-      global_load_d16_hi_b16 v1, v1, s[2:3] offset:6
-
-    The instruction encoding has:
-      vdst = v1 (destination register)
-      addr = v1 (address offset register)
-      data = v0 (data field - typically unused for loads but still encoded)
-
-    The bug: emulator was reading VDATA from inst.data (v0) instead of inst.vdst (v1),
-    so low 16 bits of v0 were preserved instead of low 16 bits of v1.
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store 0xCAFE at TEST_OFFSET
-      s_mov_b32(s[4], 0xCAFE),
-      v_mov_b32_e32(v[2], s[4]),
-      v_mov_b32_e32(v[3], 0),
-      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Set v[0] to a DIFFERENT value (0xDEAD) - this is the data field
-      # The bug would incorrectly preserve v[0]'s low bits instead of v[1]'s
-      s_mov_b32(s[4], 0x0000DEAD),
-      v_mov_b32_e32(v[0], s[4]),
-      # Set v[1] to 0 (this is vdst, whose low bits should be preserved)
-      v_mov_b32_e32(v[1], 0),
-      # Load using v[1] as addr AND vdst, but v[0] as data field
-      # Correct behavior: hi=0xCAFE (loaded), lo=0x0000 (from v1) -> 0xCAFE0000
-      # Bug behavior: hi=0xCAFE (loaded), lo=0xDEAD (from v0) -> 0xCAFEDEAD
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      # Copy result to v[0] for capture
-      v_mov_b32_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    # Expected: hi=0xCAFE (from memory), lo=0x0000 (preserved from vdst v1) -> 0xCAFE0000
-    # Bug would give: 0xCAFEDEAD (low bits from data field v0)
-    self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}")
-
-  def test_global_load_d16_hi_b16_tril_exact_pattern(self):
-    """Exact pattern from tril() failure: data=v0 differs from vdst=v1, with v1 having non-zero low bits initially.
-
-    Assembly from tril():
-      v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0
-      global_load_u16 v0, v2, s[2:3] offset:3        ; v0 = 0x0101 (loads 16 bits)
-      global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 ; vdst=v1, addr=v1, data=v0
-      ...
-      v_or_b32_e32 v1, v5, v1
-
-    The bug: since data=v0=0x0101 and vdst=v1=0, the emulator incorrectly
-    preserved v0's low bits (0x0101) instead of v1's low bits (0x0000).
-    Result: v1 = 0x01010101 instead of 0x01010000
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store test data: 0x0101 at offset, 0x0101 at offset+3
-      s_mov_b32(s[4], 0x0101),
-      v_mov_b32_e32(v[2], s[4]),
-      v_mov_b32_e32(v[3], 0),
-      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET + 3),
-      s_waitcnt(vmcnt=0),
-      # Replicate tril() pattern:
-      # v2 = 0, v1 = 0
-      v_mov_b32_e32(v[2], 0),
-      v_mov_b32_e32(v[1], 0),
-      # global_load_u16 v0, v2, s[2:3] offset:3  -> v0 gets 0x0101
-      FLAT(GLOBALOp.GLOBAL_LOAD_U16, addr=v[2], vdst=v[0], data=v[0], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      # global_load_d16_hi_b16 v1, v1, s[2:3] offset:6  -> vdst=v1, addr=v1, data=v0
-      # This should load 0x0101 into high 16 bits of v1, preserving low 16 bits (0x0000)
-      # Result should be 0x01010000, NOT 0x01010101
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET + 3, seg=2),
-      s_waitcnt(vmcnt=0),
-      # Copy v1 to v[0] for capture
-      v_mov_b32_e32(v[0], v[1]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    # Expected: hi=0x0101 (from memory), lo=0x0000 (preserved from vdst v1) -> 0x01010000
-    # Bug would give: 0x01010101 (low bits from data field v0)
-    self.assertEqual(result, 0x01010000, f"Expected 0x01010000, got 0x{result:08x}")
-
-  def test_global_load_d16_hi_u8_data_differs_from_vdst(self):
-    """GLOBAL_LOAD_D16_HI_U8 where data field differs from vdst.
-
-    Similar to B16 test but loads unsigned 8 bits into high 16 bits.
-    The bug: emulator reads VDATA from inst.data instead of inst.vdst.
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store 0xAB at TEST_OFFSET (single byte)
-      s_mov_b32(s[4], 0xAB),
-      v_mov_b32_e32(v[2], s[4]),
-      v_mov_b32_e32(v[3], 0),
-      global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Set v[4] to 0xDEAD (data field - should NOT affect result)
-      s_mov_b32(s[4], 0x0000DEAD),
-      v_mov_b32_e32(v[4], s[4]),
-      # Set v[5] to 0xBEEF (vdst - low bits should be preserved)
-      s_mov_b32(s[4], 0x0000BEEF),
-      v_mov_b32_e32(v[5], s[4]),
-      # v[3] = 0 for address offset
-      v_mov_b32_e32(v[3], 0),
-      # Load 8 bits into high 16 bits of v[5], preserving low 16 bits
-      # Correct: hi=0x00AB (zero-extended), lo=0xBEEF -> 0x00ABBEEF
-      # Bug: hi=0x00AB, lo=0xDEAD (from v4) -> 0x00ABDEAD
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_U8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], v[5]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    self.assertEqual(result, 0x00ABBEEF, f"Expected 0x00ABBEEF, got 0x{result:08x}")
-
-  def test_global_load_d16_hi_i8_data_differs_from_vdst(self):
-    """GLOBAL_LOAD_D16_HI_I8 where data field differs from vdst.
-
-    Loads signed 8 bits (sign-extended to 16 bits) into high 16 bits.
-    The bug: emulator reads VDATA from inst.data instead of inst.vdst.
-    """
-    TEST_OFFSET = 256
-
-    instructions = [
-      # Get output buffer address into s[2:3]
-      s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL),
-      s_waitcnt(lgkmcnt=0),
-      # Store 0x80 at TEST_OFFSET (negative signed byte = -128)
-      s_mov_b32(s[4], 0x80),
-      v_mov_b32_e32(v[2], s[4]),
-      v_mov_b32_e32(v[3], 0),
-      global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET),
-      s_waitcnt(vmcnt=0),
-      # Set v[4] to 0xDEAD (data field - should NOT affect result)
-      s_mov_b32(s[4], 0x0000DEAD),
-      v_mov_b32_e32(v[4], s[4]),
-      # Set v[5] to 0xBEEF (vdst - low bits should be preserved)
-      s_mov_b32(s[4], 0x0000BEEF),
-      v_mov_b32_e32(v[5], s[4]),
-      # v[3] = 0 for address offset
-      v_mov_b32_e32(v[3], 0),
-      # Load signed 8 bits into high 16 bits of v[5], preserving low 16 bits
-      # 0x80 sign-extended to 16 bits = 0xFF80
-      # Correct: hi=0xFF80, lo=0xBEEF -> 0xFF80BEEF
-      # Bug: hi=0xFF80, lo=0xDEAD (from v4) -> 0xFF80DEAD
-      FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_I8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2),
-      s_waitcnt(vmcnt=0),
-      v_mov_b32_e32(v[0], v[5]),
-    ]
-    st = run_program(instructions, n_lanes=1)
-    result = st.vgpr[0][0]
-    self.assertEqual(result, 0xFF80BEEF, f"Expected 0xFF80BEEF, got 0x{result:08x}")
-
-
-if __name__ == '__main__':
-  unittest.main()