diff --git a/test/amd/hw/helpers.py b/test/amd/hw/helpers.py
index ef7a7cc6ed..d58052d461 100644
--- a/test/amd/hw/helpers.py
+++ b/test/amd/hw/helpers.py
@@ -60,11 +60,12 @@ def skip_unless_gfx(min_major: int, min_minor: int = 0, reason: str = ""):
     return test_func
   return decorator
 
-# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec
+# Output buffer layout: vgpr[N_VGPRS][n_lanes], sgpr[N_SGPRS], vcc, scc, exec
 N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
-VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4  # 16 regs * 32 lanes * 4 bytes = 2048
 SGPR_BYTES = N_SGPRS * 4  # 16 regs * 4 bytes = 64
-OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12  # + vcc + scc + exec
+_VGPR_REGION = N_VGPRS * WAVE_SIZE * 4  # minimum vgpr region size (tests may use as scratch)
+def _out_bytes(n_lanes: int) -> int: return max(N_VGPRS * n_lanes * 4, _VGPR_REGION) + SGPR_BYTES + 12
+OUT_BYTES = _out_bytes(WAVE_SIZE)  # default for single-wave (backward compat)
 
 # Float conversion helpers
 def f2i(f: float) -> int: return _i32(f)
@@ -77,8 +78,8 @@ def assemble(instructions: list) -> bytes:
 
 # Simple WaveState class for test output parsing (mirrors test/mockgpu/amd/emu.py interface for tests)
 class WaveState:
-  def __init__(self):
-    self.vgpr = [[0] * 256 for _ in range(32)]  # vgpr[lane][reg]
+  def __init__(self, n_lanes: int = 32):
+    self.vgpr = [[0] * 256 for _ in range(n_lanes)]  # vgpr[lane][reg]
     self.sgpr = [0] * 128
     self.vcc = 0
     self.scc = 0
@@ -102,49 +103,53 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
     # Save EXEC early (before we modify it for VGPR stores)
     s_mov_b32(s[95], EXEC_LO),
     # Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC)
-    s_mov_b32(EXEC_LO, (1 << n_lanes) - 1),
+    s_mov_b32(EXEC_LO, (1 << min(n_lanes, WAVE_SIZE)) - 1),
     s_load_b64(s[92:93], s[80:81], 0, soffset=NULL),
     s_waitcnt(0),  # simm16=0 waits for all
     v_lshlrev_b32_e32(v[240], 2, v[255]),
   ]
+  vgpr_bytes = N_VGPRS * n_lanes * 4
   for i in range(N_VGPRS):
-    epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92:93], offset=i * WAVE_SIZE * 4))
+    epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92:93], offset=i * n_lanes * 4))
   epilogue.append(v_mov_b32_e32(v[241], 0))
   epilogue.append(v_cmp_eq_u32_e32(v[255], v[241]))
   epilogue.append(s_and_saveexec_b32(s[94], VCC_LO))
-  epilogue.append(v_mov_b32_e32(v[240], 0))
+  # Scalar stores: only thread 0. Use v[240]=vgpr_bytes as base offset so immediate offsets stay small.
+  epilogue.append(v_mov_b32_e32(v[240], vgpr_bytes))
   for i in range(N_SGPRS):
     epilogue.append(v_mov_b32_e32(v[243], s[i]))
-    epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + i * 4))
+    epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=i * 4))
   epilogue.append(v_mov_b32_e32(v[243], s[90]))
-  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES))
   epilogue.append(v_mov_b32_e32(v[243], s[91]))
-  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES + 4))
   # Store EXEC (saved earlier in s[95])
   epilogue.append(v_mov_b32_e32(v[243], s[95]))
-  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8))
+  epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES + 8))
   epilogue.append(s_mov_b32(EXEC_LO, s[94]))
   epilogue.append(s_endpgm())
   return prologue, epilogue
 
 def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
   """Parse output buffer into WaveState."""
-  st = WaveState()
+  vgpr_bytes = N_VGPRS * n_lanes * 4
+  st = WaveState(n_lanes)
   for i in range(N_VGPRS):
     for lane in range(n_lanes):
-      off = i * WAVE_SIZE * 4 + lane * 4
+      off = i * n_lanes * 4 + lane * 4
       st.vgpr[lane][i] = struct.unpack_from('<I', out_buf, off)[0]
   for i in range(N_SGPRS):
-    st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
-  st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
-  st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
+    st.sgpr[i] = struct.unpack_from('<I', out_buf, vgpr_bytes + i * 4)[0]
+  st.vcc = struct.unpack_from('<I', out_buf, vgpr_bytes + SGPR_BYTES)[0]
+  st.scc = struct.unpack_from('<I', out_buf, vgpr_bytes + SGPR_BYTES + 4)[0]
   # Store EXEC in its proper location (index 126)
-  st.sgpr[EXEC_LO.offset] = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 8)[0]
+  st.sgpr[EXEC_LO.offset] = struct.unpack_from('<I', out_buf, vgpr_bytes + SGPR_BYTES + 8)[0]
   return st
 
 def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
   """Run instructions via emulator run_asm, dump state to memory, return WaveState."""
-  out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES))
+  buf_sz = _out_bytes(n_lanes)
+  out_buf = (ctypes.c_uint8 * buf_sz)(*([0] * buf_sz))
   out_addr = ctypes.addressof(out_buf)
 
   prologue, epilogue = get_prologue_epilogue(n_lanes)
@@ -220,11 +225,12 @@ amdhsa.kernels:
   lib = compiler.compile(asm_src)
   prg = AMDProgram(dev, "test", lib)  # type: ignore[arg-type]
 
-  out_gpu = dev.allocator.alloc(OUT_BYTES)
+  buf_sz = _out_bytes(n_lanes)
+  out_gpu = dev.allocator.alloc(buf_sz)
   assert out_gpu.va_addr % 16 == 0, f"buffer not 16-byte aligned: 0x{out_gpu.va_addr:x}"
   prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True)
 
-  out_buf = bytearray(OUT_BYTES)
+  out_buf = bytearray(buf_sz)
   dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu)
 
   return parse_output(bytes(out_buf), n_lanes)
diff --git a/test/amd/hw/test_sop.py b/test/amd/hw/test_sop.py
index cd82f2bbe8..7202d18344 100644
--- a/test/amd/hw/test_sop.py
+++ b/test/amd/hw/test_sop.py
@@ -932,5 +932,76 @@ class Test64BitSOPLiterals(unittest.TestCase):
     self.assertEqual(st.vgpr[0][1], 0)  # zero-extended, not sign-extended
 
 
+class TestBarrier(unittest.TestCase):
+  """Tests for s_barrier — workgroup synchronization across wavefronts."""
+
+  def test_barrier_cross_wave_lds(self):
+    """Wave 0 writes to LDS, s_barrier, wave 1 reads — verifies cross-wave synchronization.
+
+    64 threads (2 waves of 32). Each thread writes (tid+1) to LDS[tid*4], then after
+    s_barrier, reads LDS[(tid^32)*4] — the value written by the other wave. Without barrier
+    support, wave 1 would read stale/zero LDS values.
+    """
+    instructions = [
+      # v[255] = tid (saved by prologue), copy to v[1]
+      v_mov_b32_e32(v[1], v[255]),
+      # v[2] = tid + 1
+      v_add_nc_u32_e32(v[2], 1, v[1]),
+      # v[3] = tid * 4
+      v_lshlrev_b32_e32(v[3], 2, v[1]),
+      # Store (tid+1) to LDS[tid*4]
+      ds_store_b32(addr=v[3], data0=v[2]),
+      s_waitcnt(lgkmcnt=0),
+      s_barrier(),
+      # Read from the other wave's slot: LDS[(tid^32)*4]
+      v_xor_b32_e32(v[4], 32, v[1]),
+      v_lshlrev_b32_e32(v[5], 2, v[4]),
+      ds_load_b32(addr=v[5], vdst=v[0]),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=64)
+    for tid in range(64):
+      self.assertEqual(st.vgpr[tid][0], (tid ^ 32) + 1, f"tid={tid}")
+
+  def test_barrier_two_phases(self):
+    """Two barriers with three phases — tests multiple barriers in sequence.
+
+    Phase 1: all threads write (tid+100) to LDS[tid*4], barrier.
+    Phase 2: all threads read other wave's value, add 1000, write to LDS[(tid+64)*4], barrier.
+    Phase 3: all threads read the other wave's phase-2 output into v[0].
+    """
+    instructions = [
+      # v[255] = tid (saved by prologue), copy to v[1]
+      v_mov_b32_e32(v[1], v[255]),
+      # v[2] = tid + 100
+      v_add_nc_u32_e32(v[2], 100, v[1]),
+      # v[3] = tid * 4
+      v_lshlrev_b32_e32(v[3], 2, v[1]),
+      # Phase 1: write (tid+100) to LDS[tid*4]
+      ds_store_b32(addr=v[3], data0=v[2]),
+      s_waitcnt(lgkmcnt=0),
+      s_barrier(),
+      # Phase 2: read from other wave, add 1000, write to separate LDS region
+      v_xor_b32_e32(v[4], 32, v[1]),
+      v_lshlrev_b32_e32(v[5], 2, v[4]),
+      ds_load_b32(addr=v[5], vdst=v[6]),
+      s_waitcnt(lgkmcnt=0),
+      v_add_nc_u32_e32(v[7], 0x3e8, v[6]),
+      v_add_nc_u32_e32(v[8], 64, v[1]),
+      v_lshlrev_b32_e32(v[9], 2, v[8]),
+      ds_store_b32(addr=v[9], data0=v[7]),
+      s_waitcnt(lgkmcnt=0),
+      s_barrier(),
+      # Phase 3: read other wave's phase-2 output into v[0]
+      v_add_nc_u32_e32(v[10], 64, v[4]),
+      v_lshlrev_b32_e32(v[11], 2, v[10]),
+      ds_load_b32(addr=v[11], vdst=v[0]),
+      s_waitcnt(lgkmcnt=0),
+    ]
+    st = run_program(instructions, n_lanes=64)
+    for tid in range(64):
+      self.assertEqual(st.vgpr[tid][0], tid + 100 + 1000, f"tid={tid}")
+
+
 if __name__ == '__main__':
   unittest.main()