diff --git a/test/amd/hw/helpers.py b/test/amd/hw/helpers.py index ef7a7cc6ed..d58052d461 100644 --- a/test/amd/hw/helpers.py +++ b/test/amd/hw/helpers.py @@ -60,11 +60,12 @@ def skip_unless_gfx(min_major: int, min_minor: int = 0, reason: str = ""): return test_func return decorator -# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec +# Output buffer layout: vgpr[N_VGPRS][n_lanes], sgpr[N_SGPRS], vcc, scc, exec N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32 -VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048 SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64 -OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12 # + vcc + scc + exec +_VGPR_REGION = N_VGPRS * WAVE_SIZE * 4 # minimum vgpr region size (tests may use as scratch) +def _out_bytes(n_lanes: int) -> int: return max(N_VGPRS * n_lanes * 4, _VGPR_REGION) + SGPR_BYTES + 12 +OUT_BYTES = _out_bytes(WAVE_SIZE) # default for single-wave (backward compat) # Float conversion helpers def f2i(f: float) -> int: return _i32(f) @@ -77,8 +78,8 @@ def assemble(instructions: list) -> bytes: # Simple WaveState class for test output parsing (mirrors test/mockgpu/amd/emu.py interface for tests) class WaveState: - def __init__(self): - self.vgpr = [[0] * 256 for _ in range(32)] # vgpr[lane][reg] + def __init__(self, n_lanes: int = 32): + self.vgpr = [[0] * 256 for _ in range(n_lanes)] # vgpr[lane][reg] self.sgpr = [0] * 128 self.vcc = 0 self.scc = 0 @@ -102,49 +103,53 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]: # Save EXEC early (before we modify it for VGPR stores) s_mov_b32(s[95], EXEC_LO), # Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC) - s_mov_b32(EXEC_LO, (1 << n_lanes) - 1), + s_mov_b32(EXEC_LO, (1 << min(n_lanes, WAVE_SIZE)) - 1), s_load_b64(s[92:93], s[80:81], 0, soffset=NULL), s_waitcnt(0), # simm16=0 waits for all v_lshlrev_b32_e32(v[240], 2, v[255]), ] + vgpr_bytes = N_VGPRS * n_lanes * 4 for i in range(N_VGPRS): - epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92:93], offset=i * WAVE_SIZE * 4)) + epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92:93], offset=i * n_lanes * 4)) epilogue.append(v_mov_b32_e32(v[241], 0)) epilogue.append(v_cmp_eq_u32_e32(v[255], v[241])) epilogue.append(s_and_saveexec_b32(s[94], VCC_LO)) - epilogue.append(v_mov_b32_e32(v[240], 0)) + # Scalar stores: only thread 0. Use v[240]=vgpr_bytes as base offset so immediate offsets stay small. + epilogue.append(v_mov_b32_e32(v[240], vgpr_bytes)) for i in range(N_SGPRS): epilogue.append(v_mov_b32_e32(v[243], s[i])) - epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + i * 4)) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=i * 4)) epilogue.append(v_mov_b32_e32(v[243], s[90])) - epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES)) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES)) epilogue.append(v_mov_b32_e32(v[243], s[91])) - epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4)) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES + 4)) # Store EXEC (saved earlier in s[95]) epilogue.append(v_mov_b32_e32(v[243], s[95])) - epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8)) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=SGPR_BYTES + 8)) epilogue.append(s_mov_b32(EXEC_LO, s[94])) epilogue.append(s_endpgm()) return prologue, epilogue def parse_output(out_buf: bytes, n_lanes: int) -> WaveState: """Parse output buffer into WaveState.""" - st = WaveState() + vgpr_bytes = N_VGPRS * n_lanes * 4 + st = WaveState(n_lanes) for i in range(N_VGPRS): for lane in range(n_lanes): - off = i * WAVE_SIZE * 4 + lane * 4 + off = i * n_lanes * 4 + lane * 4 st.vgpr[lane][i] = struct.unpack_from(' WaveState: """Run instructions via emulator run_asm, dump state to memory, return WaveState.""" - out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES)) + buf_sz = _out_bytes(n_lanes) + out_buf = (ctypes.c_uint8 * buf_sz)(*([0] * buf_sz)) out_addr = ctypes.addressof(out_buf) prologue, epilogue = get_prologue_epilogue(n_lanes) @@ -220,11 +225,12 @@ amdhsa.kernels: lib = compiler.compile(asm_src) prg = AMDProgram(dev, "test", lib) # type: ignore[arg-type] - out_gpu = dev.allocator.alloc(OUT_BYTES) + buf_sz = _out_bytes(n_lanes) + out_gpu = dev.allocator.alloc(buf_sz) assert out_gpu.va_addr % 16 == 0, f"buffer not 16-byte aligned: 0x{out_gpu.va_addr:x}" prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True) - out_buf = bytearray(OUT_BYTES) + out_buf = bytearray(buf_sz) dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu) return parse_output(bytes(out_buf), n_lanes) diff --git a/test/amd/hw/test_sop.py b/test/amd/hw/test_sop.py index cd82f2bbe8..7202d18344 100644 --- a/test/amd/hw/test_sop.py +++ b/test/amd/hw/test_sop.py @@ -932,5 +932,76 @@ class Test64BitSOPLiterals(unittest.TestCase): self.assertEqual(st.vgpr[0][1], 0) # zero-extended, not sign-extended +class TestBarrier(unittest.TestCase): + """Tests for s_barrier — workgroup synchronization across wavefronts.""" + + def test_barrier_cross_wave_lds(self): + """Wave 0 writes to LDS, s_barrier, wave 1 reads — verifies cross-wave synchronization. + + 64 threads (2 waves of 32). Each thread writes (tid+1) to LDS[tid*4], then after + s_barrier, reads LDS[(tid^32)*4] — the value written by the other wave. Without barrier + support, wave 1 would read stale/zero LDS values. + """ + instructions = [ + # v[255] = tid (saved by prologue), copy to v[1] + v_mov_b32_e32(v[1], v[255]), + # v[2] = tid + 1 + v_add_nc_u32_e32(v[2], 1, v[1]), + # v[3] = tid * 4 + v_lshlrev_b32_e32(v[3], 2, v[1]), + # Store (tid+1) to LDS[tid*4] + ds_store_b32(addr=v[3], data0=v[2]), + s_waitcnt(lgkmcnt=0), + s_barrier(), + # Read from the other wave's slot: LDS[(tid^32)*4] + v_xor_b32_e32(v[4], 32, v[1]), + v_lshlrev_b32_e32(v[5], 2, v[4]), + ds_load_b32(addr=v[5], vdst=v[0]), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=64) + for tid in range(64): + self.assertEqual(st.vgpr[tid][0], (tid ^ 32) + 1, f"tid={tid}") + + def test_barrier_two_phases(self): + """Two barriers with three phases — tests multiple barriers in sequence. + + Phase 1: all threads write (tid+100) to LDS[tid*4], barrier. + Phase 2: all threads read other wave's value, add 1000, write to LDS[(tid+64)*4], barrier. + Phase 3: all threads read the other wave's phase-2 output into v[0]. + """ + instructions = [ + # v[255] = tid (saved by prologue), copy to v[1] + v_mov_b32_e32(v[1], v[255]), + # v[2] = tid + 100 + v_add_nc_u32_e32(v[2], 100, v[1]), + # v[3] = tid * 4 + v_lshlrev_b32_e32(v[3], 2, v[1]), + # Phase 1: write (tid+100) to LDS[tid*4] + ds_store_b32(addr=v[3], data0=v[2]), + s_waitcnt(lgkmcnt=0), + s_barrier(), + # Phase 2: read from other wave, add 1000, write to separate LDS region + v_xor_b32_e32(v[4], 32, v[1]), + v_lshlrev_b32_e32(v[5], 2, v[4]), + ds_load_b32(addr=v[5], vdst=v[6]), + s_waitcnt(lgkmcnt=0), + v_add_nc_u32_e32(v[7], 0x3e8, v[6]), + v_add_nc_u32_e32(v[8], 64, v[1]), + v_lshlrev_b32_e32(v[9], 2, v[8]), + ds_store_b32(addr=v[9], data0=v[7]), + s_waitcnt(lgkmcnt=0), + s_barrier(), + # Phase 3: read other wave's phase-2 output into v[0] + v_add_nc_u32_e32(v[10], 64, v[4]), + v_lshlrev_b32_e32(v[11], 2, v[10]), + ds_load_b32(addr=v[11], vdst=v[0]), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=64) + for tid in range(64): + self.assertEqual(st.vgpr[tid][0], tid + 100 + 1000, f"tid={tid}") + + if __name__ == '__main__': unittest.main()