From 1baefed530276fb132aa1a283fdc72cf8b660871 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 21 Jan 2026 08:53:54 +0900 Subject: [PATCH] assembly/amd: add hw tests from ucode branch (#14259) * assembly/amd: add hw tests from ucode branch * fix is per lane --- extra/assembly/amd/pcode.py | 12 +- extra/assembly/amd/test/hw/helpers.py | 46 +- extra/assembly/amd/test/hw/test_ds.py | 50 ++ extra/assembly/amd/test/hw/test_global.py | 163 +++++++ extra/assembly/amd/test/hw/test_sop.py | 242 ++++++++++ extra/assembly/amd/test/hw/test_vop1.py | 292 +++++++++++- extra/assembly/amd/test/hw/test_vop2.py | 414 +++++++++++++++++ extra/assembly/amd/test/hw/test_vop3.py | 531 +++++++++++++++++++++- extra/assembly/amd/test/hw/test_vop3p.py | 79 +++- extra/assembly/amd/test/hw/test_vopc.py | 355 +++++++++++++++ extra/assembly/amd/test/hw/test_vopd.py | 161 +++++++ 11 files changed, 2304 insertions(+), 41 deletions(-) create mode 100644 extra/assembly/amd/test/hw/test_vopd.py diff --git a/extra/assembly/amd/pcode.py b/extra/assembly/amd/pcode.py index bd98f82ff1..d1249b3e22 100644 --- a/extra/assembly/amd/pcode.py +++ b/extra/assembly/amd/pcode.py @@ -653,17 +653,17 @@ def _apply_pseudocode_fixes(op_name: str, code: str) -> str: code = code.replace('D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)', 'D0.f64 = (2.0 ** 128 if exponent(S2.f64) > 1023 else 2.0 ** -128) * fma(S0.f64, S1.f64, S2.f64)') if op_name == 'V_DIV_SCALE_F32': - code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(0x1); D0.f32 = float("nan")') + code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(1 << laneId); D0.f32 = float("nan")') code = code.replace('elif S1.f32 == DENORM.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif False:\n pass') code += '\nif S1.f32 == DENORM.f32:\n D0.f32 = float("nan")' - code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(0x1); D0.f32 = ldexp(S0.f32, 64)') - code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)') + code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(1 << laneId); D0.f32 = ldexp(S0.f32, 64)') + code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(1 << laneId)') if op_name == 'V_DIV_SCALE_F64': - code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(0x1); D0.f64 = float("nan")') + code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(1 << laneId); D0.f64 = float("nan")') code = code.replace('elif S1.f64 == DENORM.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif False:\n pass') code += '\nif S1.f64 == DENORM.f64:\n D0.f64 = float("nan")' - code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(0x1); D0.f64 = ldexp(S0.f64, 128)') - code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)') + code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(1 << laneId); D0.f64 = ldexp(S0.f64, 128)') + code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(1 << laneId)') if op_name == 'V_DIV_FIXUP_F32': code = code.replace('D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))', 'D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) if isNAN(S0.f32) else ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))') diff --git a/extra/assembly/amd/test/hw/helpers.py b/extra/assembly/amd/test/hw/helpers.py index 4e40417ad4..efb8f6e893 100644 --- a/extra/assembly/amd/test/hw/helpers.py +++ b/extra/assembly/amd/test/hw/helpers.py @@ -1,14 +1,25 @@ """Test infrastructure for hardware-validated RDNA3 emulator tests. Uses run_asm() with memory output, so tests can run on both emulator and real hardware. -Set USE_HW=1 to run on both emulator and real hardware, comparing results. +Set USE_HW=1 to run on both emulator and hardware, comparing results. """ -import ctypes, os, struct +import ctypes, math, os, struct from extra.assembly.amd.autogen.rdna3.ins import * -from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges +from extra.assembly.amd.emu import run_asm from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0 -from extra.assembly.amd.pcode import _i32, _f32 + +def _i32(f: float) -> int: return struct.unpack(' float: return struct.unpack(' float: return struct.unpack(' int: + f = float(f) + if math.isnan(f): return 0x7e00 + if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00 + try: return struct.unpack(' 0 else 0xfc00 # For backwards compatibility with tests using SrcEnum.NULL etc. class SrcEnum: @@ -32,11 +43,11 @@ VCC = VCC_LO # For VOP3SD sdst field (VCC_LO is exported from dsl) USE_HW = os.environ.get("USE_HW", "0") == "1" FLOAT_TOLERANCE = 1e-5 -# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc +# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32 VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048 SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64 -OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8 # + vcc + scc +OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12 # + vcc + scc + exec # Float conversion helpers def f2i(f: float) -> int: return _i32(f) @@ -47,6 +58,14 @@ def i642f(i: int) -> float: return struct.unpack(' bytes: return b''.join(inst.to_bytes() for inst in instructions) +# Simple WaveState class for test output parsing (mirrors emu.py interface for tests) +class WaveState: + def __init__(self): + self.vgpr = [[0] * 256 for _ in range(32)] # vgpr[lane][reg] + self.sgpr = [0] * 128 + self.vcc = 0 + self.scc = 0 + def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]: """Generate prologue and epilogue instructions for state capture.""" prologue = [ @@ -63,6 +82,10 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]: epilogue = [ s_mov_b32(s[90], VCC_LO), s_cselect_b32(s[91], 1, 0), + # Save EXEC early (before we modify it for VGPR stores) + s_mov_b32(s[95], EXEC_LO), + # Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC) + s_mov_b32(EXEC_LO, (1 << n_lanes) - 1), s_load_b64(s[92:93], s[80:81], 0, soffset=NULL), s_waitcnt(0), # simm16=0 waits for all v_lshlrev_b32_e32(v[240], 2, v[255]), @@ -80,6 +103,9 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]: epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES)) epilogue.append(v_mov_b32_e32(v[243], s[91])) epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4)) + # Store EXEC (saved earlier in s[95]) + epilogue.append(v_mov_b32_e32(v[243], s[95])) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8)) epilogue.append(s_mov_b32(EXEC_LO, s[94])) epilogue.append(s_endpgm()) return prologue, epilogue @@ -95,6 +121,8 @@ def parse_output(out_buf: bytes, n_lanes: int) -> WaveState: st.sgpr[i] = struct.unpack_from(' WaveState: @@ -110,9 +138,9 @@ def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState: kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code) lib_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)}) # rsrc2: USER_SGPR_COUNT=2, ENABLE_SGPR_WORKGROUP_ID_X/Y/Z=1, LDS_SIZE=128 (64KB) rsrc2 = 0x19c | (128 << 15) + scratch_size = 0x10000 # 64KB per lane, matches .amdhsa_private_segment_fixed_size in run_program_hw result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr, rsrc2) assert result == 0, f"run_asm failed with {result}" @@ -148,6 +176,8 @@ test: .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_kernarg_size 8 .amdhsa_group_segment_fixed_size 65536 + .amdhsa_private_segment_fixed_size 65536 + .amdhsa_enable_private_segment 1 .end_amdhsa_kernel .amdgpu_metadata @@ -160,7 +190,7 @@ amdhsa.kernels: .symbol: test.kd .kernarg_segment_size: 8 .group_segment_fixed_size: 65536 - .private_segment_fixed_size: 0 + .private_segment_fixed_size: 65536 .kernarg_segment_align: 8 .wavefront_size: 32 .sgpr_count: 96 diff --git a/extra/assembly/amd/test/hw/test_ds.py b/extra/assembly/amd/test/hw/test_ds.py index 220984d2d9..2783be043c 100644 --- a/extra/assembly/amd/test/hw/test_ds.py +++ b/extra/assembly/amd/test/hw/test_ds.py @@ -138,6 +138,56 @@ class TestDS2AddrMore(unittest.TestCase): self.assertEqual(st.vgpr[0][4], 0x12345678, "v4 should be untouched") +class TestDSB128(unittest.TestCase): + """Tests for DS_STORE_B128 and DS_LOAD_B128 (128-bit / 4 dwords).""" + + def test_ds_store_load_b128(self): + """DS_STORE_B128 stores 4 VGPRs, DS_LOAD_B128 loads them back.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[0], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[1], s[0]), + s_mov_b32(s[0], 0x33333333), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0x44444444), + v_mov_b32_e32(v[3], s[0]), + ds_store_b128(addr=v[10], data0=v[0:3]), + s_waitcnt(lgkmcnt=0), + ds_load_b128(addr=v[10], vdst=v[4:7]), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword") + self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have second dword") + self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should have third dword") + self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should have fourth dword") + + def test_ds_store_b128_with_offset(self): + """DS_STORE_B128 with non-zero offset.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[0], s[0]), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[1], s[0]), + s_mov_b32(s[0], 0xCCCCCCCC), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xDDDDDDDD), + v_mov_b32_e32(v[3], s[0]), + DS(DSOp.DS_STORE_B128, addr=v[10], data0=v[0:3], offset0=16), + s_waitcnt(lgkmcnt=0), + DS(DSOp.DS_LOAD_B128, addr=v[10], vdst=v[4:7], offset0=16), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA) + self.assertEqual(st.vgpr[0][5], 0xBBBBBBBB) + self.assertEqual(st.vgpr[0][6], 0xCCCCCCCC) + self.assertEqual(st.vgpr[0][7], 0xDDDDDDDD) + + class TestDSAtomic(unittest.TestCase): """Tests for DS atomic operations.""" diff --git a/extra/assembly/amd/test/hw/test_global.py b/extra/assembly/amd/test/hw/test_global.py index 20d5478162..edefc64f62 100644 --- a/extra/assembly/amd/test/hw/test_global.py +++ b/extra/assembly/amd/test/hw/test_global.py @@ -128,6 +128,169 @@ class TestGlobalLoad(unittest.TestCase): class TestGlobalStore(unittest.TestCase): """Tests for GLOBAL store instructions.""" + def test_global_store_b8_basic(self): + """GLOBAL_STORE_B8 stores a single byte from VDATA[7:0].""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # First store 0xDEADBEEF to memory + s_mov_b32(s[4], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Now store single byte 0x42 to same address (should only change byte 0) + v_mov_b32_e32(v[2], 0x42), + global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Read back and check + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + # Only byte 0 should change from 0xEF to 0x42 + self.assertEqual(st.vgpr[0][0], 0xDEADBE42, "Only byte 0 should be modified") + + def test_global_store_b8_byte1(self): + """GLOBAL_STORE_B8 at offset+1 stores to byte 1.""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[4], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[2], 0x42), + global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1), + s_waitcnt(vmcnt=0), + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xDEAD42EF, "Only byte 1 should be modified") + + def test_global_store_b16_basic(self): + """GLOBAL_STORE_B16 stores a 16-bit value from VDATA[15:0].""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[4], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[4], 0xCAFE), + v_mov_b32_e32(v[2], s[4]), + global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xDEADCAFE, "Only lower 16 bits should be modified") + + def test_global_store_b16_high_half(self): + """GLOBAL_STORE_B16 at offset+2 stores to high 16 bits.""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[4], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[4], 0xCAFE), + v_mov_b32_e32(v[2], s[4]), + global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+2), + s_waitcnt(vmcnt=0), + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xCAFEBEEF, "Only upper 16 bits should be modified") + + def test_global_store_b16_byte_offset_1(self): + """GLOBAL_STORE_B16 at byte offset 1 stores bytes 1-2 within the same word.""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[4], 0xDDCCBBAA), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Store 0xBEEF at byte offset 1 (bytes 1-2) + s_mov_b32(s[4], 0xBEEF), + v_mov_b32_e32(v[2], s[4]), + global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1), + s_waitcnt(vmcnt=0), + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + # Bytes 1-2 should be 0xBEEF (0xEF at byte 1, 0xBE at byte 2) + # Original: 0xDDCCBBAA -> bytes [AA, BB, CC, DD] + # After: 0xDDBEEFAA -> bytes [AA, EF, BE, DD] + self.assertEqual(st.vgpr[0][0], 0xDDBEEFAA, "Bytes 1-2 should be 0xBEEF") + + def test_global_store_b16_cross_word_boundary(self): + """GLOBAL_STORE_B16 at byte offset 3 crosses word boundary (byte 3 of word N, byte 0 of word N+1).""" + TEST_OFFSET = 256 + instructions = [ + s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Initialize two consecutive words + s_mov_b32(s[4], 0xDDCCBBAA), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET), + s_mov_b32(s[4], 0x44332211), + v_mov_b32_e32(v[2], s[4]), + global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+4), + s_waitcnt(vmcnt=0), + # Store 0xBEEF at byte offset 3 (crosses word boundary) + # Low byte (0xEF) goes to byte 3 of first word + # High byte (0xBE) goes to byte 0 of second word + s_mov_b32(s[4], 0xBEEF), + v_mov_b32_e32(v[2], s[4]), + global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+3), + s_waitcnt(vmcnt=0), + # Load back both words + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET), + GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+4), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[3]), + v_mov_b32_e32(v[1], v[4]), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + # First word: 0xDDCCBBAA -> 0xEFCCBBAA (byte 3 becomes 0xEF) + # Second word: 0x44332211 -> 0x443322BE (byte 0 becomes 0xBE) + self.assertEqual(st.vgpr[0][0], 0xEFCCBBAA, "Byte 3 of first word should be 0xEF") + self.assertEqual(st.vgpr[0][1], 0x443322BE, "Byte 0 of second word should be 0xBE") + def test_global_store_b64_basic(self): """GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory.""" TEST_OFFSET = 256 diff --git a/extra/assembly/amd/test/hw/test_sop.py b/extra/assembly/amd/test/hw/test_sop.py index 1e7e79b438..62ba1f9120 100644 --- a/extra/assembly/amd/test/hw/test_sop.py +++ b/extra/assembly/amd/test/hw/test_sop.py @@ -62,6 +62,28 @@ class TestBasicScalar(unittest.TestCase): st = run_program(instructions, n_lanes=1) self.assertEqual(st.sgpr[1], 0x80000000) + def test_s_fmamk_f32(self): + """S_FMAMK_F32: D = S0 * literal + S1.""" + # 2.0 * 3.0 + 1.0 = 7.0 + instructions = [ + s_mov_b32(s[0], f2i(2.0)), + s_mov_b32(s[1], f2i(1.0)), + s_fmamk_f32(s[2], s[0], s[1], literal=f2i(3.0)), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], f2i(7.0)) + + def test_s_fmamk_f32_negative(self): + """S_FMAMK_F32 with negative values.""" + # -2.0 * 4.0 + 10.0 = 2.0 + instructions = [ + s_mov_b32(s[0], f2i(-2.0)), + s_mov_b32(s[1], f2i(10.0)), + s_fmamk_f32(s[2], s[0], s[1], literal=f2i(4.0)), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], f2i(2.0)) + class TestQuadmaskWqm(unittest.TestCase): """Tests for S_QUADMASK_B32 and S_WQM_B32.""" @@ -298,6 +320,56 @@ class TestSignedArithmetic(unittest.TestCase): st = run_program(instructions, n_lanes=1) self.assertEqual(st.sgpr[2], 2) + def test_s_mul_hi_u32_max(self): + """S_MUL_HI_U32: 0xFFFFFFFF * 0xFFFFFFFF.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), + s_mov_b32(s[1], 0xFFFFFFFF), + s_mul_hi_u32(s[2], s[0], s[1]), # (0xFFFFFFFF * 0xFFFFFFFF) >> 32 = 0xFFFFFFFE + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 0xFFFFFFFE) + + def test_s_mul_hi_i32_positive(self): + """S_MUL_HI_I32: positive * positive.""" + instructions = [ + s_mov_b32(s[0], 0x40000000), # 2^30 + s_mov_b32(s[1], 4), + s_mul_hi_i32(s[2], s[0], s[1]), # (2^30 * 4) >> 32 = 1 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 1) + + def test_s_mul_hi_i32_neg_times_neg(self): + """S_MUL_HI_I32: (-1) * (-1) = 1, high bits = 0.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # -1 + s_mov_b32(s[1], 0xFFFFFFFF), # -1 + s_mul_hi_i32(s[2], s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 0) + + def test_s_mul_hi_i32_neg_times_pos(self): + """S_MUL_HI_I32: (-1) * 2 = -2, high bits = -1 (sign extension).""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # -1 + s_mov_b32(s[1], 2), + s_mul_hi_i32(s[2], s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 0xFFFFFFFF) # -1 sign extends + + def test_s_mul_hi_i32_min_int(self): + """S_MUL_HI_I32: MIN_INT * 2 = -2^32, high = -1.""" + instructions = [ + s_mov_b32(s[0], 0x80000000), # -2^31 (MIN_INT) + s_mov_b32(s[1], 2), + s_mul_hi_i32(s[2], s[0], s[1]), # (-2^31 * 2) >> 32 = -1 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 0xFFFFFFFF) + def test_s_mul_i32(self): """S_MUL_I32: signed multiply low 32 bits.""" instructions = [ @@ -329,6 +401,176 @@ class TestSignedArithmetic(unittest.TestCase): self.assertEqual(st.sgpr[7], ((dividend * 2) + 1) & 0xFFFFFFFF) +class TestBitSet(unittest.TestCase): + """Tests for S_BITSET0_B32 and S_BITSET1_B32 instructions.""" + + def test_s_bitset1_b32_set_bit0(self): + """S_BITSET1_B32: set bit 0 in destination.""" + instructions = [ + s_mov_b32(s[0], 0), # start with 0 + s_mov_b32(s[1], 0), # bit position = 0 + s_bitset1_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 1, "Bit 0 should be set") + + def test_s_bitset1_b32_set_bit31(self): + """S_BITSET1_B32: set bit 31 in destination.""" + instructions = [ + s_mov_b32(s[0], 0), # start with 0 + s_mov_b32(s[1], 31), # bit position = 31 + s_bitset1_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 0x80000000, "Bit 31 should be set") + + def test_s_bitset1_b32_preserves_other_bits(self): + """S_BITSET1_B32: preserves bits not being set.""" + instructions = [ + s_mov_b32(s[0], 0xFF00FF00), # existing pattern + s_mov_b32(s[1], 0), # bit position = 0 + s_bitset1_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 0xFF00FF01, "Should set bit 0 while preserving others") + + def test_s_bitset0_b32_clear_bit0(self): + """S_BITSET0_B32: clear bit 0 in destination.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set + s_mov_b32(s[1], 0), # bit position = 0 + s_bitset0_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 0xFFFFFFFE, "Bit 0 should be cleared") + + def test_s_bitset0_b32_clear_bit31(self): + """S_BITSET0_B32: clear bit 31 in destination.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set + s_mov_b32(s[1], 31), # bit position = 31 + s_bitset0_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 0x7FFFFFFF, "Bit 31 should be cleared") + + def test_s_bitset1_b32_uses_low5_bits(self): + """S_BITSET1_B32: only uses low 5 bits of position (mod 32).""" + instructions = [ + s_mov_b32(s[0], 0), + s_mov_b32(s[1], 32 + 5), # position = 37, but mod 32 = 5 + s_bitset1_b32(s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[0], 0x20, "Bit 5 should be set (37 mod 32 = 5)") + + +class TestBfeI64(unittest.TestCase): + """Tests for S_BFE_I64 - 64-bit bit field extract with sign extension. + + Regression tests for sign extension bug where 32-bit masks were incorrectly + used for 64-bit operations, causing the high 32 bits to not be sign-extended. + """ + + def test_s_bfe_i64_positive_no_sign_extend(self): + """S_BFE_I64: positive value (1) in 16 bits should not sign extend.""" + # S1 encodes: [22:16] = width, [5:0] = offset + # width=16, offset=0 -> S1 = (16 << 16) | 0 = 0x100000 + instructions = [ + s_mov_b32(s[0], 1), # S0 lo = 1 + s_mov_b32(s[1], 0), # S0 hi = 0 + s_mov_b32(s[2], 0x100000), # width=16, offset=0 + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 1, "lo should be 1") + self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)") + + def test_s_bfe_i64_negative_sign_extend(self): + """S_BFE_I64: 0xFFFF (-1 in 16 bits) should sign extend to 64 bits. + + This is the main regression test - before the fix, hi was 0 instead of 0xFFFFFFFF. + """ + instructions = [ + s_mov_b32(s[0], 0xFFFF), # S0 lo = -1 in 16 bits + s_mov_b32(s[1], 0), # S0 hi = 0 + s_mov_b32(s[2], 0x100000), # width=16, offset=0 + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF") + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)") + + def test_s_bfe_i64_8bit_negative_sign_extend(self): + """S_BFE_I64: 0xFF (-1 in 8 bits) should sign extend to 64 bits.""" + # width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000 + instructions = [ + s_mov_b32(s[0], 0xFF), # S0 lo = -1 in 8 bits + s_mov_b32(s[1], 0), # S0 hi = 0 + s_mov_b32(s[2], 0x80000), # width=8, offset=0 + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF") + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)") + + def test_s_bfe_i64_8bit_positive(self): + """S_BFE_I64: 0x7F (127 in 8 bits) should not sign extend.""" + # width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000 + instructions = [ + s_mov_b32(s[0], 0x7F), # S0 lo = 127 in 8 bits (MSB=0) + s_mov_b32(s[1], 0), # S0 hi = 0 + s_mov_b32(s[2], 0x80000), # width=8, offset=0 + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0x7F, "lo should be 0x7F") + self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)") + + def test_s_bfe_i64_with_offset(self): + """S_BFE_I64: extract from non-zero bit offset with sign extension.""" + # Extract 16 bits starting at bit 8: value 0xFF00 >> 8 = 0xFF = -1 in 8 bits? No wait... + # Let's put 0x8000FF00: extract 16 bits at offset 8 = 0x00FF (positive) + # Put 0xFF00_0000: extract 16 bits at offset 16 = 0xFF00 = -256 in signed 16-bit + instructions = [ + s_mov_b32(s[0], 0xFF000000), # bits [31:24] = 0xFF, [23:16] = 0x00 + s_mov_b32(s[1], 0), + # width=16, offset=16 -> S1 = (16 << 16) | 16 = 0x100010 + s_mov_b32(s[2], 0x100010), + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + # Extract bits [31:16] = 0xFF00, sign bit is bit 15 of extracted = bit 31 of original = 1 + # So result should be sign-extended 0xFF00 -> 0xFFFFFF00 in lo, 0xFFFFFFFF in hi + self.assertEqual(st.vgpr[0][0], 0xFFFFFF00, "lo should be sign-extended 0xFF00") + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)") + + def test_s_bfe_i64_32bit_negative(self): + """S_BFE_I64: extract 32 bits with sign extension.""" + # width=32, offset=0 -> S1 = (32 << 16) | 0 = 0x200000 + instructions = [ + s_mov_b32(s[0], 0x80000000), # MIN_INT32 = -2^31 + s_mov_b32(s[1], 0), + s_mov_b32(s[2], 0x200000), # width=32, offset=0 + s_bfe_i64(s[4:5], s[0:1], s[2]), + v_mov_b32_e32(v[0], s[4]), + v_mov_b32_e32(v[1], s[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0x80000000, "lo should be 0x80000000") + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)") + + class Test64BitCompare(unittest.TestCase): """Tests for 64-bit scalar compare instructions.""" diff --git a/extra/assembly/amd/test/hw/test_vop1.py b/extra/assembly/amd/test/hw/test_vop1.py index a1042e72b5..215a2a2fbe 100644 --- a/extra/assembly/amd/test/hw/test_vop1.py +++ b/extra/assembly/amd/test/hw/test_vop1.py @@ -255,7 +255,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_cvt_f16_f32_small(self): """V_CVT_F16_F32 converts small f32 value.""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 instructions = [ v_mov_b32_e32(v[0], 0.5), v_cvt_f16_f32_e32(v[1], v[0]), @@ -293,7 +293,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_cvt_f16_f32_reads_full_32bit_source(self): """V_CVT_F16_F32 must read full 32-bit f32 source.""" - from extra.assembly.amd.pcode import _f16 + from extra.assembly.amd.test.hw.helpers import _f16 instructions = [ s_mov_b32(s[0], 0x3fc00000), # f32 1.5 v_mov_b32_e32(v[0], s[0]), @@ -348,6 +348,142 @@ class TestF16Conversions(unittest.TestCase): self.assertEqual(result, 1, f"Expected 1 from high bits, got {result}") +class TestF64Conversions(unittest.TestCase): + """Tests for f64 conversion instructions. Regression tests for f32_to_f64/f64_to_f32.""" + + def test_v_cvt_f64_f32_one(self): + """V_CVT_F64_F32 converts f32 1.0 to f64.""" + instructions = [ + s_mov_b32(s[0], f2i(1.0)), + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_f32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, 1.0, places=10) + + def test_v_cvt_f64_f32_negative(self): + """V_CVT_F64_F32 converts f32 -2.5 to f64.""" + instructions = [ + s_mov_b32(s[0], f2i(-2.5)), + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_f32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, -2.5, places=10) + + def test_v_cvt_f64_f32_pi(self): + """V_CVT_F64_F32 converts f32 pi to f64.""" + import math + instructions = [ + s_mov_b32(s[0], f2i(3.14159265)), + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_f32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, 3.14159265, places=5) + + def test_v_cvt_f64_f32_zero(self): + """V_CVT_F64_F32 converts f32 0.0 to f64.""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_cvt_f64_f32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertEqual(result, 0.0) + + def test_v_cvt_f32_f64_one(self): + """V_CVT_F32_F64 converts f64 1.0 to f32.""" + f64_bits = f2i64(1.0) + lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF + instructions = [ + s_mov_b32(s[0], lo), + s_mov_b32(s[1], hi), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_cvt_f32_f64_e32(v[2], v[0:1]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][2]) + self.assertAlmostEqual(result, 1.0, places=5) + + def test_v_cvt_f32_f64_negative(self): + """V_CVT_F32_F64 converts f64 -3.5 to f32.""" + f64_bits = f2i64(-3.5) + lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF + instructions = [ + s_mov_b32(s[0], lo), + s_mov_b32(s[1], hi), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_cvt_f32_f64_e32(v[2], v[0:1]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][2]) + self.assertAlmostEqual(result, -3.5, places=5) + + def test_v_cvt_f32_f64_large(self): + """V_CVT_F32_F64 converts large f64 to f32.""" + f64_bits = f2i64(123456.789) + lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF + instructions = [ + s_mov_b32(s[0], lo), + s_mov_b32(s[1], hi), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_cvt_f32_f64_e32(v[2], v[0:1]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][2]) + self.assertAlmostEqual(result, 123456.789, places=0) + + def test_v_cvt_f64_i32_positive(self): + """V_CVT_F64_I32 converts positive i32 to f64.""" + instructions = [ + s_mov_b32(s[0], 42), + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_i32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, 42.0, places=10) + + def test_v_cvt_f64_i32_negative(self): + """V_CVT_F64_I32 converts negative i32 to f64.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # -1 as i32 + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_i32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, -1.0, places=10) + + def test_v_cvt_f64_u32_large(self): + """V_CVT_F64_U32 converts large u32 to f64.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), # max u32 + v_mov_b32_e32(v[0], s[0]), + v_cvt_f64_u32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertAlmostEqual(result, 4294967295.0, places=0) + + def test_v_cvt_f64_u32_zero(self): + """V_CVT_F64_U32 converts 0 to f64.""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_cvt_f64_u32_e32(v[2:3], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2]) + self.assertEqual(result, 0.0) + + class TestClz(unittest.TestCase): """Tests for V_CLZ_I32_U32 - count leading zeros.""" @@ -560,7 +696,7 @@ class TestCvtF16Modifiers(unittest.TestCase): def test_v_cvt_f32_f16_abs_negative(self): """V_CVT_F32_F16 with |abs| on negative value.""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 f16_neg1 = f32_to_f16(-1.0) # 0xbc00 instructions = [ s_mov_b32(s[0], f16_neg1), @@ -573,7 +709,7 @@ class TestCvtF16Modifiers(unittest.TestCase): def test_v_cvt_f32_f16_abs_positive(self): """V_CVT_F32_F16 with |abs| on positive value (should stay positive).""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 f16_2 = f32_to_f16(2.0) # 0x4000 instructions = [ s_mov_b32(s[0], f16_2), @@ -586,7 +722,7 @@ class TestCvtF16Modifiers(unittest.TestCase): def test_v_cvt_f32_f16_neg_positive(self): """V_CVT_F32_F16 with neg on positive value.""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 f16_2 = f32_to_f16(2.0) # 0x4000 instructions = [ s_mov_b32(s[0], f16_2), @@ -599,7 +735,7 @@ class TestCvtF16Modifiers(unittest.TestCase): def test_v_cvt_f32_f16_neg_negative(self): """V_CVT_F32_F16 with neg on negative value (double negative).""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 f16_neg2 = f32_to_f16(-2.0) # 0xc000 instructions = [ s_mov_b32(s[0], f16_neg2), @@ -612,7 +748,7 @@ class TestCvtF16Modifiers(unittest.TestCase): def test_v_cvt_f16_f32_then_pack_for_wmma(self): """CVT F32->F16 followed by pack (common WMMA pattern).""" - from extra.assembly.amd.pcode import _f16 + from extra.assembly.amd.test.hw.helpers import _f16 f32_val = 3.5 instructions = [ s_mov_b32(s[0], f2i(f32_val)), @@ -668,7 +804,7 @@ class TestConversionRounding(unittest.TestCase): def test_f16_to_f32_precision(self): """F16 to F32 conversion precision.""" - from extra.assembly.amd.pcode import f32_to_f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16 f16_val = f32_to_f16(1.5) instructions = [ s_mov_b32(s[0], f16_val), @@ -680,7 +816,7 @@ class TestConversionRounding(unittest.TestCase): def test_f16_denormal_to_f32(self): """F16 denormal converts to small positive f32.""" - from extra.assembly.amd.pcode import _f16 + from extra.assembly.amd.test.hw.helpers import _f16 f16_denorm = 0x0001 # Smallest positive f16 denormal instructions = [ v_mov_b32_e32(v[0], f16_denorm), @@ -1238,5 +1374,143 @@ class TestFloorEdgeCases(unittest.TestCase): self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5) +class TestVop1F16HiHalf(unittest.TestCase): + """Regression tests for VOP1 f16 hi-half source operand handling. + + For 16-bit VOP1 operations, when src0 is in the range v[128]+ (offset >= 384), + the hardware reads from the high 16 bits of v[src0-128]. The emulator must + extract bits [31:16] from the actual VGPR. + """ + + def test_v_cvt_f32_f16_src_hi_half(self): + """V_CVT_F32_F16 with source from hi-half (v[128]+). + + When src0 >= v[128], it reads from the high 16 bits of v[src0-128]. + This is critical for global_load_d16_hi_b16 + v_cvt_f32_f16 patterns. + + Regression test for: VOP1 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # v_cvt_f32_f16 v[1], v[128] (reads hi half of v[0]) + # Should convert f16(2.0) to f32(2.0) + v_cvt_f32_f16_e32(v[1], v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected f32(2.0), got {result}") + + def test_v_cvt_f32_f16_src_lo_vs_hi(self): + """V_CVT_F32_F16 comparing lo and hi half reads. + + v[0] has different values in lo and hi halves. + v_cvt_f32_f16 v[1], v[0] should read lo (1.0) + v_cvt_f32_f16 v[2], v[128] should read hi (2.0) + + Regression test for: VOP1 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # Read from lo half + v_cvt_f32_f16_e32(v[1], v[0]), + # Read from hi half + v_cvt_f32_f16_e32(v[2], v[128]), + ] + st = run_program(instructions, n_lanes=1) + result_lo = i2f(st.vgpr[0][1]) + result_hi = i2f(st.vgpr[0][2]) + self.assertAlmostEqual(result_lo, 1.0, places=5, msg=f"Expected f32(1.0) from lo, got {result_lo}") + self.assertAlmostEqual(result_hi, 2.0, places=5, msg=f"Expected f32(2.0) from hi, got {result_hi}") + + def test_v_cvt_i16_f16_src_hi_half(self): + """V_CVT_I16_F16 with source from hi-half. + + Regression test for: VOP1 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0xc000_3c00: hi=f16(-2.0), lo=f16(1.0) + s_mov_b32(s[0], 0xc0003c00), + v_mov_b32_e32(v[0], s[0]), + # v_cvt_i16_f16 v[1], v[128] (reads hi half of v[0]) + # Should convert f16(-2.0) to i16(-2) + v_cvt_i16_f16_e32(v[1], v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xffff + expected = (-2) & 0xffff + self.assertEqual(result, expected, f"Expected i16(-2)=0x{expected:04x}, got 0x{result:04x}") + + def test_v_mov_b16_src_hi_half(self): + """V_MOV_B16 with source from hi-half. + + Regression test for: VOP1 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0xBEEF_DEAD: hi=0xBEEF, lo=0xDEAD + s_mov_b32(s[0], 0xBEEFDEAD), + v_mov_b32_e32(v[0], s[0]), + # v[1] = 0x0000_0000 initially + v_mov_b32_e32(v[1], 0), + # v_mov_b16 v[1], v[128] (reads hi half of v[0]) + # Should move 0xBEEF to v[1].lo + v_mov_b16_e32(v[1], v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xffff + self.assertEqual(result, 0xBEEF, f"Expected 0xBEEF from hi half, got 0x{result:04x}") + + +class TestReciprocalF16(unittest.TestCase): + """Tests for V_RCP_F16 - reciprocal in half precision. + + The pcode uses a 16-bit float literal: D0.f16 = 16'1.0 / S0.f16 + This tests that the sized float literal (16'1.0) is correctly parsed. + """ + + def test_v_rcp_f16_one(self): + """V_RCP_F16: 1/1.0 = 1.0""" + import struct + def f16_to_bits(f): return struct.unpack('= 384 (v[128]+) wasn't extracting hi 16 bits + 2. VOP2 vdst >= 384 (v[128]+) wasn't preserving lo 16 bits + """ + + def test_v_add_f16_e32_vsrc1_hi_half(self): + """V_ADD_F16_E32 with vsrc1 from hi-half (v[128]+). + + When vsrc1 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits + of v[vsrc1-128]. The emulator must extract bits [31:16] from the actual VGPR. + + Regression test for: VOP2 f16 vsrc1 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # v_add_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0]) + # In VOP2 encoding, vsrc1=384 means v[128], which maps to v[0].hi + # v[1] = v[0].lo + v[0].hi = 1.0 + 2.0 = 3.0 + VOP2(VOP2Op.V_ADD_F16, vdst=v[1], src0=v[0], vsrc1=v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xffff + # 1.0 + 2.0 = 3.0, f16 3.0 = 0x4200 + self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}") + + def test_v_mul_f16_e32_vsrc1_hi_half(self): + """V_MUL_F16_E32 with vsrc1 from hi-half. + + Regression test for: VOP2 f16 vsrc1 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4200_4000: hi=f16(3.0), lo=f16(2.0) + s_mov_b32(s[0], 0x42004000), + v_mov_b32_e32(v[0], s[0]), + # v_mul_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0]) + # v[1] = v[0].lo * v[0].hi = 2.0 * 3.0 = 6.0 + VOP2(VOP2Op.V_MUL_F16, vdst=v[1], src0=v[0], vsrc1=v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xffff + # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600 + self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}") + + def test_v_add_f16_e32_vdst_hi_half(self): + """V_ADD_F16_E32 writing to hi-half destination (v[128]+). + + When vdst >= 384 (representing v[128]+), the hardware writes to bits [31:16] + of v[vdst-128] while preserving bits [15:0]. The emulator must merge the result. + + Regression test for: VOP2 f16 vdst hi-half write bug. + """ + instructions = [ + # v[0] = 0x0000_BEEF: lo has marker value + s_mov_b32(s[0], 0x0000BEEF), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(1.0), v[2] = f16(2.0) + s_mov_b32(s[1], 0x3c00), + s_mov_b32(s[2], 0x4000), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + # v_add_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0]) + # v[0].hi = 1.0 + 2.0 = 3.0, v[0].lo preserved = 0xBEEF + VOP2(VOP2Op.V_ADD_F16, vdst=v[128], src0=v[1], vsrc1=v[2]), + ] + st = run_program(instructions, n_lanes=1) + hi = (st.vgpr[0][0] >> 16) & 0xffff + lo = st.vgpr[0][0] & 0xffff + # hi = 3.0 = 0x4200, lo preserved = 0xBEEF + self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}") + self.assertEqual(lo, 0xBEEF, f"Expected lo preserved=0xBEEF, got 0x{lo:04x}") + + def test_v_mul_f16_e32_vdst_hi_half(self): + """V_MUL_F16_E32 writing to hi-half destination. + + Regression test for: VOP2 f16 vdst hi-half write bug. + """ + instructions = [ + # v[0] = 0x0000_DEAD: lo has marker value + s_mov_b32(s[0], 0x0000DEAD), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(2.0), v[2] = f16(4.0) + s_mov_b32(s[1], 0x4000), + s_mov_b32(s[2], 0x4400), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + # v_mul_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0]) + # v[0].hi = 2.0 * 4.0 = 8.0, v[0].lo preserved = 0xDEAD + VOP2(VOP2Op.V_MUL_F16, vdst=v[128], src0=v[1], vsrc1=v[2]), + ] + st = run_program(instructions, n_lanes=1) + hi = (st.vgpr[0][0] >> 16) & 0xffff + lo = st.vgpr[0][0] & 0xffff + # hi = 8.0 = 0x4800, lo preserved = 0xDEAD + self.assertEqual(hi, 0x4800, f"Expected hi=f16(8.0)=0x4800, got 0x{hi:04x}") + self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}") + + def test_v_add_f16_e32_both_hi_half(self): + """V_ADD_F16_E32 with both vsrc1 and vdst as hi-half (different underlying regs). + + Tests the combination of both fixes: reading vsrc1 from hi-half AND + writing result to hi-half destination, using different underlying VGPRs. + + Regression test for: VOP2 f16 hi-half bugs (combined). + """ + instructions = [ + # v[0] = 0x4000_xxxx: hi=f16(2.0) for vsrc1 + s_mov_b32(s[0], 0x40000000), + v_mov_b32_e32(v[0], s[0]), + # v[1] = 0x0000_3c00: lo=f16(1.0) for src0 + s_mov_b32(s[1], 0x00003c00), + v_mov_b32_e32(v[1], s[1]), + # v[2] = 0x0000_CAFE: lo=marker for vdst preservation + s_mov_b32(s[2], 0x0000CAFE), + v_mov_b32_e32(v[2], s[2]), + # v_add_f16_e32 v[130], v[1], v[128] + # src0 = v[1].lo = 1.0 + # vsrc1 = v[128] reads v[0].hi = 2.0 + # result = 1.0 + 2.0 = 3.0 + # vdst = v[130] writes to v[2].hi, preserving v[2].lo + VOP2(VOP2Op.V_ADD_F16, vdst=v[130], src0=v[1], vsrc1=v[128]), + ] + st = run_program(instructions, n_lanes=1) + hi = (st.vgpr[0][2] >> 16) & 0xffff + lo = st.vgpr[0][2] & 0xffff + # hi = 3.0 = 0x4200, lo preserved = 0xCAFE + self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}") + self.assertEqual(lo, 0xCAFE, f"Expected lo preserved=0xCAFE, got 0x{lo:04x}") + + def test_v_fmac_f16_e32_vsrc1_hi_half(self): + """V_FMAC_F16_E32 with vsrc1 from hi-half. + + V_FMAC_F16: vdst = vdst + src0 * vsrc1 + + Regression test for: VOP2 f16 vsrc1 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(3.0) = 0x4200 + s_mov_b32(s[1], 0x4200), + v_mov_b32_e32(v[1], s[1]), + # v_fmac_f16_e32 v[1], v[0], v[128] + # vdst = v[1] = 3.0 + v[0].lo * v[0].hi = 3.0 + 1.0 * 2.0 = 5.0 + VOP2(VOP2Op.V_FMAC_F16, vdst=v[1], src0=v[0], vsrc1=v[128]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xffff + # 3.0 + 1.0 * 2.0 = 5.0, f16 5.0 = 0x4500 + self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}") + + def test_v_fmac_f16_e32_vdst_hi_half(self): + """V_FMAC_F16_E32 writing to hi-half destination. + + V_FMAC_F16: vdst.h = vdst.h + src0 * vsrc1 + + When vdst is v[128]+, the accumulator D0 must also read from the hi-half. + This tests the bug where D0 was read from lo-half instead of hi-half. + + Regression test for: VOP2 FMAC hi-half D0 accumulator read bug. + """ + instructions = [ + # v[0] = 0x3800_DEAD: hi=f16(0.5), lo=marker (0xDEAD) + s_mov_b32(s[0], 0x3800DEAD), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(2.0) = 0x4000 + s_mov_b32(s[1], 0x4000), + v_mov_b32_e32(v[1], s[1]), + # v[2] = f16(3.0) = 0x4200 + s_mov_b32(s[2], 0x4200), + v_mov_b32_e32(v[2], s[2]), + # v_fmac_f16_e32 v[128], v[1], v[2] + # vdst = v[128] means v[0].hi + # D0 = v[0].hi = 0.5 + # result = D0 + src0 * vsrc1 = 0.5 + 2.0 * 3.0 = 6.5 + # v[0].hi = 6.5, v[0].lo preserved = 0xDEAD + VOP2(VOP2Op.V_FMAC_F16, vdst=v[128], src0=v[1], vsrc1=v[2]), + ] + st = run_program(instructions, n_lanes=1) + hi = (st.vgpr[0][0] >> 16) & 0xffff + lo = st.vgpr[0][0] & 0xffff + # hi = 6.5 = 0x4680, lo preserved = 0xDEAD + self.assertEqual(hi, 0x4680, f"Expected hi=f16(6.5)=0x4680, got 0x{hi:04x}") + self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}") + + def test_v_mul_f16_e32_src0_hi_half(self): + """V_MUL_F16_E32 with src0 from hi-half (src0 >= v[128]). + + When src0 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits + of v[src0-128]. The emulator must extract bits [31:16] from the actual VGPR. + + Regression test for: VOP2 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(3.0) = 0x4200 + s_mov_b32(s[1], 0x4200), + v_mov_b32_e32(v[1], s[1]), + # v_mul_f16_e32 v[2], v[128], v[1] + # src0 = v[128] reads from v[0].hi = 2.0 + # result = 2.0 * 3.0 = 6.0 + VOP2(VOP2Op.V_MUL_F16, vdst=v[2], src0=v[128], vsrc1=v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] & 0xffff + # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600 + self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}") + + def test_v_add_f16_e32_src0_hi_half(self): + """V_ADD_F16_E32 with src0 from hi-half (src0 >= v[128]). + + Regression test for: VOP2 f16 src0 hi-half extraction bug. + """ + instructions = [ + # v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0) + s_mov_b32(s[0], 0x40003c00), + v_mov_b32_e32(v[0], s[0]), + # v[1] = f16(5.0) = 0x4500 + s_mov_b32(s[1], 0x4500), + v_mov_b32_e32(v[1], s[1]), + # v_add_f16_e32 v[2], v[128], v[1] + # src0 = v[128] reads from v[0].hi = 2.0 + # result = 2.0 + 5.0 = 7.0 + VOP2(VOP2Op.V_ADD_F16, vdst=v[2], src0=v[128], vsrc1=v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] & 0xffff + # 2.0 + 5.0 = 7.0, f16 7.0 = 0x4700 + self.assertEqual(result, 0x4700, f"Expected f16(7.0)=0x4700, got 0x{result:04x}") + + +class TestF16InlineConstants(unittest.TestCase): + """Regression tests for VOP2 F16 inline float constants. + + For 16-bit VOP2 operations (v_add_f16, v_mul_f16, etc.), inline float constants + like 1.0, 2.0 must use F16 encoding (0x3c00, 0x4000) not F32 encoding (0x3f800000). + + The emulator's rsrc() function needs bits=16 to select F16_INLINE constants. + + Regression test for: VOP2 16-bit inline constant using F32 instead of F16. + """ + + def test_v_add_f16_inline_constant_1_0(self): + """V_ADD_F16_E32 with inline constant 1.0 should use F16 encoding.""" + instructions = [ + s_mov_b32(s[0], 0x3c00), # f16 1.0 + v_mov_b32_e32(v[0], s[0]), + # v_add_f16_e32 v[1], 1.0, v[0] -- 1.0 must be F16 0x3c00, not F32 0x3f800000 + v_add_f16_e32(v[1], 1.0, v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xFFFF + # 1.0 + 1.0 = 2.0, f16 2.0 = 0x4000 + self.assertEqual(result, 0x4000, f"Expected f16(2.0)=0x4000, got 0x{result:04x}") + + def test_v_add_f16_inline_constant_2_0(self): + """V_ADD_F16_E32 with inline constant 2.0.""" + instructions = [ + s_mov_b32(s[0], 0x4200), # f16 3.0 + v_mov_b32_e32(v[0], s[0]), + v_add_f16_e32(v[1], 2.0, v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xFFFF + # 2.0 + 3.0 = 5.0, f16 5.0 = 0x4500 + self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}") + + def test_v_mul_f16_inline_constant(self): + """V_MUL_F16_E32 with inline constant 2.0.""" + instructions = [ + s_mov_b32(s[0], 0x4200), # f16 3.0 + v_mov_b32_e32(v[0], s[0]), + v_mul_f16_e32(v[1], 2.0, v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] & 0xFFFF + # 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600 + self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}") + + class TestCndmask(unittest.TestCase): """Tests for V_CNDMASK_B32 and V_CNDMASK_B16.""" @@ -447,5 +734,132 @@ class TestSpecialFloatValues(unittest.TestCase): self.assertEqual(st.vgpr[0][1], 0x00000000) +class TestCarryOps(unittest.TestCase): + """Tests for VOP2 carry instructions (v_add_co_ci_u32, v_sub_co_ci_u32, v_subrev_co_ci_u32).""" + + def test_v_subrev_co_ci_u32_no_borrow(self): + """V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=0.""" + instructions = [ + s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in) + v_mov_b32_e32(v[0], 5), # S0 = 5 + v_mov_b32_e32(v[1], 10), # S1 = 10 + v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 5) + self.assertEqual(st.vcc, 0) # No borrow out + + def test_v_subrev_co_ci_u32_with_borrow(self): + """V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=1.""" + instructions = [ + s_mov_b32(VCC_LO, 1), # VCC = 1 (borrow in) + v_mov_b32_e32(v[0], 5), # S0 = 5 + v_mov_b32_e32(v[1], 10), # S1 = 10 + v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 1 = 4 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 4) + self.assertEqual(st.vcc, 0) # No borrow out + + def test_v_subrev_co_ci_u32_generates_borrow(self): + """V_SUBREV_CO_CI_U32: generates borrow when S0 + VCC_IN > S1.""" + instructions = [ + s_mov_b32(VCC_LO, 0), # VCC = 0 + v_mov_b32_e32(v[0], 10), # S0 = 10 + v_mov_b32_e32(v[1], 5), # S1 = 5 + v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 - 10 - 0 = -5 (underflow) + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xFFFFFFFB) # -5 as unsigned + self.assertEqual(st.vcc, 1) # Borrow out + + def test_v_add_co_ci_u32_no_carry(self): + """V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=0.""" + instructions = [ + s_mov_b32(VCC_LO, 0), # VCC = 0 (no carry in) + v_mov_b32_e32(v[0], 5), # S0 = 5 + v_mov_b32_e32(v[1], 10), # S1 = 10 + v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 0 = 15 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 15) + self.assertEqual(st.vcc, 0) # No carry out + + def test_v_add_co_ci_u32_with_carry(self): + """V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=1.""" + instructions = [ + s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in) + v_mov_b32_e32(v[0], 5), # S0 = 5 + v_mov_b32_e32(v[1], 10), # S1 = 10 + v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 1 = 16 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 16) + self.assertEqual(st.vcc, 0) # No carry out + + def test_v_add_co_ci_u32_generates_carry(self): + """V_ADD_CO_CI_U32: generates carry when overflow occurs.""" + instructions = [ + s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in) + s_mov_b32(s[0], 0xFFFFFFFF), # max u32 + v_mov_b32_e32(v[0], s[0]), # S0 = 0xFFFFFFFF + v_mov_b32_e32(v[1], 0), # S1 = 0 + v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 0xFFFFFFFF + 0 + 1 = 0 (overflow) + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0) # Overflowed to 0 + self.assertEqual(st.vcc, 1) # Carry out + + def test_v_sub_co_ci_u32_no_borrow(self): + """V_SUB_CO_CI_U32: D0 = S0 - S1 - VCC_IN, when VCC_IN=0.""" + instructions = [ + s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in) + v_mov_b32_e32(v[0], 10), # S0 = 10 + v_mov_b32_e32(v[1], 5), # S1 = 5 + v_sub_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 5) + self.assertEqual(st.vcc, 0) # No borrow out + + def test_v_sub_co_ci_u32_vop3sd_separate_carry_regs(self): + """VOP3SD V_SUB_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers). + + This tests the VOP3SD encoding where src2 specifies the carry-in register + independently from sdst (carry-out). The bug was reading carry-in from sdst + instead of src2. + + Computation: D0 = S0 - S1 - carry_in = 0 - 0 - 1 = -1 = 0xFFFFFFFF + """ + instructions = [ + s_mov_b32(s[6], 1), # carry-in = 1 (in s[6]) + s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10]) + # VOP3SD: v_sub_co_ci_u32(vdst, sdst, src0, src1, src2) + # src2 is carry-in (s[6]=1), sdst is carry-out (s[10]) + v_sub_co_ci_u32(v[0], s[10], 0, 0, s[6]), # D0 = 0 - 0 - 1 = -1 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF) # -1 as unsigned + self.assertEqual(st.sgpr[10], 1) # Borrow out to s[10] + + def test_v_add_co_ci_u32_vop3sd_separate_carry_regs(self): + """VOP3SD V_ADD_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers). + + This tests the VOP3SD encoding where src2 specifies the carry-in register + independently from sdst (carry-out). + + Computation: D0 = S0 + S1 + carry_in = 5 + 10 + 1 = 16 + """ + instructions = [ + s_mov_b32(s[6], 1), # carry-in = 1 (in s[6]) + s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10]) + # VOP3SD: v_add_co_ci_u32(vdst, sdst, src0, src1, src2) + v_add_co_ci_u32(v[0], s[10], 5, 10, s[6]), # D0 = 5 + 10 + 1 = 16 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 16) + self.assertEqual(st.sgpr[10], 0) # No carry out + + if __name__ == '__main__': unittest.main() diff --git a/extra/assembly/amd/test/hw/test_vop3.py b/extra/assembly/amd/test/hw/test_vop3.py index 7797312edb..9a4bbfa3a9 100644 --- a/extra/assembly/amd/test/hw/test_vop3.py +++ b/extra/assembly/amd/test/hw/test_vop3.py @@ -58,6 +58,95 @@ class TestFMA(unittest.TestCase): self.assertTrue(math.isinf(result) and result > 0) +class TestFmacE64(unittest.TestCase): + """Regression tests for V_FMAC_F32 VOP3 encoding (e64). + + V_FMAC_F32: D0 = D0 + S0 * S1 (fused multiply-add with accumulator) + + The VOP3 encoding needs to read D0 from the destination register as the + accumulator input, not just write to it. + + Regression test for: VOP3 FMAC missing D0 accumulator bug. + """ + + def test_v_fmac_f32_e64_basic(self): + """V_FMAC_F32_E64: basic accumulate test.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # S0 = 2.0 + v_mov_b32_e32(v[1], 3.0), # S1 = 3.0 + v_mov_b32_e32(v[2], 1.0), # D0 (accumulator) = 1.0 + # v_fmac_f32_e64 v[2], v[0], v[1] + # D0 = D0 + S0 * S1 = 1.0 + 2.0 * 3.0 = 7.0 + v_fmac_f32_e64(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5) + + def test_v_fmac_f32_e64_with_sgpr_sources(self): + """V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output). + + This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0) + where src0 is SGPR and src1 is inline constant 0. + + Regression test for: VOP3 FMAC missing D0 accumulator bug. + """ + instructions = [ + s_mov_b32(s[4], f2i(2.0)), # S0 = 2.0 in SGPR + v_mov_b32_e32(v[0], 5.0), # D0 (accumulator) = 5.0 + # v_fmac_f32_e64 v[0], s[4], 0 + # D0 = D0 + S0 * S1 = 5.0 + 2.0 * 0.0 = 5.0 + v_fmac_f32_e64(v[0], s[4], 0), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][0]), 5.0, places=5) + + def test_v_fmac_f32_e64_with_two_sgprs(self): + """V_FMAC_F32_E64 with two SGPR sources. + + Tests pattern: v_fmac_f32_e64(v[0], s[a], s[b]) + + Regression test for: VOP3 FMAC missing D0 accumulator bug. + """ + instructions = [ + s_mov_b32(s[10], f2i(3.0)), # S0 = 3.0 + s_mov_b32(s[12], f2i(4.0)), # S1 = 4.0 + v_mov_b32_e32(v[9], 2.0), # D0 (accumulator) = 2.0 + # v_fmac_f32_e64 v[9], s[10], s[12] + # D0 = D0 + S0 * S1 = 2.0 + 3.0 * 4.0 = 14.0 + v_fmac_f32_e64(v[9], s[10], s[12]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][9]), 14.0, places=5) + + def test_v_fmac_f32_e64_accumulates_correctly(self): + """V_FMAC_F32_E64 accumulates multiple times.""" + instructions = [ + v_mov_b32_e32(v[0], 0.0), # D0 = 0.0 + v_mov_b32_e32(v[1], 1.0), # S0 = 1.0 + v_mov_b32_e32(v[2], 2.0), # S1 = 2.0 + # First: D0 = 0.0 + 1.0 * 2.0 = 2.0 + v_fmac_f32_e64(v[0], v[1], v[2]), + # Second: D0 = 2.0 + 1.0 * 2.0 = 4.0 + v_fmac_f32_e64(v[0], v[1], v[2]), + # Third: D0 = 4.0 + 1.0 * 2.0 = 6.0 + v_fmac_f32_e64(v[0], v[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][0]), 6.0, places=5) + + def test_v_fmac_f32_e64_negative_accumulator(self): + """V_FMAC_F32_E64 with negative accumulator.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # S0 = 2.0 + v_mov_b32_e32(v[1], 3.0), # S1 = 3.0 + v_mov_b32_e32(v[2], -10.0), # D0 (accumulator) = -10.0 + # D0 = -10.0 + 2.0 * 3.0 = -4.0 + v_fmac_f32_e64(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), -4.0, places=5) + + class TestDivScale(unittest.TestCase): """Tests for V_DIV_SCALE_F32.""" @@ -768,7 +857,7 @@ class TestF16Modifiers(unittest.TestCase): def test_v_fma_f16_inline_const_1_0(self): """V_FMA_F16: a*b + 1.0 should use f16 inline constant.""" - from extra.assembly.amd.pcode import f32_to_f16, _f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16 f16_a = f32_to_f16(0.325928) # ~0x3537 f16_b = f32_to_f16(-0.486572) # ~0xb7c9 instructions = [ @@ -785,7 +874,7 @@ class TestF16Modifiers(unittest.TestCase): def test_v_fma_f16_inline_const_0_5(self): """V_FMA_F16: a*b + 0.5 should use f16 inline constant.""" - from extra.assembly.amd.pcode import f32_to_f16, _f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16 f16_a = f32_to_f16(2.0) f16_b = f32_to_f16(3.0) instructions = [ @@ -802,7 +891,7 @@ class TestF16Modifiers(unittest.TestCase): def test_v_fma_f16_inline_const_neg_1_0(self): """V_FMA_F16: a*b + (-1.0) should use f16 inline constant.""" - from extra.assembly.amd.pcode import f32_to_f16, _f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16 f16_a = f32_to_f16(2.0) f16_b = f32_to_f16(3.0) instructions = [ @@ -819,7 +908,7 @@ class TestF16Modifiers(unittest.TestCase): def test_v_add_f16_abs_both(self): """V_ADD_F16 with abs on both operands.""" - from extra.assembly.amd.pcode import f32_to_f16, _f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16 f16_neg2 = f32_to_f16(-2.0) f16_neg3 = f32_to_f16(-3.0) instructions = [ @@ -835,7 +924,7 @@ class TestF16Modifiers(unittest.TestCase): def test_v_mul_f16_neg_abs(self): """V_MUL_F16 with neg on one operand and abs on another.""" - from extra.assembly.amd.pcode import f32_to_f16, _f16 + from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16 f16_2 = f32_to_f16(2.0) f16_neg3 = f32_to_f16(-3.0) instructions = [ @@ -854,7 +943,7 @@ class TestF16Modifiers(unittest.TestCase): This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h. """ - from extra.assembly.amd.pcode import _f16 + from extra.assembly.amd.test.hw.helpers import _f16 instructions = [ s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0} v_mov_b32_e32(v[0], s[0]), @@ -1621,6 +1710,27 @@ class TestCarryBorrow(unittest.TestCase): self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result") self.assertEqual(st.vgpr[0][5], 0x00000003, "hi result") + def test_add_co_u32_same_dst_src(self): + """V_ADD_CO_U32 where dst is same as src - VCC must use original src value.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[0], s[0]), + v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1, VCC should be set from overflow + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0, "0xFFFFFFFF + 1 = 0") + self.assertEqual(st.vcc & 1, 1, "Should have carry from 0xFFFFFFFF + 1") + + def test_add_co_u32_same_dst_src_no_carry(self): + """V_ADD_CO_U32 where dst is same as src - no carry case.""" + instructions = [ + v_mov_b32_e32(v[0], 100), + v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1 + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 101, "100 + 1 = 101") + self.assertEqual(st.vcc & 1, 0, "No carry from 100 + 1") + class TestReadlane(unittest.TestCase): """Tests for V_READLANE_B32 and related cross-lane operations.""" @@ -2292,5 +2402,414 @@ class TestAddF32EdgeCases(unittest.TestCase): self.assertEqual(st.vgpr[0][2], 0x80000000) # -0 +class TestDivScaleF64(unittest.TestCase): + """Tests for V_DIV_SCALE_F64 - critical for tan() and division. + + These tests verify that VCC bits are set independently per lane, + which is essential for correct multi-lane f64 division operations. + """ + + def test_div_scale_f64_basic_no_scaling(self): + """V_DIV_SCALE_F64: normal values with no scaling needed.""" + sqrt2 = f2i64(1.4142135623730951) + one = f2i64(1.0) + instructions = [ + s_mov_b32(s[0], sqrt2 & 0xffffffff), + s_mov_b32(s[1], sqrt2 >> 32), + s_mov_b32(s[2], one & 0xffffffff), + s_mov_b32(s[3], one >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_mov_b32_e32(v[3], s[3]), + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[2:3]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32)) + self.assertAlmostEqual(result, 1.4142135623730951, places=10) + self.assertEqual(st.vcc & 1, 0, "VCC should be 0 when no scaling needed") + + def test_div_scale_f64_vcc_per_lane_uniform_input(self): + """V_DIV_SCALE_F64: VCC bits should be set independently per lane (uniform input). + + This is a regression test for the bug where VCC = 0x0LL was setting the whole + 64-bit VCC register instead of just the current lane's bit. With uniform input + all lanes should get VCC=0. + """ + val = f2i64(2.0) + instructions = [ + s_mov_b32(s[0], val & 0xffffffff), + s_mov_b32(s[1], val >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]), + ] + st = run_program(instructions, n_lanes=4) + # All lanes should have VCC=0 for normal values + self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values") + # All lanes should have same result + for lane in range(4): + result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32)) + self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} result mismatch") + + def test_div_scale_f64_vcc_per_lane_varying_input(self): + """V_DIV_SCALE_F64: VCC bits set per-lane with different inputs per lane. + + This test uses different inputs per lane to verify that VCC is tracked + independently. This catches the bug where the emulator was setting VCC + for all lanes to the same value. + """ + import math + # Use lane-varying input: lane 0 gets 2.0, lane 1 gets 3.0, etc. + # All normal values should result in VCC=0 for each lane + instructions = [ + # Set up per-lane values using lane_id + v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id) + v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # v0:1 = lane_id + 2.0 + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]), + ] + st = run_program(instructions, n_lanes=4) + # All lanes should have VCC=0 (no scaling needed for 2.0, 3.0, 4.0, 5.0) + self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values") + # Verify each lane has correct result + for lane in range(4): + expected = float(lane) + 2.0 + result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32)) + self.assertAlmostEqual(result, expected, places=10, msg=f"Lane {lane}: expected {expected}, got {result}") + + def test_div_scale_f64_zero_denom_sets_vcc(self): + """V_DIV_SCALE_F64: zero denominator -> NaN, VCC=1.""" + import math + one = f2i64(1.0) + zero = f2i64(0.0) + instructions = [ + s_mov_b32(s[0], one & 0xffffffff), + s_mov_b32(s[1], one >> 32), + s_mov_b32(s[2], zero & 0xffffffff), + s_mov_b32(s[3], zero >> 32), + v_mov_b32_e32(v[0], s[0]), # numer = 1.0 + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), # denom = 0.0 + v_mov_b32_e32(v[3], s[3]), + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32)) + self.assertTrue(math.isnan(result), "Should be NaN for zero denom") + self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom") + + def test_div_scale_f64_mixed_vcc_per_lane(self): + """V_DIV_SCALE_F64: some lanes need scaling, others don't. + + This is the key test for the tan() bug - it verifies that VCC is set + correctly for each lane independently when some lanes need scaling and + others don't. + """ + import math + # Lane 0: normal value (VCC=0), Lane 1: zero denom (VCC=1) + # Lane 2: normal value (VCC=0), Lane 3: zero denom (VCC=1) + normal = f2i64(2.0) + zero = f2i64(0.0) + instructions = [ + # Set up numer = 2.0 for all lanes + s_mov_b32(s[0], normal & 0xffffffff), + s_mov_b32(s[1], normal >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + # Set up denom: lane 0,2 get 2.0, lane 1,3 get 0.0 + s_mov_b32(s[2], zero & 0xffffffff), + s_mov_b32(s[3], zero >> 32), + v_mov_b32_e32(v[2], s[0]), # default to 2.0 + v_mov_b32_e32(v[3], s[1]), + # Override lanes 1 and 3 with 0.0 using writelane + v_writelane_b32(v[2], s[2], 1), + v_writelane_b32(v[3], s[3], 1), + v_writelane_b32(v[2], s[2], 3), + v_writelane_b32(v[3], s[3], 3), + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]), + ] + st = run_program(instructions, n_lanes=4) + # Lanes 0,2 should have VCC=0 (normal), lanes 1,3 should have VCC=1 (zero denom) + self.assertEqual(st.vcc & 0b0001, 0, "Lane 0 VCC should be 0") + self.assertEqual(st.vcc & 0b0010, 0b0010, "Lane 1 VCC should be 1") + self.assertEqual(st.vcc & 0b0100, 0, "Lane 2 VCC should be 0") + self.assertEqual(st.vcc & 0b1000, 0b1000, "Lane 3 VCC should be 1") + + # Check results + for lane in [0, 2]: + result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32)) + self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} should be 2.0") + for lane in [1, 3]: + result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32)) + self.assertTrue(math.isnan(result), f"Lane {lane} should be NaN") + + +class TestDivFmasF64(unittest.TestCase): + """Tests for V_DIV_FMAS_F64 - scaling FMA for f64 division. + + These tests verify that V_DIV_FMAS applies the correct scaling + based on VCC per lane, which is essential for correct tan() results. + """ + + def test_div_fmas_f64_no_scale_vcc0(self): + """V_DIV_FMAS_F64: VCC=0 -> normal FMA, no scaling.""" + a = f2i64(2.0) + b = f2i64(3.0) + c = f2i64(1.0) + instructions = [ + s_mov_b32(VCC_LO, 0), + s_mov_b32(s[0], a & 0xffffffff), + s_mov_b32(s[1], a >> 32), + s_mov_b32(s[2], b & 0xffffffff), + s_mov_b32(s[3], b >> 32), + s_mov_b32(s[4], c & 0xffffffff), + s_mov_b32(s[5], c >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_mov_b32_e32(v[3], s[3]), + v_mov_b32_e32(v[4], s[4]), + v_mov_b32_e32(v[5], s[5]), + v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32)) + expected = 2.0 * 3.0 + 1.0 # = 7.0 + self.assertAlmostEqual(result, expected, places=10) + + def test_div_fmas_f64_scale_up_vcc1_large_s2(self): + """V_DIV_FMAS_F64: VCC=1 with S2 exponent > 1023 -> scale by 2^+128.""" + a = f2i64(1.0) + b = f2i64(1.0) + c = f2i64(2.0) # exponent = 1024 > 1023, so scale UP + instructions = [ + s_mov_b32(VCC_LO, 1), + s_mov_b32(s[0], a & 0xffffffff), + s_mov_b32(s[1], a >> 32), + s_mov_b32(s[2], b & 0xffffffff), + s_mov_b32(s[3], b >> 32), + s_mov_b32(s[4], c & 0xffffffff), + s_mov_b32(s[5], c >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_mov_b32_e32(v[3], s[3]), + v_mov_b32_e32(v[4], s[4]), + v_mov_b32_e32(v[5], s[5]), + v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32)) + expected = (1.0 * 1.0 + 2.0) * (2.0 ** 128) # = 3.0 * 2^128 + self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10) + + def test_div_fmas_f64_scale_down_vcc1_small_s2(self): + """V_DIV_FMAS_F64: VCC=1 with S2 exponent <= 1023 -> scale by 2^-128.""" + a = f2i64(2.0) + b = f2i64(3.0) + c = f2i64(1.0) # exponent = 1023, so scale DOWN + instructions = [ + s_mov_b32(VCC_LO, 1), + s_mov_b32(s[0], a & 0xffffffff), + s_mov_b32(s[1], a >> 32), + s_mov_b32(s[2], b & 0xffffffff), + s_mov_b32(s[3], b >> 32), + s_mov_b32(s[4], c & 0xffffffff), + s_mov_b32(s[5], c >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_mov_b32_e32(v[3], s[3]), + v_mov_b32_e32(v[4], s[4]), + v_mov_b32_e32(v[5], s[5]), + v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]), + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32)) + expected = (2.0 * 3.0 + 1.0) * (2.0 ** -128) # = 7.0 * 2^-128 + self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10) + + def test_div_fmas_f64_per_lane_vcc_varying(self): + """V_DIV_FMAS_F64: different VCC per lane applies different scaling. + + This is the key test for the tan() bug - verifies that scaling is + applied per-lane based on VCC bits, not uniformly. + """ + a = f2i64(1.0) + b = f2i64(1.0) + c = f2i64(1.0) # exponent = 1023, so when VCC=1 it scales DOWN + instructions = [ + # VCC = 0b0101: lanes 0,2 scale, lanes 1,3 don't + s_mov_b32(VCC_LO, 0b0101), + s_mov_b32(s[0], a & 0xffffffff), + s_mov_b32(s[1], a >> 32), + s_mov_b32(s[2], b & 0xffffffff), + s_mov_b32(s[3], b >> 32), + s_mov_b32(s[4], c & 0xffffffff), + s_mov_b32(s[5], c >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_mov_b32_e32(v[3], s[3]), + v_mov_b32_e32(v[4], s[4]), + v_mov_b32_e32(v[5], s[5]), + v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]), + ] + st = run_program(instructions, n_lanes=4) + + scaled = (1.0 * 1.0 + 1.0) * (2.0 ** -128) # = 2.0 * 2^-128 + unscaled = 1.0 * 1.0 + 1.0 # = 2.0 + + # Lane 0: VCC=1, scale + result0 = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32)) + self.assertAlmostEqual(result0, scaled, delta=abs(scaled) * 1e-10, msg="Lane 0 should be scaled") + + # Lane 1: VCC=0, no scale + result1 = i642f(st.vgpr[1][6] | (st.vgpr[1][7] << 32)) + self.assertAlmostEqual(result1, unscaled, places=10, msg="Lane 1 should be unscaled") + + # Lane 2: VCC=1, scale + result2 = i642f(st.vgpr[2][6] | (st.vgpr[2][7] << 32)) + self.assertAlmostEqual(result2, scaled, delta=abs(scaled) * 1e-10, msg="Lane 2 should be scaled") + + # Lane 3: VCC=0, no scale + result3 = i642f(st.vgpr[3][6] | (st.vgpr[3][7] << 32)) + self.assertAlmostEqual(result3, unscaled, places=10, msg="Lane 3 should be unscaled") + + +class TestDivScaleFmasF64Integration(unittest.TestCase): + """Integration tests for V_DIV_SCALE_F64 + V_DIV_FMAS_F64. + + These tests verify the full division sequence used by tan() works + correctly with multiple lanes having different values. + """ + + def test_div_scale_then_fmas_multi_lane_tan_pattern(self): + """Test the pattern used by tan(): DIV_SCALE sets VCC, DIV_FMAS uses it. + + This is the exact bug scenario: tan([2.0, 3.0, 4.0]) was failing because + VCC from DIV_SCALE was being set incorrectly for all lanes. + """ + import math + # Set up values like tan() would: different values per lane + instructions = [ + # Create per-lane values: 2.0, 3.0, 4.0, 5.0 + v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id) + v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # numer = lane_id + 2.0 + # denom = 1.0 for all lanes (uniform) + v_mov_b32_e32(v[2], f2i64(1.0) & 0xffffffff), + v_mov_b32_e32(v[3], f2i64(1.0) >> 32), + # V_DIV_SCALE_F64: sets VCC per lane + VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]), + # Copy scaled numer for FMA + v_mov_b32_e32(v[6], v[4]), + v_mov_b32_e32(v[7], v[5]), + # V_DIV_FMAS_F64: uses VCC to apply scaling + v_div_fmas_f64(v[8:9], v[6:7], v[2:3], v[4:5]), + ] + st = run_program(instructions, n_lanes=4) + + # All lanes should have VCC=0 (no scaling needed for normal values) + self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values") + + # Verify each lane has correct intermediate value + for lane in range(4): + expected_numer = float(lane) + 2.0 + # With VCC=0, DIV_FMAS should just do FMA with no scaling + result = i642f(st.vgpr[lane][8] | (st.vgpr[lane][9] << 32)) + # The FMA result should be: scaled_numer * denom + scaled_numer = 2*scaled_numer + expected = expected_numer * 1.0 + expected_numer # Simple FMA for this test setup + self.assertAlmostEqual(result, expected, places=8, + msg=f"Lane {lane}: expected {expected}, got {result}") + + +class TestVOP3VOPC(unittest.TestCase): + """Tests for VOP3-encoded VOPC instructions (comparisons with scalar dest).""" + + def test_v_cmp_ge_f32_e64_nan(self): + """V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false).""" + from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST + instructions = [ + s_mov_b32(s[0], 0xffc00000), # NaN + s_mov_b32(s[1], 0x00000000), # 0.0 + v_mov_b32_e32(v[5], s[0]), + v_mov_b32_e32(v[3], s[1]), + VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs_=3), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[5], 0) # NaN comparison is always FALSE + + +class TestMin3Max3Unsigned(unittest.TestCase): + """Regression tests for V_MIN3/V_MAX3 with unsigned integer types. + + The emulator's _minmax_reduce used UOp.minimum() which implements min(a,b) as + -max(-a,-b). This is broken for unsigned types because negation (mul by -1) + doesn't preserve ordering: for uint16, -0 = 0 but -5 = 65531, so + max(-0, -5) = max(0, 65531) = 65531, and -65531 = 5, giving min(0,5) = 5 (wrong!). + + Fix: use comparison-based min/max for unsigned types: min(a,b) = (abool cast.""" + instructions = [ + v_mov_b32_e32(v[1], 0), + v_cmp_eq_u32_e32(1, v[255]), # vcc = (lane == 1) + v_cndmask_b32_e64(v[1], v[1], 1, VCC_LO), # v1[lane1] = 1 + v_cmp_ne_u32_e32(0, v[1]), # vcc = (0 != v1) + v_cndmask_b32_e64(v[0], 0, 1, VCC_LO), # v0 = vcc ? 1 : 0 + ] + st = run_program(instructions, n_lanes=2) + self.assertEqual(st.vgpr[0][0], 0, "lane 0: 0 != 0 should be false") + self.assertEqual(st.vgpr[1][0], 1, "lane 1: 0 != 1 should be true") + self.assertEqual(st.vcc & 0x3, 0x2, "VCC should be 0b10") + + def test_v_cmp_ne_u32_all_nonzero(self): + """V_CMP_NE_U32: all lanes have nonzero values.""" + instructions = [ + v_mov_b32_e32(v[1], 5), + v_cmp_ne_u32_e32(0, v[1]), + ] + st = run_program(instructions, n_lanes=4) + self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should be != 0") + def test_cmp_eq_u16_opsel_lo_lo(self): """V_CMP_EQ_U16 comparing lo halves.""" instructions = [ @@ -448,6 +471,242 @@ class TestCmpFloat(unittest.TestCase): self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)") +class TestVOP3VOPCModifiers(unittest.TestCase): + """Tests for VOP3 VOPC with abs/neg modifiers.""" + + def test_v_cmp_ge_f32_abs_both(self): + """v_cmp_ge_f32 with abs on both sources: abs(0.0) >= abs(-1.0) = false. + + Regression test: int16 mod operation uses v_cmp_ge_f32 with abs modifiers. + """ + instructions = [ + v_mov_b32_e32(v[0], 0.0), + v_mov_b32_e32(v[1], -1.0), + # abs=0b11 means abs(src0) and abs(src1) + v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false") + + def test_v_cmp_ge_f32_abs_negative_divisor(self): + """v_cmp_ge_f32 with abs: remainder check for negative divisor. + + Tests the exact comparison used in int16 mod: abs(rem_f) >= abs(div_f). + For 1 % -1: rem_f = 0.0, div_f = -1.0, so abs(0.0) >= abs(-1.0) = false. + """ + instructions = [ + v_mov_b32_e32(v[0], 0.0), # remainder as float + v_mov_b32_e32(v[1], -1.0), # divisor as float + v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false") + + def test_v_cmp_ge_f32_abs_small_remainder(self): + """v_cmp_ge_f32 with abs: abs(-0.5) >= abs(-3.0) = false.""" + instructions = [ + v_mov_b32_e32(v[0], -0.5), + v_mov_b32_e32(v[1], -3.0), + v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "abs(-0.5) >= abs(-3.0) should be false") + + def test_v_cmp_ge_f32_abs_equal(self): + """v_cmp_ge_f32 with abs: abs(-1.0) >= abs(1.0) = true.""" + instructions = [ + v_mov_b32_e32(v[0], -1.0), + v_mov_b32_e32(v[1], 1.0), + v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "abs(-1.0) >= abs(1.0) should be true") + + +class TestVOP3VOPC64Bit(unittest.TestCase): + """Tests for VOP3 VOPC with 64-bit operands.""" + + def test_v_cmp_lt_f64_basic(self): + """v_cmp_lt_f64: 0.0 < 1.0 = true.""" + zero_f64 = f2i64(0.0) + one_f64 = f2i64(1.0) + instructions = [ + s_mov_b32(s[0], zero_f64 & 0xffffffff), + s_mov_b32(s[1], zero_f64 >> 32), + s_mov_b32(s[2], one_f64 & 0xffffffff), + s_mov_b32(s[3], one_f64 >> 32), + v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "0.0 < 1.0 should be true") + + def test_v_cmp_lt_f64_negative(self): + """v_cmp_lt_f64: -1.0 < 0.0 = true.""" + neg_one_f64 = f2i64(-1.0) + zero_f64 = f2i64(0.0) + instructions = [ + s_mov_b32(s[0], neg_one_f64 & 0xffffffff), + s_mov_b32(s[1], neg_one_f64 >> 32), + s_mov_b32(s[2], zero_f64 & 0xffffffff), + s_mov_b32(s[3], zero_f64 >> 32), + v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "-1.0 < 0.0 should be true") + + def test_v_cmp_lt_i64_signed(self): + """v_cmp_lt_i64: 0 < -1 (signed) = false.""" + instructions = [ + s_mov_b32(s[0], 0), + s_mov_b32(s[1], 0), # s[0:1] = 0 + s_mov_b32(s[2], 0xffffffff), + s_mov_b32(s[3], 0xffffffff), # s[2:3] = -1 + v_cmp_lt_i64_e64(VCC_LO, s[0:1], s[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "0 < -1 (signed) should be false") + + def test_v_cmp_lt_u64_unsigned(self): + """v_cmp_lt_u64: 0 < 0xFFFFFFFFFFFFFFFF (unsigned) = true.""" + instructions = [ + s_mov_b32(s[0], 0), + s_mov_b32(s[1], 0), # s[0:1] = 0 + s_mov_b32(s[2], 0xffffffff), + s_mov_b32(s[3], 0xffffffff), # s[2:3] = max uint64 + v_cmp_lt_u64_e64(VCC_LO, s[0:1], s[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "0 < max_uint64 should be true") + + +class TestVOPCF64(unittest.TestCase): + """Tests for VOPC (E32 encoding) with 64-bit float operands. Regression test for f64 compare bug.""" + + def test_v_cmp_lt_f64_e32_true(self): + """v_cmp_lt_f64_e32: 2.0 < 3.0 = true.""" + lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32 + lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32 + instructions = [ + s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0), + s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1), + v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]), + v_cmp_lt_f64_e32(v[0:1], v[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "2.0 < 3.0 should be true") + + def test_v_cmp_lt_f64_e32_false(self): + """v_cmp_lt_f64_e32: 3.0 < 2.0 = false.""" + lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32 + lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32 + instructions = [ + s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0), + s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1), + v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]), + v_cmp_lt_f64_e32(v[0:1], v[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "3.0 < 2.0 should be false") + + def test_v_cmp_nlt_f64_e32_true(self): + """v_cmp_nlt_f64_e32: !(3.0 < 2.0) = true.""" + lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32 + lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32 + instructions = [ + s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0), + s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1), + v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]), + v_cmp_nlt_f64_e32(v[0:1], v[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "!(3.0 < 2.0) should be true") + + def test_v_cmp_nlt_f64_e32_false(self): + """v_cmp_nlt_f64_e32: !(2.0 < 3.0) = false.""" + lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32 + lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32 + instructions = [ + s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0), + s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1), + v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]), + v_cmp_nlt_f64_e32(v[0:1], v[2:3]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "!(2.0 < 3.0) should be false") + + +class TestCmpxExec(unittest.TestCase): + """Tests for V_CMPX instructions that modify EXEC mask.""" + + def test_v_cmpx_ngt_f32_e64_all_true(self): + """V_CMPX_NGT_F32_E64: all lanes pass (literal <= all values).""" + # 131072.0 = 0x48000000 + # All values > 131072, so !(131072 > val) = true for all + instructions = [ + s_mov_b32(EXEC_LO, 0x7), # 3 lanes active + v_mov_b32_e32(v[0], f2i(200000.0)), # lane 0 + v_cmp_eq_u32_e32(1, v[255]), + v_cndmask_b32_e64(v[1], v[0], f2i(300000.0), VCC_LO), # lane 1 + v_cmp_eq_u32_e32(2, v[255]), + v_cndmask_b32_e64(v[1], v[1], f2i(400000.0), VCC_LO), # lane 2 + # Now v[1] has: lane0=200000, lane1=300000, lane2=400000 + # Compare: !(131072.0 > v[1]) i.e., 131072.0 <= v[1] + v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]), + ] + st = run_program(instructions, n_lanes=3) + # All values > 131072, so all lanes should remain active + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active") + + def test_v_cmpx_ngt_f32_e64_some_false(self): + """V_CMPX_NGT_F32_E64: some lanes fail (literal > some values).""" + instructions = [ + s_mov_b32(EXEC_LO, 0x7), # 3 lanes active + v_mov_b32_e32(v[0], f2i(100000.0)), # lane 0: 131072 > 100000 = true, so !(true) = false + v_cmp_eq_u32_e32(1, v[255]), + v_cndmask_b32_e64(v[1], v[0], f2i(200000.0), VCC_LO), # lane 1: 131072 > 200000 = false, so !(false) = true + v_cmp_eq_u32_e32(2, v[255]), + v_cndmask_b32_e64(v[1], v[1], f2i(150000.0), VCC_LO), # lane 2: 131072 > 150000 = false, so !(false) = true + v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]), + ] + st = run_program(instructions, n_lanes=3) + # lane 0: fail (100000 < 131072), lanes 1,2: pass + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x6, "Lanes 1,2 should be active, lane 0 inactive") + + def test_v_cmpx_ngt_f32_e64_all_false(self): + """V_CMPX_NGT_F32_E64: all lanes fail (literal > all values).""" + instructions = [ + s_mov_b32(EXEC_LO, 0x7), # 3 lanes active + v_mov_b32_e32(v[0], f2i(100.0)), # all lanes have 100.0 + # 131072 > 100 = true, so !(true) = false for all + v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[0]), + ] + st = run_program(instructions, n_lanes=3) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x0, "All lanes should be inactive") + + def test_v_cmpx_ngt_f32_e64_large_values(self): + """V_CMPX_NGT_F32_E64: test with values that trigger Payne-Hanek in sin(). + + This is a regression test for the sin(859240.0) bug. + Values 859240, 1000000, 100594688 should all pass !(131072 > val). + """ + instructions = [ + s_mov_b32(EXEC_LO, 0x7), # 3 lanes active + v_mov_b32_e32(v[0], f2i(859240.0)), # lane 0 + v_cmp_eq_u32_e32(1, v[255]), + v_cndmask_b32_e64(v[1], v[0], f2i(1000000.0), VCC_LO), # lane 1 + v_cmp_eq_u32_e32(2, v[255]), + v_cndmask_b32_e64(v[1], v[1], f2i(100594688.0), VCC_LO), # lane 2 + v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]), + ] + st = run_program(instructions, n_lanes=3) + # All values > 131072, so !(131072 > val) = true for all + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active") + + class TestVCCBehavior(unittest.TestCase): """Tests for VCC condition code behavior.""" @@ -472,5 +731,101 @@ class TestVCCBehavior(unittest.TestCase): self.assertEqual(st.vcc >> 16, 0x0000, "Lanes 16-31 should be false") +class TestCmpxPartialWavefront(unittest.TestCase): + """Tests for V_CMPX with partial wavefronts (fewer than 32 active lanes). + + Regression tests for bug where v_cmpx incorrectly set EXEC bits for inactive + lanes when the wavefront had fewer than 32 lanes. This caused garbage data + from uninitialized lanes to corrupt memory writes. + """ + + def test_v_cmpx_eq_u32_partial_wave_3_lanes(self): + """V_CMPX_EQ_U32 with 3 active lanes should only affect those 3 lanes. + + With n_lanes=3, initial EXEC=0x7. After v_cmpx comparing lane_id == 1, + only lane 1 should pass, so EXEC should become 0x2 (not have bits 3-31 set). + """ + instructions = [ + v_cmpx_eq_u32_e32(1, v[255]), # EXEC = lanes where lane_id == 1 + ] + st = run_program(instructions, n_lanes=3) + # Only lane 1 should be active (bit 1 set) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x2, + "Only lane 1 should be active after v_cmpx_eq_u32 with 3 lanes") + + def test_v_cmpx_eq_u32_partial_wave_5_lanes(self): + """V_CMPX_EQ_U32 with 5 active lanes.""" + instructions = [ + v_cmpx_eq_u32_e32(3, v[255]), # EXEC = lanes where lane_id == 3 + ] + st = run_program(instructions, n_lanes=5) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x8, + "Only lane 3 should be active after v_cmpx_eq_u32 with 5 lanes") + + def test_v_cmpx_lt_u32_partial_wave(self): + """V_CMPX_LT_U32 with partial wavefront.""" + # VOPC: src0 < vsrc1, so we need v_cmpx_gt_u32 to get lane_id < 2 + instructions = [ + v_cmpx_gt_u32_e32(2, v[255]), # EXEC = lanes where 2 > lane_id (i.e., lane_id < 2) + ] + st = run_program(instructions, n_lanes=4) + # Lanes 0,1 should be active (bits 0,1 set = 0x3) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x3, + "Only lanes 0,1 should be active after v_cmpx_gt_u32(2, lane_id) with 4 lanes") + + def test_v_cmpx_ge_u32_partial_wave(self): + """V_CMPX_GE_U32 with partial wavefront.""" + # VOPC: src0 >= vsrc1, so v_cmpx_le_u32(1, lane_id) gives lane_id >= 2? No. + # v_cmpx_le_u32(src0, vsrc1) = src0 <= vsrc1 = 1 <= lane_id + instructions = [ + v_cmpx_le_u32_e32(2, v[255]), # EXEC = lanes where 2 <= lane_id (i.e., lane_id >= 2) + ] + st = run_program(instructions, n_lanes=4) + # Lanes 2,3 should be active (bits 2,3 set = 0xC) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xC, + "Only lanes 2,3 should be active after v_cmpx_le_u32(2, lane_id) with 4 lanes") + + def test_v_cmpx_ne_u32_partial_wave_all_pass(self): + """V_CMPX_NE_U32 where all active lanes pass.""" + instructions = [ + v_cmpx_ne_u32_e32(99, v[255]), # EXEC = lanes where lane_id != 99 + ] + st = run_program(instructions, n_lanes=3) + # All 3 lanes should remain active (bits 0,1,2 set = 0x7) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x7, + "All 3 lanes should remain active when all pass") + + def test_v_cmpx_eq_u32_partial_wave_none_pass(self): + """V_CMPX_EQ_U32 where no active lanes pass.""" + instructions = [ + v_cmpx_eq_u32_e32(99, v[255]), # EXEC = lanes where lane_id == 99 + ] + st = run_program(instructions, n_lanes=3) + # No lanes should be active + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x0, + "No lanes should be active when none pass") + + def test_v_cmpx_f32_partial_wave(self): + """V_CMPX_GT_F32 with partial wavefront - float comparison.""" + instructions = [ + v_cvt_f32_u32_e32(v[0], v[255]), # v[0] = float(lane_id) + v_mov_b32_e32(v[1], f2i(0.5)), # v[1] = 0.5 + v_cmpx_gt_f32_e32(v[0], v[1]), # EXEC = lanes where v[0] > 0.5 + ] + st = run_program(instructions, n_lanes=4) + # Lanes 1,2,3 have values > 0.5, lane 0 has 0.0 + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xE, + "Lanes 1,2,3 should be active (float > 0.5)") + + def test_v_cmpx_e64_partial_wave(self): + """V_CMPX_EQ_U32_E64 (VOP3 encoding) with partial wavefront.""" + instructions = [ + v_cmpx_eq_u32_e64(EXEC_LO, v[255], 2), # EXEC = lanes where lane_id == 2 + ] + st = run_program(instructions, n_lanes=4) + self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x4, + "Only lane 2 should be active after v_cmpx_eq_u32_e64") + + if __name__ == '__main__': unittest.main() diff --git a/extra/assembly/amd/test/hw/test_vopd.py b/extra/assembly/amd/test/hw/test_vopd.py new file mode 100644 index 0000000000..15c67ba448 --- /dev/null +++ b/extra/assembly/amd/test/hw/test_vopd.py @@ -0,0 +1,161 @@ +"""Tests for VOPD instructions - dual-issue vector operations. + +VOPD executes two operations simultaneously. Key behavior: +- Both ops read their sources BEFORE either writes (dual-issue semantics) +- This means if X writes to a register that Y reads, Y sees the OLD value +- Op X can use ops 0-15 (FMAC, MUL, ADD, MOV, etc.) +- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND) +""" +import unittest +from extra.assembly.amd.test.hw.helpers import run_program, run_program_emu, run_program_hw, compare_wave_states, \ + v, s, v_mov_b32_e32, s_mov_b32 +from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp + +class TestVOPDBasic(unittest.TestCase): + """Basic VOPD functionality tests.""" + + def test_vopd_dual_mov(self): + """VOPD with two MOV operations to different registers.""" + instructions = [ + v_mov_b32_e32(v[0], 0x12345678), + v_mov_b32_e32(v[1], 0xDEADBEEF), + # X: v[2] = v[0], Y: v[3] = v[1] + VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[1], v[0], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0x12345678) + self.assertEqual(st.vgpr[0][3], 0xDEADBEEF) + + def test_vopd_mov_and_add(self): + """VOPD with MOV (X) and ADD_NC_U32 (Y) - ADD_NC_U32 can only be Y op.""" + instructions = [ + v_mov_b32_e32(v[0], 10), + v_mov_b32_e32(v[1], 5), + # X: v[2] = 100 (literal), Y: v[3] = v[0] + v[1] = 15 + VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[3], 100, v[0], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 100) + self.assertEqual(st.vgpr[0][3], 15) + + +class TestVOPDReadBeforeWrite(unittest.TestCase): + """Tests for VOPD dual-issue read-before-write semantics. + + In VOPD, both X and Y operations read their sources BEFORE either writes. + This is critical when X's destination is Y's source. + """ + + def test_vopd_x_writes_y_reads_same_reg(self): + """VOPD where X writes to a register that Y reads. + + X: v[2] = 0 (overwrites v[2]) + Y: v[1] = v[2] + v[0] (srcy0=v[2], vsrcy1=v[0]) + + If reads happen before writes: v[1] = OLD_v[2] + v[0] = 0xFFFFFFFF + 1 = 0 + If writes happen before reads: v[1] = 0 + v[0] = 0 + 1 = 1 + + Hardware does reads-before-writes, so v[1] should be 0. + """ + instructions = [ + v_mov_b32_e32(v[0], 1), # v[0] = 1 + v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten) + v_mov_b32_e32(v[2], 0xFFFFFFFF), # v[2] = 0xFFFFFFFF + # X: v[2] = 0 (literal), srcx0=0, vsrcx1=v[0] (unused for MOV) + # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[0] (should read OLD v[2] = 0xFFFFFFFF) + # vdsty encoding: (vdsty << 1) | ((vdstx & 1) ^ 1) where vdsty field = 0, vdstx = v[2] + # So vdsty_reg = (0 << 1) | ((2 & 1) ^ 1) = 0 | 1 = 1 = v[1] + VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # X should have written 0 to v[2] + self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]") + # Y should have read OLD v[2] (0xFFFFFFFF) and added v[0] (1) + # 0xFFFFFFFF + 1 = 0 (wrap around) + self.assertEqual(st.vgpr[0][1], 0, "Y should read OLD v[2]=0xFFFFFFFF, compute 0xFFFFFFFF+1=0") + + def test_vopd_x_writes_y_reads_same_reg_v2(self): + """VOPD where X writes to a register that Y reads - cleaner test case. + + X: v[2] = 0 (MOV) + Y: v[1] = v[2] + v[2] (ADD_NC_U32 with both sources from v[2]) + + If reads happen before writes: v[1] = OLD_v[2] + OLD_v[2] = 100 + 100 = 200 + If writes happen before reads: v[1] = 0 + 0 = 0 + + Hardware does reads-before-writes, so v[1] should be 200. + """ + instructions = [ + v_mov_b32_e32(v[0], 0x88888888), # v[0] = unused placeholder + v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten) + v_mov_b32_e32(v[2], 100), # v[2] = 100 + # X: v[2] = 0 (literal) + # Y: v[1] = srcy0 + vsrcy1 = v[2] + v[2] (should read OLD v[2] = 100) + VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[2]), + ] + st = run_program(instructions, n_lanes=1) + # X should have written 0 to v[2] + self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]") + # Y should have read OLD v[2] (100) twice and added them + self.assertEqual(st.vgpr[0][1], 200, "Y should read OLD v[2]=100 twice, compute 100+100=200") + + +class TestVOPDLiterals(unittest.TestCase): + """Tests for VOPD instructions that use SIMM32 literals (FMAAK, FMAMK).""" + + def test_vopd_fmaak_f32(self): + """VOPD V_DUAL_FMAAK_F32: D = S0 * S1 + SIMM32 (literal addend). + + Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction. + fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0 + """ + from extra.assembly.amd.test.hw.helpers import f2i, i2f + instructions = [ + v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0 + v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0 + # VOPD args: opx, opy, vdstx, vdsty, srcx0, srcy0, vsrcx1, vsrcy1 + # X: v[2] = fma(srcx0, vsrcx1, SIMM32) = v[0]*v[1]+10.0 = 2*3+10 = 16 + # Y: v[3] = srcy0 (MOV) = v[0] = 2.0 + VOPD_LIT(VOPDOp.V_DUAL_FMAAK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(10.0)), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 16.0, places=5, msg="fma(2.0, 3.0, 10.0) should be 16.0") + + def test_vopd_fmamk_f32(self): + """VOPD V_DUAL_FMAMK_F32: D = S0 * SIMM32 + S1 (literal multiplier). + + Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier. + fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0 + """ + from extra.assembly.amd.test.hw.helpers import f2i, i2f + instructions = [ + v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0 + v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0 + # X: v[2] = fma(srcx0, SIMM32, vsrcx1) = v[0]*5.0+v[1] = 2*5+3 = 13 + # Y: v[3] = srcy0 (MOV) = v[0] = 2.0 + VOPD_LIT(VOPDOp.V_DUAL_FMAMK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(5.0)), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 13.0, places=5, msg="fma(2.0, 5.0, 3.0) should be 13.0") + + +class TestVOPDMultilane(unittest.TestCase): + """Tests for VOPD with multiple lanes.""" + + def test_vopd_multilane_mov_add(self): + """VOPD MOV and ADD with multiple active lanes - no register conflict.""" + instructions = [ + v_mov_b32_e32(v[0], 5), + v_mov_b32_e32(v[1], 10), + # X: v[2] = 100 (constant), Y: v[1] = v[0] + v[1] = 5 + 10 = 15 + # vdsty_reg = (vdsty << 1) | ((vdstx.offset & 1) ^ 1) = (0 << 1) | ((258 & 1) ^ 1) = 0 | 1 = 1 + VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 100, v[0], v[2], v[1]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][2], 100, f"Lane {lane}: v[2] should be 100") + self.assertEqual(st.vgpr[lane][1], 15, f"Lane {lane}: v[1] should be 15 (5+10)") + + +if __name__ == '__main__': + unittest.main()