assembly/amd: add hw tests from ucode branch (#14259)

* assembly/amd: add hw tests from ucode branch

* fix is per lane
This commit is contained in:
George Hotz
2026-01-21 08:53:54 +09:00
committed by GitHub
parent ba90e1b52e
commit 1baefed530
11 changed files with 2304 additions and 41 deletions

View File

@@ -653,17 +653,17 @@ def _apply_pseudocode_fixes(op_name: str, code: str) -> str:
code = code.replace('D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)',
'D0.f64 = (2.0 ** 128 if exponent(S2.f64) > 1023 else 2.0 ** -128) * fma(S0.f64, S1.f64, S2.f64)')
if op_name == 'V_DIV_SCALE_F32':
code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(0x1); D0.f32 = float("nan")')
code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(1 << laneId); D0.f32 = float("nan")')
code = code.replace('elif S1.f32 == DENORM.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif False:\n pass')
code += '\nif S1.f32 == DENORM.f32:\n D0.f32 = float("nan")'
code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(0x1); D0.f32 = ldexp(S0.f32, 64)')
code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)')
code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(1 << laneId); D0.f32 = ldexp(S0.f32, 64)')
code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(1 << laneId)')
if op_name == 'V_DIV_SCALE_F64':
code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(0x1); D0.f64 = float("nan")')
code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(1 << laneId); D0.f64 = float("nan")')
code = code.replace('elif S1.f64 == DENORM.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif False:\n pass')
code += '\nif S1.f64 == DENORM.f64:\n D0.f64 = float("nan")'
code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(0x1); D0.f64 = ldexp(S0.f64, 128)')
code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)')
code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(1 << laneId); D0.f64 = ldexp(S0.f64, 128)')
code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(1 << laneId)')
if op_name == 'V_DIV_FIXUP_F32':
code = code.replace('D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))',
'D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) if isNAN(S0.f32) else ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))')

View File

@@ -1,14 +1,25 @@
"""Test infrastructure for hardware-validated RDNA3 emulator tests.
Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
Set USE_HW=1 to run on both emulator and real hardware, comparing results.
Set USE_HW=1 to run on both emulator and hardware, comparing results.
"""
import ctypes, os, struct
import ctypes, math, os, struct
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
from extra.assembly.amd.emu import run_asm
from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
from extra.assembly.amd.pcode import _i32, _f32
def _i32(f: float) -> int: return struct.unpack('<I', struct.pack('<f', f))[0]
def _f32(i: int) -> float: return struct.unpack('<f', struct.pack('<I', i & 0xFFFFFFFF))[0]
# f16 conversion helpers
def _f16(i: int) -> float: return struct.unpack('<e', struct.pack('<H', i & 0xFFFF))[0]
def f32_to_f16(f: float) -> int:
f = float(f)
if math.isnan(f): return 0x7e00
if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00
try: return struct.unpack('<H', struct.pack('<e', f))[0]
except OverflowError: return 0x7c00 if f > 0 else 0xfc00
# For backwards compatibility with tests using SrcEnum.NULL etc.
class SrcEnum:
@@ -32,11 +43,11 @@ VCC = VCC_LO # For VOP3SD sdst field (VCC_LO is exported from dsl)
USE_HW = os.environ.get("USE_HW", "0") == "1"
FLOAT_TOLERANCE = 1e-5
# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec
N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048
SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64
OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8 # + vcc + scc
OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12 # + vcc + scc + exec
# Float conversion helpers
def f2i(f: float) -> int: return _i32(f)
@@ -47,6 +58,14 @@ def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
def assemble(instructions: list) -> bytes:
return b''.join(inst.to_bytes() for inst in instructions)
# Simple WaveState class for test output parsing (mirrors emu.py interface for tests)
class WaveState:
def __init__(self):
self.vgpr = [[0] * 256 for _ in range(32)] # vgpr[lane][reg]
self.sgpr = [0] * 128
self.vcc = 0
self.scc = 0
def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
"""Generate prologue and epilogue instructions for state capture."""
prologue = [
@@ -63,6 +82,10 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
epilogue = [
s_mov_b32(s[90], VCC_LO),
s_cselect_b32(s[91], 1, 0),
# Save EXEC early (before we modify it for VGPR stores)
s_mov_b32(s[95], EXEC_LO),
# Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC)
s_mov_b32(EXEC_LO, (1 << n_lanes) - 1),
s_load_b64(s[92:93], s[80:81], 0, soffset=NULL),
s_waitcnt(0), # simm16=0 waits for all
v_lshlrev_b32_e32(v[240], 2, v[255]),
@@ -80,6 +103,9 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES))
epilogue.append(v_mov_b32_e32(v[243], s[91]))
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4))
# Store EXEC (saved earlier in s[95])
epilogue.append(v_mov_b32_e32(v[243], s[95]))
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8))
epilogue.append(s_mov_b32(EXEC_LO, s[94]))
epilogue.append(s_endpgm())
return prologue, epilogue
@@ -95,6 +121,8 @@ def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
# Store EXEC in its proper location (index 126)
st.sgpr[EXEC_LO.offset] = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 8)[0]
return st
def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
@@ -110,9 +138,9 @@ def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
lib_ptr = ctypes.addressof(kernel_buf)
set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
# rsrc2: USER_SGPR_COUNT=2, ENABLE_SGPR_WORKGROUP_ID_X/Y/Z=1, LDS_SIZE=128 (64KB)
rsrc2 = 0x19c | (128 << 15)
scratch_size = 0x10000 # 64KB per lane, matches .amdhsa_private_segment_fixed_size in run_program_hw
result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr, rsrc2)
assert result == 0, f"run_asm failed with {result}"
@@ -148,6 +176,8 @@ test:
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_kernarg_size 8
.amdhsa_group_segment_fixed_size 65536
.amdhsa_private_segment_fixed_size 65536
.amdhsa_enable_private_segment 1
.end_amdhsa_kernel
.amdgpu_metadata
@@ -160,7 +190,7 @@ amdhsa.kernels:
.symbol: test.kd
.kernarg_segment_size: 8
.group_segment_fixed_size: 65536
.private_segment_fixed_size: 0
.private_segment_fixed_size: 65536
.kernarg_segment_align: 8
.wavefront_size: 32
.sgpr_count: 96

View File

@@ -138,6 +138,56 @@ class TestDS2AddrMore(unittest.TestCase):
self.assertEqual(st.vgpr[0][4], 0x12345678, "v4 should be untouched")
class TestDSB128(unittest.TestCase):
"""Tests for DS_STORE_B128 and DS_LOAD_B128 (128-bit / 4 dwords)."""
def test_ds_store_load_b128(self):
"""DS_STORE_B128 stores 4 VGPRs, DS_LOAD_B128 loads them back."""
instructions = [
v_mov_b32_e32(v[10], 0),
s_mov_b32(s[0], 0x11111111),
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[0], 0x22222222),
v_mov_b32_e32(v[1], s[0]),
s_mov_b32(s[0], 0x33333333),
v_mov_b32_e32(v[2], s[0]),
s_mov_b32(s[0], 0x44444444),
v_mov_b32_e32(v[3], s[0]),
ds_store_b128(addr=v[10], data0=v[0:3]),
s_waitcnt(lgkmcnt=0),
ds_load_b128(addr=v[10], vdst=v[4:7]),
s_waitcnt(lgkmcnt=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword")
self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have second dword")
self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should have third dword")
self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should have fourth dword")
def test_ds_store_b128_with_offset(self):
"""DS_STORE_B128 with non-zero offset."""
instructions = [
v_mov_b32_e32(v[10], 0),
s_mov_b32(s[0], 0xAAAAAAAA),
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[0], 0xBBBBBBBB),
v_mov_b32_e32(v[1], s[0]),
s_mov_b32(s[0], 0xCCCCCCCC),
v_mov_b32_e32(v[2], s[0]),
s_mov_b32(s[0], 0xDDDDDDDD),
v_mov_b32_e32(v[3], s[0]),
DS(DSOp.DS_STORE_B128, addr=v[10], data0=v[0:3], offset0=16),
s_waitcnt(lgkmcnt=0),
DS(DSOp.DS_LOAD_B128, addr=v[10], vdst=v[4:7], offset0=16),
s_waitcnt(lgkmcnt=0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
self.assertEqual(st.vgpr[0][5], 0xBBBBBBBB)
self.assertEqual(st.vgpr[0][6], 0xCCCCCCCC)
self.assertEqual(st.vgpr[0][7], 0xDDDDDDDD)
class TestDSAtomic(unittest.TestCase):
"""Tests for DS atomic operations."""

View File

@@ -128,6 +128,169 @@ class TestGlobalLoad(unittest.TestCase):
class TestGlobalStore(unittest.TestCase):
"""Tests for GLOBAL store instructions."""
def test_global_store_b8_basic(self):
"""GLOBAL_STORE_B8 stores a single byte from VDATA[7:0]."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
# First store 0xDEADBEEF to memory
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
# Now store single byte 0x42 to same address (should only change byte 0)
v_mov_b32_e32(v[2], 0x42),
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
# Read back and check
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
# Only byte 0 should change from 0xEF to 0x42
self.assertEqual(st.vgpr[0][0], 0xDEADBE42, "Only byte 0 should be modified")
def test_global_store_b8_byte1(self):
"""GLOBAL_STORE_B8 at offset+1 stores to byte 1."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[2], 0x42),
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
s_waitcnt(vmcnt=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xDEAD42EF, "Only byte 1 should be modified")
def test_global_store_b16_basic(self):
"""GLOBAL_STORE_B16 stores a 16-bit value from VDATA[15:0]."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xDEADCAFE, "Only lower 16 bits should be modified")
def test_global_store_b16_high_half(self):
"""GLOBAL_STORE_B16 at offset+2 stores to high 16 bits."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_mov_b32(s[4], 0xDEADBEEF),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
s_mov_b32(s[4], 0xCAFE),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+2),
s_waitcnt(vmcnt=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xCAFEBEEF, "Only upper 16 bits should be modified")
def test_global_store_b16_byte_offset_1(self):
"""GLOBAL_STORE_B16 at byte offset 1 stores bytes 1-2 within the same word."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
s_mov_b32(s[4], 0xDDCCBBAA),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
# Store 0xBEEF at byte offset 1 (bytes 1-2)
s_mov_b32(s[4], 0xBEEF),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
s_waitcnt(vmcnt=0),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
# Bytes 1-2 should be 0xBEEF (0xEF at byte 1, 0xBE at byte 2)
# Original: 0xDDCCBBAA -> bytes [AA, BB, CC, DD]
# After: 0xDDBEEFAA -> bytes [AA, EF, BE, DD]
self.assertEqual(st.vgpr[0][0], 0xDDBEEFAA, "Bytes 1-2 should be 0xBEEF")
def test_global_store_b16_cross_word_boundary(self):
"""GLOBAL_STORE_B16 at byte offset 3 crosses word boundary (byte 3 of word N, byte 0 of word N+1)."""
TEST_OFFSET = 256
instructions = [
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
s_waitcnt(lgkmcnt=0),
# Initialize two consecutive words
s_mov_b32(s[4], 0xDDCCBBAA),
v_mov_b32_e32(v[2], s[4]),
v_mov_b32_e32(v[0], 0),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
s_mov_b32(s[4], 0x44332211),
v_mov_b32_e32(v[2], s[4]),
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
# Store 0xBEEF at byte offset 3 (crosses word boundary)
# Low byte (0xEF) goes to byte 3 of first word
# High byte (0xBE) goes to byte 0 of second word
s_mov_b32(s[4], 0xBEEF),
v_mov_b32_e32(v[2], s[4]),
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+3),
s_waitcnt(vmcnt=0),
# Load back both words
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+4),
s_waitcnt(vmcnt=0),
v_mov_b32_e32(v[0], v[3]),
v_mov_b32_e32(v[1], v[4]),
s_mov_b32(s[2], 0),
s_mov_b32(s[3], 0),
]
st = run_program(instructions, n_lanes=1)
# First word: 0xDDCCBBAA -> 0xEFCCBBAA (byte 3 becomes 0xEF)
# Second word: 0x44332211 -> 0x443322BE (byte 0 becomes 0xBE)
self.assertEqual(st.vgpr[0][0], 0xEFCCBBAA, "Byte 3 of first word should be 0xEF")
self.assertEqual(st.vgpr[0][1], 0x443322BE, "Byte 0 of second word should be 0xBE")
def test_global_store_b64_basic(self):
"""GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory."""
TEST_OFFSET = 256

View File

@@ -62,6 +62,28 @@ class TestBasicScalar(unittest.TestCase):
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[1], 0x80000000)
def test_s_fmamk_f32(self):
"""S_FMAMK_F32: D = S0 * literal + S1."""
# 2.0 * 3.0 + 1.0 = 7.0
instructions = [
s_mov_b32(s[0], f2i(2.0)),
s_mov_b32(s[1], f2i(1.0)),
s_fmamk_f32(s[2], s[0], s[1], literal=f2i(3.0)),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], f2i(7.0))
def test_s_fmamk_f32_negative(self):
"""S_FMAMK_F32 with negative values."""
# -2.0 * 4.0 + 10.0 = 2.0
instructions = [
s_mov_b32(s[0], f2i(-2.0)),
s_mov_b32(s[1], f2i(10.0)),
s_fmamk_f32(s[2], s[0], s[1], literal=f2i(4.0)),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], f2i(2.0))
class TestQuadmaskWqm(unittest.TestCase):
"""Tests for S_QUADMASK_B32 and S_WQM_B32."""
@@ -298,6 +320,56 @@ class TestSignedArithmetic(unittest.TestCase):
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 2)
def test_s_mul_hi_u32_max(self):
"""S_MUL_HI_U32: 0xFFFFFFFF * 0xFFFFFFFF."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF),
s_mov_b32(s[1], 0xFFFFFFFF),
s_mul_hi_u32(s[2], s[0], s[1]), # (0xFFFFFFFF * 0xFFFFFFFF) >> 32 = 0xFFFFFFFE
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 0xFFFFFFFE)
def test_s_mul_hi_i32_positive(self):
"""S_MUL_HI_I32: positive * positive."""
instructions = [
s_mov_b32(s[0], 0x40000000), # 2^30
s_mov_b32(s[1], 4),
s_mul_hi_i32(s[2], s[0], s[1]), # (2^30 * 4) >> 32 = 1
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 1)
def test_s_mul_hi_i32_neg_times_neg(self):
"""S_MUL_HI_I32: (-1) * (-1) = 1, high bits = 0."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # -1
s_mov_b32(s[1], 0xFFFFFFFF), # -1
s_mul_hi_i32(s[2], s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 0)
def test_s_mul_hi_i32_neg_times_pos(self):
"""S_MUL_HI_I32: (-1) * 2 = -2, high bits = -1 (sign extension)."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # -1
s_mov_b32(s[1], 2),
s_mul_hi_i32(s[2], s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 0xFFFFFFFF) # -1 sign extends
def test_s_mul_hi_i32_min_int(self):
"""S_MUL_HI_I32: MIN_INT * 2 = -2^32, high = -1."""
instructions = [
s_mov_b32(s[0], 0x80000000), # -2^31 (MIN_INT)
s_mov_b32(s[1], 2),
s_mul_hi_i32(s[2], s[0], s[1]), # (-2^31 * 2) >> 32 = -1
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[2], 0xFFFFFFFF)
def test_s_mul_i32(self):
"""S_MUL_I32: signed multiply low 32 bits."""
instructions = [
@@ -329,6 +401,176 @@ class TestSignedArithmetic(unittest.TestCase):
self.assertEqual(st.sgpr[7], ((dividend * 2) + 1) & 0xFFFFFFFF)
class TestBitSet(unittest.TestCase):
"""Tests for S_BITSET0_B32 and S_BITSET1_B32 instructions."""
def test_s_bitset1_b32_set_bit0(self):
"""S_BITSET1_B32: set bit 0 in destination."""
instructions = [
s_mov_b32(s[0], 0), # start with 0
s_mov_b32(s[1], 0), # bit position = 0
s_bitset1_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 1, "Bit 0 should be set")
def test_s_bitset1_b32_set_bit31(self):
"""S_BITSET1_B32: set bit 31 in destination."""
instructions = [
s_mov_b32(s[0], 0), # start with 0
s_mov_b32(s[1], 31), # bit position = 31
s_bitset1_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 0x80000000, "Bit 31 should be set")
def test_s_bitset1_b32_preserves_other_bits(self):
"""S_BITSET1_B32: preserves bits not being set."""
instructions = [
s_mov_b32(s[0], 0xFF00FF00), # existing pattern
s_mov_b32(s[1], 0), # bit position = 0
s_bitset1_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 0xFF00FF01, "Should set bit 0 while preserving others")
def test_s_bitset0_b32_clear_bit0(self):
"""S_BITSET0_B32: clear bit 0 in destination."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set
s_mov_b32(s[1], 0), # bit position = 0
s_bitset0_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 0xFFFFFFFE, "Bit 0 should be cleared")
def test_s_bitset0_b32_clear_bit31(self):
"""S_BITSET0_B32: clear bit 31 in destination."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set
s_mov_b32(s[1], 31), # bit position = 31
s_bitset0_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 0x7FFFFFFF, "Bit 31 should be cleared")
def test_s_bitset1_b32_uses_low5_bits(self):
"""S_BITSET1_B32: only uses low 5 bits of position (mod 32)."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 32 + 5), # position = 37, but mod 32 = 5
s_bitset1_b32(s[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[0], 0x20, "Bit 5 should be set (37 mod 32 = 5)")
class TestBfeI64(unittest.TestCase):
"""Tests for S_BFE_I64 - 64-bit bit field extract with sign extension.
Regression tests for sign extension bug where 32-bit masks were incorrectly
used for 64-bit operations, causing the high 32 bits to not be sign-extended.
"""
def test_s_bfe_i64_positive_no_sign_extend(self):
"""S_BFE_I64: positive value (1) in 16 bits should not sign extend."""
# S1 encodes: [22:16] = width, [5:0] = offset
# width=16, offset=0 -> S1 = (16 << 16) | 0 = 0x100000
instructions = [
s_mov_b32(s[0], 1), # S0 lo = 1
s_mov_b32(s[1], 0), # S0 hi = 0
s_mov_b32(s[2], 0x100000), # width=16, offset=0
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 1, "lo should be 1")
self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
def test_s_bfe_i64_negative_sign_extend(self):
"""S_BFE_I64: 0xFFFF (-1 in 16 bits) should sign extend to 64 bits.
This is the main regression test - before the fix, hi was 0 instead of 0xFFFFFFFF.
"""
instructions = [
s_mov_b32(s[0], 0xFFFF), # S0 lo = -1 in 16 bits
s_mov_b32(s[1], 0), # S0 hi = 0
s_mov_b32(s[2], 0x100000), # width=16, offset=0
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
def test_s_bfe_i64_8bit_negative_sign_extend(self):
"""S_BFE_I64: 0xFF (-1 in 8 bits) should sign extend to 64 bits."""
# width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
instructions = [
s_mov_b32(s[0], 0xFF), # S0 lo = -1 in 8 bits
s_mov_b32(s[1], 0), # S0 hi = 0
s_mov_b32(s[2], 0x80000), # width=8, offset=0
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
def test_s_bfe_i64_8bit_positive(self):
"""S_BFE_I64: 0x7F (127 in 8 bits) should not sign extend."""
# width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
instructions = [
s_mov_b32(s[0], 0x7F), # S0 lo = 127 in 8 bits (MSB=0)
s_mov_b32(s[1], 0), # S0 hi = 0
s_mov_b32(s[2], 0x80000), # width=8, offset=0
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0x7F, "lo should be 0x7F")
self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
def test_s_bfe_i64_with_offset(self):
"""S_BFE_I64: extract from non-zero bit offset with sign extension."""
# Extract 16 bits starting at bit 8: value 0xFF00 >> 8 = 0xFF = -1 in 8 bits? No wait...
# Let's put 0x8000FF00: extract 16 bits at offset 8 = 0x00FF (positive)
# Put 0xFF00_0000: extract 16 bits at offset 16 = 0xFF00 = -256 in signed 16-bit
instructions = [
s_mov_b32(s[0], 0xFF000000), # bits [31:24] = 0xFF, [23:16] = 0x00
s_mov_b32(s[1], 0),
# width=16, offset=16 -> S1 = (16 << 16) | 16 = 0x100010
s_mov_b32(s[2], 0x100010),
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
# Extract bits [31:16] = 0xFF00, sign bit is bit 15 of extracted = bit 31 of original = 1
# So result should be sign-extended 0xFF00 -> 0xFFFFFF00 in lo, 0xFFFFFFFF in hi
self.assertEqual(st.vgpr[0][0], 0xFFFFFF00, "lo should be sign-extended 0xFF00")
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
def test_s_bfe_i64_32bit_negative(self):
"""S_BFE_I64: extract 32 bits with sign extension."""
# width=32, offset=0 -> S1 = (32 << 16) | 0 = 0x200000
instructions = [
s_mov_b32(s[0], 0x80000000), # MIN_INT32 = -2^31
s_mov_b32(s[1], 0),
s_mov_b32(s[2], 0x200000), # width=32, offset=0
s_bfe_i64(s[4:5], s[0:1], s[2]),
v_mov_b32_e32(v[0], s[4]),
v_mov_b32_e32(v[1], s[5]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0x80000000, "lo should be 0x80000000")
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
class Test64BitCompare(unittest.TestCase):
"""Tests for 64-bit scalar compare instructions."""

View File

@@ -255,7 +255,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_cvt_f16_f32_small(self):
"""V_CVT_F16_F32 converts small f32 value."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
instructions = [
v_mov_b32_e32(v[0], 0.5),
v_cvt_f16_f32_e32(v[1], v[0]),
@@ -293,7 +293,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_cvt_f16_f32_reads_full_32bit_source(self):
"""V_CVT_F16_F32 must read full 32-bit f32 source."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x3fc00000), # f32 1.5
v_mov_b32_e32(v[0], s[0]),
@@ -348,6 +348,142 @@ class TestF16Conversions(unittest.TestCase):
self.assertEqual(result, 1, f"Expected 1 from high bits, got {result}")
class TestF64Conversions(unittest.TestCase):
"""Tests for f64 conversion instructions. Regression tests for f32_to_f64/f64_to_f32."""
def test_v_cvt_f64_f32_one(self):
"""V_CVT_F64_F32 converts f32 1.0 to f64."""
instructions = [
s_mov_b32(s[0], f2i(1.0)),
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_f32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, 1.0, places=10)
def test_v_cvt_f64_f32_negative(self):
"""V_CVT_F64_F32 converts f32 -2.5 to f64."""
instructions = [
s_mov_b32(s[0], f2i(-2.5)),
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_f32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, -2.5, places=10)
def test_v_cvt_f64_f32_pi(self):
"""V_CVT_F64_F32 converts f32 pi to f64."""
import math
instructions = [
s_mov_b32(s[0], f2i(3.14159265)),
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_f32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, 3.14159265, places=5)
def test_v_cvt_f64_f32_zero(self):
"""V_CVT_F64_F32 converts f32 0.0 to f64."""
instructions = [
v_mov_b32_e32(v[0], 0),
v_cvt_f64_f32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertEqual(result, 0.0)
def test_v_cvt_f32_f64_one(self):
"""V_CVT_F32_F64 converts f64 1.0 to f32."""
f64_bits = f2i64(1.0)
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
instructions = [
s_mov_b32(s[0], lo),
s_mov_b32(s[1], hi),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_cvt_f32_f64_e32(v[2], v[0:1]),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][2])
self.assertAlmostEqual(result, 1.0, places=5)
def test_v_cvt_f32_f64_negative(self):
"""V_CVT_F32_F64 converts f64 -3.5 to f32."""
f64_bits = f2i64(-3.5)
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
instructions = [
s_mov_b32(s[0], lo),
s_mov_b32(s[1], hi),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_cvt_f32_f64_e32(v[2], v[0:1]),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][2])
self.assertAlmostEqual(result, -3.5, places=5)
def test_v_cvt_f32_f64_large(self):
"""V_CVT_F32_F64 converts large f64 to f32."""
f64_bits = f2i64(123456.789)
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
instructions = [
s_mov_b32(s[0], lo),
s_mov_b32(s[1], hi),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_cvt_f32_f64_e32(v[2], v[0:1]),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][2])
self.assertAlmostEqual(result, 123456.789, places=0)
def test_v_cvt_f64_i32_positive(self):
"""V_CVT_F64_I32 converts positive i32 to f64."""
instructions = [
s_mov_b32(s[0], 42),
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_i32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, 42.0, places=10)
def test_v_cvt_f64_i32_negative(self):
"""V_CVT_F64_I32 converts negative i32 to f64."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # -1 as i32
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_i32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, -1.0, places=10)
def test_v_cvt_f64_u32_large(self):
"""V_CVT_F64_U32 converts large u32 to f64."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF), # max u32
v_mov_b32_e32(v[0], s[0]),
v_cvt_f64_u32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertAlmostEqual(result, 4294967295.0, places=0)
def test_v_cvt_f64_u32_zero(self):
"""V_CVT_F64_U32 converts 0 to f64."""
instructions = [
v_mov_b32_e32(v[0], 0),
v_cvt_f64_u32_e32(v[2:3], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
self.assertEqual(result, 0.0)
class TestClz(unittest.TestCase):
"""Tests for V_CLZ_I32_U32 - count leading zeros."""
@@ -560,7 +696,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
def test_v_cvt_f32_f16_abs_negative(self):
"""V_CVT_F32_F16 with |abs| on negative value."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
instructions = [
s_mov_b32(s[0], f16_neg1),
@@ -573,7 +709,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
def test_v_cvt_f32_f16_abs_positive(self):
"""V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_2 = f32_to_f16(2.0) # 0x4000
instructions = [
s_mov_b32(s[0], f16_2),
@@ -586,7 +722,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
def test_v_cvt_f32_f16_neg_positive(self):
"""V_CVT_F32_F16 with neg on positive value."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_2 = f32_to_f16(2.0) # 0x4000
instructions = [
s_mov_b32(s[0], f16_2),
@@ -599,7 +735,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
def test_v_cvt_f32_f16_neg_negative(self):
"""V_CVT_F32_F16 with neg on negative value (double negative)."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_neg2 = f32_to_f16(-2.0) # 0xc000
instructions = [
s_mov_b32(s[0], f16_neg2),
@@ -612,7 +748,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
def test_v_cvt_f16_f32_then_pack_for_wmma(self):
"""CVT F32->F16 followed by pack (common WMMA pattern)."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
f32_val = 3.5
instructions = [
s_mov_b32(s[0], f2i(f32_val)),
@@ -668,7 +804,7 @@ class TestConversionRounding(unittest.TestCase):
def test_f16_to_f32_precision(self):
"""F16 to F32 conversion precision."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_val = f32_to_f16(1.5)
instructions = [
s_mov_b32(s[0], f16_val),
@@ -680,7 +816,7 @@ class TestConversionRounding(unittest.TestCase):
def test_f16_denormal_to_f32(self):
"""F16 denormal converts to small positive f32."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
f16_denorm = 0x0001 # Smallest positive f16 denormal
instructions = [
v_mov_b32_e32(v[0], f16_denorm),
@@ -1238,5 +1374,143 @@ class TestFloorEdgeCases(unittest.TestCase):
self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
class TestVop1F16HiHalf(unittest.TestCase):
"""Regression tests for VOP1 f16 hi-half source operand handling.
For 16-bit VOP1 operations, when src0 is in the range v[128]+ (offset >= 384),
the hardware reads from the high 16 bits of v[src0-128]. The emulator must
extract bits [31:16] from the actual VGPR.
"""
def test_v_cvt_f32_f16_src_hi_half(self):
"""V_CVT_F32_F16 with source from hi-half (v[128]+).
When src0 >= v[128], it reads from the high 16 bits of v[src0-128].
This is critical for global_load_d16_hi_b16 + v_cvt_f32_f16 patterns.
Regression test for: VOP1 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v_cvt_f32_f16 v[1], v[128] (reads hi half of v[0])
# Should convert f16(2.0) to f32(2.0)
v_cvt_f32_f16_e32(v[1], v[128]),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][1])
self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected f32(2.0), got {result}")
def test_v_cvt_f32_f16_src_lo_vs_hi(self):
"""V_CVT_F32_F16 comparing lo and hi half reads.
v[0] has different values in lo and hi halves.
v_cvt_f32_f16 v[1], v[0] should read lo (1.0)
v_cvt_f32_f16 v[2], v[128] should read hi (2.0)
Regression test for: VOP1 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# Read from lo half
v_cvt_f32_f16_e32(v[1], v[0]),
# Read from hi half
v_cvt_f32_f16_e32(v[2], v[128]),
]
st = run_program(instructions, n_lanes=1)
result_lo = i2f(st.vgpr[0][1])
result_hi = i2f(st.vgpr[0][2])
self.assertAlmostEqual(result_lo, 1.0, places=5, msg=f"Expected f32(1.0) from lo, got {result_lo}")
self.assertAlmostEqual(result_hi, 2.0, places=5, msg=f"Expected f32(2.0) from hi, got {result_hi}")
def test_v_cvt_i16_f16_src_hi_half(self):
"""V_CVT_I16_F16 with source from hi-half.
Regression test for: VOP1 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0xc000_3c00: hi=f16(-2.0), lo=f16(1.0)
s_mov_b32(s[0], 0xc0003c00),
v_mov_b32_e32(v[0], s[0]),
# v_cvt_i16_f16 v[1], v[128] (reads hi half of v[0])
# Should convert f16(-2.0) to i16(-2)
v_cvt_i16_f16_e32(v[1], v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
expected = (-2) & 0xffff
self.assertEqual(result, expected, f"Expected i16(-2)=0x{expected:04x}, got 0x{result:04x}")
def test_v_mov_b16_src_hi_half(self):
"""V_MOV_B16 with source from hi-half.
Regression test for: VOP1 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0xBEEF_DEAD: hi=0xBEEF, lo=0xDEAD
s_mov_b32(s[0], 0xBEEFDEAD),
v_mov_b32_e32(v[0], s[0]),
# v[1] = 0x0000_0000 initially
v_mov_b32_e32(v[1], 0),
# v_mov_b16 v[1], v[128] (reads hi half of v[0])
# Should move 0xBEEF to v[1].lo
v_mov_b16_e32(v[1], v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
self.assertEqual(result, 0xBEEF, f"Expected 0xBEEF from hi half, got 0x{result:04x}")
class TestReciprocalF16(unittest.TestCase):
"""Tests for V_RCP_F16 - reciprocal in half precision.
The pcode uses a 16-bit float literal: D0.f16 = 16'1.0 / S0.f16
This tests that the sized float literal (16'1.0) is correctly parsed.
"""
def test_v_rcp_f16_one(self):
"""V_RCP_F16: 1/1.0 = 1.0"""
import struct
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
instructions = [
# Load f16 1.0 into low 16 bits of v[0]
v_mov_b32_e32(v[0], f16_to_bits(1.0)),
v_rcp_f16_e32(v[1], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
self.assertAlmostEqual(result, 1.0, places=2, msg="1/1.0 should be 1.0")
def test_v_rcp_f16_two(self):
"""V_RCP_F16: 1/2.0 = 0.5"""
import struct
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
instructions = [
v_mov_b32_e32(v[0], f16_to_bits(2.0)),
v_rcp_f16_e32(v[1], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
self.assertAlmostEqual(result, 0.5, places=2, msg="1/2.0 should be 0.5")
def test_v_rcp_f16_four(self):
"""V_RCP_F16: 1/4.0 = 0.25"""
import struct
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
instructions = [
v_mov_b32_e32(v[0], f16_to_bits(4.0)),
v_rcp_f16_e32(v[1], v[0]),
]
st = run_program(instructions, n_lanes=1)
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
self.assertAlmostEqual(result, 0.25, places=2, msg="1/4.0 should be 0.25")
if __name__ == '__main__':
unittest.main()

View File

@@ -341,6 +341,293 @@ class TestHiHalfOps(unittest.TestCase):
self.assertEqual(result, 0x4200, f"Lane {lane}: expected 0x4200, got 0x{result:04x}")
class TestVop2F16HiHalf(unittest.TestCase):
"""Regression tests for VOP2 f16 hi-half operand handling.
These test the bugs where:
1. VOP2 vsrc1 >= 384 (v[128]+) wasn't extracting hi 16 bits
2. VOP2 vdst >= 384 (v[128]+) wasn't preserving lo 16 bits
"""
def test_v_add_f16_e32_vsrc1_hi_half(self):
"""V_ADD_F16_E32 with vsrc1 from hi-half (v[128]+).
When vsrc1 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
of v[vsrc1-128]. The emulator must extract bits [31:16] from the actual VGPR.
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v_add_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
# In VOP2 encoding, vsrc1=384 means v[128], which maps to v[0].hi
# v[1] = v[0].lo + v[0].hi = 1.0 + 2.0 = 3.0
VOP2(VOP2Op.V_ADD_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 1.0 + 2.0 = 3.0, f16 3.0 = 0x4200
self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
def test_v_mul_f16_e32_vsrc1_hi_half(self):
"""V_MUL_F16_E32 with vsrc1 from hi-half.
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4200_4000: hi=f16(3.0), lo=f16(2.0)
s_mov_b32(s[0], 0x42004000),
v_mov_b32_e32(v[0], s[0]),
# v_mul_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
# v[1] = v[0].lo * v[0].hi = 2.0 * 3.0 = 6.0
VOP2(VOP2Op.V_MUL_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
def test_v_add_f16_e32_vdst_hi_half(self):
"""V_ADD_F16_E32 writing to hi-half destination (v[128]+).
When vdst >= 384 (representing v[128]+), the hardware writes to bits [31:16]
of v[vdst-128] while preserving bits [15:0]. The emulator must merge the result.
Regression test for: VOP2 f16 vdst hi-half write bug.
"""
instructions = [
# v[0] = 0x0000_BEEF: lo has marker value
s_mov_b32(s[0], 0x0000BEEF),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(1.0), v[2] = f16(2.0)
s_mov_b32(s[1], 0x3c00),
s_mov_b32(s[2], 0x4000),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
# v_add_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
# v[0].hi = 1.0 + 2.0 = 3.0, v[0].lo preserved = 0xBEEF
VOP2(VOP2Op.V_ADD_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 3.0 = 0x4200, lo preserved = 0xBEEF
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
self.assertEqual(lo, 0xBEEF, f"Expected lo preserved=0xBEEF, got 0x{lo:04x}")
def test_v_mul_f16_e32_vdst_hi_half(self):
"""V_MUL_F16_E32 writing to hi-half destination.
Regression test for: VOP2 f16 vdst hi-half write bug.
"""
instructions = [
# v[0] = 0x0000_DEAD: lo has marker value
s_mov_b32(s[0], 0x0000DEAD),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(2.0), v[2] = f16(4.0)
s_mov_b32(s[1], 0x4000),
s_mov_b32(s[2], 0x4400),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
# v_mul_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
# v[0].hi = 2.0 * 4.0 = 8.0, v[0].lo preserved = 0xDEAD
VOP2(VOP2Op.V_MUL_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 8.0 = 0x4800, lo preserved = 0xDEAD
self.assertEqual(hi, 0x4800, f"Expected hi=f16(8.0)=0x4800, got 0x{hi:04x}")
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
def test_v_add_f16_e32_both_hi_half(self):
"""V_ADD_F16_E32 with both vsrc1 and vdst as hi-half (different underlying regs).
Tests the combination of both fixes: reading vsrc1 from hi-half AND
writing result to hi-half destination, using different underlying VGPRs.
Regression test for: VOP2 f16 hi-half bugs (combined).
"""
instructions = [
# v[0] = 0x4000_xxxx: hi=f16(2.0) for vsrc1
s_mov_b32(s[0], 0x40000000),
v_mov_b32_e32(v[0], s[0]),
# v[1] = 0x0000_3c00: lo=f16(1.0) for src0
s_mov_b32(s[1], 0x00003c00),
v_mov_b32_e32(v[1], s[1]),
# v[2] = 0x0000_CAFE: lo=marker for vdst preservation
s_mov_b32(s[2], 0x0000CAFE),
v_mov_b32_e32(v[2], s[2]),
# v_add_f16_e32 v[130], v[1], v[128]
# src0 = v[1].lo = 1.0
# vsrc1 = v[128] reads v[0].hi = 2.0
# result = 1.0 + 2.0 = 3.0
# vdst = v[130] writes to v[2].hi, preserving v[2].lo
VOP2(VOP2Op.V_ADD_F16, vdst=v[130], src0=v[1], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][2] >> 16) & 0xffff
lo = st.vgpr[0][2] & 0xffff
# hi = 3.0 = 0x4200, lo preserved = 0xCAFE
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
self.assertEqual(lo, 0xCAFE, f"Expected lo preserved=0xCAFE, got 0x{lo:04x}")
def test_v_fmac_f16_e32_vsrc1_hi_half(self):
"""V_FMAC_F16_E32 with vsrc1 from hi-half.
V_FMAC_F16: vdst = vdst + src0 * vsrc1
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(3.0) = 0x4200
s_mov_b32(s[1], 0x4200),
v_mov_b32_e32(v[1], s[1]),
# v_fmac_f16_e32 v[1], v[0], v[128]
# vdst = v[1] = 3.0 + v[0].lo * v[0].hi = 3.0 + 1.0 * 2.0 = 5.0
VOP2(VOP2Op.V_FMAC_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 3.0 + 1.0 * 2.0 = 5.0, f16 5.0 = 0x4500
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
def test_v_fmac_f16_e32_vdst_hi_half(self):
"""V_FMAC_F16_E32 writing to hi-half destination.
V_FMAC_F16: vdst.h = vdst.h + src0 * vsrc1
When vdst is v[128]+, the accumulator D0 must also read from the hi-half.
This tests the bug where D0 was read from lo-half instead of hi-half.
Regression test for: VOP2 FMAC hi-half D0 accumulator read bug.
"""
instructions = [
# v[0] = 0x3800_DEAD: hi=f16(0.5), lo=marker (0xDEAD)
s_mov_b32(s[0], 0x3800DEAD),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(2.0) = 0x4000
s_mov_b32(s[1], 0x4000),
v_mov_b32_e32(v[1], s[1]),
# v[2] = f16(3.0) = 0x4200
s_mov_b32(s[2], 0x4200),
v_mov_b32_e32(v[2], s[2]),
# v_fmac_f16_e32 v[128], v[1], v[2]
# vdst = v[128] means v[0].hi
# D0 = v[0].hi = 0.5
# result = D0 + src0 * vsrc1 = 0.5 + 2.0 * 3.0 = 6.5
# v[0].hi = 6.5, v[0].lo preserved = 0xDEAD
VOP2(VOP2Op.V_FMAC_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 6.5 = 0x4680, lo preserved = 0xDEAD
self.assertEqual(hi, 0x4680, f"Expected hi=f16(6.5)=0x4680, got 0x{hi:04x}")
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
def test_v_mul_f16_e32_src0_hi_half(self):
"""V_MUL_F16_E32 with src0 from hi-half (src0 >= v[128]).
When src0 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
of v[src0-128]. The emulator must extract bits [31:16] from the actual VGPR.
Regression test for: VOP2 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(3.0) = 0x4200
s_mov_b32(s[1], 0x4200),
v_mov_b32_e32(v[1], s[1]),
# v_mul_f16_e32 v[2], v[128], v[1]
# src0 = v[128] reads from v[0].hi = 2.0
# result = 2.0 * 3.0 = 6.0
VOP2(VOP2Op.V_MUL_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
def test_v_add_f16_e32_src0_hi_half(self):
"""V_ADD_F16_E32 with src0 from hi-half (src0 >= v[128]).
Regression test for: VOP2 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(5.0) = 0x4500
s_mov_b32(s[1], 0x4500),
v_mov_b32_e32(v[1], s[1]),
# v_add_f16_e32 v[2], v[128], v[1]
# src0 = v[128] reads from v[0].hi = 2.0
# result = 2.0 + 5.0 = 7.0
VOP2(VOP2Op.V_ADD_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 + 5.0 = 7.0, f16 7.0 = 0x4700
self.assertEqual(result, 0x4700, f"Expected f16(7.0)=0x4700, got 0x{result:04x}")
class TestF16InlineConstants(unittest.TestCase):
"""Regression tests for VOP2 F16 inline float constants.
For 16-bit VOP2 operations (v_add_f16, v_mul_f16, etc.), inline float constants
like 1.0, 2.0 must use F16 encoding (0x3c00, 0x4000) not F32 encoding (0x3f800000).
The emulator's rsrc() function needs bits=16 to select F16_INLINE constants.
Regression test for: VOP2 16-bit inline constant using F32 instead of F16.
"""
def test_v_add_f16_inline_constant_1_0(self):
"""V_ADD_F16_E32 with inline constant 1.0 should use F16 encoding."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
v_mov_b32_e32(v[0], s[0]),
# v_add_f16_e32 v[1], 1.0, v[0] -- 1.0 must be F16 0x3c00, not F32 0x3f800000
v_add_f16_e32(v[1], 1.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 1.0 + 1.0 = 2.0, f16 2.0 = 0x4000
self.assertEqual(result, 0x4000, f"Expected f16(2.0)=0x4000, got 0x{result:04x}")
def test_v_add_f16_inline_constant_2_0(self):
"""V_ADD_F16_E32 with inline constant 2.0."""
instructions = [
s_mov_b32(s[0], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_add_f16_e32(v[1], 2.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 2.0 + 3.0 = 5.0, f16 5.0 = 0x4500
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
def test_v_mul_f16_inline_constant(self):
"""V_MUL_F16_E32 with inline constant 2.0."""
instructions = [
s_mov_b32(s[0], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_mul_f16_e32(v[1], 2.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
class TestCndmask(unittest.TestCase):
"""Tests for V_CNDMASK_B32 and V_CNDMASK_B16."""
@@ -447,5 +734,132 @@ class TestSpecialFloatValues(unittest.TestCase):
self.assertEqual(st.vgpr[0][1], 0x00000000)
class TestCarryOps(unittest.TestCase):
"""Tests for VOP2 carry instructions (v_add_co_ci_u32, v_sub_co_ci_u32, v_subrev_co_ci_u32)."""
def test_v_subrev_co_ci_u32_no_borrow(self):
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_subrev_co_ci_u32_with_borrow(self):
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=1."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (borrow in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 1 = 4
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 4)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_subrev_co_ci_u32_generates_borrow(self):
"""V_SUBREV_CO_CI_U32: generates borrow when S0 + VCC_IN > S1."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0
v_mov_b32_e32(v[0], 10), # S0 = 10
v_mov_b32_e32(v[1], 5), # S1 = 5
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 - 10 - 0 = -5 (underflow)
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xFFFFFFFB) # -5 as unsigned
self.assertEqual(st.vcc, 1) # Borrow out
def test_v_add_co_ci_u32_no_carry(self):
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no carry in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 0 = 15
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 15)
self.assertEqual(st.vcc, 0) # No carry out
def test_v_add_co_ci_u32_with_carry(self):
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=1."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 1 = 16
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 16)
self.assertEqual(st.vcc, 0) # No carry out
def test_v_add_co_ci_u32_generates_carry(self):
"""V_ADD_CO_CI_U32: generates carry when overflow occurs."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
s_mov_b32(s[0], 0xFFFFFFFF), # max u32
v_mov_b32_e32(v[0], s[0]), # S0 = 0xFFFFFFFF
v_mov_b32_e32(v[1], 0), # S1 = 0
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 0xFFFFFFFF + 0 + 1 = 0 (overflow)
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0) # Overflowed to 0
self.assertEqual(st.vcc, 1) # Carry out
def test_v_sub_co_ci_u32_no_borrow(self):
"""V_SUB_CO_CI_U32: D0 = S0 - S1 - VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
v_mov_b32_e32(v[0], 10), # S0 = 10
v_mov_b32_e32(v[1], 5), # S1 = 5
v_sub_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_sub_co_ci_u32_vop3sd_separate_carry_regs(self):
"""VOP3SD V_SUB_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
This tests the VOP3SD encoding where src2 specifies the carry-in register
independently from sdst (carry-out). The bug was reading carry-in from sdst
instead of src2.
Computation: D0 = S0 - S1 - carry_in = 0 - 0 - 1 = -1 = 0xFFFFFFFF
"""
instructions = [
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
# VOP3SD: v_sub_co_ci_u32(vdst, sdst, src0, src1, src2)
# src2 is carry-in (s[6]=1), sdst is carry-out (s[10])
v_sub_co_ci_u32(v[0], s[10], 0, 0, s[6]), # D0 = 0 - 0 - 1 = -1
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF) # -1 as unsigned
self.assertEqual(st.sgpr[10], 1) # Borrow out to s[10]
def test_v_add_co_ci_u32_vop3sd_separate_carry_regs(self):
"""VOP3SD V_ADD_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
This tests the VOP3SD encoding where src2 specifies the carry-in register
independently from sdst (carry-out).
Computation: D0 = S0 + S1 + carry_in = 5 + 10 + 1 = 16
"""
instructions = [
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
# VOP3SD: v_add_co_ci_u32(vdst, sdst, src0, src1, src2)
v_add_co_ci_u32(v[0], s[10], 5, 10, s[6]), # D0 = 5 + 10 + 1 = 16
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 16)
self.assertEqual(st.sgpr[10], 0) # No carry out
if __name__ == '__main__':
unittest.main()

View File

@@ -58,6 +58,95 @@ class TestFMA(unittest.TestCase):
self.assertTrue(math.isinf(result) and result > 0)
class TestFmacE64(unittest.TestCase):
"""Regression tests for V_FMAC_F32 VOP3 encoding (e64).
V_FMAC_F32: D0 = D0 + S0 * S1 (fused multiply-add with accumulator)
The VOP3 encoding needs to read D0 from the destination register as the
accumulator input, not just write to it.
Regression test for: VOP3 FMAC missing D0 accumulator bug.
"""
def test_v_fmac_f32_e64_basic(self):
"""V_FMAC_F32_E64: basic accumulate test."""
instructions = [
v_mov_b32_e32(v[0], 2.0), # S0 = 2.0
v_mov_b32_e32(v[1], 3.0), # S1 = 3.0
v_mov_b32_e32(v[2], 1.0), # D0 (accumulator) = 1.0
# v_fmac_f32_e64 v[2], v[0], v[1]
# D0 = D0 + S0 * S1 = 1.0 + 2.0 * 3.0 = 7.0
v_fmac_f32_e64(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)
def test_v_fmac_f32_e64_with_sgpr_sources(self):
"""V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
where src0 is SGPR and src1 is inline constant 0.
Regression test for: VOP3 FMAC missing D0 accumulator bug.
"""
instructions = [
s_mov_b32(s[4], f2i(2.0)), # S0 = 2.0 in SGPR
v_mov_b32_e32(v[0], 5.0), # D0 (accumulator) = 5.0
# v_fmac_f32_e64 v[0], s[4], 0
# D0 = D0 + S0 * S1 = 5.0 + 2.0 * 0.0 = 5.0
v_fmac_f32_e64(v[0], s[4], 0),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][0]), 5.0, places=5)
def test_v_fmac_f32_e64_with_two_sgprs(self):
"""V_FMAC_F32_E64 with two SGPR sources.
Tests pattern: v_fmac_f32_e64(v[0], s[a], s[b])
Regression test for: VOP3 FMAC missing D0 accumulator bug.
"""
instructions = [
s_mov_b32(s[10], f2i(3.0)), # S0 = 3.0
s_mov_b32(s[12], f2i(4.0)), # S1 = 4.0
v_mov_b32_e32(v[9], 2.0), # D0 (accumulator) = 2.0
# v_fmac_f32_e64 v[9], s[10], s[12]
# D0 = D0 + S0 * S1 = 2.0 + 3.0 * 4.0 = 14.0
v_fmac_f32_e64(v[9], s[10], s[12]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][9]), 14.0, places=5)
def test_v_fmac_f32_e64_accumulates_correctly(self):
"""V_FMAC_F32_E64 accumulates multiple times."""
instructions = [
v_mov_b32_e32(v[0], 0.0), # D0 = 0.0
v_mov_b32_e32(v[1], 1.0), # S0 = 1.0
v_mov_b32_e32(v[2], 2.0), # S1 = 2.0
# First: D0 = 0.0 + 1.0 * 2.0 = 2.0
v_fmac_f32_e64(v[0], v[1], v[2]),
# Second: D0 = 2.0 + 1.0 * 2.0 = 4.0
v_fmac_f32_e64(v[0], v[1], v[2]),
# Third: D0 = 4.0 + 1.0 * 2.0 = 6.0
v_fmac_f32_e64(v[0], v[1], v[2]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][0]), 6.0, places=5)
def test_v_fmac_f32_e64_negative_accumulator(self):
"""V_FMAC_F32_E64 with negative accumulator."""
instructions = [
v_mov_b32_e32(v[0], 2.0), # S0 = 2.0
v_mov_b32_e32(v[1], 3.0), # S1 = 3.0
v_mov_b32_e32(v[2], -10.0), # D0 (accumulator) = -10.0
# D0 = -10.0 + 2.0 * 3.0 = -4.0
v_fmac_f32_e64(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), -4.0, places=5)
class TestDivScale(unittest.TestCase):
"""Tests for V_DIV_SCALE_F32."""
@@ -768,7 +857,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_fma_f16_inline_const_1_0(self):
"""V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
from extra.assembly.amd.pcode import f32_to_f16, _f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
f16_a = f32_to_f16(0.325928) # ~0x3537
f16_b = f32_to_f16(-0.486572) # ~0xb7c9
instructions = [
@@ -785,7 +874,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_fma_f16_inline_const_0_5(self):
"""V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
from extra.assembly.amd.pcode import f32_to_f16, _f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
f16_a = f32_to_f16(2.0)
f16_b = f32_to_f16(3.0)
instructions = [
@@ -802,7 +891,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_fma_f16_inline_const_neg_1_0(self):
"""V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
from extra.assembly.amd.pcode import f32_to_f16, _f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
f16_a = f32_to_f16(2.0)
f16_b = f32_to_f16(3.0)
instructions = [
@@ -819,7 +908,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_add_f16_abs_both(self):
"""V_ADD_F16 with abs on both operands."""
from extra.assembly.amd.pcode import f32_to_f16, _f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
f16_neg2 = f32_to_f16(-2.0)
f16_neg3 = f32_to_f16(-3.0)
instructions = [
@@ -835,7 +924,7 @@ class TestF16Modifiers(unittest.TestCase):
def test_v_mul_f16_neg_abs(self):
"""V_MUL_F16 with neg on one operand and abs on another."""
from extra.assembly.amd.pcode import f32_to_f16, _f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
f16_2 = f32_to_f16(2.0)
f16_neg3 = f32_to_f16(-3.0)
instructions = [
@@ -854,7 +943,7 @@ class TestF16Modifiers(unittest.TestCase):
This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
"""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0}
v_mov_b32_e32(v[0], s[0]),
@@ -1621,6 +1710,27 @@ class TestCarryBorrow(unittest.TestCase):
self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result")
self.assertEqual(st.vgpr[0][5], 0x00000003, "hi result")
def test_add_co_u32_same_dst_src(self):
"""V_ADD_CO_U32 where dst is same as src - VCC must use original src value."""
instructions = [
s_mov_b32(s[0], 0xFFFFFFFF),
v_mov_b32_e32(v[0], s[0]),
v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1, VCC should be set from overflow
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0, "0xFFFFFFFF + 1 = 0")
self.assertEqual(st.vcc & 1, 1, "Should have carry from 0xFFFFFFFF + 1")
def test_add_co_u32_same_dst_src_no_carry(self):
"""V_ADD_CO_U32 where dst is same as src - no carry case."""
instructions = [
v_mov_b32_e32(v[0], 100),
v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 101, "100 + 1 = 101")
self.assertEqual(st.vcc & 1, 0, "No carry from 100 + 1")
class TestReadlane(unittest.TestCase):
"""Tests for V_READLANE_B32 and related cross-lane operations."""
@@ -2292,5 +2402,414 @@ class TestAddF32EdgeCases(unittest.TestCase):
self.assertEqual(st.vgpr[0][2], 0x80000000) # -0
class TestDivScaleF64(unittest.TestCase):
"""Tests for V_DIV_SCALE_F64 - critical for tan() and division.
These tests verify that VCC bits are set independently per lane,
which is essential for correct multi-lane f64 division operations.
"""
def test_div_scale_f64_basic_no_scaling(self):
"""V_DIV_SCALE_F64: normal values with no scaling needed."""
sqrt2 = f2i64(1.4142135623730951)
one = f2i64(1.0)
instructions = [
s_mov_b32(s[0], sqrt2 & 0xffffffff),
s_mov_b32(s[1], sqrt2 >> 32),
s_mov_b32(s[2], one & 0xffffffff),
s_mov_b32(s[3], one >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_mov_b32_e32(v[3], s[3]),
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[2:3]),
]
st = run_program(instructions, n_lanes=1)
result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
self.assertAlmostEqual(result, 1.4142135623730951, places=10)
self.assertEqual(st.vcc & 1, 0, "VCC should be 0 when no scaling needed")
def test_div_scale_f64_vcc_per_lane_uniform_input(self):
"""V_DIV_SCALE_F64: VCC bits should be set independently per lane (uniform input).
This is a regression test for the bug where VCC = 0x0LL was setting the whole
64-bit VCC register instead of just the current lane's bit. With uniform input
all lanes should get VCC=0.
"""
val = f2i64(2.0)
instructions = [
s_mov_b32(s[0], val & 0xffffffff),
s_mov_b32(s[1], val >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
]
st = run_program(instructions, n_lanes=4)
# All lanes should have VCC=0 for normal values
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
# All lanes should have same result
for lane in range(4):
result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} result mismatch")
def test_div_scale_f64_vcc_per_lane_varying_input(self):
"""V_DIV_SCALE_F64: VCC bits set per-lane with different inputs per lane.
This test uses different inputs per lane to verify that VCC is tracked
independently. This catches the bug where the emulator was setting VCC
for all lanes to the same value.
"""
import math
# Use lane-varying input: lane 0 gets 2.0, lane 1 gets 3.0, etc.
# All normal values should result in VCC=0 for each lane
instructions = [
# Set up per-lane values using lane_id
v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id)
v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # v0:1 = lane_id + 2.0
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
]
st = run_program(instructions, n_lanes=4)
# All lanes should have VCC=0 (no scaling needed for 2.0, 3.0, 4.0, 5.0)
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
# Verify each lane has correct result
for lane in range(4):
expected = float(lane) + 2.0
result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
self.assertAlmostEqual(result, expected, places=10, msg=f"Lane {lane}: expected {expected}, got {result}")
def test_div_scale_f64_zero_denom_sets_vcc(self):
"""V_DIV_SCALE_F64: zero denominator -> NaN, VCC=1."""
import math
one = f2i64(1.0)
zero = f2i64(0.0)
instructions = [
s_mov_b32(s[0], one & 0xffffffff),
s_mov_b32(s[1], one >> 32),
s_mov_b32(s[2], zero & 0xffffffff),
s_mov_b32(s[3], zero >> 32),
v_mov_b32_e32(v[0], s[0]), # numer = 1.0
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]), # denom = 0.0
v_mov_b32_e32(v[3], s[3]),
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
]
st = run_program(instructions, n_lanes=1)
result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
self.assertTrue(math.isnan(result), "Should be NaN for zero denom")
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
def test_div_scale_f64_mixed_vcc_per_lane(self):
"""V_DIV_SCALE_F64: some lanes need scaling, others don't.
This is the key test for the tan() bug - it verifies that VCC is set
correctly for each lane independently when some lanes need scaling and
others don't.
"""
import math
# Lane 0: normal value (VCC=0), Lane 1: zero denom (VCC=1)
# Lane 2: normal value (VCC=0), Lane 3: zero denom (VCC=1)
normal = f2i64(2.0)
zero = f2i64(0.0)
instructions = [
# Set up numer = 2.0 for all lanes
s_mov_b32(s[0], normal & 0xffffffff),
s_mov_b32(s[1], normal >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
# Set up denom: lane 0,2 get 2.0, lane 1,3 get 0.0
s_mov_b32(s[2], zero & 0xffffffff),
s_mov_b32(s[3], zero >> 32),
v_mov_b32_e32(v[2], s[0]), # default to 2.0
v_mov_b32_e32(v[3], s[1]),
# Override lanes 1 and 3 with 0.0 using writelane
v_writelane_b32(v[2], s[2], 1),
v_writelane_b32(v[3], s[3], 1),
v_writelane_b32(v[2], s[2], 3),
v_writelane_b32(v[3], s[3], 3),
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
]
st = run_program(instructions, n_lanes=4)
# Lanes 0,2 should have VCC=0 (normal), lanes 1,3 should have VCC=1 (zero denom)
self.assertEqual(st.vcc & 0b0001, 0, "Lane 0 VCC should be 0")
self.assertEqual(st.vcc & 0b0010, 0b0010, "Lane 1 VCC should be 1")
self.assertEqual(st.vcc & 0b0100, 0, "Lane 2 VCC should be 0")
self.assertEqual(st.vcc & 0b1000, 0b1000, "Lane 3 VCC should be 1")
# Check results
for lane in [0, 2]:
result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} should be 2.0")
for lane in [1, 3]:
result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
self.assertTrue(math.isnan(result), f"Lane {lane} should be NaN")
class TestDivFmasF64(unittest.TestCase):
"""Tests for V_DIV_FMAS_F64 - scaling FMA for f64 division.
These tests verify that V_DIV_FMAS applies the correct scaling
based on VCC per lane, which is essential for correct tan() results.
"""
def test_div_fmas_f64_no_scale_vcc0(self):
"""V_DIV_FMAS_F64: VCC=0 -> normal FMA, no scaling."""
a = f2i64(2.0)
b = f2i64(3.0)
c = f2i64(1.0)
instructions = [
s_mov_b32(VCC_LO, 0),
s_mov_b32(s[0], a & 0xffffffff),
s_mov_b32(s[1], a >> 32),
s_mov_b32(s[2], b & 0xffffffff),
s_mov_b32(s[3], b >> 32),
s_mov_b32(s[4], c & 0xffffffff),
s_mov_b32(s[5], c >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_mov_b32_e32(v[3], s[3]),
v_mov_b32_e32(v[4], s[4]),
v_mov_b32_e32(v[5], s[5]),
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
]
st = run_program(instructions, n_lanes=1)
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
expected = 2.0 * 3.0 + 1.0 # = 7.0
self.assertAlmostEqual(result, expected, places=10)
def test_div_fmas_f64_scale_up_vcc1_large_s2(self):
"""V_DIV_FMAS_F64: VCC=1 with S2 exponent > 1023 -> scale by 2^+128."""
a = f2i64(1.0)
b = f2i64(1.0)
c = f2i64(2.0) # exponent = 1024 > 1023, so scale UP
instructions = [
s_mov_b32(VCC_LO, 1),
s_mov_b32(s[0], a & 0xffffffff),
s_mov_b32(s[1], a >> 32),
s_mov_b32(s[2], b & 0xffffffff),
s_mov_b32(s[3], b >> 32),
s_mov_b32(s[4], c & 0xffffffff),
s_mov_b32(s[5], c >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_mov_b32_e32(v[3], s[3]),
v_mov_b32_e32(v[4], s[4]),
v_mov_b32_e32(v[5], s[5]),
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
]
st = run_program(instructions, n_lanes=1)
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
expected = (1.0 * 1.0 + 2.0) * (2.0 ** 128) # = 3.0 * 2^128
self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
def test_div_fmas_f64_scale_down_vcc1_small_s2(self):
"""V_DIV_FMAS_F64: VCC=1 with S2 exponent <= 1023 -> scale by 2^-128."""
a = f2i64(2.0)
b = f2i64(3.0)
c = f2i64(1.0) # exponent = 1023, so scale DOWN
instructions = [
s_mov_b32(VCC_LO, 1),
s_mov_b32(s[0], a & 0xffffffff),
s_mov_b32(s[1], a >> 32),
s_mov_b32(s[2], b & 0xffffffff),
s_mov_b32(s[3], b >> 32),
s_mov_b32(s[4], c & 0xffffffff),
s_mov_b32(s[5], c >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_mov_b32_e32(v[3], s[3]),
v_mov_b32_e32(v[4], s[4]),
v_mov_b32_e32(v[5], s[5]),
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
]
st = run_program(instructions, n_lanes=1)
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
expected = (2.0 * 3.0 + 1.0) * (2.0 ** -128) # = 7.0 * 2^-128
self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
def test_div_fmas_f64_per_lane_vcc_varying(self):
"""V_DIV_FMAS_F64: different VCC per lane applies different scaling.
This is the key test for the tan() bug - verifies that scaling is
applied per-lane based on VCC bits, not uniformly.
"""
a = f2i64(1.0)
b = f2i64(1.0)
c = f2i64(1.0) # exponent = 1023, so when VCC=1 it scales DOWN
instructions = [
# VCC = 0b0101: lanes 0,2 scale, lanes 1,3 don't
s_mov_b32(VCC_LO, 0b0101),
s_mov_b32(s[0], a & 0xffffffff),
s_mov_b32(s[1], a >> 32),
s_mov_b32(s[2], b & 0xffffffff),
s_mov_b32(s[3], b >> 32),
s_mov_b32(s[4], c & 0xffffffff),
s_mov_b32(s[5], c >> 32),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_mov_b32_e32(v[3], s[3]),
v_mov_b32_e32(v[4], s[4]),
v_mov_b32_e32(v[5], s[5]),
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
]
st = run_program(instructions, n_lanes=4)
scaled = (1.0 * 1.0 + 1.0) * (2.0 ** -128) # = 2.0 * 2^-128
unscaled = 1.0 * 1.0 + 1.0 # = 2.0
# Lane 0: VCC=1, scale
result0 = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
self.assertAlmostEqual(result0, scaled, delta=abs(scaled) * 1e-10, msg="Lane 0 should be scaled")
# Lane 1: VCC=0, no scale
result1 = i642f(st.vgpr[1][6] | (st.vgpr[1][7] << 32))
self.assertAlmostEqual(result1, unscaled, places=10, msg="Lane 1 should be unscaled")
# Lane 2: VCC=1, scale
result2 = i642f(st.vgpr[2][6] | (st.vgpr[2][7] << 32))
self.assertAlmostEqual(result2, scaled, delta=abs(scaled) * 1e-10, msg="Lane 2 should be scaled")
# Lane 3: VCC=0, no scale
result3 = i642f(st.vgpr[3][6] | (st.vgpr[3][7] << 32))
self.assertAlmostEqual(result3, unscaled, places=10, msg="Lane 3 should be unscaled")
class TestDivScaleFmasF64Integration(unittest.TestCase):
"""Integration tests for V_DIV_SCALE_F64 + V_DIV_FMAS_F64.
These tests verify the full division sequence used by tan() works
correctly with multiple lanes having different values.
"""
def test_div_scale_then_fmas_multi_lane_tan_pattern(self):
"""Test the pattern used by tan(): DIV_SCALE sets VCC, DIV_FMAS uses it.
This is the exact bug scenario: tan([2.0, 3.0, 4.0]) was failing because
VCC from DIV_SCALE was being set incorrectly for all lanes.
"""
import math
# Set up values like tan() would: different values per lane
instructions = [
# Create per-lane values: 2.0, 3.0, 4.0, 5.0
v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id)
v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # numer = lane_id + 2.0
# denom = 1.0 for all lanes (uniform)
v_mov_b32_e32(v[2], f2i64(1.0) & 0xffffffff),
v_mov_b32_e32(v[3], f2i64(1.0) >> 32),
# V_DIV_SCALE_F64: sets VCC per lane
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
# Copy scaled numer for FMA
v_mov_b32_e32(v[6], v[4]),
v_mov_b32_e32(v[7], v[5]),
# V_DIV_FMAS_F64: uses VCC to apply scaling
v_div_fmas_f64(v[8:9], v[6:7], v[2:3], v[4:5]),
]
st = run_program(instructions, n_lanes=4)
# All lanes should have VCC=0 (no scaling needed for normal values)
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
# Verify each lane has correct intermediate value
for lane in range(4):
expected_numer = float(lane) + 2.0
# With VCC=0, DIV_FMAS should just do FMA with no scaling
result = i642f(st.vgpr[lane][8] | (st.vgpr[lane][9] << 32))
# The FMA result should be: scaled_numer * denom + scaled_numer = 2*scaled_numer
expected = expected_numer * 1.0 + expected_numer # Simple FMA for this test setup
self.assertAlmostEqual(result, expected, places=8,
msg=f"Lane {lane}: expected {expected}, got {result}")
class TestVOP3VOPC(unittest.TestCase):
"""Tests for VOP3-encoded VOPC instructions (comparisons with scalar dest)."""
def test_v_cmp_ge_f32_e64_nan(self):
"""V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false)."""
from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST
instructions = [
s_mov_b32(s[0], 0xffc00000), # NaN
s_mov_b32(s[1], 0x00000000), # 0.0
v_mov_b32_e32(v[5], s[0]),
v_mov_b32_e32(v[3], s[1]),
VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs_=3),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.sgpr[5], 0) # NaN comparison is always FALSE
class TestMin3Max3Unsigned(unittest.TestCase):
"""Regression tests for V_MIN3/V_MAX3 with unsigned integer types.
The emulator's _minmax_reduce used UOp.minimum() which implements min(a,b) as
-max(-a,-b). This is broken for unsigned types because negation (mul by -1)
doesn't preserve ordering: for uint16, -0 = 0 but -5 = 65531, so
max(-0, -5) = max(0, 65531) = 65531, and -65531 = 5, giving min(0,5) = 5 (wrong!).
Fix: use comparison-based min/max for unsigned types: min(a,b) = (a<b)?a:b
"""
def test_v_min3_u16_with_zero(self):
"""V_MIN3_U16: min3(0, 3, 5) should return 0, not a wrong value."""
instructions = [
s_mov_b32(s[0], 0), # 0
s_mov_b32(s[1], 3), # 3
s_mov_b32(s[2], 5), # 5
v_mov_b32_e32(v[0], s[0]),
v_min3_u16(v[1], v[0], s[1], s[2]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
def test_v_min3_u16_all_nonzero(self):
"""V_MIN3_U16: min3(2, 5, 3) should return 2."""
instructions = [
s_mov_b32(s[0], 2),
s_mov_b32(s[1], 5),
s_mov_b32(s[2], 3),
v_mov_b32_e32(v[0], s[0]),
v_min3_u16(v[1], v[0], s[1], s[2]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 2)
def test_v_min3_u32_with_zero(self):
"""V_MIN3_U32: min3(0, 100, 50) should return 0."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 100),
s_mov_b32(s[2], 50),
v_mov_b32_e32(v[0], s[0]),
v_min3_u32(v[1], v[0], s[1], s[2]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0)
def test_v_max3_u16_basic(self):
"""V_MAX3_U16: max3(0, 3, 5) should return 5."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 3),
s_mov_b32(s[2], 5),
v_mov_b32_e32(v[0], s[0]),
v_max3_u16(v[1], v[0], s[1], s[2]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 5)
def test_v_min_u16_two_operand(self):
"""V_MIN_U16 (two operand): min(0, 5) should return 0."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 5),
v_mov_b32_e32(v[0], s[0]),
v_min_u16(v[1], v[0], s[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
if __name__ == '__main__':
unittest.main()

View File

@@ -149,7 +149,7 @@ class TestFmaMix(unittest.TestCase):
def test_v_fma_mix_f32_src2_f16_lo(self):
"""V_FMA_MIX_F32 with src2 as f16 from lo bits."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_2 = f32_to_f16(2.0)
instructions = [
s_mov_b32(s[0], f2i(1.0)),
@@ -166,7 +166,7 @@ class TestFmaMix(unittest.TestCase):
def test_v_fma_mix_f32_src2_f16_hi(self):
"""V_FMA_MIX_F32 with src2 as f16 from hi bits."""
from extra.assembly.amd.pcode import f32_to_f16
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_2 = f32_to_f16(2.0)
val = (f16_2 << 16) | 0
instructions = [
@@ -197,9 +197,64 @@ class TestFmaMix(unittest.TestCase):
result = i2f(st.vgpr[0][3])
self.assertAlmostEqual(result, 7.0, places=5)
def test_v_fma_mix_f32_with_abs_f16_src2_lo(self):
"""V_FMA_MIX_F32 with abs modifier on f16 src2 (lo half). Regression test for sin(1.0) bug."""
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
instructions = [
s_mov_b32(s[0], f2i(0.0)), # src0 = 0.0 (f32)
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[1], f2i(1.0)), # src1 = 1.0 (f32)
v_mov_b32_e32(v[1], s[1]),
s_mov_b32(s[2], f16_neg1), # src2 = -1.0 (f16 in lo)
v_mov_b32_e32(v[2], s[2]),
# 0*1 + abs(-1.0) = 1.0; neg_hi=4 means abs on src2, opsel_hi2=1 means src2 is f16
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg_hi=4),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][3])
self.assertAlmostEqual(result, 1.0, places=5)
def test_v_fma_mix_f32_with_neg_f16_src2_lo(self):
"""V_FMA_MIX_F32 with neg modifier on f16 src2 (lo half)."""
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_1 = f32_to_f16(1.0) # 0x3c00
instructions = [
s_mov_b32(s[0], f2i(0.0)), # src0 = 0.0 (f32)
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[1], f2i(1.0)), # src1 = 1.0 (f32)
v_mov_b32_e32(v[1], s[1]),
s_mov_b32(s[2], f16_1), # src2 = 1.0 (f16 in lo)
v_mov_b32_e32(v[2], s[2]),
# 0*1 + neg(1.0) = -1.0; neg=4 means neg on src2, opsel_hi2=1 means src2 is f16
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg=4),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][3])
self.assertAlmostEqual(result, -1.0, places=5)
def test_v_fma_mix_f32_with_abs_f16_src2_hi(self):
"""V_FMA_MIX_F32 with abs modifier on f16 src2 (hi half)."""
from extra.assembly.amd.test.hw.helpers import f32_to_f16
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
val = (f16_neg1 << 16) | 0 # -1.0 in hi, 0 in lo
instructions = [
s_mov_b32(s[0], f2i(0.0)),
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[1], f2i(1.0)),
v_mov_b32_e32(v[1], s[1]),
s_mov_b32(s[2], val),
v_mov_b32_e32(v[2], s[2]),
# opsel=4 selects hi half of src2; neg_hi=4 means abs on src2
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1, neg_hi=4),
]
st = run_program(instructions, n_lanes=1)
result = i2f(st.vgpr[0][3])
self.assertAlmostEqual(result, 1.0, places=5)
def test_v_fma_mixlo_f16(self):
"""V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], f2i(2.0)),
v_mov_b32_e32(v[0], s[0]),
@@ -219,7 +274,7 @@ class TestFmaMix(unittest.TestCase):
def test_v_fma_mixlo_f16_all_f32_sources(self):
"""V_FMA_MIXLO_F16 with all f32 sources."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], f2i(1.0)),
v_mov_b32_e32(v[0], s[0]),
@@ -237,7 +292,7 @@ class TestFmaMix(unittest.TestCase):
def test_v_fma_mixlo_f16_sin_case(self):
"""V_FMA_MIXLO_F16 case from sin kernel."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x3f800000), # f32 1.0
v_mov_b32_e32(v[3], s[0]),
@@ -259,7 +314,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_add_f16_basic(self):
"""V_PK_ADD_F16 adds two packed f16 values."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x40003c00), # hi=2.0, lo=1.0
s_mov_b32(s[1], 0x44004200), # hi=4.0, lo=3.0
@@ -276,7 +331,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_mul_f16_basic(self):
"""V_PK_MUL_F16 multiplies two packed f16 values."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x42004000), # hi=3.0, lo=2.0
s_mov_b32(s[1], 0x45004400), # hi=5.0, lo=4.0
@@ -293,7 +348,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_fma_f16_basic(self):
"""V_PK_FMA_F16: D = A * B + C for packed f16."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x42004000), # A: hi=3.0, lo=2.0
s_mov_b32(s[1], 0x45004400), # B: hi=5.0, lo=4.0
@@ -315,7 +370,7 @@ class TestVOP3P(unittest.TestCase):
Inline constants for VOP3P are f16 values in the low 16 bits only.
hi half of inline constant is 0, so hi result = v0.hi + 0 = 1.0.
"""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x3c003c00), # packed f16: hi=1.0, lo=1.0
v_mov_b32_e32(v[0], s[0]),
@@ -333,7 +388,7 @@ class TestVOP3P(unittest.TestCase):
"""V_PK_MUL_F16 with inline constant POS_TWO (2.0).
Inline constant has value only in low 16 bits, hi is 0.
"""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
# v0 = packed (3.0, 4.0), multiply by POS_TWO
# lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
instructions = [
@@ -498,7 +553,7 @@ class TestPackedMixedSigns(unittest.TestCase):
def test_pk_add_f16_mixed_signs(self):
"""V_PK_ADD_F16 with mixed positive/negative values."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0xc0003c00), # packed: hi=-2.0, lo=1.0
s_mov_b32(s[1], 0x3c003c00), # packed: hi=1.0, lo=1.0
@@ -515,7 +570,7 @@ class TestPackedMixedSigns(unittest.TestCase):
def test_pk_mul_f16_zero(self):
"""V_PK_MUL_F16 with zero."""
from extra.assembly.amd.pcode import _f16
from extra.assembly.amd.test.hw.helpers import _f16
instructions = [
s_mov_b32(s[0], 0x40004000), # packed: 2.0, 2.0
s_mov_b32(s[1], 0x00000000), # packed: 0.0, 0.0

View File

@@ -324,6 +324,29 @@ class TestCmpInt(unittest.TestCase):
st = run_program(instructions, n_lanes=4)
self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
def test_v_cmp_ne_u32_with_zero(self):
"""V_CMP_NE_U32: compare with zero, used for int->bool cast."""
instructions = [
v_mov_b32_e32(v[1], 0),
v_cmp_eq_u32_e32(1, v[255]), # vcc = (lane == 1)
v_cndmask_b32_e64(v[1], v[1], 1, VCC_LO), # v1[lane1] = 1
v_cmp_ne_u32_e32(0, v[1]), # vcc = (0 != v1)
v_cndmask_b32_e64(v[0], 0, 1, VCC_LO), # v0 = vcc ? 1 : 0
]
st = run_program(instructions, n_lanes=2)
self.assertEqual(st.vgpr[0][0], 0, "lane 0: 0 != 0 should be false")
self.assertEqual(st.vgpr[1][0], 1, "lane 1: 0 != 1 should be true")
self.assertEqual(st.vcc & 0x3, 0x2, "VCC should be 0b10")
def test_v_cmp_ne_u32_all_nonzero(self):
"""V_CMP_NE_U32: all lanes have nonzero values."""
instructions = [
v_mov_b32_e32(v[1], 5),
v_cmp_ne_u32_e32(0, v[1]),
]
st = run_program(instructions, n_lanes=4)
self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should be != 0")
def test_cmp_eq_u16_opsel_lo_lo(self):
"""V_CMP_EQ_U16 comparing lo halves."""
instructions = [
@@ -448,6 +471,242 @@ class TestCmpFloat(unittest.TestCase):
self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
class TestVOP3VOPCModifiers(unittest.TestCase):
"""Tests for VOP3 VOPC with abs/neg modifiers."""
def test_v_cmp_ge_f32_abs_both(self):
"""v_cmp_ge_f32 with abs on both sources: abs(0.0) >= abs(-1.0) = false.
Regression test: int16 mod operation uses v_cmp_ge_f32 with abs modifiers.
"""
instructions = [
v_mov_b32_e32(v[0], 0.0),
v_mov_b32_e32(v[1], -1.0),
# abs=0b11 means abs(src0) and abs(src1)
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
def test_v_cmp_ge_f32_abs_negative_divisor(self):
"""v_cmp_ge_f32 with abs: remainder check for negative divisor.
Tests the exact comparison used in int16 mod: abs(rem_f) >= abs(div_f).
For 1 % -1: rem_f = 0.0, div_f = -1.0, so abs(0.0) >= abs(-1.0) = false.
"""
instructions = [
v_mov_b32_e32(v[0], 0.0), # remainder as float
v_mov_b32_e32(v[1], -1.0), # divisor as float
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
def test_v_cmp_ge_f32_abs_small_remainder(self):
"""v_cmp_ge_f32 with abs: abs(-0.5) >= abs(-3.0) = false."""
instructions = [
v_mov_b32_e32(v[0], -0.5),
v_mov_b32_e32(v[1], -3.0),
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "abs(-0.5) >= abs(-3.0) should be false")
def test_v_cmp_ge_f32_abs_equal(self):
"""v_cmp_ge_f32 with abs: abs(-1.0) >= abs(1.0) = true."""
instructions = [
v_mov_b32_e32(v[0], -1.0),
v_mov_b32_e32(v[1], 1.0),
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "abs(-1.0) >= abs(1.0) should be true")
class TestVOP3VOPC64Bit(unittest.TestCase):
"""Tests for VOP3 VOPC with 64-bit operands."""
def test_v_cmp_lt_f64_basic(self):
"""v_cmp_lt_f64: 0.0 < 1.0 = true."""
zero_f64 = f2i64(0.0)
one_f64 = f2i64(1.0)
instructions = [
s_mov_b32(s[0], zero_f64 & 0xffffffff),
s_mov_b32(s[1], zero_f64 >> 32),
s_mov_b32(s[2], one_f64 & 0xffffffff),
s_mov_b32(s[3], one_f64 >> 32),
v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "0.0 < 1.0 should be true")
def test_v_cmp_lt_f64_negative(self):
"""v_cmp_lt_f64: -1.0 < 0.0 = true."""
neg_one_f64 = f2i64(-1.0)
zero_f64 = f2i64(0.0)
instructions = [
s_mov_b32(s[0], neg_one_f64 & 0xffffffff),
s_mov_b32(s[1], neg_one_f64 >> 32),
s_mov_b32(s[2], zero_f64 & 0xffffffff),
s_mov_b32(s[3], zero_f64 >> 32),
v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "-1.0 < 0.0 should be true")
def test_v_cmp_lt_i64_signed(self):
"""v_cmp_lt_i64: 0 < -1 (signed) = false."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 0), # s[0:1] = 0
s_mov_b32(s[2], 0xffffffff),
s_mov_b32(s[3], 0xffffffff), # s[2:3] = -1
v_cmp_lt_i64_e64(VCC_LO, s[0:1], s[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "0 < -1 (signed) should be false")
def test_v_cmp_lt_u64_unsigned(self):
"""v_cmp_lt_u64: 0 < 0xFFFFFFFFFFFFFFFF (unsigned) = true."""
instructions = [
s_mov_b32(s[0], 0),
s_mov_b32(s[1], 0), # s[0:1] = 0
s_mov_b32(s[2], 0xffffffff),
s_mov_b32(s[3], 0xffffffff), # s[2:3] = max uint64
v_cmp_lt_u64_e64(VCC_LO, s[0:1], s[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "0 < max_uint64 should be true")
class TestVOPCF64(unittest.TestCase):
"""Tests for VOPC (E32 encoding) with 64-bit float operands. Regression test for f64 compare bug."""
def test_v_cmp_lt_f64_e32_true(self):
"""v_cmp_lt_f64_e32: 2.0 < 3.0 = true."""
lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
instructions = [
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
v_cmp_lt_f64_e32(v[0:1], v[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "2.0 < 3.0 should be true")
def test_v_cmp_lt_f64_e32_false(self):
"""v_cmp_lt_f64_e32: 3.0 < 2.0 = false."""
lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
instructions = [
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
v_cmp_lt_f64_e32(v[0:1], v[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "3.0 < 2.0 should be false")
def test_v_cmp_nlt_f64_e32_true(self):
"""v_cmp_nlt_f64_e32: !(3.0 < 2.0) = true."""
lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
instructions = [
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 1, "!(3.0 < 2.0) should be true")
def test_v_cmp_nlt_f64_e32_false(self):
"""v_cmp_nlt_f64_e32: !(2.0 < 3.0) = false."""
lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
instructions = [
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vcc & 1, 0, "!(2.0 < 3.0) should be false")
class TestCmpxExec(unittest.TestCase):
"""Tests for V_CMPX instructions that modify EXEC mask."""
def test_v_cmpx_ngt_f32_e64_all_true(self):
"""V_CMPX_NGT_F32_E64: all lanes pass (literal <= all values)."""
# 131072.0 = 0x48000000
# All values > 131072, so !(131072 > val) = true for all
instructions = [
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
v_mov_b32_e32(v[0], f2i(200000.0)), # lane 0
v_cmp_eq_u32_e32(1, v[255]),
v_cndmask_b32_e64(v[1], v[0], f2i(300000.0), VCC_LO), # lane 1
v_cmp_eq_u32_e32(2, v[255]),
v_cndmask_b32_e64(v[1], v[1], f2i(400000.0), VCC_LO), # lane 2
# Now v[1] has: lane0=200000, lane1=300000, lane2=400000
# Compare: !(131072.0 > v[1]) i.e., 131072.0 <= v[1]
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
]
st = run_program(instructions, n_lanes=3)
# All values > 131072, so all lanes should remain active
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
def test_v_cmpx_ngt_f32_e64_some_false(self):
"""V_CMPX_NGT_F32_E64: some lanes fail (literal > some values)."""
instructions = [
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
v_mov_b32_e32(v[0], f2i(100000.0)), # lane 0: 131072 > 100000 = true, so !(true) = false
v_cmp_eq_u32_e32(1, v[255]),
v_cndmask_b32_e64(v[1], v[0], f2i(200000.0), VCC_LO), # lane 1: 131072 > 200000 = false, so !(false) = true
v_cmp_eq_u32_e32(2, v[255]),
v_cndmask_b32_e64(v[1], v[1], f2i(150000.0), VCC_LO), # lane 2: 131072 > 150000 = false, so !(false) = true
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
]
st = run_program(instructions, n_lanes=3)
# lane 0: fail (100000 < 131072), lanes 1,2: pass
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x6, "Lanes 1,2 should be active, lane 0 inactive")
def test_v_cmpx_ngt_f32_e64_all_false(self):
"""V_CMPX_NGT_F32_E64: all lanes fail (literal > all values)."""
instructions = [
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
v_mov_b32_e32(v[0], f2i(100.0)), # all lanes have 100.0
# 131072 > 100 = true, so !(true) = false for all
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[0]),
]
st = run_program(instructions, n_lanes=3)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x0, "All lanes should be inactive")
def test_v_cmpx_ngt_f32_e64_large_values(self):
"""V_CMPX_NGT_F32_E64: test with values that trigger Payne-Hanek in sin().
This is a regression test for the sin(859240.0) bug.
Values 859240, 1000000, 100594688 should all pass !(131072 > val).
"""
instructions = [
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
v_mov_b32_e32(v[0], f2i(859240.0)), # lane 0
v_cmp_eq_u32_e32(1, v[255]),
v_cndmask_b32_e64(v[1], v[0], f2i(1000000.0), VCC_LO), # lane 1
v_cmp_eq_u32_e32(2, v[255]),
v_cndmask_b32_e64(v[1], v[1], f2i(100594688.0), VCC_LO), # lane 2
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
]
st = run_program(instructions, n_lanes=3)
# All values > 131072, so !(131072 > val) = true for all
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
class TestVCCBehavior(unittest.TestCase):
"""Tests for VCC condition code behavior."""
@@ -472,5 +731,101 @@ class TestVCCBehavior(unittest.TestCase):
self.assertEqual(st.vcc >> 16, 0x0000, "Lanes 16-31 should be false")
class TestCmpxPartialWavefront(unittest.TestCase):
"""Tests for V_CMPX with partial wavefronts (fewer than 32 active lanes).
Regression tests for bug where v_cmpx incorrectly set EXEC bits for inactive
lanes when the wavefront had fewer than 32 lanes. This caused garbage data
from uninitialized lanes to corrupt memory writes.
"""
def test_v_cmpx_eq_u32_partial_wave_3_lanes(self):
"""V_CMPX_EQ_U32 with 3 active lanes should only affect those 3 lanes.
With n_lanes=3, initial EXEC=0x7. After v_cmpx comparing lane_id == 1,
only lane 1 should pass, so EXEC should become 0x2 (not have bits 3-31 set).
"""
instructions = [
v_cmpx_eq_u32_e32(1, v[255]), # EXEC = lanes where lane_id == 1
]
st = run_program(instructions, n_lanes=3)
# Only lane 1 should be active (bit 1 set)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x2,
"Only lane 1 should be active after v_cmpx_eq_u32 with 3 lanes")
def test_v_cmpx_eq_u32_partial_wave_5_lanes(self):
"""V_CMPX_EQ_U32 with 5 active lanes."""
instructions = [
v_cmpx_eq_u32_e32(3, v[255]), # EXEC = lanes where lane_id == 3
]
st = run_program(instructions, n_lanes=5)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x8,
"Only lane 3 should be active after v_cmpx_eq_u32 with 5 lanes")
def test_v_cmpx_lt_u32_partial_wave(self):
"""V_CMPX_LT_U32 with partial wavefront."""
# VOPC: src0 < vsrc1, so we need v_cmpx_gt_u32 to get lane_id < 2
instructions = [
v_cmpx_gt_u32_e32(2, v[255]), # EXEC = lanes where 2 > lane_id (i.e., lane_id < 2)
]
st = run_program(instructions, n_lanes=4)
# Lanes 0,1 should be active (bits 0,1 set = 0x3)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x3,
"Only lanes 0,1 should be active after v_cmpx_gt_u32(2, lane_id) with 4 lanes")
def test_v_cmpx_ge_u32_partial_wave(self):
"""V_CMPX_GE_U32 with partial wavefront."""
# VOPC: src0 >= vsrc1, so v_cmpx_le_u32(1, lane_id) gives lane_id >= 2? No.
# v_cmpx_le_u32(src0, vsrc1) = src0 <= vsrc1 = 1 <= lane_id
instructions = [
v_cmpx_le_u32_e32(2, v[255]), # EXEC = lanes where 2 <= lane_id (i.e., lane_id >= 2)
]
st = run_program(instructions, n_lanes=4)
# Lanes 2,3 should be active (bits 2,3 set = 0xC)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xC,
"Only lanes 2,3 should be active after v_cmpx_le_u32(2, lane_id) with 4 lanes")
def test_v_cmpx_ne_u32_partial_wave_all_pass(self):
"""V_CMPX_NE_U32 where all active lanes pass."""
instructions = [
v_cmpx_ne_u32_e32(99, v[255]), # EXEC = lanes where lane_id != 99
]
st = run_program(instructions, n_lanes=3)
# All 3 lanes should remain active (bits 0,1,2 set = 0x7)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x7,
"All 3 lanes should remain active when all pass")
def test_v_cmpx_eq_u32_partial_wave_none_pass(self):
"""V_CMPX_EQ_U32 where no active lanes pass."""
instructions = [
v_cmpx_eq_u32_e32(99, v[255]), # EXEC = lanes where lane_id == 99
]
st = run_program(instructions, n_lanes=3)
# No lanes should be active
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x0,
"No lanes should be active when none pass")
def test_v_cmpx_f32_partial_wave(self):
"""V_CMPX_GT_F32 with partial wavefront - float comparison."""
instructions = [
v_cvt_f32_u32_e32(v[0], v[255]), # v[0] = float(lane_id)
v_mov_b32_e32(v[1], f2i(0.5)), # v[1] = 0.5
v_cmpx_gt_f32_e32(v[0], v[1]), # EXEC = lanes where v[0] > 0.5
]
st = run_program(instructions, n_lanes=4)
# Lanes 1,2,3 have values > 0.5, lane 0 has 0.0
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xE,
"Lanes 1,2,3 should be active (float > 0.5)")
def test_v_cmpx_e64_partial_wave(self):
"""V_CMPX_EQ_U32_E64 (VOP3 encoding) with partial wavefront."""
instructions = [
v_cmpx_eq_u32_e64(EXEC_LO, v[255], 2), # EXEC = lanes where lane_id == 2
]
st = run_program(instructions, n_lanes=4)
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x4,
"Only lane 2 should be active after v_cmpx_eq_u32_e64")
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,161 @@
"""Tests for VOPD instructions - dual-issue vector operations.
VOPD executes two operations simultaneously. Key behavior:
- Both ops read their sources BEFORE either writes (dual-issue semantics)
- This means if X writes to a register that Y reads, Y sees the OLD value
- Op X can use ops 0-15 (FMAC, MUL, ADD, MOV, etc.)
- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
"""
import unittest
from extra.assembly.amd.test.hw.helpers import run_program, run_program_emu, run_program_hw, compare_wave_states, \
v, s, v_mov_b32_e32, s_mov_b32
from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
class TestVOPDBasic(unittest.TestCase):
"""Basic VOPD functionality tests."""
def test_vopd_dual_mov(self):
"""VOPD with two MOV operations to different registers."""
instructions = [
v_mov_b32_e32(v[0], 0x12345678),
v_mov_b32_e32(v[1], 0xDEADBEEF),
# X: v[2] = v[0], Y: v[3] = v[1]
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[1], v[0], v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0x12345678)
self.assertEqual(st.vgpr[0][3], 0xDEADBEEF)
def test_vopd_mov_and_add(self):
"""VOPD with MOV (X) and ADD_NC_U32 (Y) - ADD_NC_U32 can only be Y op."""
instructions = [
v_mov_b32_e32(v[0], 10),
v_mov_b32_e32(v[1], 5),
# X: v[2] = 100 (literal), Y: v[3] = v[0] + v[1] = 15
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[3], 100, v[0], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 100)
self.assertEqual(st.vgpr[0][3], 15)
class TestVOPDReadBeforeWrite(unittest.TestCase):
"""Tests for VOPD dual-issue read-before-write semantics.
In VOPD, both X and Y operations read their sources BEFORE either writes.
This is critical when X's destination is Y's source.
"""
def test_vopd_x_writes_y_reads_same_reg(self):
"""VOPD where X writes to a register that Y reads.
X: v[2] = 0 (overwrites v[2])
Y: v[1] = v[2] + v[0] (srcy0=v[2], vsrcy1=v[0])
If reads happen before writes: v[1] = OLD_v[2] + v[0] = 0xFFFFFFFF + 1 = 0
If writes happen before reads: v[1] = 0 + v[0] = 0 + 1 = 1
Hardware does reads-before-writes, so v[1] should be 0.
"""
instructions = [
v_mov_b32_e32(v[0], 1), # v[0] = 1
v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
v_mov_b32_e32(v[2], 0xFFFFFFFF), # v[2] = 0xFFFFFFFF
# X: v[2] = 0 (literal), srcx0=0, vsrcx1=v[0] (unused for MOV)
# Y: v[1] = srcy0 + vsrcy1 = v[2] + v[0] (should read OLD v[2] = 0xFFFFFFFF)
# vdsty encoding: (vdsty << 1) | ((vdstx & 1) ^ 1) where vdsty field = 0, vdstx = v[2]
# So vdsty_reg = (0 << 1) | ((2 & 1) ^ 1) = 0 | 1 = 1 = v[1]
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[0]),
]
st = run_program(instructions, n_lanes=1)
# X should have written 0 to v[2]
self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
# Y should have read OLD v[2] (0xFFFFFFFF) and added v[0] (1)
# 0xFFFFFFFF + 1 = 0 (wrap around)
self.assertEqual(st.vgpr[0][1], 0, "Y should read OLD v[2]=0xFFFFFFFF, compute 0xFFFFFFFF+1=0")
def test_vopd_x_writes_y_reads_same_reg_v2(self):
"""VOPD where X writes to a register that Y reads - cleaner test case.
X: v[2] = 0 (MOV)
Y: v[1] = v[2] + v[2] (ADD_NC_U32 with both sources from v[2])
If reads happen before writes: v[1] = OLD_v[2] + OLD_v[2] = 100 + 100 = 200
If writes happen before reads: v[1] = 0 + 0 = 0
Hardware does reads-before-writes, so v[1] should be 200.
"""
instructions = [
v_mov_b32_e32(v[0], 0x88888888), # v[0] = unused placeholder
v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
v_mov_b32_e32(v[2], 100), # v[2] = 100
# X: v[2] = 0 (literal)
# Y: v[1] = srcy0 + vsrcy1 = v[2] + v[2] (should read OLD v[2] = 100)
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[2]),
]
st = run_program(instructions, n_lanes=1)
# X should have written 0 to v[2]
self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
# Y should have read OLD v[2] (100) twice and added them
self.assertEqual(st.vgpr[0][1], 200, "Y should read OLD v[2]=100 twice, compute 100+100=200")
class TestVOPDLiterals(unittest.TestCase):
"""Tests for VOPD instructions that use SIMM32 literals (FMAAK, FMAMK)."""
def test_vopd_fmaak_f32(self):
"""VOPD V_DUAL_FMAAK_F32: D = S0 * S1 + SIMM32 (literal addend).
Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
"""
from extra.assembly.amd.test.hw.helpers import f2i, i2f
instructions = [
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
# VOPD args: opx, opy, vdstx, vdsty, srcx0, srcy0, vsrcx1, vsrcy1
# X: v[2] = fma(srcx0, vsrcx1, SIMM32) = v[0]*v[1]+10.0 = 2*3+10 = 16
# Y: v[3] = srcy0 (MOV) = v[0] = 2.0
VOPD_LIT(VOPDOp.V_DUAL_FMAAK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(10.0)),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 16.0, places=5, msg="fma(2.0, 3.0, 10.0) should be 16.0")
def test_vopd_fmamk_f32(self):
"""VOPD V_DUAL_FMAMK_F32: D = S0 * SIMM32 + S1 (literal multiplier).
Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
"""
from extra.assembly.amd.test.hw.helpers import f2i, i2f
instructions = [
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
# X: v[2] = fma(srcx0, SIMM32, vsrcx1) = v[0]*5.0+v[1] = 2*5+3 = 13
# Y: v[3] = srcy0 (MOV) = v[0] = 2.0
VOPD_LIT(VOPDOp.V_DUAL_FMAMK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(5.0)),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 13.0, places=5, msg="fma(2.0, 5.0, 3.0) should be 13.0")
class TestVOPDMultilane(unittest.TestCase):
"""Tests for VOPD with multiple lanes."""
def test_vopd_multilane_mov_add(self):
"""VOPD MOV and ADD with multiple active lanes - no register conflict."""
instructions = [
v_mov_b32_e32(v[0], 5),
v_mov_b32_e32(v[1], 10),
# X: v[2] = 100 (constant), Y: v[1] = v[0] + v[1] = 5 + 10 = 15
# vdsty_reg = (vdsty << 1) | ((vdstx.offset & 1) ^ 1) = (0 << 1) | ((258 & 1) ^ 1) = 0 | 1 = 1
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 100, v[0], v[2], v[1]),
]
st = run_program(instructions, n_lanes=4)
for lane in range(4):
self.assertEqual(st.vgpr[lane][2], 100, f"Lane {lane}: v[2] should be 100")
self.assertEqual(st.vgpr[lane][1], 15, f"Lane {lane}: v[1] should be 15 (5+10)")
if __name__ == '__main__':
unittest.main()