mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
assembly/amd: add hw tests from ucode branch (#14259)
* assembly/amd: add hw tests from ucode branch * fix is per lane
This commit is contained in:
@@ -653,17 +653,17 @@ def _apply_pseudocode_fixes(op_name: str, code: str) -> str:
|
||||
code = code.replace('D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)',
|
||||
'D0.f64 = (2.0 ** 128 if exponent(S2.f64) > 1023 else 2.0 ** -128) * fma(S0.f64, S1.f64, S2.f64)')
|
||||
if op_name == 'V_DIV_SCALE_F32':
|
||||
code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(0x1); D0.f32 = float("nan")')
|
||||
code = code.replace('D0.f32 = float("nan")', 'VCC = Reg(1 << laneId); D0.f32 = float("nan")')
|
||||
code = code.replace('elif S1.f32 == DENORM.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif False:\n pass')
|
||||
code += '\nif S1.f32 == DENORM.f32:\n D0.f32 = float("nan")'
|
||||
code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(0x1); D0.f32 = ldexp(S0.f32, 64)')
|
||||
code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)')
|
||||
code = code.replace('elif exponent(S2.f32) <= 23:\n D0.f32 = ldexp(S0.f32, 64)', 'elif exponent(S2.f32) <= 23:\n VCC = Reg(1 << laneId); D0.f32 = ldexp(S0.f32, 64)')
|
||||
code = code.replace('elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(0x1)\n if S0.f32 == S2.f32:\n D0.f32 = ldexp(S0.f32, 64)', 'elif S2.f32 / S1.f32 == DENORM.f32:\n VCC = Reg(1 << laneId)')
|
||||
if op_name == 'V_DIV_SCALE_F64':
|
||||
code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(0x1); D0.f64 = float("nan")')
|
||||
code = code.replace('D0.f64 = float("nan")', 'VCC = Reg(1 << laneId); D0.f64 = float("nan")')
|
||||
code = code.replace('elif S1.f64 == DENORM.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif False:\n pass')
|
||||
code += '\nif S1.f64 == DENORM.f64:\n D0.f64 = float("nan")'
|
||||
code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(0x1); D0.f64 = ldexp(S0.f64, 128)')
|
||||
code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)')
|
||||
code = code.replace('elif exponent(S2.f64) <= 52:\n D0.f64 = ldexp(S0.f64, 128)', 'elif exponent(S2.f64) <= 52:\n VCC = Reg(1 << laneId); D0.f64 = ldexp(S0.f64, 128)')
|
||||
code = code.replace('elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(0x1)\n if S0.f64 == S2.f64:\n D0.f64 = ldexp(S0.f64, 128)', 'elif S2.f64 / S1.f64 == DENORM.f64:\n VCC = Reg(1 << laneId)')
|
||||
if op_name == 'V_DIV_FIXUP_F32':
|
||||
code = code.replace('D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))',
|
||||
'D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) if isNAN(S0.f32) else ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32)))')
|
||||
|
||||
@@ -1,14 +1,25 @@
|
||||
"""Test infrastructure for hardware-validated RDNA3 emulator tests.
|
||||
|
||||
Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
|
||||
Set USE_HW=1 to run on both emulator and real hardware, comparing results.
|
||||
Set USE_HW=1 to run on both emulator and hardware, comparing results.
|
||||
"""
|
||||
import ctypes, os, struct
|
||||
import ctypes, math, os, struct
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
|
||||
from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
|
||||
from extra.assembly.amd.emu import run_asm
|
||||
from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
|
||||
from extra.assembly.amd.pcode import _i32, _f32
|
||||
|
||||
def _i32(f: float) -> int: return struct.unpack('<I', struct.pack('<f', f))[0]
|
||||
def _f32(i: int) -> float: return struct.unpack('<f', struct.pack('<I', i & 0xFFFFFFFF))[0]
|
||||
|
||||
# f16 conversion helpers
|
||||
def _f16(i: int) -> float: return struct.unpack('<e', struct.pack('<H', i & 0xFFFF))[0]
|
||||
def f32_to_f16(f: float) -> int:
|
||||
f = float(f)
|
||||
if math.isnan(f): return 0x7e00
|
||||
if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00
|
||||
try: return struct.unpack('<H', struct.pack('<e', f))[0]
|
||||
except OverflowError: return 0x7c00 if f > 0 else 0xfc00
|
||||
|
||||
# For backwards compatibility with tests using SrcEnum.NULL etc.
|
||||
class SrcEnum:
|
||||
@@ -32,11 +43,11 @@ VCC = VCC_LO # For VOP3SD sdst field (VCC_LO is exported from dsl)
|
||||
USE_HW = os.environ.get("USE_HW", "0") == "1"
|
||||
FLOAT_TOLERANCE = 1e-5
|
||||
|
||||
# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
|
||||
# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc, exec
|
||||
N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
|
||||
VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048
|
||||
SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64
|
||||
OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8 # + vcc + scc
|
||||
OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 12 # + vcc + scc + exec
|
||||
|
||||
# Float conversion helpers
|
||||
def f2i(f: float) -> int: return _i32(f)
|
||||
@@ -47,6 +58,14 @@ def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
|
||||
def assemble(instructions: list) -> bytes:
|
||||
return b''.join(inst.to_bytes() for inst in instructions)
|
||||
|
||||
# Simple WaveState class for test output parsing (mirrors emu.py interface for tests)
|
||||
class WaveState:
|
||||
def __init__(self):
|
||||
self.vgpr = [[0] * 256 for _ in range(32)] # vgpr[lane][reg]
|
||||
self.sgpr = [0] * 128
|
||||
self.vcc = 0
|
||||
self.scc = 0
|
||||
|
||||
def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
|
||||
"""Generate prologue and epilogue instructions for state capture."""
|
||||
prologue = [
|
||||
@@ -63,6 +82,10 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
|
||||
epilogue = [
|
||||
s_mov_b32(s[90], VCC_LO),
|
||||
s_cselect_b32(s[91], 1, 0),
|
||||
# Save EXEC early (before we modify it for VGPR stores)
|
||||
s_mov_b32(s[95], EXEC_LO),
|
||||
# Restore EXEC to all active lanes for VGPR stores (test may have modified EXEC)
|
||||
s_mov_b32(EXEC_LO, (1 << n_lanes) - 1),
|
||||
s_load_b64(s[92:93], s[80:81], 0, soffset=NULL),
|
||||
s_waitcnt(0), # simm16=0 waits for all
|
||||
v_lshlrev_b32_e32(v[240], 2, v[255]),
|
||||
@@ -80,6 +103,9 @@ def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
|
||||
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES))
|
||||
epilogue.append(v_mov_b32_e32(v[243], s[91]))
|
||||
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 4))
|
||||
# Store EXEC (saved earlier in s[95])
|
||||
epilogue.append(v_mov_b32_e32(v[243], s[95]))
|
||||
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92:93], offset=VGPR_BYTES + SGPR_BYTES + 8))
|
||||
epilogue.append(s_mov_b32(EXEC_LO, s[94]))
|
||||
epilogue.append(s_endpgm())
|
||||
return prologue, epilogue
|
||||
@@ -95,6 +121,8 @@ def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
|
||||
st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
|
||||
st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
|
||||
st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
|
||||
# Store EXEC in its proper location (index 126)
|
||||
st.sgpr[EXEC_LO.offset] = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 8)[0]
|
||||
return st
|
||||
|
||||
def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
|
||||
@@ -110,9 +138,9 @@ def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
|
||||
kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
|
||||
lib_ptr = ctypes.addressof(kernel_buf)
|
||||
|
||||
set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
|
||||
# rsrc2: USER_SGPR_COUNT=2, ENABLE_SGPR_WORKGROUP_ID_X/Y/Z=1, LDS_SIZE=128 (64KB)
|
||||
rsrc2 = 0x19c | (128 << 15)
|
||||
scratch_size = 0x10000 # 64KB per lane, matches .amdhsa_private_segment_fixed_size in run_program_hw
|
||||
result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr, rsrc2)
|
||||
assert result == 0, f"run_asm failed with {result}"
|
||||
|
||||
@@ -148,6 +176,8 @@ test:
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_kernarg_size 8
|
||||
.amdhsa_group_segment_fixed_size 65536
|
||||
.amdhsa_private_segment_fixed_size 65536
|
||||
.amdhsa_enable_private_segment 1
|
||||
.end_amdhsa_kernel
|
||||
|
||||
.amdgpu_metadata
|
||||
@@ -160,7 +190,7 @@ amdhsa.kernels:
|
||||
.symbol: test.kd
|
||||
.kernarg_segment_size: 8
|
||||
.group_segment_fixed_size: 65536
|
||||
.private_segment_fixed_size: 0
|
||||
.private_segment_fixed_size: 65536
|
||||
.kernarg_segment_align: 8
|
||||
.wavefront_size: 32
|
||||
.sgpr_count: 96
|
||||
|
||||
@@ -138,6 +138,56 @@ class TestDS2AddrMore(unittest.TestCase):
|
||||
self.assertEqual(st.vgpr[0][4], 0x12345678, "v4 should be untouched")
|
||||
|
||||
|
||||
class TestDSB128(unittest.TestCase):
|
||||
"""Tests for DS_STORE_B128 and DS_LOAD_B128 (128-bit / 4 dwords)."""
|
||||
|
||||
def test_ds_store_load_b128(self):
|
||||
"""DS_STORE_B128 stores 4 VGPRs, DS_LOAD_B128 loads them back."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[10], 0),
|
||||
s_mov_b32(s[0], 0x11111111),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
s_mov_b32(s[0], 0x22222222),
|
||||
v_mov_b32_e32(v[1], s[0]),
|
||||
s_mov_b32(s[0], 0x33333333),
|
||||
v_mov_b32_e32(v[2], s[0]),
|
||||
s_mov_b32(s[0], 0x44444444),
|
||||
v_mov_b32_e32(v[3], s[0]),
|
||||
ds_store_b128(addr=v[10], data0=v[0:3]),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
ds_load_b128(addr=v[10], vdst=v[4:7]),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have first dword")
|
||||
self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have second dword")
|
||||
self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should have third dword")
|
||||
self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should have fourth dword")
|
||||
|
||||
def test_ds_store_b128_with_offset(self):
|
||||
"""DS_STORE_B128 with non-zero offset."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[10], 0),
|
||||
s_mov_b32(s[0], 0xAAAAAAAA),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
s_mov_b32(s[0], 0xBBBBBBBB),
|
||||
v_mov_b32_e32(v[1], s[0]),
|
||||
s_mov_b32(s[0], 0xCCCCCCCC),
|
||||
v_mov_b32_e32(v[2], s[0]),
|
||||
s_mov_b32(s[0], 0xDDDDDDDD),
|
||||
v_mov_b32_e32(v[3], s[0]),
|
||||
DS(DSOp.DS_STORE_B128, addr=v[10], data0=v[0:3], offset0=16),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
DS(DSOp.DS_LOAD_B128, addr=v[10], vdst=v[4:7], offset0=16),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA)
|
||||
self.assertEqual(st.vgpr[0][5], 0xBBBBBBBB)
|
||||
self.assertEqual(st.vgpr[0][6], 0xCCCCCCCC)
|
||||
self.assertEqual(st.vgpr[0][7], 0xDDDDDDDD)
|
||||
|
||||
|
||||
class TestDSAtomic(unittest.TestCase):
|
||||
"""Tests for DS atomic operations."""
|
||||
|
||||
|
||||
@@ -128,6 +128,169 @@ class TestGlobalLoad(unittest.TestCase):
|
||||
class TestGlobalStore(unittest.TestCase):
|
||||
"""Tests for GLOBAL store instructions."""
|
||||
|
||||
def test_global_store_b8_basic(self):
|
||||
"""GLOBAL_STORE_B8 stores a single byte from VDATA[7:0]."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
# First store 0xDEADBEEF to memory
|
||||
s_mov_b32(s[4], 0xDEADBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
# Now store single byte 0x42 to same address (should only change byte 0)
|
||||
v_mov_b32_e32(v[2], 0x42),
|
||||
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
# Read back and check
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# Only byte 0 should change from 0xEF to 0x42
|
||||
self.assertEqual(st.vgpr[0][0], 0xDEADBE42, "Only byte 0 should be modified")
|
||||
|
||||
def test_global_store_b8_byte1(self):
|
||||
"""GLOBAL_STORE_B8 at offset+1 stores to byte 1."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
s_mov_b32(s[4], 0xDEADBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[2], 0x42),
|
||||
global_store_b8(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
|
||||
s_waitcnt(vmcnt=0),
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xDEAD42EF, "Only byte 1 should be modified")
|
||||
|
||||
def test_global_store_b16_basic(self):
|
||||
"""GLOBAL_STORE_B16 stores a 16-bit value from VDATA[15:0]."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
s_mov_b32(s[4], 0xDEADBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
s_mov_b32(s[4], 0xCAFE),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xDEADCAFE, "Only lower 16 bits should be modified")
|
||||
|
||||
def test_global_store_b16_high_half(self):
|
||||
"""GLOBAL_STORE_B16 at offset+2 stores to high 16 bits."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
s_mov_b32(s[4], 0xDEADBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
s_mov_b32(s[4], 0xCAFE),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+2),
|
||||
s_waitcnt(vmcnt=0),
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xCAFEBEEF, "Only upper 16 bits should be modified")
|
||||
|
||||
def test_global_store_b16_byte_offset_1(self):
|
||||
"""GLOBAL_STORE_B16 at byte offset 1 stores bytes 1-2 within the same word."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
s_mov_b32(s[4], 0xDDCCBBAA),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
# Store 0xBEEF at byte offset 1 (bytes 1-2)
|
||||
s_mov_b32(s[4], 0xBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+1),
|
||||
s_waitcnt(vmcnt=0),
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# Bytes 1-2 should be 0xBEEF (0xEF at byte 1, 0xBE at byte 2)
|
||||
# Original: 0xDDCCBBAA -> bytes [AA, BB, CC, DD]
|
||||
# After: 0xDDBEEFAA -> bytes [AA, EF, BE, DD]
|
||||
self.assertEqual(st.vgpr[0][0], 0xDDBEEFAA, "Bytes 1-2 should be 0xBEEF")
|
||||
|
||||
def test_global_store_b16_cross_word_boundary(self):
|
||||
"""GLOBAL_STORE_B16 at byte offset 3 crosses word boundary (byte 3 of word N, byte 0 of word N+1)."""
|
||||
TEST_OFFSET = 256
|
||||
instructions = [
|
||||
s_load_b64(s[2:3], s[80:81], 0, soffset=SrcEnum.NULL),
|
||||
s_waitcnt(lgkmcnt=0),
|
||||
# Initialize two consecutive words
|
||||
s_mov_b32(s[4], 0xDDCCBBAA),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
s_mov_b32(s[4], 0x44332211),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
global_store_b32(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+4),
|
||||
s_waitcnt(vmcnt=0),
|
||||
# Store 0xBEEF at byte offset 3 (crosses word boundary)
|
||||
# Low byte (0xEF) goes to byte 3 of first word
|
||||
# High byte (0xBE) goes to byte 0 of second word
|
||||
s_mov_b32(s[4], 0xBEEF),
|
||||
v_mov_b32_e32(v[2], s[4]),
|
||||
global_store_b16(addr=v[0], data=v[2], saddr=s[2:3], offset=TEST_OFFSET+3),
|
||||
s_waitcnt(vmcnt=0),
|
||||
# Load back both words
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[3], data=v[3], saddr=s[2:3], offset=TEST_OFFSET),
|
||||
GLOBAL(GLOBALOp.GLOBAL_LOAD_B32, addr=v[0], vdst=v[4], data=v[4], saddr=s[2:3], offset=TEST_OFFSET+4),
|
||||
s_waitcnt(vmcnt=0),
|
||||
v_mov_b32_e32(v[0], v[3]),
|
||||
v_mov_b32_e32(v[1], v[4]),
|
||||
s_mov_b32(s[2], 0),
|
||||
s_mov_b32(s[3], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# First word: 0xDDCCBBAA -> 0xEFCCBBAA (byte 3 becomes 0xEF)
|
||||
# Second word: 0x44332211 -> 0x443322BE (byte 0 becomes 0xBE)
|
||||
self.assertEqual(st.vgpr[0][0], 0xEFCCBBAA, "Byte 3 of first word should be 0xEF")
|
||||
self.assertEqual(st.vgpr[0][1], 0x443322BE, "Byte 0 of second word should be 0xBE")
|
||||
|
||||
def test_global_store_b64_basic(self):
|
||||
"""GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory."""
|
||||
TEST_OFFSET = 256
|
||||
|
||||
@@ -62,6 +62,28 @@ class TestBasicScalar(unittest.TestCase):
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[1], 0x80000000)
|
||||
|
||||
def test_s_fmamk_f32(self):
|
||||
"""S_FMAMK_F32: D = S0 * literal + S1."""
|
||||
# 2.0 * 3.0 + 1.0 = 7.0
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(2.0)),
|
||||
s_mov_b32(s[1], f2i(1.0)),
|
||||
s_fmamk_f32(s[2], s[0], s[1], literal=f2i(3.0)),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], f2i(7.0))
|
||||
|
||||
def test_s_fmamk_f32_negative(self):
|
||||
"""S_FMAMK_F32 with negative values."""
|
||||
# -2.0 * 4.0 + 10.0 = 2.0
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(-2.0)),
|
||||
s_mov_b32(s[1], f2i(10.0)),
|
||||
s_fmamk_f32(s[2], s[0], s[1], literal=f2i(4.0)),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], f2i(2.0))
|
||||
|
||||
|
||||
class TestQuadmaskWqm(unittest.TestCase):
|
||||
"""Tests for S_QUADMASK_B32 and S_WQM_B32."""
|
||||
@@ -298,6 +320,56 @@ class TestSignedArithmetic(unittest.TestCase):
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 2)
|
||||
|
||||
def test_s_mul_hi_u32_max(self):
|
||||
"""S_MUL_HI_U32: 0xFFFFFFFF * 0xFFFFFFFF."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF),
|
||||
s_mov_b32(s[1], 0xFFFFFFFF),
|
||||
s_mul_hi_u32(s[2], s[0], s[1]), # (0xFFFFFFFF * 0xFFFFFFFF) >> 32 = 0xFFFFFFFE
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 0xFFFFFFFE)
|
||||
|
||||
def test_s_mul_hi_i32_positive(self):
|
||||
"""S_MUL_HI_I32: positive * positive."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x40000000), # 2^30
|
||||
s_mov_b32(s[1], 4),
|
||||
s_mul_hi_i32(s[2], s[0], s[1]), # (2^30 * 4) >> 32 = 1
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 1)
|
||||
|
||||
def test_s_mul_hi_i32_neg_times_neg(self):
|
||||
"""S_MUL_HI_I32: (-1) * (-1) = 1, high bits = 0."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # -1
|
||||
s_mov_b32(s[1], 0xFFFFFFFF), # -1
|
||||
s_mul_hi_i32(s[2], s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 0)
|
||||
|
||||
def test_s_mul_hi_i32_neg_times_pos(self):
|
||||
"""S_MUL_HI_I32: (-1) * 2 = -2, high bits = -1 (sign extension)."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # -1
|
||||
s_mov_b32(s[1], 2),
|
||||
s_mul_hi_i32(s[2], s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 0xFFFFFFFF) # -1 sign extends
|
||||
|
||||
def test_s_mul_hi_i32_min_int(self):
|
||||
"""S_MUL_HI_I32: MIN_INT * 2 = -2^32, high = -1."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x80000000), # -2^31 (MIN_INT)
|
||||
s_mov_b32(s[1], 2),
|
||||
s_mul_hi_i32(s[2], s[0], s[1]), # (-2^31 * 2) >> 32 = -1
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[2], 0xFFFFFFFF)
|
||||
|
||||
def test_s_mul_i32(self):
|
||||
"""S_MUL_I32: signed multiply low 32 bits."""
|
||||
instructions = [
|
||||
@@ -329,6 +401,176 @@ class TestSignedArithmetic(unittest.TestCase):
|
||||
self.assertEqual(st.sgpr[7], ((dividend * 2) + 1) & 0xFFFFFFFF)
|
||||
|
||||
|
||||
class TestBitSet(unittest.TestCase):
|
||||
"""Tests for S_BITSET0_B32 and S_BITSET1_B32 instructions."""
|
||||
|
||||
def test_s_bitset1_b32_set_bit0(self):
|
||||
"""S_BITSET1_B32: set bit 0 in destination."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0), # start with 0
|
||||
s_mov_b32(s[1], 0), # bit position = 0
|
||||
s_bitset1_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 1, "Bit 0 should be set")
|
||||
|
||||
def test_s_bitset1_b32_set_bit31(self):
|
||||
"""S_BITSET1_B32: set bit 31 in destination."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0), # start with 0
|
||||
s_mov_b32(s[1], 31), # bit position = 31
|
||||
s_bitset1_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 0x80000000, "Bit 31 should be set")
|
||||
|
||||
def test_s_bitset1_b32_preserves_other_bits(self):
|
||||
"""S_BITSET1_B32: preserves bits not being set."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFF00FF00), # existing pattern
|
||||
s_mov_b32(s[1], 0), # bit position = 0
|
||||
s_bitset1_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 0xFF00FF01, "Should set bit 0 while preserving others")
|
||||
|
||||
def test_s_bitset0_b32_clear_bit0(self):
|
||||
"""S_BITSET0_B32: clear bit 0 in destination."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set
|
||||
s_mov_b32(s[1], 0), # bit position = 0
|
||||
s_bitset0_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 0xFFFFFFFE, "Bit 0 should be cleared")
|
||||
|
||||
def test_s_bitset0_b32_clear_bit31(self):
|
||||
"""S_BITSET0_B32: clear bit 31 in destination."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # start with all bits set
|
||||
s_mov_b32(s[1], 31), # bit position = 31
|
||||
s_bitset0_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 0x7FFFFFFF, "Bit 31 should be cleared")
|
||||
|
||||
def test_s_bitset1_b32_uses_low5_bits(self):
|
||||
"""S_BITSET1_B32: only uses low 5 bits of position (mod 32)."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 32 + 5), # position = 37, but mod 32 = 5
|
||||
s_bitset1_b32(s[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[0], 0x20, "Bit 5 should be set (37 mod 32 = 5)")
|
||||
|
||||
|
||||
class TestBfeI64(unittest.TestCase):
|
||||
"""Tests for S_BFE_I64 - 64-bit bit field extract with sign extension.
|
||||
|
||||
Regression tests for sign extension bug where 32-bit masks were incorrectly
|
||||
used for 64-bit operations, causing the high 32 bits to not be sign-extended.
|
||||
"""
|
||||
|
||||
def test_s_bfe_i64_positive_no_sign_extend(self):
|
||||
"""S_BFE_I64: positive value (1) in 16 bits should not sign extend."""
|
||||
# S1 encodes: [22:16] = width, [5:0] = offset
|
||||
# width=16, offset=0 -> S1 = (16 << 16) | 0 = 0x100000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 1), # S0 lo = 1
|
||||
s_mov_b32(s[1], 0), # S0 hi = 0
|
||||
s_mov_b32(s[2], 0x100000), # width=16, offset=0
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 1, "lo should be 1")
|
||||
self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
|
||||
|
||||
def test_s_bfe_i64_negative_sign_extend(self):
|
||||
"""S_BFE_I64: 0xFFFF (-1 in 16 bits) should sign extend to 64 bits.
|
||||
|
||||
This is the main regression test - before the fix, hi was 0 instead of 0xFFFFFFFF.
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFF), # S0 lo = -1 in 16 bits
|
||||
s_mov_b32(s[1], 0), # S0 hi = 0
|
||||
s_mov_b32(s[2], 0x100000), # width=16, offset=0
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
|
||||
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
|
||||
|
||||
def test_s_bfe_i64_8bit_negative_sign_extend(self):
|
||||
"""S_BFE_I64: 0xFF (-1 in 8 bits) should sign extend to 64 bits."""
|
||||
# width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFF), # S0 lo = -1 in 8 bits
|
||||
s_mov_b32(s[1], 0), # S0 hi = 0
|
||||
s_mov_b32(s[2], 0x80000), # width=8, offset=0
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF, "lo should be 0xFFFFFFFF")
|
||||
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
|
||||
|
||||
def test_s_bfe_i64_8bit_positive(self):
|
||||
"""S_BFE_I64: 0x7F (127 in 8 bits) should not sign extend."""
|
||||
# width=8, offset=0 -> S1 = (8 << 16) | 0 = 0x80000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x7F), # S0 lo = 127 in 8 bits (MSB=0)
|
||||
s_mov_b32(s[1], 0), # S0 hi = 0
|
||||
s_mov_b32(s[2], 0x80000), # width=8, offset=0
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0x7F, "lo should be 0x7F")
|
||||
self.assertEqual(st.vgpr[0][1], 0, "hi should be 0 (no sign extend)")
|
||||
|
||||
def test_s_bfe_i64_with_offset(self):
|
||||
"""S_BFE_I64: extract from non-zero bit offset with sign extension."""
|
||||
# Extract 16 bits starting at bit 8: value 0xFF00 >> 8 = 0xFF = -1 in 8 bits? No wait...
|
||||
# Let's put 0x8000FF00: extract 16 bits at offset 8 = 0x00FF (positive)
|
||||
# Put 0xFF00_0000: extract 16 bits at offset 16 = 0xFF00 = -256 in signed 16-bit
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFF000000), # bits [31:24] = 0xFF, [23:16] = 0x00
|
||||
s_mov_b32(s[1], 0),
|
||||
# width=16, offset=16 -> S1 = (16 << 16) | 16 = 0x100010
|
||||
s_mov_b32(s[2], 0x100010),
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# Extract bits [31:16] = 0xFF00, sign bit is bit 15 of extracted = bit 31 of original = 1
|
||||
# So result should be sign-extended 0xFF00 -> 0xFFFFFF00 in lo, 0xFFFFFFFF in hi
|
||||
self.assertEqual(st.vgpr[0][0], 0xFFFFFF00, "lo should be sign-extended 0xFF00")
|
||||
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
|
||||
|
||||
def test_s_bfe_i64_32bit_negative(self):
|
||||
"""S_BFE_I64: extract 32 bits with sign extension."""
|
||||
# width=32, offset=0 -> S1 = (32 << 16) | 0 = 0x200000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x80000000), # MIN_INT32 = -2^31
|
||||
s_mov_b32(s[1], 0),
|
||||
s_mov_b32(s[2], 0x200000), # width=32, offset=0
|
||||
s_bfe_i64(s[4:5], s[0:1], s[2]),
|
||||
v_mov_b32_e32(v[0], s[4]),
|
||||
v_mov_b32_e32(v[1], s[5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0x80000000, "lo should be 0x80000000")
|
||||
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF, "hi should be 0xFFFFFFFF (sign extended)")
|
||||
|
||||
|
||||
class Test64BitCompare(unittest.TestCase):
|
||||
"""Tests for 64-bit scalar compare instructions."""
|
||||
|
||||
|
||||
@@ -255,7 +255,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_small(self):
|
||||
"""V_CVT_F16_F32 converts small f32 value."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.5),
|
||||
v_cvt_f16_f32_e32(v[1], v[0]),
|
||||
@@ -293,7 +293,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_reads_full_32bit_source(self):
|
||||
"""V_CVT_F16_F32 must read full 32-bit f32 source."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x3fc00000), # f32 1.5
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
@@ -348,6 +348,142 @@ class TestF16Conversions(unittest.TestCase):
|
||||
self.assertEqual(result, 1, f"Expected 1 from high bits, got {result}")
|
||||
|
||||
|
||||
class TestF64Conversions(unittest.TestCase):
|
||||
"""Tests for f64 conversion instructions. Regression tests for f32_to_f64/f64_to_f32."""
|
||||
|
||||
def test_v_cvt_f64_f32_one(self):
|
||||
"""V_CVT_F64_F32 converts f32 1.0 to f64."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(1.0)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_f32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 1.0, places=10)
|
||||
|
||||
def test_v_cvt_f64_f32_negative(self):
|
||||
"""V_CVT_F64_F32 converts f32 -2.5 to f64."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(-2.5)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_f32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, -2.5, places=10)
|
||||
|
||||
def test_v_cvt_f64_f32_pi(self):
|
||||
"""V_CVT_F64_F32 converts f32 pi to f64."""
|
||||
import math
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(3.14159265)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_f32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 3.14159265, places=5)
|
||||
|
||||
def test_v_cvt_f64_f32_zero(self):
|
||||
"""V_CVT_F64_F32 converts f32 0.0 to f64."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
v_cvt_f64_f32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertEqual(result, 0.0)
|
||||
|
||||
def test_v_cvt_f32_f64_one(self):
|
||||
"""V_CVT_F32_F64 converts f64 1.0 to f32."""
|
||||
f64_bits = f2i64(1.0)
|
||||
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo),
|
||||
s_mov_b32(s[1], hi),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_cvt_f32_f64_e32(v[2], v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 1.0, places=5)
|
||||
|
||||
def test_v_cvt_f32_f64_negative(self):
|
||||
"""V_CVT_F32_F64 converts f64 -3.5 to f32."""
|
||||
f64_bits = f2i64(-3.5)
|
||||
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo),
|
||||
s_mov_b32(s[1], hi),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_cvt_f32_f64_e32(v[2], v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, -3.5, places=5)
|
||||
|
||||
def test_v_cvt_f32_f64_large(self):
|
||||
"""V_CVT_F32_F64 converts large f64 to f32."""
|
||||
f64_bits = f2i64(123456.789)
|
||||
lo, hi = f64_bits & 0xFFFFFFFF, (f64_bits >> 32) & 0xFFFFFFFF
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo),
|
||||
s_mov_b32(s[1], hi),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_cvt_f32_f64_e32(v[2], v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 123456.789, places=0)
|
||||
|
||||
def test_v_cvt_f64_i32_positive(self):
|
||||
"""V_CVT_F64_I32 converts positive i32 to f64."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 42),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_i32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 42.0, places=10)
|
||||
|
||||
def test_v_cvt_f64_i32_negative(self):
|
||||
"""V_CVT_F64_I32 converts negative i32 to f64."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # -1 as i32
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_i32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, -1.0, places=10)
|
||||
|
||||
def test_v_cvt_f64_u32_large(self):
|
||||
"""V_CVT_F64_U32 converts large u32 to f64."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # max u32
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_cvt_f64_u32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result, 4294967295.0, places=0)
|
||||
|
||||
def test_v_cvt_f64_u32_zero(self):
|
||||
"""V_CVT_F64_U32 converts 0 to f64."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0),
|
||||
v_cvt_f64_u32_e32(v[2:3], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f((st.vgpr[0][3] << 32) | st.vgpr[0][2])
|
||||
self.assertEqual(result, 0.0)
|
||||
|
||||
|
||||
class TestClz(unittest.TestCase):
|
||||
"""Tests for V_CLZ_I32_U32 - count leading zeros."""
|
||||
|
||||
@@ -560,7 +696,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f32_f16_abs_negative(self):
|
||||
"""V_CVT_F32_F16 with |abs| on negative value."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f16_neg1),
|
||||
@@ -573,7 +709,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f32_f16_abs_positive(self):
|
||||
"""V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_2 = f32_to_f16(2.0) # 0x4000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f16_2),
|
||||
@@ -586,7 +722,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f32_f16_neg_positive(self):
|
||||
"""V_CVT_F32_F16 with neg on positive value."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_2 = f32_to_f16(2.0) # 0x4000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f16_2),
|
||||
@@ -599,7 +735,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f32_f16_neg_negative(self):
|
||||
"""V_CVT_F32_F16 with neg on negative value (double negative)."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_neg2 = f32_to_f16(-2.0) # 0xc000
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f16_neg2),
|
||||
@@ -612,7 +748,7 @@ class TestCvtF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_then_pack_for_wmma(self):
|
||||
"""CVT F32->F16 followed by pack (common WMMA pattern)."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
f32_val = 3.5
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(f32_val)),
|
||||
@@ -668,7 +804,7 @@ class TestConversionRounding(unittest.TestCase):
|
||||
|
||||
def test_f16_to_f32_precision(self):
|
||||
"""F16 to F32 conversion precision."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_val = f32_to_f16(1.5)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f16_val),
|
||||
@@ -680,7 +816,7 @@ class TestConversionRounding(unittest.TestCase):
|
||||
|
||||
def test_f16_denormal_to_f32(self):
|
||||
"""F16 denormal converts to small positive f32."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
f16_denorm = 0x0001 # Smallest positive f16 denormal
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f16_denorm),
|
||||
@@ -1238,5 +1374,143 @@ class TestFloorEdgeCases(unittest.TestCase):
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
|
||||
|
||||
|
||||
class TestVop1F16HiHalf(unittest.TestCase):
|
||||
"""Regression tests for VOP1 f16 hi-half source operand handling.
|
||||
|
||||
For 16-bit VOP1 operations, when src0 is in the range v[128]+ (offset >= 384),
|
||||
the hardware reads from the high 16 bits of v[src0-128]. The emulator must
|
||||
extract bits [31:16] from the actual VGPR.
|
||||
"""
|
||||
|
||||
def test_v_cvt_f32_f16_src_hi_half(self):
|
||||
"""V_CVT_F32_F16 with source from hi-half (v[128]+).
|
||||
|
||||
When src0 >= v[128], it reads from the high 16 bits of v[src0-128].
|
||||
This is critical for global_load_d16_hi_b16 + v_cvt_f32_f16 patterns.
|
||||
|
||||
Regression test for: VOP1 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v_cvt_f32_f16 v[1], v[128] (reads hi half of v[0])
|
||||
# Should convert f16(2.0) to f32(2.0)
|
||||
v_cvt_f32_f16_e32(v[1], v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][1])
|
||||
self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected f32(2.0), got {result}")
|
||||
|
||||
def test_v_cvt_f32_f16_src_lo_vs_hi(self):
|
||||
"""V_CVT_F32_F16 comparing lo and hi half reads.
|
||||
|
||||
v[0] has different values in lo and hi halves.
|
||||
v_cvt_f32_f16 v[1], v[0] should read lo (1.0)
|
||||
v_cvt_f32_f16 v[2], v[128] should read hi (2.0)
|
||||
|
||||
Regression test for: VOP1 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# Read from lo half
|
||||
v_cvt_f32_f16_e32(v[1], v[0]),
|
||||
# Read from hi half
|
||||
v_cvt_f32_f16_e32(v[2], v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result_lo = i2f(st.vgpr[0][1])
|
||||
result_hi = i2f(st.vgpr[0][2])
|
||||
self.assertAlmostEqual(result_lo, 1.0, places=5, msg=f"Expected f32(1.0) from lo, got {result_lo}")
|
||||
self.assertAlmostEqual(result_hi, 2.0, places=5, msg=f"Expected f32(2.0) from hi, got {result_hi}")
|
||||
|
||||
def test_v_cvt_i16_f16_src_hi_half(self):
|
||||
"""V_CVT_I16_F16 with source from hi-half.
|
||||
|
||||
Regression test for: VOP1 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0xc000_3c00: hi=f16(-2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0xc0003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v_cvt_i16_f16 v[1], v[128] (reads hi half of v[0])
|
||||
# Should convert f16(-2.0) to i16(-2)
|
||||
v_cvt_i16_f16_e32(v[1], v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xffff
|
||||
expected = (-2) & 0xffff
|
||||
self.assertEqual(result, expected, f"Expected i16(-2)=0x{expected:04x}, got 0x{result:04x}")
|
||||
|
||||
def test_v_mov_b16_src_hi_half(self):
|
||||
"""V_MOV_B16 with source from hi-half.
|
||||
|
||||
Regression test for: VOP1 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0xBEEF_DEAD: hi=0xBEEF, lo=0xDEAD
|
||||
s_mov_b32(s[0], 0xBEEFDEAD),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = 0x0000_0000 initially
|
||||
v_mov_b32_e32(v[1], 0),
|
||||
# v_mov_b16 v[1], v[128] (reads hi half of v[0])
|
||||
# Should move 0xBEEF to v[1].lo
|
||||
v_mov_b16_e32(v[1], v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xffff
|
||||
self.assertEqual(result, 0xBEEF, f"Expected 0xBEEF from hi half, got 0x{result:04x}")
|
||||
|
||||
|
||||
class TestReciprocalF16(unittest.TestCase):
|
||||
"""Tests for V_RCP_F16 - reciprocal in half precision.
|
||||
|
||||
The pcode uses a 16-bit float literal: D0.f16 = 16'1.0 / S0.f16
|
||||
This tests that the sized float literal (16'1.0) is correctly parsed.
|
||||
"""
|
||||
|
||||
def test_v_rcp_f16_one(self):
|
||||
"""V_RCP_F16: 1/1.0 = 1.0"""
|
||||
import struct
|
||||
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
|
||||
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
|
||||
instructions = [
|
||||
# Load f16 1.0 into low 16 bits of v[0]
|
||||
v_mov_b32_e32(v[0], f16_to_bits(1.0)),
|
||||
v_rcp_f16_e32(v[1], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
|
||||
self.assertAlmostEqual(result, 1.0, places=2, msg="1/1.0 should be 1.0")
|
||||
|
||||
def test_v_rcp_f16_two(self):
|
||||
"""V_RCP_F16: 1/2.0 = 0.5"""
|
||||
import struct
|
||||
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
|
||||
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f16_to_bits(2.0)),
|
||||
v_rcp_f16_e32(v[1], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
|
||||
self.assertAlmostEqual(result, 0.5, places=2, msg="1/2.0 should be 0.5")
|
||||
|
||||
def test_v_rcp_f16_four(self):
|
||||
"""V_RCP_F16: 1/4.0 = 0.25"""
|
||||
import struct
|
||||
def f16_to_bits(f): return struct.unpack('<H', struct.pack('<e', f))[0]
|
||||
def bits_to_f16(b): return struct.unpack('<e', struct.pack('<H', b))[0]
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f16_to_bits(4.0)),
|
||||
v_rcp_f16_e32(v[1], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = bits_to_f16(st.vgpr[0][1] & 0xFFFF)
|
||||
self.assertAlmostEqual(result, 0.25, places=2, msg="1/4.0 should be 0.25")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -341,6 +341,293 @@ class TestHiHalfOps(unittest.TestCase):
|
||||
self.assertEqual(result, 0x4200, f"Lane {lane}: expected 0x4200, got 0x{result:04x}")
|
||||
|
||||
|
||||
class TestVop2F16HiHalf(unittest.TestCase):
|
||||
"""Regression tests for VOP2 f16 hi-half operand handling.
|
||||
|
||||
These test the bugs where:
|
||||
1. VOP2 vsrc1 >= 384 (v[128]+) wasn't extracting hi 16 bits
|
||||
2. VOP2 vdst >= 384 (v[128]+) wasn't preserving lo 16 bits
|
||||
"""
|
||||
|
||||
def test_v_add_f16_e32_vsrc1_hi_half(self):
|
||||
"""V_ADD_F16_E32 with vsrc1 from hi-half (v[128]+).
|
||||
|
||||
When vsrc1 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
|
||||
of v[vsrc1-128]. The emulator must extract bits [31:16] from the actual VGPR.
|
||||
|
||||
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v_add_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
|
||||
# In VOP2 encoding, vsrc1=384 means v[128], which maps to v[0].hi
|
||||
# v[1] = v[0].lo + v[0].hi = 1.0 + 2.0 = 3.0
|
||||
VOP2(VOP2Op.V_ADD_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xffff
|
||||
# 1.0 + 2.0 = 3.0, f16 3.0 = 0x4200
|
||||
self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
|
||||
|
||||
def test_v_mul_f16_e32_vsrc1_hi_half(self):
|
||||
"""V_MUL_F16_E32 with vsrc1 from hi-half.
|
||||
|
||||
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4200_4000: hi=f16(3.0), lo=f16(2.0)
|
||||
s_mov_b32(s[0], 0x42004000),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v_mul_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
|
||||
# v[1] = v[0].lo * v[0].hi = 2.0 * 3.0 = 6.0
|
||||
VOP2(VOP2Op.V_MUL_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xffff
|
||||
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
|
||||
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
|
||||
|
||||
def test_v_add_f16_e32_vdst_hi_half(self):
|
||||
"""V_ADD_F16_E32 writing to hi-half destination (v[128]+).
|
||||
|
||||
When vdst >= 384 (representing v[128]+), the hardware writes to bits [31:16]
|
||||
of v[vdst-128] while preserving bits [15:0]. The emulator must merge the result.
|
||||
|
||||
Regression test for: VOP2 f16 vdst hi-half write bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x0000_BEEF: lo has marker value
|
||||
s_mov_b32(s[0], 0x0000BEEF),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(1.0), v[2] = f16(2.0)
|
||||
s_mov_b32(s[1], 0x3c00),
|
||||
s_mov_b32(s[2], 0x4000),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# v_add_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
|
||||
# v[0].hi = 1.0 + 2.0 = 3.0, v[0].lo preserved = 0xBEEF
|
||||
VOP2(VOP2Op.V_ADD_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
hi = (st.vgpr[0][0] >> 16) & 0xffff
|
||||
lo = st.vgpr[0][0] & 0xffff
|
||||
# hi = 3.0 = 0x4200, lo preserved = 0xBEEF
|
||||
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
|
||||
self.assertEqual(lo, 0xBEEF, f"Expected lo preserved=0xBEEF, got 0x{lo:04x}")
|
||||
|
||||
def test_v_mul_f16_e32_vdst_hi_half(self):
|
||||
"""V_MUL_F16_E32 writing to hi-half destination.
|
||||
|
||||
Regression test for: VOP2 f16 vdst hi-half write bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x0000_DEAD: lo has marker value
|
||||
s_mov_b32(s[0], 0x0000DEAD),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(2.0), v[2] = f16(4.0)
|
||||
s_mov_b32(s[1], 0x4000),
|
||||
s_mov_b32(s[2], 0x4400),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# v_mul_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
|
||||
# v[0].hi = 2.0 * 4.0 = 8.0, v[0].lo preserved = 0xDEAD
|
||||
VOP2(VOP2Op.V_MUL_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
hi = (st.vgpr[0][0] >> 16) & 0xffff
|
||||
lo = st.vgpr[0][0] & 0xffff
|
||||
# hi = 8.0 = 0x4800, lo preserved = 0xDEAD
|
||||
self.assertEqual(hi, 0x4800, f"Expected hi=f16(8.0)=0x4800, got 0x{hi:04x}")
|
||||
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
|
||||
|
||||
def test_v_add_f16_e32_both_hi_half(self):
|
||||
"""V_ADD_F16_E32 with both vsrc1 and vdst as hi-half (different underlying regs).
|
||||
|
||||
Tests the combination of both fixes: reading vsrc1 from hi-half AND
|
||||
writing result to hi-half destination, using different underlying VGPRs.
|
||||
|
||||
Regression test for: VOP2 f16 hi-half bugs (combined).
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_xxxx: hi=f16(2.0) for vsrc1
|
||||
s_mov_b32(s[0], 0x40000000),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = 0x0000_3c00: lo=f16(1.0) for src0
|
||||
s_mov_b32(s[1], 0x00003c00),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# v[2] = 0x0000_CAFE: lo=marker for vdst preservation
|
||||
s_mov_b32(s[2], 0x0000CAFE),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# v_add_f16_e32 v[130], v[1], v[128]
|
||||
# src0 = v[1].lo = 1.0
|
||||
# vsrc1 = v[128] reads v[0].hi = 2.0
|
||||
# result = 1.0 + 2.0 = 3.0
|
||||
# vdst = v[130] writes to v[2].hi, preserving v[2].lo
|
||||
VOP2(VOP2Op.V_ADD_F16, vdst=v[130], src0=v[1], vsrc1=v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
hi = (st.vgpr[0][2] >> 16) & 0xffff
|
||||
lo = st.vgpr[0][2] & 0xffff
|
||||
# hi = 3.0 = 0x4200, lo preserved = 0xCAFE
|
||||
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
|
||||
self.assertEqual(lo, 0xCAFE, f"Expected lo preserved=0xCAFE, got 0x{lo:04x}")
|
||||
|
||||
def test_v_fmac_f16_e32_vsrc1_hi_half(self):
|
||||
"""V_FMAC_F16_E32 with vsrc1 from hi-half.
|
||||
|
||||
V_FMAC_F16: vdst = vdst + src0 * vsrc1
|
||||
|
||||
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(3.0) = 0x4200
|
||||
s_mov_b32(s[1], 0x4200),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# v_fmac_f16_e32 v[1], v[0], v[128]
|
||||
# vdst = v[1] = 3.0 + v[0].lo * v[0].hi = 3.0 + 1.0 * 2.0 = 5.0
|
||||
VOP2(VOP2Op.V_FMAC_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xffff
|
||||
# 3.0 + 1.0 * 2.0 = 5.0, f16 5.0 = 0x4500
|
||||
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
|
||||
|
||||
def test_v_fmac_f16_e32_vdst_hi_half(self):
|
||||
"""V_FMAC_F16_E32 writing to hi-half destination.
|
||||
|
||||
V_FMAC_F16: vdst.h = vdst.h + src0 * vsrc1
|
||||
|
||||
When vdst is v[128]+, the accumulator D0 must also read from the hi-half.
|
||||
This tests the bug where D0 was read from lo-half instead of hi-half.
|
||||
|
||||
Regression test for: VOP2 FMAC hi-half D0 accumulator read bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x3800_DEAD: hi=f16(0.5), lo=marker (0xDEAD)
|
||||
s_mov_b32(s[0], 0x3800DEAD),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(2.0) = 0x4000
|
||||
s_mov_b32(s[1], 0x4000),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# v[2] = f16(3.0) = 0x4200
|
||||
s_mov_b32(s[2], 0x4200),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# v_fmac_f16_e32 v[128], v[1], v[2]
|
||||
# vdst = v[128] means v[0].hi
|
||||
# D0 = v[0].hi = 0.5
|
||||
# result = D0 + src0 * vsrc1 = 0.5 + 2.0 * 3.0 = 6.5
|
||||
# v[0].hi = 6.5, v[0].lo preserved = 0xDEAD
|
||||
VOP2(VOP2Op.V_FMAC_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
hi = (st.vgpr[0][0] >> 16) & 0xffff
|
||||
lo = st.vgpr[0][0] & 0xffff
|
||||
# hi = 6.5 = 0x4680, lo preserved = 0xDEAD
|
||||
self.assertEqual(hi, 0x4680, f"Expected hi=f16(6.5)=0x4680, got 0x{hi:04x}")
|
||||
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
|
||||
|
||||
def test_v_mul_f16_e32_src0_hi_half(self):
|
||||
"""V_MUL_F16_E32 with src0 from hi-half (src0 >= v[128]).
|
||||
|
||||
When src0 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
|
||||
of v[src0-128]. The emulator must extract bits [31:16] from the actual VGPR.
|
||||
|
||||
Regression test for: VOP2 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(3.0) = 0x4200
|
||||
s_mov_b32(s[1], 0x4200),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# v_mul_f16_e32 v[2], v[128], v[1]
|
||||
# src0 = v[128] reads from v[0].hi = 2.0
|
||||
# result = 2.0 * 3.0 = 6.0
|
||||
VOP2(VOP2Op.V_MUL_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][2] & 0xffff
|
||||
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
|
||||
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
|
||||
|
||||
def test_v_add_f16_e32_src0_hi_half(self):
|
||||
"""V_ADD_F16_E32 with src0 from hi-half (src0 >= v[128]).
|
||||
|
||||
Regression test for: VOP2 f16 src0 hi-half extraction bug.
|
||||
"""
|
||||
instructions = [
|
||||
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
|
||||
s_mov_b32(s[0], 0x40003c00),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v[1] = f16(5.0) = 0x4500
|
||||
s_mov_b32(s[1], 0x4500),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# v_add_f16_e32 v[2], v[128], v[1]
|
||||
# src0 = v[128] reads from v[0].hi = 2.0
|
||||
# result = 2.0 + 5.0 = 7.0
|
||||
VOP2(VOP2Op.V_ADD_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][2] & 0xffff
|
||||
# 2.0 + 5.0 = 7.0, f16 7.0 = 0x4700
|
||||
self.assertEqual(result, 0x4700, f"Expected f16(7.0)=0x4700, got 0x{result:04x}")
|
||||
|
||||
|
||||
class TestF16InlineConstants(unittest.TestCase):
|
||||
"""Regression tests for VOP2 F16 inline float constants.
|
||||
|
||||
For 16-bit VOP2 operations (v_add_f16, v_mul_f16, etc.), inline float constants
|
||||
like 1.0, 2.0 must use F16 encoding (0x3c00, 0x4000) not F32 encoding (0x3f800000).
|
||||
|
||||
The emulator's rsrc() function needs bits=16 to select F16_INLINE constants.
|
||||
|
||||
Regression test for: VOP2 16-bit inline constant using F32 instead of F16.
|
||||
"""
|
||||
|
||||
def test_v_add_f16_inline_constant_1_0(self):
|
||||
"""V_ADD_F16_E32 with inline constant 1.0 should use F16 encoding."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x3c00), # f16 1.0
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
# v_add_f16_e32 v[1], 1.0, v[0] -- 1.0 must be F16 0x3c00, not F32 0x3f800000
|
||||
v_add_f16_e32(v[1], 1.0, v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xFFFF
|
||||
# 1.0 + 1.0 = 2.0, f16 2.0 = 0x4000
|
||||
self.assertEqual(result, 0x4000, f"Expected f16(2.0)=0x4000, got 0x{result:04x}")
|
||||
|
||||
def test_v_add_f16_inline_constant_2_0(self):
|
||||
"""V_ADD_F16_E32 with inline constant 2.0."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x4200), # f16 3.0
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_add_f16_e32(v[1], 2.0, v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xFFFF
|
||||
# 2.0 + 3.0 = 5.0, f16 5.0 = 0x4500
|
||||
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
|
||||
|
||||
def test_v_mul_f16_inline_constant(self):
|
||||
"""V_MUL_F16_E32 with inline constant 2.0."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x4200), # f16 3.0
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mul_f16_e32(v[1], 2.0, v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = st.vgpr[0][1] & 0xFFFF
|
||||
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
|
||||
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
|
||||
|
||||
|
||||
class TestCndmask(unittest.TestCase):
|
||||
"""Tests for V_CNDMASK_B32 and V_CNDMASK_B16."""
|
||||
|
||||
@@ -447,5 +734,132 @@ class TestSpecialFloatValues(unittest.TestCase):
|
||||
self.assertEqual(st.vgpr[0][1], 0x00000000)
|
||||
|
||||
|
||||
class TestCarryOps(unittest.TestCase):
|
||||
"""Tests for VOP2 carry instructions (v_add_co_ci_u32, v_sub_co_ci_u32, v_subrev_co_ci_u32)."""
|
||||
|
||||
def test_v_subrev_co_ci_u32_no_borrow(self):
|
||||
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=0."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
|
||||
v_mov_b32_e32(v[0], 5), # S0 = 5
|
||||
v_mov_b32_e32(v[1], 10), # S1 = 10
|
||||
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 5)
|
||||
self.assertEqual(st.vcc, 0) # No borrow out
|
||||
|
||||
def test_v_subrev_co_ci_u32_with_borrow(self):
|
||||
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=1."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 1), # VCC = 1 (borrow in)
|
||||
v_mov_b32_e32(v[0], 5), # S0 = 5
|
||||
v_mov_b32_e32(v[1], 10), # S1 = 10
|
||||
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 1 = 4
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 4)
|
||||
self.assertEqual(st.vcc, 0) # No borrow out
|
||||
|
||||
def test_v_subrev_co_ci_u32_generates_borrow(self):
|
||||
"""V_SUBREV_CO_CI_U32: generates borrow when S0 + VCC_IN > S1."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 0), # VCC = 0
|
||||
v_mov_b32_e32(v[0], 10), # S0 = 10
|
||||
v_mov_b32_e32(v[1], 5), # S1 = 5
|
||||
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 - 10 - 0 = -5 (underflow)
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 0xFFFFFFFB) # -5 as unsigned
|
||||
self.assertEqual(st.vcc, 1) # Borrow out
|
||||
|
||||
def test_v_add_co_ci_u32_no_carry(self):
|
||||
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=0."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 0), # VCC = 0 (no carry in)
|
||||
v_mov_b32_e32(v[0], 5), # S0 = 5
|
||||
v_mov_b32_e32(v[1], 10), # S1 = 10
|
||||
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 0 = 15
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 15)
|
||||
self.assertEqual(st.vcc, 0) # No carry out
|
||||
|
||||
def test_v_add_co_ci_u32_with_carry(self):
|
||||
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=1."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
|
||||
v_mov_b32_e32(v[0], 5), # S0 = 5
|
||||
v_mov_b32_e32(v[1], 10), # S1 = 10
|
||||
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 1 = 16
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 16)
|
||||
self.assertEqual(st.vcc, 0) # No carry out
|
||||
|
||||
def test_v_add_co_ci_u32_generates_carry(self):
|
||||
"""V_ADD_CO_CI_U32: generates carry when overflow occurs."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
|
||||
s_mov_b32(s[0], 0xFFFFFFFF), # max u32
|
||||
v_mov_b32_e32(v[0], s[0]), # S0 = 0xFFFFFFFF
|
||||
v_mov_b32_e32(v[1], 0), # S1 = 0
|
||||
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 0xFFFFFFFF + 0 + 1 = 0 (overflow)
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 0) # Overflowed to 0
|
||||
self.assertEqual(st.vcc, 1) # Carry out
|
||||
|
||||
def test_v_sub_co_ci_u32_no_borrow(self):
|
||||
"""V_SUB_CO_CI_U32: D0 = S0 - S1 - VCC_IN, when VCC_IN=0."""
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
|
||||
v_mov_b32_e32(v[0], 10), # S0 = 10
|
||||
v_mov_b32_e32(v[1], 5), # S1 = 5
|
||||
v_sub_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 5)
|
||||
self.assertEqual(st.vcc, 0) # No borrow out
|
||||
|
||||
def test_v_sub_co_ci_u32_vop3sd_separate_carry_regs(self):
|
||||
"""VOP3SD V_SUB_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
|
||||
|
||||
This tests the VOP3SD encoding where src2 specifies the carry-in register
|
||||
independently from sdst (carry-out). The bug was reading carry-in from sdst
|
||||
instead of src2.
|
||||
|
||||
Computation: D0 = S0 - S1 - carry_in = 0 - 0 - 1 = -1 = 0xFFFFFFFF
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
|
||||
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
|
||||
# VOP3SD: v_sub_co_ci_u32(vdst, sdst, src0, src1, src2)
|
||||
# src2 is carry-in (s[6]=1), sdst is carry-out (s[10])
|
||||
v_sub_co_ci_u32(v[0], s[10], 0, 0, s[6]), # D0 = 0 - 0 - 1 = -1
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF) # -1 as unsigned
|
||||
self.assertEqual(st.sgpr[10], 1) # Borrow out to s[10]
|
||||
|
||||
def test_v_add_co_ci_u32_vop3sd_separate_carry_regs(self):
|
||||
"""VOP3SD V_ADD_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
|
||||
|
||||
This tests the VOP3SD encoding where src2 specifies the carry-in register
|
||||
independently from sdst (carry-out).
|
||||
|
||||
Computation: D0 = S0 + S1 + carry_in = 5 + 10 + 1 = 16
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
|
||||
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
|
||||
# VOP3SD: v_add_co_ci_u32(vdst, sdst, src0, src1, src2)
|
||||
v_add_co_ci_u32(v[0], s[10], 5, 10, s[6]), # D0 = 5 + 10 + 1 = 16
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 16)
|
||||
self.assertEqual(st.sgpr[10], 0) # No carry out
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -58,6 +58,95 @@ class TestFMA(unittest.TestCase):
|
||||
self.assertTrue(math.isinf(result) and result > 0)
|
||||
|
||||
|
||||
class TestFmacE64(unittest.TestCase):
|
||||
"""Regression tests for V_FMAC_F32 VOP3 encoding (e64).
|
||||
|
||||
V_FMAC_F32: D0 = D0 + S0 * S1 (fused multiply-add with accumulator)
|
||||
|
||||
The VOP3 encoding needs to read D0 from the destination register as the
|
||||
accumulator input, not just write to it.
|
||||
|
||||
Regression test for: VOP3 FMAC missing D0 accumulator bug.
|
||||
"""
|
||||
|
||||
def test_v_fmac_f32_e64_basic(self):
|
||||
"""V_FMAC_F32_E64: basic accumulate test."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 2.0), # S0 = 2.0
|
||||
v_mov_b32_e32(v[1], 3.0), # S1 = 3.0
|
||||
v_mov_b32_e32(v[2], 1.0), # D0 (accumulator) = 1.0
|
||||
# v_fmac_f32_e64 v[2], v[0], v[1]
|
||||
# D0 = D0 + S0 * S1 = 1.0 + 2.0 * 3.0 = 7.0
|
||||
v_fmac_f32_e64(v[2], v[0], v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 7.0, places=5)
|
||||
|
||||
def test_v_fmac_f32_e64_with_sgpr_sources(self):
|
||||
"""V_FMAC_F32_E64 with SGPR sources (common in AMD_LLVM output).
|
||||
|
||||
This tests the exact pattern that was failing: v_fmac_f32_e64(v[0], s[4], 0)
|
||||
where src0 is SGPR and src1 is inline constant 0.
|
||||
|
||||
Regression test for: VOP3 FMAC missing D0 accumulator bug.
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[4], f2i(2.0)), # S0 = 2.0 in SGPR
|
||||
v_mov_b32_e32(v[0], 5.0), # D0 (accumulator) = 5.0
|
||||
# v_fmac_f32_e64 v[0], s[4], 0
|
||||
# D0 = D0 + S0 * S1 = 5.0 + 2.0 * 0.0 = 5.0
|
||||
v_fmac_f32_e64(v[0], s[4], 0),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][0]), 5.0, places=5)
|
||||
|
||||
def test_v_fmac_f32_e64_with_two_sgprs(self):
|
||||
"""V_FMAC_F32_E64 with two SGPR sources.
|
||||
|
||||
Tests pattern: v_fmac_f32_e64(v[0], s[a], s[b])
|
||||
|
||||
Regression test for: VOP3 FMAC missing D0 accumulator bug.
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(s[10], f2i(3.0)), # S0 = 3.0
|
||||
s_mov_b32(s[12], f2i(4.0)), # S1 = 4.0
|
||||
v_mov_b32_e32(v[9], 2.0), # D0 (accumulator) = 2.0
|
||||
# v_fmac_f32_e64 v[9], s[10], s[12]
|
||||
# D0 = D0 + S0 * S1 = 2.0 + 3.0 * 4.0 = 14.0
|
||||
v_fmac_f32_e64(v[9], s[10], s[12]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][9]), 14.0, places=5)
|
||||
|
||||
def test_v_fmac_f32_e64_accumulates_correctly(self):
|
||||
"""V_FMAC_F32_E64 accumulates multiple times."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.0), # D0 = 0.0
|
||||
v_mov_b32_e32(v[1], 1.0), # S0 = 1.0
|
||||
v_mov_b32_e32(v[2], 2.0), # S1 = 2.0
|
||||
# First: D0 = 0.0 + 1.0 * 2.0 = 2.0
|
||||
v_fmac_f32_e64(v[0], v[1], v[2]),
|
||||
# Second: D0 = 2.0 + 1.0 * 2.0 = 4.0
|
||||
v_fmac_f32_e64(v[0], v[1], v[2]),
|
||||
# Third: D0 = 4.0 + 1.0 * 2.0 = 6.0
|
||||
v_fmac_f32_e64(v[0], v[1], v[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][0]), 6.0, places=5)
|
||||
|
||||
def test_v_fmac_f32_e64_negative_accumulator(self):
|
||||
"""V_FMAC_F32_E64 with negative accumulator."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 2.0), # S0 = 2.0
|
||||
v_mov_b32_e32(v[1], 3.0), # S1 = 3.0
|
||||
v_mov_b32_e32(v[2], -10.0), # D0 (accumulator) = -10.0
|
||||
# D0 = -10.0 + 2.0 * 3.0 = -4.0
|
||||
v_fmac_f32_e64(v[2], v[0], v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][2]), -4.0, places=5)
|
||||
|
||||
|
||||
class TestDivScale(unittest.TestCase):
|
||||
"""Tests for V_DIV_SCALE_F32."""
|
||||
|
||||
@@ -768,7 +857,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_fma_f16_inline_const_1_0(self):
|
||||
"""V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
|
||||
f16_a = f32_to_f16(0.325928) # ~0x3537
|
||||
f16_b = f32_to_f16(-0.486572) # ~0xb7c9
|
||||
instructions = [
|
||||
@@ -785,7 +874,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_fma_f16_inline_const_0_5(self):
|
||||
"""V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
|
||||
f16_a = f32_to_f16(2.0)
|
||||
f16_b = f32_to_f16(3.0)
|
||||
instructions = [
|
||||
@@ -802,7 +891,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_fma_f16_inline_const_neg_1_0(self):
|
||||
"""V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
|
||||
f16_a = f32_to_f16(2.0)
|
||||
f16_b = f32_to_f16(3.0)
|
||||
instructions = [
|
||||
@@ -819,7 +908,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_add_f16_abs_both(self):
|
||||
"""V_ADD_F16 with abs on both operands."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
|
||||
f16_neg2 = f32_to_f16(-2.0)
|
||||
f16_neg3 = f32_to_f16(-3.0)
|
||||
instructions = [
|
||||
@@ -835,7 +924,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
def test_v_mul_f16_neg_abs(self):
|
||||
"""V_MUL_F16 with neg on one operand and abs on another."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16, _f16
|
||||
f16_2 = f32_to_f16(2.0)
|
||||
f16_neg3 = f32_to_f16(-3.0)
|
||||
instructions = [
|
||||
@@ -854,7 +943,7 @@ class TestF16Modifiers(unittest.TestCase):
|
||||
|
||||
This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
|
||||
"""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0}
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
@@ -1621,6 +1710,27 @@ class TestCarryBorrow(unittest.TestCase):
|
||||
self.assertEqual(st.vgpr[0][4], 0x00000000, "lo result")
|
||||
self.assertEqual(st.vgpr[0][5], 0x00000003, "hi result")
|
||||
|
||||
def test_add_co_u32_same_dst_src(self):
|
||||
"""V_ADD_CO_U32 where dst is same as src - VCC must use original src value."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xFFFFFFFF),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1, VCC should be set from overflow
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 0, "0xFFFFFFFF + 1 = 0")
|
||||
self.assertEqual(st.vcc & 1, 1, "Should have carry from 0xFFFFFFFF + 1")
|
||||
|
||||
def test_add_co_u32_same_dst_src_no_carry(self):
|
||||
"""V_ADD_CO_U32 where dst is same as src - no carry case."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 100),
|
||||
v_add_co_u32(v[0], VCC, v[0], 1), # v[0] = v[0] + 1
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][0], 101, "100 + 1 = 101")
|
||||
self.assertEqual(st.vcc & 1, 0, "No carry from 100 + 1")
|
||||
|
||||
|
||||
class TestReadlane(unittest.TestCase):
|
||||
"""Tests for V_READLANE_B32 and related cross-lane operations."""
|
||||
@@ -2292,5 +2402,414 @@ class TestAddF32EdgeCases(unittest.TestCase):
|
||||
self.assertEqual(st.vgpr[0][2], 0x80000000) # -0
|
||||
|
||||
|
||||
class TestDivScaleF64(unittest.TestCase):
|
||||
"""Tests for V_DIV_SCALE_F64 - critical for tan() and division.
|
||||
|
||||
These tests verify that VCC bits are set independently per lane,
|
||||
which is essential for correct multi-lane f64 division operations.
|
||||
"""
|
||||
|
||||
def test_div_scale_f64_basic_no_scaling(self):
|
||||
"""V_DIV_SCALE_F64: normal values with no scaling needed."""
|
||||
sqrt2 = f2i64(1.4142135623730951)
|
||||
one = f2i64(1.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], sqrt2 & 0xffffffff),
|
||||
s_mov_b32(s[1], sqrt2 >> 32),
|
||||
s_mov_b32(s[2], one & 0xffffffff),
|
||||
s_mov_b32(s[3], one >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
|
||||
self.assertAlmostEqual(result, 1.4142135623730951, places=10)
|
||||
self.assertEqual(st.vcc & 1, 0, "VCC should be 0 when no scaling needed")
|
||||
|
||||
def test_div_scale_f64_vcc_per_lane_uniform_input(self):
|
||||
"""V_DIV_SCALE_F64: VCC bits should be set independently per lane (uniform input).
|
||||
|
||||
This is a regression test for the bug where VCC = 0x0LL was setting the whole
|
||||
64-bit VCC register instead of just the current lane's bit. With uniform input
|
||||
all lanes should get VCC=0.
|
||||
"""
|
||||
val = f2i64(2.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], val & 0xffffffff),
|
||||
s_mov_b32(s[1], val >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# All lanes should have VCC=0 for normal values
|
||||
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
|
||||
# All lanes should have same result
|
||||
for lane in range(4):
|
||||
result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
|
||||
self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} result mismatch")
|
||||
|
||||
def test_div_scale_f64_vcc_per_lane_varying_input(self):
|
||||
"""V_DIV_SCALE_F64: VCC bits set per-lane with different inputs per lane.
|
||||
|
||||
This test uses different inputs per lane to verify that VCC is tracked
|
||||
independently. This catches the bug where the emulator was setting VCC
|
||||
for all lanes to the same value.
|
||||
"""
|
||||
import math
|
||||
# Use lane-varying input: lane 0 gets 2.0, lane 1 gets 3.0, etc.
|
||||
# All normal values should result in VCC=0 for each lane
|
||||
instructions = [
|
||||
# Set up per-lane values using lane_id
|
||||
v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id)
|
||||
v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # v0:1 = lane_id + 2.0
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[2:3], sdst=VCC, src0=v[0:1], src1=v[0:1], src2=v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# All lanes should have VCC=0 (no scaling needed for 2.0, 3.0, 4.0, 5.0)
|
||||
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
|
||||
# Verify each lane has correct result
|
||||
for lane in range(4):
|
||||
expected = float(lane) + 2.0
|
||||
result = i642f(st.vgpr[lane][2] | (st.vgpr[lane][3] << 32))
|
||||
self.assertAlmostEqual(result, expected, places=10, msg=f"Lane {lane}: expected {expected}, got {result}")
|
||||
|
||||
def test_div_scale_f64_zero_denom_sets_vcc(self):
|
||||
"""V_DIV_SCALE_F64: zero denominator -> NaN, VCC=1."""
|
||||
import math
|
||||
one = f2i64(1.0)
|
||||
zero = f2i64(0.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], one & 0xffffffff),
|
||||
s_mov_b32(s[1], one >> 32),
|
||||
s_mov_b32(s[2], zero & 0xffffffff),
|
||||
s_mov_b32(s[3], zero >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]), # numer = 1.0
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]), # denom = 0.0
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
|
||||
self.assertTrue(math.isnan(result), "Should be NaN for zero denom")
|
||||
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
|
||||
|
||||
def test_div_scale_f64_mixed_vcc_per_lane(self):
|
||||
"""V_DIV_SCALE_F64: some lanes need scaling, others don't.
|
||||
|
||||
This is the key test for the tan() bug - it verifies that VCC is set
|
||||
correctly for each lane independently when some lanes need scaling and
|
||||
others don't.
|
||||
"""
|
||||
import math
|
||||
# Lane 0: normal value (VCC=0), Lane 1: zero denom (VCC=1)
|
||||
# Lane 2: normal value (VCC=0), Lane 3: zero denom (VCC=1)
|
||||
normal = f2i64(2.0)
|
||||
zero = f2i64(0.0)
|
||||
instructions = [
|
||||
# Set up numer = 2.0 for all lanes
|
||||
s_mov_b32(s[0], normal & 0xffffffff),
|
||||
s_mov_b32(s[1], normal >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
# Set up denom: lane 0,2 get 2.0, lane 1,3 get 0.0
|
||||
s_mov_b32(s[2], zero & 0xffffffff),
|
||||
s_mov_b32(s[3], zero >> 32),
|
||||
v_mov_b32_e32(v[2], s[0]), # default to 2.0
|
||||
v_mov_b32_e32(v[3], s[1]),
|
||||
# Override lanes 1 and 3 with 0.0 using writelane
|
||||
v_writelane_b32(v[2], s[2], 1),
|
||||
v_writelane_b32(v[3], s[3], 1),
|
||||
v_writelane_b32(v[2], s[2], 3),
|
||||
v_writelane_b32(v[3], s[3], 3),
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# Lanes 0,2 should have VCC=0 (normal), lanes 1,3 should have VCC=1 (zero denom)
|
||||
self.assertEqual(st.vcc & 0b0001, 0, "Lane 0 VCC should be 0")
|
||||
self.assertEqual(st.vcc & 0b0010, 0b0010, "Lane 1 VCC should be 1")
|
||||
self.assertEqual(st.vcc & 0b0100, 0, "Lane 2 VCC should be 0")
|
||||
self.assertEqual(st.vcc & 0b1000, 0b1000, "Lane 3 VCC should be 1")
|
||||
|
||||
# Check results
|
||||
for lane in [0, 2]:
|
||||
result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
|
||||
self.assertAlmostEqual(result, 2.0, places=10, msg=f"Lane {lane} should be 2.0")
|
||||
for lane in [1, 3]:
|
||||
result = i642f(st.vgpr[lane][4] | (st.vgpr[lane][5] << 32))
|
||||
self.assertTrue(math.isnan(result), f"Lane {lane} should be NaN")
|
||||
|
||||
|
||||
class TestDivFmasF64(unittest.TestCase):
|
||||
"""Tests for V_DIV_FMAS_F64 - scaling FMA for f64 division.
|
||||
|
||||
These tests verify that V_DIV_FMAS applies the correct scaling
|
||||
based on VCC per lane, which is essential for correct tan() results.
|
||||
"""
|
||||
|
||||
def test_div_fmas_f64_no_scale_vcc0(self):
|
||||
"""V_DIV_FMAS_F64: VCC=0 -> normal FMA, no scaling."""
|
||||
a = f2i64(2.0)
|
||||
b = f2i64(3.0)
|
||||
c = f2i64(1.0)
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 0),
|
||||
s_mov_b32(s[0], a & 0xffffffff),
|
||||
s_mov_b32(s[1], a >> 32),
|
||||
s_mov_b32(s[2], b & 0xffffffff),
|
||||
s_mov_b32(s[3], b >> 32),
|
||||
s_mov_b32(s[4], c & 0xffffffff),
|
||||
s_mov_b32(s[5], c >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
v_mov_b32_e32(v[4], s[4]),
|
||||
v_mov_b32_e32(v[5], s[5]),
|
||||
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
|
||||
expected = 2.0 * 3.0 + 1.0 # = 7.0
|
||||
self.assertAlmostEqual(result, expected, places=10)
|
||||
|
||||
def test_div_fmas_f64_scale_up_vcc1_large_s2(self):
|
||||
"""V_DIV_FMAS_F64: VCC=1 with S2 exponent > 1023 -> scale by 2^+128."""
|
||||
a = f2i64(1.0)
|
||||
b = f2i64(1.0)
|
||||
c = f2i64(2.0) # exponent = 1024 > 1023, so scale UP
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 1),
|
||||
s_mov_b32(s[0], a & 0xffffffff),
|
||||
s_mov_b32(s[1], a >> 32),
|
||||
s_mov_b32(s[2], b & 0xffffffff),
|
||||
s_mov_b32(s[3], b >> 32),
|
||||
s_mov_b32(s[4], c & 0xffffffff),
|
||||
s_mov_b32(s[5], c >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
v_mov_b32_e32(v[4], s[4]),
|
||||
v_mov_b32_e32(v[5], s[5]),
|
||||
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
|
||||
expected = (1.0 * 1.0 + 2.0) * (2.0 ** 128) # = 3.0 * 2^128
|
||||
self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
|
||||
|
||||
def test_div_fmas_f64_scale_down_vcc1_small_s2(self):
|
||||
"""V_DIV_FMAS_F64: VCC=1 with S2 exponent <= 1023 -> scale by 2^-128."""
|
||||
a = f2i64(2.0)
|
||||
b = f2i64(3.0)
|
||||
c = f2i64(1.0) # exponent = 1023, so scale DOWN
|
||||
instructions = [
|
||||
s_mov_b32(VCC_LO, 1),
|
||||
s_mov_b32(s[0], a & 0xffffffff),
|
||||
s_mov_b32(s[1], a >> 32),
|
||||
s_mov_b32(s[2], b & 0xffffffff),
|
||||
s_mov_b32(s[3], b >> 32),
|
||||
s_mov_b32(s[4], c & 0xffffffff),
|
||||
s_mov_b32(s[5], c >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
v_mov_b32_e32(v[4], s[4]),
|
||||
v_mov_b32_e32(v[5], s[5]),
|
||||
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
|
||||
expected = (2.0 * 3.0 + 1.0) * (2.0 ** -128) # = 7.0 * 2^-128
|
||||
self.assertAlmostEqual(result, expected, delta=abs(expected) * 1e-10)
|
||||
|
||||
def test_div_fmas_f64_per_lane_vcc_varying(self):
|
||||
"""V_DIV_FMAS_F64: different VCC per lane applies different scaling.
|
||||
|
||||
This is the key test for the tan() bug - verifies that scaling is
|
||||
applied per-lane based on VCC bits, not uniformly.
|
||||
"""
|
||||
a = f2i64(1.0)
|
||||
b = f2i64(1.0)
|
||||
c = f2i64(1.0) # exponent = 1023, so when VCC=1 it scales DOWN
|
||||
instructions = [
|
||||
# VCC = 0b0101: lanes 0,2 scale, lanes 1,3 don't
|
||||
s_mov_b32(VCC_LO, 0b0101),
|
||||
s_mov_b32(s[0], a & 0xffffffff),
|
||||
s_mov_b32(s[1], a >> 32),
|
||||
s_mov_b32(s[2], b & 0xffffffff),
|
||||
s_mov_b32(s[3], b >> 32),
|
||||
s_mov_b32(s[4], c & 0xffffffff),
|
||||
s_mov_b32(s[5], c >> 32),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
v_mov_b32_e32(v[3], s[3]),
|
||||
v_mov_b32_e32(v[4], s[4]),
|
||||
v_mov_b32_e32(v[5], s[5]),
|
||||
v_div_fmas_f64(v[6:7], v[0:1], v[2:3], v[4:5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
|
||||
scaled = (1.0 * 1.0 + 1.0) * (2.0 ** -128) # = 2.0 * 2^-128
|
||||
unscaled = 1.0 * 1.0 + 1.0 # = 2.0
|
||||
|
||||
# Lane 0: VCC=1, scale
|
||||
result0 = i642f(st.vgpr[0][6] | (st.vgpr[0][7] << 32))
|
||||
self.assertAlmostEqual(result0, scaled, delta=abs(scaled) * 1e-10, msg="Lane 0 should be scaled")
|
||||
|
||||
# Lane 1: VCC=0, no scale
|
||||
result1 = i642f(st.vgpr[1][6] | (st.vgpr[1][7] << 32))
|
||||
self.assertAlmostEqual(result1, unscaled, places=10, msg="Lane 1 should be unscaled")
|
||||
|
||||
# Lane 2: VCC=1, scale
|
||||
result2 = i642f(st.vgpr[2][6] | (st.vgpr[2][7] << 32))
|
||||
self.assertAlmostEqual(result2, scaled, delta=abs(scaled) * 1e-10, msg="Lane 2 should be scaled")
|
||||
|
||||
# Lane 3: VCC=0, no scale
|
||||
result3 = i642f(st.vgpr[3][6] | (st.vgpr[3][7] << 32))
|
||||
self.assertAlmostEqual(result3, unscaled, places=10, msg="Lane 3 should be unscaled")
|
||||
|
||||
|
||||
class TestDivScaleFmasF64Integration(unittest.TestCase):
|
||||
"""Integration tests for V_DIV_SCALE_F64 + V_DIV_FMAS_F64.
|
||||
|
||||
These tests verify the full division sequence used by tan() works
|
||||
correctly with multiple lanes having different values.
|
||||
"""
|
||||
|
||||
def test_div_scale_then_fmas_multi_lane_tan_pattern(self):
|
||||
"""Test the pattern used by tan(): DIV_SCALE sets VCC, DIV_FMAS uses it.
|
||||
|
||||
This is the exact bug scenario: tan([2.0, 3.0, 4.0]) was failing because
|
||||
VCC from DIV_SCALE was being set incorrectly for all lanes.
|
||||
"""
|
||||
import math
|
||||
# Set up values like tan() would: different values per lane
|
||||
instructions = [
|
||||
# Create per-lane values: 2.0, 3.0, 4.0, 5.0
|
||||
v_cvt_f64_i32_e32(v[0:1], v[255]), # v0:1 = f64(lane_id)
|
||||
v_add_f64(v[0:1], v[0:1], SrcEnum.POS_TWO), # numer = lane_id + 2.0
|
||||
# denom = 1.0 for all lanes (uniform)
|
||||
v_mov_b32_e32(v[2], f2i64(1.0) & 0xffffffff),
|
||||
v_mov_b32_e32(v[3], f2i64(1.0) >> 32),
|
||||
# V_DIV_SCALE_F64: sets VCC per lane
|
||||
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4:5], sdst=VCC, src0=v[0:1], src1=v[2:3], src2=v[0:1]),
|
||||
# Copy scaled numer for FMA
|
||||
v_mov_b32_e32(v[6], v[4]),
|
||||
v_mov_b32_e32(v[7], v[5]),
|
||||
# V_DIV_FMAS_F64: uses VCC to apply scaling
|
||||
v_div_fmas_f64(v[8:9], v[6:7], v[2:3], v[4:5]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
|
||||
# All lanes should have VCC=0 (no scaling needed for normal values)
|
||||
self.assertEqual(st.vcc & 0xf, 0, "All lanes should have VCC=0 for normal values")
|
||||
|
||||
# Verify each lane has correct intermediate value
|
||||
for lane in range(4):
|
||||
expected_numer = float(lane) + 2.0
|
||||
# With VCC=0, DIV_FMAS should just do FMA with no scaling
|
||||
result = i642f(st.vgpr[lane][8] | (st.vgpr[lane][9] << 32))
|
||||
# The FMA result should be: scaled_numer * denom + scaled_numer = 2*scaled_numer
|
||||
expected = expected_numer * 1.0 + expected_numer # Simple FMA for this test setup
|
||||
self.assertAlmostEqual(result, expected, places=8,
|
||||
msg=f"Lane {lane}: expected {expected}, got {result}")
|
||||
|
||||
|
||||
class TestVOP3VOPC(unittest.TestCase):
|
||||
"""Tests for VOP3-encoded VOPC instructions (comparisons with scalar dest)."""
|
||||
|
||||
def test_v_cmp_ge_f32_e64_nan(self):
|
||||
"""V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false)."""
|
||||
from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xffc00000), # NaN
|
||||
s_mov_b32(s[1], 0x00000000), # 0.0
|
||||
v_mov_b32_e32(v[5], s[0]),
|
||||
v_mov_b32_e32(v[3], s[1]),
|
||||
VOP3_SDST(VOP3Op.V_CMP_GE_F32, vdst=s[5], src0=v[5], src1=v[3], abs_=3),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.sgpr[5], 0) # NaN comparison is always FALSE
|
||||
|
||||
|
||||
class TestMin3Max3Unsigned(unittest.TestCase):
|
||||
"""Regression tests for V_MIN3/V_MAX3 with unsigned integer types.
|
||||
|
||||
The emulator's _minmax_reduce used UOp.minimum() which implements min(a,b) as
|
||||
-max(-a,-b). This is broken for unsigned types because negation (mul by -1)
|
||||
doesn't preserve ordering: for uint16, -0 = 0 but -5 = 65531, so
|
||||
max(-0, -5) = max(0, 65531) = 65531, and -65531 = 5, giving min(0,5) = 5 (wrong!).
|
||||
|
||||
Fix: use comparison-based min/max for unsigned types: min(a,b) = (a<b)?a:b
|
||||
"""
|
||||
|
||||
def test_v_min3_u16_with_zero(self):
|
||||
"""V_MIN3_U16: min3(0, 3, 5) should return 0, not a wrong value."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0), # 0
|
||||
s_mov_b32(s[1], 3), # 3
|
||||
s_mov_b32(s[2], 5), # 5
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_min3_u16(v[1], v[0], s[1], s[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
|
||||
|
||||
def test_v_min3_u16_all_nonzero(self):
|
||||
"""V_MIN3_U16: min3(2, 5, 3) should return 2."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 2),
|
||||
s_mov_b32(s[1], 5),
|
||||
s_mov_b32(s[2], 3),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_min3_u16(v[1], v[0], s[1], s[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 2)
|
||||
|
||||
def test_v_min3_u32_with_zero(self):
|
||||
"""V_MIN3_U32: min3(0, 100, 50) should return 0."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 100),
|
||||
s_mov_b32(s[2], 50),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_min3_u32(v[1], v[0], s[1], s[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][1], 0)
|
||||
|
||||
def test_v_max3_u16_basic(self):
|
||||
"""V_MAX3_U16: max3(0, 3, 5) should return 5."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 3),
|
||||
s_mov_b32(s[2], 5),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_max3_u16(v[1], v[0], s[1], s[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 5)
|
||||
|
||||
def test_v_min_u16_two_operand(self):
|
||||
"""V_MIN_U16 (two operand): min(0, 5) should return 0."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 5),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
v_min_u16(v[1], v[0], s[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][1] & 0xFFFF, 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -149,7 +149,7 @@ class TestFmaMix(unittest.TestCase):
|
||||
|
||||
def test_v_fma_mix_f32_src2_f16_lo(self):
|
||||
"""V_FMA_MIX_F32 with src2 as f16 from lo bits."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_2 = f32_to_f16(2.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(1.0)),
|
||||
@@ -166,7 +166,7 @@ class TestFmaMix(unittest.TestCase):
|
||||
|
||||
def test_v_fma_mix_f32_src2_f16_hi(self):
|
||||
"""V_FMA_MIX_F32 with src2 as f16 from hi bits."""
|
||||
from extra.assembly.amd.pcode import f32_to_f16
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_2 = f32_to_f16(2.0)
|
||||
val = (f16_2 << 16) | 0
|
||||
instructions = [
|
||||
@@ -197,9 +197,64 @@ class TestFmaMix(unittest.TestCase):
|
||||
result = i2f(st.vgpr[0][3])
|
||||
self.assertAlmostEqual(result, 7.0, places=5)
|
||||
|
||||
def test_v_fma_mix_f32_with_abs_f16_src2_lo(self):
|
||||
"""V_FMA_MIX_F32 with abs modifier on f16 src2 (lo half). Regression test for sin(1.0) bug."""
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(0.0)), # src0 = 0.0 (f32)
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
s_mov_b32(s[1], f2i(1.0)), # src1 = 1.0 (f32)
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
s_mov_b32(s[2], f16_neg1), # src2 = -1.0 (f16 in lo)
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# 0*1 + abs(-1.0) = 1.0; neg_hi=4 means abs on src2, opsel_hi2=1 means src2 is f16
|
||||
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg_hi=4),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][3])
|
||||
self.assertAlmostEqual(result, 1.0, places=5)
|
||||
|
||||
def test_v_fma_mix_f32_with_neg_f16_src2_lo(self):
|
||||
"""V_FMA_MIX_F32 with neg modifier on f16 src2 (lo half)."""
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_1 = f32_to_f16(1.0) # 0x3c00
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(0.0)), # src0 = 0.0 (f32)
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
s_mov_b32(s[1], f2i(1.0)), # src1 = 1.0 (f32)
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
s_mov_b32(s[2], f16_1), # src2 = 1.0 (f16 in lo)
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# 0*1 + neg(1.0) = -1.0; neg=4 means neg on src2, opsel_hi2=1 means src2 is f16
|
||||
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1, neg=4),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][3])
|
||||
self.assertAlmostEqual(result, -1.0, places=5)
|
||||
|
||||
def test_v_fma_mix_f32_with_abs_f16_src2_hi(self):
|
||||
"""V_FMA_MIX_F32 with abs modifier on f16 src2 (hi half)."""
|
||||
from extra.assembly.amd.test.hw.helpers import f32_to_f16
|
||||
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
|
||||
val = (f16_neg1 << 16) | 0 # -1.0 in hi, 0 in lo
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(0.0)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
s_mov_b32(s[1], f2i(1.0)),
|
||||
v_mov_b32_e32(v[1], s[1]),
|
||||
s_mov_b32(s[2], val),
|
||||
v_mov_b32_e32(v[2], s[2]),
|
||||
# opsel=4 selects hi half of src2; neg_hi=4 means abs on src2
|
||||
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1, neg_hi=4),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
result = i2f(st.vgpr[0][3])
|
||||
self.assertAlmostEqual(result, 1.0, places=5)
|
||||
|
||||
def test_v_fma_mixlo_f16(self):
|
||||
"""V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(2.0)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
@@ -219,7 +274,7 @@ class TestFmaMix(unittest.TestCase):
|
||||
|
||||
def test_v_fma_mixlo_f16_all_f32_sources(self):
|
||||
"""V_FMA_MIXLO_F16 with all f32 sources."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], f2i(1.0)),
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
@@ -237,7 +292,7 @@ class TestFmaMix(unittest.TestCase):
|
||||
|
||||
def test_v_fma_mixlo_f16_sin_case(self):
|
||||
"""V_FMA_MIXLO_F16 case from sin kernel."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x3f800000), # f32 1.0
|
||||
v_mov_b32_e32(v[3], s[0]),
|
||||
@@ -259,7 +314,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_add_f16_basic(self):
|
||||
"""V_PK_ADD_F16 adds two packed f16 values."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x40003c00), # hi=2.0, lo=1.0
|
||||
s_mov_b32(s[1], 0x44004200), # hi=4.0, lo=3.0
|
||||
@@ -276,7 +331,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_mul_f16_basic(self):
|
||||
"""V_PK_MUL_F16 multiplies two packed f16 values."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x42004000), # hi=3.0, lo=2.0
|
||||
s_mov_b32(s[1], 0x45004400), # hi=5.0, lo=4.0
|
||||
@@ -293,7 +348,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_fma_f16_basic(self):
|
||||
"""V_PK_FMA_F16: D = A * B + C for packed f16."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x42004000), # A: hi=3.0, lo=2.0
|
||||
s_mov_b32(s[1], 0x45004400), # B: hi=5.0, lo=4.0
|
||||
@@ -315,7 +370,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
Inline constants for VOP3P are f16 values in the low 16 bits only.
|
||||
hi half of inline constant is 0, so hi result = v0.hi + 0 = 1.0.
|
||||
"""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x3c003c00), # packed f16: hi=1.0, lo=1.0
|
||||
v_mov_b32_e32(v[0], s[0]),
|
||||
@@ -333,7 +388,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
"""V_PK_MUL_F16 with inline constant POS_TWO (2.0).
|
||||
Inline constant has value only in low 16 bits, hi is 0.
|
||||
"""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
# v0 = packed (3.0, 4.0), multiply by POS_TWO
|
||||
# lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
|
||||
instructions = [
|
||||
@@ -498,7 +553,7 @@ class TestPackedMixedSigns(unittest.TestCase):
|
||||
|
||||
def test_pk_add_f16_mixed_signs(self):
|
||||
"""V_PK_ADD_F16 with mixed positive/negative values."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xc0003c00), # packed: hi=-2.0, lo=1.0
|
||||
s_mov_b32(s[1], 0x3c003c00), # packed: hi=1.0, lo=1.0
|
||||
@@ -515,7 +570,7 @@ class TestPackedMixedSigns(unittest.TestCase):
|
||||
|
||||
def test_pk_mul_f16_zero(self):
|
||||
"""V_PK_MUL_F16 with zero."""
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
from extra.assembly.amd.test.hw.helpers import _f16
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0x40004000), # packed: 2.0, 2.0
|
||||
s_mov_b32(s[1], 0x00000000), # packed: 0.0, 0.0
|
||||
|
||||
@@ -324,6 +324,29 @@ class TestCmpInt(unittest.TestCase):
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
|
||||
|
||||
def test_v_cmp_ne_u32_with_zero(self):
|
||||
"""V_CMP_NE_U32: compare with zero, used for int->bool cast."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[1], 0),
|
||||
v_cmp_eq_u32_e32(1, v[255]), # vcc = (lane == 1)
|
||||
v_cndmask_b32_e64(v[1], v[1], 1, VCC_LO), # v1[lane1] = 1
|
||||
v_cmp_ne_u32_e32(0, v[1]), # vcc = (0 != v1)
|
||||
v_cndmask_b32_e64(v[0], 0, 1, VCC_LO), # v0 = vcc ? 1 : 0
|
||||
]
|
||||
st = run_program(instructions, n_lanes=2)
|
||||
self.assertEqual(st.vgpr[0][0], 0, "lane 0: 0 != 0 should be false")
|
||||
self.assertEqual(st.vgpr[1][0], 1, "lane 1: 0 != 1 should be true")
|
||||
self.assertEqual(st.vcc & 0x3, 0x2, "VCC should be 0b10")
|
||||
|
||||
def test_v_cmp_ne_u32_all_nonzero(self):
|
||||
"""V_CMP_NE_U32: all lanes have nonzero values."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[1], 5),
|
||||
v_cmp_ne_u32_e32(0, v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should be != 0")
|
||||
|
||||
def test_cmp_eq_u16_opsel_lo_lo(self):
|
||||
"""V_CMP_EQ_U16 comparing lo halves."""
|
||||
instructions = [
|
||||
@@ -448,6 +471,242 @@ class TestCmpFloat(unittest.TestCase):
|
||||
self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
|
||||
|
||||
|
||||
class TestVOP3VOPCModifiers(unittest.TestCase):
|
||||
"""Tests for VOP3 VOPC with abs/neg modifiers."""
|
||||
|
||||
def test_v_cmp_ge_f32_abs_both(self):
|
||||
"""v_cmp_ge_f32 with abs on both sources: abs(0.0) >= abs(-1.0) = false.
|
||||
|
||||
Regression test: int16 mod operation uses v_cmp_ge_f32 with abs modifiers.
|
||||
"""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.0),
|
||||
v_mov_b32_e32(v[1], -1.0),
|
||||
# abs=0b11 means abs(src0) and abs(src1)
|
||||
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
|
||||
|
||||
def test_v_cmp_ge_f32_abs_negative_divisor(self):
|
||||
"""v_cmp_ge_f32 with abs: remainder check for negative divisor.
|
||||
|
||||
Tests the exact comparison used in int16 mod: abs(rem_f) >= abs(div_f).
|
||||
For 1 % -1: rem_f = 0.0, div_f = -1.0, so abs(0.0) >= abs(-1.0) = false.
|
||||
"""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.0), # remainder as float
|
||||
v_mov_b32_e32(v[1], -1.0), # divisor as float
|
||||
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "abs(0.0) >= abs(-1.0) should be false")
|
||||
|
||||
def test_v_cmp_ge_f32_abs_small_remainder(self):
|
||||
"""v_cmp_ge_f32 with abs: abs(-0.5) >= abs(-3.0) = false."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], -0.5),
|
||||
v_mov_b32_e32(v[1], -3.0),
|
||||
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "abs(-0.5) >= abs(-3.0) should be false")
|
||||
|
||||
def test_v_cmp_ge_f32_abs_equal(self):
|
||||
"""v_cmp_ge_f32 with abs: abs(-1.0) >= abs(1.0) = true."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], -1.0),
|
||||
v_mov_b32_e32(v[1], 1.0),
|
||||
v_cmp_ge_f32_e64(VCC_LO, v[0], v[1], abs=0b11),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "abs(-1.0) >= abs(1.0) should be true")
|
||||
|
||||
|
||||
class TestVOP3VOPC64Bit(unittest.TestCase):
|
||||
"""Tests for VOP3 VOPC with 64-bit operands."""
|
||||
|
||||
def test_v_cmp_lt_f64_basic(self):
|
||||
"""v_cmp_lt_f64: 0.0 < 1.0 = true."""
|
||||
zero_f64 = f2i64(0.0)
|
||||
one_f64 = f2i64(1.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], zero_f64 & 0xffffffff),
|
||||
s_mov_b32(s[1], zero_f64 >> 32),
|
||||
s_mov_b32(s[2], one_f64 & 0xffffffff),
|
||||
s_mov_b32(s[3], one_f64 >> 32),
|
||||
v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "0.0 < 1.0 should be true")
|
||||
|
||||
def test_v_cmp_lt_f64_negative(self):
|
||||
"""v_cmp_lt_f64: -1.0 < 0.0 = true."""
|
||||
neg_one_f64 = f2i64(-1.0)
|
||||
zero_f64 = f2i64(0.0)
|
||||
instructions = [
|
||||
s_mov_b32(s[0], neg_one_f64 & 0xffffffff),
|
||||
s_mov_b32(s[1], neg_one_f64 >> 32),
|
||||
s_mov_b32(s[2], zero_f64 & 0xffffffff),
|
||||
s_mov_b32(s[3], zero_f64 >> 32),
|
||||
v_cmp_lt_f64_e64(VCC_LO, s[0:1], s[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "-1.0 < 0.0 should be true")
|
||||
|
||||
def test_v_cmp_lt_i64_signed(self):
|
||||
"""v_cmp_lt_i64: 0 < -1 (signed) = false."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 0), # s[0:1] = 0
|
||||
s_mov_b32(s[2], 0xffffffff),
|
||||
s_mov_b32(s[3], 0xffffffff), # s[2:3] = -1
|
||||
v_cmp_lt_i64_e64(VCC_LO, s[0:1], s[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "0 < -1 (signed) should be false")
|
||||
|
||||
def test_v_cmp_lt_u64_unsigned(self):
|
||||
"""v_cmp_lt_u64: 0 < 0xFFFFFFFFFFFFFFFF (unsigned) = true."""
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0),
|
||||
s_mov_b32(s[1], 0), # s[0:1] = 0
|
||||
s_mov_b32(s[2], 0xffffffff),
|
||||
s_mov_b32(s[3], 0xffffffff), # s[2:3] = max uint64
|
||||
v_cmp_lt_u64_e64(VCC_LO, s[0:1], s[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "0 < max_uint64 should be true")
|
||||
|
||||
|
||||
class TestVOPCF64(unittest.TestCase):
|
||||
"""Tests for VOPC (E32 encoding) with 64-bit float operands. Regression test for f64 compare bug."""
|
||||
|
||||
def test_v_cmp_lt_f64_e32_true(self):
|
||||
"""v_cmp_lt_f64_e32: 2.0 < 3.0 = true."""
|
||||
lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
|
||||
lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
|
||||
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
|
||||
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
|
||||
v_cmp_lt_f64_e32(v[0:1], v[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "2.0 < 3.0 should be true")
|
||||
|
||||
def test_v_cmp_lt_f64_e32_false(self):
|
||||
"""v_cmp_lt_f64_e32: 3.0 < 2.0 = false."""
|
||||
lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
|
||||
lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
|
||||
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
|
||||
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
|
||||
v_cmp_lt_f64_e32(v[0:1], v[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "3.0 < 2.0 should be false")
|
||||
|
||||
def test_v_cmp_nlt_f64_e32_true(self):
|
||||
"""v_cmp_nlt_f64_e32: !(3.0 < 2.0) = true."""
|
||||
lo0, hi0 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
|
||||
lo1, hi1 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
|
||||
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
|
||||
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
|
||||
v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 1, "!(3.0 < 2.0) should be true")
|
||||
|
||||
def test_v_cmp_nlt_f64_e32_false(self):
|
||||
"""v_cmp_nlt_f64_e32: !(2.0 < 3.0) = false."""
|
||||
lo0, hi0 = f2i64(2.0) & 0xffffffff, f2i64(2.0) >> 32
|
||||
lo1, hi1 = f2i64(3.0) & 0xffffffff, f2i64(3.0) >> 32
|
||||
instructions = [
|
||||
s_mov_b32(s[0], lo0), s_mov_b32(s[1], hi0),
|
||||
s_mov_b32(s[2], lo1), s_mov_b32(s[3], hi1),
|
||||
v_mov_b32_e32(v[0], s[0]), v_mov_b32_e32(v[1], s[1]),
|
||||
v_mov_b32_e32(v[2], s[2]), v_mov_b32_e32(v[3], s[3]),
|
||||
v_cmp_nlt_f64_e32(v[0:1], v[2:3]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vcc & 1, 0, "!(2.0 < 3.0) should be false")
|
||||
|
||||
|
||||
class TestCmpxExec(unittest.TestCase):
|
||||
"""Tests for V_CMPX instructions that modify EXEC mask."""
|
||||
|
||||
def test_v_cmpx_ngt_f32_e64_all_true(self):
|
||||
"""V_CMPX_NGT_F32_E64: all lanes pass (literal <= all values)."""
|
||||
# 131072.0 = 0x48000000
|
||||
# All values > 131072, so !(131072 > val) = true for all
|
||||
instructions = [
|
||||
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
|
||||
v_mov_b32_e32(v[0], f2i(200000.0)), # lane 0
|
||||
v_cmp_eq_u32_e32(1, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[0], f2i(300000.0), VCC_LO), # lane 1
|
||||
v_cmp_eq_u32_e32(2, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[1], f2i(400000.0), VCC_LO), # lane 2
|
||||
# Now v[1] has: lane0=200000, lane1=300000, lane2=400000
|
||||
# Compare: !(131072.0 > v[1]) i.e., 131072.0 <= v[1]
|
||||
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# All values > 131072, so all lanes should remain active
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
|
||||
|
||||
def test_v_cmpx_ngt_f32_e64_some_false(self):
|
||||
"""V_CMPX_NGT_F32_E64: some lanes fail (literal > some values)."""
|
||||
instructions = [
|
||||
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
|
||||
v_mov_b32_e32(v[0], f2i(100000.0)), # lane 0: 131072 > 100000 = true, so !(true) = false
|
||||
v_cmp_eq_u32_e32(1, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[0], f2i(200000.0), VCC_LO), # lane 1: 131072 > 200000 = false, so !(false) = true
|
||||
v_cmp_eq_u32_e32(2, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[1], f2i(150000.0), VCC_LO), # lane 2: 131072 > 150000 = false, so !(false) = true
|
||||
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# lane 0: fail (100000 < 131072), lanes 1,2: pass
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x6, "Lanes 1,2 should be active, lane 0 inactive")
|
||||
|
||||
def test_v_cmpx_ngt_f32_e64_all_false(self):
|
||||
"""V_CMPX_NGT_F32_E64: all lanes fail (literal > all values)."""
|
||||
instructions = [
|
||||
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
|
||||
v_mov_b32_e32(v[0], f2i(100.0)), # all lanes have 100.0
|
||||
# 131072 > 100 = true, so !(true) = false for all
|
||||
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x0, "All lanes should be inactive")
|
||||
|
||||
def test_v_cmpx_ngt_f32_e64_large_values(self):
|
||||
"""V_CMPX_NGT_F32_E64: test with values that trigger Payne-Hanek in sin().
|
||||
|
||||
This is a regression test for the sin(859240.0) bug.
|
||||
Values 859240, 1000000, 100594688 should all pass !(131072 > val).
|
||||
"""
|
||||
instructions = [
|
||||
s_mov_b32(EXEC_LO, 0x7), # 3 lanes active
|
||||
v_mov_b32_e32(v[0], f2i(859240.0)), # lane 0
|
||||
v_cmp_eq_u32_e32(1, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[0], f2i(1000000.0), VCC_LO), # lane 1
|
||||
v_cmp_eq_u32_e32(2, v[255]),
|
||||
v_cndmask_b32_e64(v[1], v[1], f2i(100594688.0), VCC_LO), # lane 2
|
||||
v_cmpx_ngt_f32_e64(EXEC_LO, f2i(131072.0), v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# All values > 131072, so !(131072 > val) = true for all
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0x7, 0x7, "All 3 lanes should remain active")
|
||||
|
||||
|
||||
class TestVCCBehavior(unittest.TestCase):
|
||||
"""Tests for VCC condition code behavior."""
|
||||
|
||||
@@ -472,5 +731,101 @@ class TestVCCBehavior(unittest.TestCase):
|
||||
self.assertEqual(st.vcc >> 16, 0x0000, "Lanes 16-31 should be false")
|
||||
|
||||
|
||||
class TestCmpxPartialWavefront(unittest.TestCase):
|
||||
"""Tests for V_CMPX with partial wavefronts (fewer than 32 active lanes).
|
||||
|
||||
Regression tests for bug where v_cmpx incorrectly set EXEC bits for inactive
|
||||
lanes when the wavefront had fewer than 32 lanes. This caused garbage data
|
||||
from uninitialized lanes to corrupt memory writes.
|
||||
"""
|
||||
|
||||
def test_v_cmpx_eq_u32_partial_wave_3_lanes(self):
|
||||
"""V_CMPX_EQ_U32 with 3 active lanes should only affect those 3 lanes.
|
||||
|
||||
With n_lanes=3, initial EXEC=0x7. After v_cmpx comparing lane_id == 1,
|
||||
only lane 1 should pass, so EXEC should become 0x2 (not have bits 3-31 set).
|
||||
"""
|
||||
instructions = [
|
||||
v_cmpx_eq_u32_e32(1, v[255]), # EXEC = lanes where lane_id == 1
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# Only lane 1 should be active (bit 1 set)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x2,
|
||||
"Only lane 1 should be active after v_cmpx_eq_u32 with 3 lanes")
|
||||
|
||||
def test_v_cmpx_eq_u32_partial_wave_5_lanes(self):
|
||||
"""V_CMPX_EQ_U32 with 5 active lanes."""
|
||||
instructions = [
|
||||
v_cmpx_eq_u32_e32(3, v[255]), # EXEC = lanes where lane_id == 3
|
||||
]
|
||||
st = run_program(instructions, n_lanes=5)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x8,
|
||||
"Only lane 3 should be active after v_cmpx_eq_u32 with 5 lanes")
|
||||
|
||||
def test_v_cmpx_lt_u32_partial_wave(self):
|
||||
"""V_CMPX_LT_U32 with partial wavefront."""
|
||||
# VOPC: src0 < vsrc1, so we need v_cmpx_gt_u32 to get lane_id < 2
|
||||
instructions = [
|
||||
v_cmpx_gt_u32_e32(2, v[255]), # EXEC = lanes where 2 > lane_id (i.e., lane_id < 2)
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# Lanes 0,1 should be active (bits 0,1 set = 0x3)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x3,
|
||||
"Only lanes 0,1 should be active after v_cmpx_gt_u32(2, lane_id) with 4 lanes")
|
||||
|
||||
def test_v_cmpx_ge_u32_partial_wave(self):
|
||||
"""V_CMPX_GE_U32 with partial wavefront."""
|
||||
# VOPC: src0 >= vsrc1, so v_cmpx_le_u32(1, lane_id) gives lane_id >= 2? No.
|
||||
# v_cmpx_le_u32(src0, vsrc1) = src0 <= vsrc1 = 1 <= lane_id
|
||||
instructions = [
|
||||
v_cmpx_le_u32_e32(2, v[255]), # EXEC = lanes where 2 <= lane_id (i.e., lane_id >= 2)
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# Lanes 2,3 should be active (bits 2,3 set = 0xC)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xC,
|
||||
"Only lanes 2,3 should be active after v_cmpx_le_u32(2, lane_id) with 4 lanes")
|
||||
|
||||
def test_v_cmpx_ne_u32_partial_wave_all_pass(self):
|
||||
"""V_CMPX_NE_U32 where all active lanes pass."""
|
||||
instructions = [
|
||||
v_cmpx_ne_u32_e32(99, v[255]), # EXEC = lanes where lane_id != 99
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# All 3 lanes should remain active (bits 0,1,2 set = 0x7)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x7,
|
||||
"All 3 lanes should remain active when all pass")
|
||||
|
||||
def test_v_cmpx_eq_u32_partial_wave_none_pass(self):
|
||||
"""V_CMPX_EQ_U32 where no active lanes pass."""
|
||||
instructions = [
|
||||
v_cmpx_eq_u32_e32(99, v[255]), # EXEC = lanes where lane_id == 99
|
||||
]
|
||||
st = run_program(instructions, n_lanes=3)
|
||||
# No lanes should be active
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x0,
|
||||
"No lanes should be active when none pass")
|
||||
|
||||
def test_v_cmpx_f32_partial_wave(self):
|
||||
"""V_CMPX_GT_F32 with partial wavefront - float comparison."""
|
||||
instructions = [
|
||||
v_cvt_f32_u32_e32(v[0], v[255]), # v[0] = float(lane_id)
|
||||
v_mov_b32_e32(v[1], f2i(0.5)), # v[1] = 0.5
|
||||
v_cmpx_gt_f32_e32(v[0], v[1]), # EXEC = lanes where v[0] > 0.5
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
# Lanes 1,2,3 have values > 0.5, lane 0 has 0.0
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0xE,
|
||||
"Lanes 1,2,3 should be active (float > 0.5)")
|
||||
|
||||
def test_v_cmpx_e64_partial_wave(self):
|
||||
"""V_CMPX_EQ_U32_E64 (VOP3 encoding) with partial wavefront."""
|
||||
instructions = [
|
||||
v_cmpx_eq_u32_e64(EXEC_LO, v[255], 2), # EXEC = lanes where lane_id == 2
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
self.assertEqual(st.sgpr[EXEC_LO.offset] & 0xFFFFFFFF, 0x4,
|
||||
"Only lane 2 should be active after v_cmpx_eq_u32_e64")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
161
extra/assembly/amd/test/hw/test_vopd.py
Normal file
161
extra/assembly/amd/test/hw/test_vopd.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""Tests for VOPD instructions - dual-issue vector operations.
|
||||
|
||||
VOPD executes two operations simultaneously. Key behavior:
|
||||
- Both ops read their sources BEFORE either writes (dual-issue semantics)
|
||||
- This means if X writes to a register that Y reads, Y sees the OLD value
|
||||
- Op X can use ops 0-15 (FMAC, MUL, ADD, MOV, etc.)
|
||||
- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import run_program, run_program_emu, run_program_hw, compare_wave_states, \
|
||||
v, s, v_mov_b32_e32, s_mov_b32
|
||||
from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
|
||||
|
||||
class TestVOPDBasic(unittest.TestCase):
|
||||
"""Basic VOPD functionality tests."""
|
||||
|
||||
def test_vopd_dual_mov(self):
|
||||
"""VOPD with two MOV operations to different registers."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0x12345678),
|
||||
v_mov_b32_e32(v[1], 0xDEADBEEF),
|
||||
# X: v[2] = v[0], Y: v[3] = v[1]
|
||||
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[1], v[0], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 0x12345678)
|
||||
self.assertEqual(st.vgpr[0][3], 0xDEADBEEF)
|
||||
|
||||
def test_vopd_mov_and_add(self):
|
||||
"""VOPD with MOV (X) and ADD_NC_U32 (Y) - ADD_NC_U32 can only be Y op."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 10),
|
||||
v_mov_b32_e32(v[1], 5),
|
||||
# X: v[2] = 100 (literal), Y: v[3] = v[0] + v[1] = 15
|
||||
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[3], 100, v[0], v[0], v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertEqual(st.vgpr[0][2], 100)
|
||||
self.assertEqual(st.vgpr[0][3], 15)
|
||||
|
||||
|
||||
class TestVOPDReadBeforeWrite(unittest.TestCase):
|
||||
"""Tests for VOPD dual-issue read-before-write semantics.
|
||||
|
||||
In VOPD, both X and Y operations read their sources BEFORE either writes.
|
||||
This is critical when X's destination is Y's source.
|
||||
"""
|
||||
|
||||
def test_vopd_x_writes_y_reads_same_reg(self):
|
||||
"""VOPD where X writes to a register that Y reads.
|
||||
|
||||
X: v[2] = 0 (overwrites v[2])
|
||||
Y: v[1] = v[2] + v[0] (srcy0=v[2], vsrcy1=v[0])
|
||||
|
||||
If reads happen before writes: v[1] = OLD_v[2] + v[0] = 0xFFFFFFFF + 1 = 0
|
||||
If writes happen before reads: v[1] = 0 + v[0] = 0 + 1 = 1
|
||||
|
||||
Hardware does reads-before-writes, so v[1] should be 0.
|
||||
"""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 1), # v[0] = 1
|
||||
v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
|
||||
v_mov_b32_e32(v[2], 0xFFFFFFFF), # v[2] = 0xFFFFFFFF
|
||||
# X: v[2] = 0 (literal), srcx0=0, vsrcx1=v[0] (unused for MOV)
|
||||
# Y: v[1] = srcy0 + vsrcy1 = v[2] + v[0] (should read OLD v[2] = 0xFFFFFFFF)
|
||||
# vdsty encoding: (vdsty << 1) | ((vdstx & 1) ^ 1) where vdsty field = 0, vdstx = v[2]
|
||||
# So vdsty_reg = (0 << 1) | ((2 & 1) ^ 1) = 0 | 1 = 1 = v[1]
|
||||
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[0]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# X should have written 0 to v[2]
|
||||
self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
|
||||
# Y should have read OLD v[2] (0xFFFFFFFF) and added v[0] (1)
|
||||
# 0xFFFFFFFF + 1 = 0 (wrap around)
|
||||
self.assertEqual(st.vgpr[0][1], 0, "Y should read OLD v[2]=0xFFFFFFFF, compute 0xFFFFFFFF+1=0")
|
||||
|
||||
def test_vopd_x_writes_y_reads_same_reg_v2(self):
|
||||
"""VOPD where X writes to a register that Y reads - cleaner test case.
|
||||
|
||||
X: v[2] = 0 (MOV)
|
||||
Y: v[1] = v[2] + v[2] (ADD_NC_U32 with both sources from v[2])
|
||||
|
||||
If reads happen before writes: v[1] = OLD_v[2] + OLD_v[2] = 100 + 100 = 200
|
||||
If writes happen before reads: v[1] = 0 + 0 = 0
|
||||
|
||||
Hardware does reads-before-writes, so v[1] should be 200.
|
||||
"""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0x88888888), # v[0] = unused placeholder
|
||||
v_mov_b32_e32(v[1], 0x99999999), # v[1] = placeholder (will be overwritten)
|
||||
v_mov_b32_e32(v[2], 100), # v[2] = 100
|
||||
# X: v[2] = 0 (literal)
|
||||
# Y: v[1] = srcy0 + vsrcy1 = v[2] + v[2] (should read OLD v[2] = 100)
|
||||
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 0, v[2], v[0], v[2]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
# X should have written 0 to v[2]
|
||||
self.assertEqual(st.vgpr[0][2], 0, "X should write 0 to v[2]")
|
||||
# Y should have read OLD v[2] (100) twice and added them
|
||||
self.assertEqual(st.vgpr[0][1], 200, "Y should read OLD v[2]=100 twice, compute 100+100=200")
|
||||
|
||||
|
||||
class TestVOPDLiterals(unittest.TestCase):
|
||||
"""Tests for VOPD instructions that use SIMM32 literals (FMAAK, FMAMK)."""
|
||||
|
||||
def test_vopd_fmaak_f32(self):
|
||||
"""VOPD V_DUAL_FMAAK_F32: D = S0 * S1 + SIMM32 (literal addend).
|
||||
|
||||
Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
|
||||
fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
|
||||
"""
|
||||
from extra.assembly.amd.test.hw.helpers import f2i, i2f
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
|
||||
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
|
||||
# VOPD args: opx, opy, vdstx, vdsty, srcx0, srcy0, vsrcx1, vsrcy1
|
||||
# X: v[2] = fma(srcx0, vsrcx1, SIMM32) = v[0]*v[1]+10.0 = 2*3+10 = 16
|
||||
# Y: v[3] = srcy0 (MOV) = v[0] = 2.0
|
||||
VOPD_LIT(VOPDOp.V_DUAL_FMAAK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(10.0)),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 16.0, places=5, msg="fma(2.0, 3.0, 10.0) should be 16.0")
|
||||
|
||||
def test_vopd_fmamk_f32(self):
|
||||
"""VOPD V_DUAL_FMAMK_F32: D = S0 * SIMM32 + S1 (literal multiplier).
|
||||
|
||||
Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
|
||||
fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
|
||||
"""
|
||||
from extra.assembly.amd.test.hw.helpers import f2i, i2f
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
|
||||
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
|
||||
# X: v[2] = fma(srcx0, SIMM32, vsrcx1) = v[0]*5.0+v[1] = 2*5+3 = 13
|
||||
# Y: v[3] = srcy0 (MOV) = v[0] = 2.0
|
||||
VOPD_LIT(VOPDOp.V_DUAL_FMAMK_F32, VOPDOp.V_DUAL_MOV_B32, v[2], v[3], v[0], v[0], v[1], v[0], literal=f2i(5.0)),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=1)
|
||||
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 13.0, places=5, msg="fma(2.0, 5.0, 3.0) should be 13.0")
|
||||
|
||||
|
||||
class TestVOPDMultilane(unittest.TestCase):
|
||||
"""Tests for VOPD with multiple lanes."""
|
||||
|
||||
def test_vopd_multilane_mov_add(self):
|
||||
"""VOPD MOV and ADD with multiple active lanes - no register conflict."""
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 5),
|
||||
v_mov_b32_e32(v[1], 10),
|
||||
# X: v[2] = 100 (constant), Y: v[1] = v[0] + v[1] = 5 + 10 = 15
|
||||
# vdsty_reg = (vdsty << 1) | ((vdstx.offset & 1) ^ 1) = (0 << 1) | ((258 & 1) ^ 1) = 0 | 1 = 1
|
||||
VOPD(VOPDOp.V_DUAL_MOV_B32, VOPDOp.V_DUAL_ADD_NC_U32, v[2], v[0], 100, v[0], v[2], v[1]),
|
||||
]
|
||||
st = run_program(instructions, n_lanes=4)
|
||||
for lane in range(4):
|
||||
self.assertEqual(st.vgpr[lane][2], 100, f"Lane {lane}: v[2] should be 100")
|
||||
self.assertEqual(st.vgpr[lane][1], 15, f"Lane {lane}: v[1] should be 15 (5+10)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user