mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
4086 lines
161 KiB
Python
4086 lines
161 KiB
Python
#!/usr/bin/env python3
|
|
"""Regression tests for the RDNA3 emulator instruction execution.
|
|
Uses run_asm() with memory output, so tests can run on both emulator and real hardware.
|
|
|
|
Set USE_HW=1 to run on both emulator and real hardware, comparing results.
|
|
"""
|
|
|
|
import ctypes, unittest, os, struct
|
|
from extra.assembly.amd.autogen.rdna3 import *
|
|
from extra.assembly.amd.dsl import RawImm
|
|
from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
|
|
from extra.assembly.amd.pcode import _i32, _f32
|
|
|
|
VCC = SrcEnum.VCC_LO # For VOP3SD sdst field
|
|
USE_HW = os.environ.get("USE_HW", "0") == "1"
|
|
# Tolerance for float comparisons (in ULPs or absolute)
|
|
FLOAT_TOLERANCE = 1e-5
|
|
|
|
# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc
|
|
# Each VGPR store writes 32 lanes (128 bytes), so vgpr[i] is at offset i*128
|
|
N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32
|
|
VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048
|
|
SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64
|
|
OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8 # + vcc + scc
|
|
|
|
def f2i(f: float) -> int: return _i32(f)
|
|
def i2f(i: int) -> float: return _f32(i)
|
|
def f2i64(f: float) -> int: return struct.unpack('<Q', struct.pack('<d', f))[0]
|
|
def i642f(i: int) -> float: return struct.unpack('<d', struct.pack('<Q', i))[0]
|
|
|
|
def assemble(instructions: list) -> bytes:
|
|
return b''.join(inst.to_bytes() for inst in instructions)
|
|
|
|
def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]:
|
|
"""Generate prologue and epilogue instructions for state capture."""
|
|
# Prologue: save s[0:1] and v[0] before test clobbers them
|
|
# Use s[80:81] for args pointer (safe range, avoiding VCC=106-107 and staying under 100)
|
|
prologue = [
|
|
s_mov_b32(s[80], s[0]),
|
|
s_mov_b32(s[81], s[1]),
|
|
v_mov_b32_e32(v[255], v[0]),
|
|
]
|
|
# Zero out test registers (v0-v15, s0-s15, vcc) so emu and hw start from same state
|
|
for i in range(N_VGPRS):
|
|
prologue.append(v_mov_b32_e32(v[i], 0))
|
|
for i in range(N_SGPRS):
|
|
prologue.append(s_mov_b32(s[i], 0))
|
|
prologue.append(s_mov_b32(s[SrcEnum.VCC_LO - 128], 0)) # zero VCC
|
|
|
|
# Epilogue: store wave state to memory
|
|
# Use s[90-99] for epilogue temps to stay in safe SGPR range (<100, avoiding VCC=106-107)
|
|
# s[90] = saved VCC, s[91] = saved SCC, s[92:93] = output addr, s[94] = saved EXEC
|
|
# Save VCC/SCC first before we clobber them
|
|
epilogue = [
|
|
s_mov_b32(s[90], SrcEnum.VCC_LO), # save VCC
|
|
s_cselect_b32(s[91], 1, 0), # save SCC
|
|
s_load_b64(s[92:93], s[80], 0, soffset=SrcEnum.NULL),
|
|
s_waitcnt(lgkmcnt=0),
|
|
v_lshlrev_b32_e32(v[240], 2, v[255]), # v[240] = lane_id * 4
|
|
]
|
|
# Store VGPRs: vgpr[i] at offset i*128 + lane_id*4
|
|
for i in range(N_VGPRS):
|
|
epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92], offset=i * WAVE_SIZE * 4))
|
|
# Store SGPRs at VGPR_BYTES + i*4 (lane 0 only via exec mask)
|
|
epilogue.append(v_mov_b32_e32(v[241], 0))
|
|
epilogue.append(v_cmp_eq_u32_e32(v[255], v[241]))
|
|
epilogue.append(s_and_saveexec_b32(s[94], SrcEnum.VCC_LO))
|
|
epilogue.append(v_mov_b32_e32(v[240], 0))
|
|
for i in range(N_SGPRS):
|
|
epilogue.append(v_mov_b32_e32(v[243], s[i]))
|
|
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + i * 4))
|
|
# Store saved VCC
|
|
epilogue.append(v_mov_b32_e32(v[243], s[90]))
|
|
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES))
|
|
# Store saved SCC
|
|
epilogue.append(v_mov_b32_e32(v[243], s[91]))
|
|
epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES + 4))
|
|
epilogue.append(s_mov_b32(s[SrcEnum.EXEC_LO - 128], s[94])) # restore exec
|
|
epilogue.append(s_endpgm())
|
|
|
|
return prologue, epilogue
|
|
|
|
def parse_output(out_buf: bytes, n_lanes: int) -> WaveState:
|
|
"""Parse output buffer into WaveState."""
|
|
st = WaveState()
|
|
for i in range(N_VGPRS):
|
|
for lane in range(n_lanes):
|
|
off = i * WAVE_SIZE * 4 + lane * 4
|
|
st.vgpr[lane][i] = struct.unpack_from('<I', out_buf, off)[0]
|
|
for i in range(N_SGPRS):
|
|
st.sgpr[i] = struct.unpack_from('<I', out_buf, VGPR_BYTES + i * 4)[0]
|
|
st.vcc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES)[0]
|
|
st.scc = struct.unpack_from('<I', out_buf, VGPR_BYTES + SGPR_BYTES + 4)[0]
|
|
return st
|
|
|
|
def run_program_emu(instructions: list, n_lanes: int = 1) -> WaveState:
|
|
"""Run instructions via emulator run_asm, dump state to memory, return WaveState."""
|
|
out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES))
|
|
out_addr = ctypes.addressof(out_buf)
|
|
|
|
prologue, epilogue = get_prologue_epilogue(n_lanes)
|
|
code = assemble(prologue + instructions + epilogue)
|
|
|
|
args = (ctypes.c_uint64 * 1)(out_addr)
|
|
args_ptr = ctypes.addressof(args)
|
|
kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code)
|
|
lib_ptr = ctypes.addressof(kernel_buf)
|
|
|
|
set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)})
|
|
result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr)
|
|
assert result == 0, f"run_asm failed with {result}"
|
|
|
|
return parse_output(bytes(out_buf), n_lanes)
|
|
|
|
def run_program_hw(instructions: list, n_lanes: int = 1) -> WaveState:
|
|
"""Run instructions on real AMD hardware via HIPCompiler and AMDProgram."""
|
|
from tinygrad.device import Device
|
|
from tinygrad.runtime.ops_amd import AMDProgram
|
|
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
|
from tinygrad.helpers import flat_mv
|
|
|
|
dev = Device["AMD"]
|
|
compiler = HIPCompiler(dev.arch)
|
|
|
|
prologue, epilogue = get_prologue_epilogue(n_lanes)
|
|
code = assemble(prologue + instructions + epilogue)
|
|
|
|
# Create inline assembly source with .byte directives
|
|
byte_str = ', '.join(f'0x{b:02x}' for b in code)
|
|
asm_src = f""".text
|
|
.globl test
|
|
.p2align 8
|
|
.type test,@function
|
|
test:
|
|
.byte {byte_str}
|
|
|
|
.rodata
|
|
.p2align 6
|
|
.amdhsa_kernel test
|
|
.amdhsa_next_free_vgpr 256
|
|
.amdhsa_next_free_sgpr 96
|
|
.amdhsa_wavefront_size32 1
|
|
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
|
.amdhsa_kernarg_size 8
|
|
.end_amdhsa_kernel
|
|
|
|
.amdgpu_metadata
|
|
---
|
|
amdhsa.version:
|
|
- 1
|
|
- 0
|
|
amdhsa.kernels:
|
|
- .name: test
|
|
.symbol: test.kd
|
|
.kernarg_segment_size: 8
|
|
.group_segment_fixed_size: 0
|
|
.private_segment_fixed_size: 0
|
|
.kernarg_segment_align: 8
|
|
.wavefront_size: 32
|
|
.sgpr_count: 96
|
|
.vgpr_count: 256
|
|
.max_flat_workgroup_size: 1024
|
|
...
|
|
.end_amdgpu_metadata
|
|
"""
|
|
|
|
lib = compiler.compile(asm_src)
|
|
prg = AMDProgram(dev, "test", lib)
|
|
|
|
# Allocate output buffer on GPU
|
|
out_gpu = dev.allocator.alloc(OUT_BYTES)
|
|
|
|
# Run the kernel
|
|
prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True)
|
|
|
|
# Copy result back
|
|
out_buf = bytearray(OUT_BYTES)
|
|
dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu)
|
|
|
|
return parse_output(bytes(out_buf), n_lanes)
|
|
|
|
def compare_wave_states(emu_st: WaveState, hw_st: WaveState, n_lanes: int, n_vgprs: int = N_VGPRS) -> list[str]:
|
|
"""Compare two WaveStates and return list of differences."""
|
|
import math
|
|
diffs = []
|
|
# Compare VGPRs - vgpr is list[lane][reg]
|
|
for i in range(n_vgprs):
|
|
for lane in range(n_lanes):
|
|
emu_val = emu_st.vgpr[lane][i]
|
|
hw_val = hw_st.vgpr[lane][i]
|
|
if emu_val != hw_val:
|
|
emu_f, hw_f = _f32(emu_val), _f32(hw_val)
|
|
# Handle NaN comparison
|
|
if math.isnan(emu_f) and math.isnan(hw_f):
|
|
continue
|
|
diffs.append(f"v[{i}] lane {lane}: emu=0x{emu_val:08x} ({emu_f:.6g}) hw=0x{hw_val:08x} ({hw_f:.6g})")
|
|
# Compare SGPRs - sgpr is list
|
|
for i in range(N_SGPRS):
|
|
emu_val = emu_st.sgpr[i]
|
|
hw_val = hw_st.sgpr[i]
|
|
if emu_val != hw_val:
|
|
diffs.append(f"s[{i}]: emu=0x{emu_val:08x} hw=0x{hw_val:08x}")
|
|
# Compare VCC
|
|
if emu_st.vcc != hw_st.vcc:
|
|
diffs.append(f"vcc: emu=0x{emu_st.vcc:08x} hw=0x{hw_st.vcc:08x}")
|
|
# Compare SCC
|
|
if emu_st.scc != hw_st.scc:
|
|
diffs.append(f"scc: emu={emu_st.scc} hw={hw_st.scc}")
|
|
return diffs
|
|
|
|
def run_program(instructions: list, n_lanes: int = 1) -> WaveState:
|
|
"""Run instructions and return WaveState.
|
|
|
|
If USE_HW=1, runs on both emulator and hardware, compares results, and raises if they differ.
|
|
Otherwise, runs only on emulator.
|
|
"""
|
|
emu_st = run_program_emu(instructions, n_lanes)
|
|
if USE_HW:
|
|
hw_st = run_program_hw(instructions, n_lanes)
|
|
diffs = compare_wave_states(emu_st, hw_st, n_lanes)
|
|
if diffs:
|
|
raise AssertionError(f"Emulator vs Hardware mismatch:\n" + "\n".join(diffs))
|
|
return hw_st # Return hardware result when both match
|
|
return emu_st
|
|
|
|
|
|
class TestVDivScale(unittest.TestCase):
|
|
"""Tests for V_DIV_SCALE_F32 edge cases.
|
|
|
|
V_DIV_SCALE_F32 is used in the Newton-Raphson division sequence to handle
|
|
denormals and near-overflow cases. It scales operands and sets VCC when
|
|
the final result needs to be unscaled.
|
|
|
|
Pseudocode cases:
|
|
1. Zero operands -> NaN
|
|
2. exp(S2) - exp(S1) >= 96 -> scale denom, VCC=1
|
|
3. S1 is denorm -> scale by 2^64
|
|
4. 1/S1 is f64 denorm AND S2/S1 is f32 denorm -> scale denom, VCC=1
|
|
5. 1/S1 is f64 denorm -> scale by 2^-64
|
|
6. S2/S1 is f32 denorm -> scale numer, VCC=1
|
|
7. exp(S2) <= 23 -> scale by 2^64 (tiny numerator)
|
|
"""
|
|
|
|
def test_div_scale_f32_vcc_zero_single_lane(self):
|
|
"""V_DIV_SCALE_F32 sets VCC=0 when no scaling needed."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # uses inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # uses inline constant
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc, 0, "VCC should be 0 when no scaling needed")
|
|
|
|
def test_div_scale_f32_vcc_zero_multiple_lanes(self):
|
|
"""V_DIV_SCALE_F32 sets VCC=0 for all lanes when no scaling needed."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0),
|
|
v_mov_b32_e32(v[1], 4.0),
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
self.assertEqual(st.vcc & 0xf, 0, "VCC should be 0 for all lanes")
|
|
|
|
def test_div_scale_f32_preserves_input(self):
|
|
"""V_DIV_SCALE_F32 outputs S0 when no scaling needed."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0), # numerator - use inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # denominator
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 2.0, places=5)
|
|
|
|
def test_div_scale_f32_zero_denom_gives_nan(self):
|
|
"""V_DIV_SCALE_F32: zero denominator -> NaN, VCC=1."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # numerator
|
|
v_mov_b32_e32(v[1], 0.0), # denominator = 0
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero denom")
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero denom")
|
|
|
|
def test_div_scale_f32_zero_numer_gives_nan(self):
|
|
"""V_DIV_SCALE_F32: zero numerator -> NaN, VCC=1."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0.0), # numerator = 0
|
|
v_mov_b32_e32(v[1], 1.0), # denominator
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Should be NaN for zero numer")
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for zero numer")
|
|
|
|
def test_div_scale_f32_large_exp_diff_scales_denom(self):
|
|
"""V_DIV_SCALE_F32: exp(numer) - exp(denom) >= 96 -> scale denom, VCC=1."""
|
|
# Need exp difference >= 96. Use MAX_FLOAT / tiny_normal
|
|
# MAX_FLOAT exp=254, tiny_normal with exp <= 254-96=158
|
|
# Let's use exp=127 (1.0) for denom, exp=254 for numer -> diff = 127 (>96)
|
|
max_float = 0x7f7fffff # 3.4028235e+38, exp=254
|
|
instructions = [
|
|
s_mov_b32(s[0], max_float),
|
|
v_mov_b32_e32(v[0], s[0]), # numer = MAX_FLOAT (S2)
|
|
v_mov_b32_e32(v[1], 1.0), # denom = 1.0 (S1), exp=127. diff = 254-127 = 127 >= 96
|
|
# S0=denom (what we're scaling), S1=denom, S2=numer
|
|
v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling denom for large exp diff")
|
|
# Result should be denom * 2^64
|
|
expected = 1.0 * (2.0 ** 64)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=expected * 1e-6)
|
|
|
|
def test_div_scale_f32_denorm_denom(self):
|
|
"""V_DIV_SCALE_F32: denormalized denominator -> NaN, VCC=1.
|
|
|
|
Hardware returns NaN when denominator is denormalized (different from PDF pseudocode).
|
|
"""
|
|
# Smallest positive denorm: 0x00000001 = 1.4e-45
|
|
denorm = 0x00000001
|
|
instructions = [
|
|
s_mov_b32(s[0], denorm),
|
|
v_mov_b32_e32(v[0], 1.0), # numer = 1.0 (S2)
|
|
v_mov_b32_e32(v[1], s[0]), # denom = denorm (S1)
|
|
# S0=denom, S1=denom, S2=numer -> scale denom
|
|
v_div_scale_f32(v[2], VCC, v[1], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])), "Hardware returns NaN for denorm denom")
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for denorm denom")
|
|
|
|
def test_div_scale_f32_tiny_numer_exp_le_23(self):
|
|
"""V_DIV_SCALE_F32: exponent(numer) <= 23 -> scale by 2^64, VCC=1."""
|
|
# exp <= 23 means exponent field is 0..23
|
|
# exp=23 corresponds to float value around 2^(23-127) = 2^-104 ≈ 4.9e-32
|
|
# Use exp=1 (smallest normal), which is 2^(1-127) = 2^-126 ≈ 1.18e-38
|
|
smallest_normal = 0x00800000 # exp=1, mantissa=0
|
|
instructions = [
|
|
s_mov_b32(s[0], smallest_normal),
|
|
v_mov_b32_e32(v[0], s[0]), # numer = smallest_normal (S2), exp=1 <= 23
|
|
v_mov_b32_e32(v[1], 1.0), # denom = 1.0 (S1)
|
|
# S0=numer, S1=denom, S2=numer -> scale numer
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# Numer scaled by 2^64, VCC=1 to indicate scaling was done
|
|
numer_f = i2f(smallest_normal)
|
|
expected = numer_f * (2.0 ** 64)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), expected, delta=abs(expected) * 1e-5)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when scaling tiny numer")
|
|
|
|
def test_div_scale_f32_result_would_be_denorm(self):
|
|
"""V_DIV_SCALE_F32: result would be denorm -> no scaling applied, VCC=1.
|
|
|
|
When the result of numer/denom would be denormalized, hardware sets VCC=1
|
|
but does NOT scale the input (returns it unchanged). The scaling happens
|
|
elsewhere in the division sequence.
|
|
"""
|
|
# If S2/S1 would be denorm, set VCC but don't scale
|
|
# Denorm result: exp < 1, i.e., |result| < 2^-126
|
|
# Use 1.0 / 2^127 ≈ 5.9e-39 (result would be denorm)
|
|
large_denom = 0x7f000000 # 2^127
|
|
instructions = [
|
|
s_mov_b32(s[0], large_denom),
|
|
v_mov_b32_e32(v[0], 1.0), # numer = 1.0 (S2)
|
|
v_mov_b32_e32(v[1], s[0]), # denom = 2^127 (S1)
|
|
# S0=numer, S1=denom, S2=numer -> check if we need to scale numer
|
|
v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# Hardware returns input unchanged but sets VCC=1
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 1.0, places=5)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 when result would be denorm")
|
|
|
|
|
|
class TestVDivFmas(unittest.TestCase):
|
|
"""Tests for V_DIV_FMAS_F32 edge cases.
|
|
|
|
V_DIV_FMAS_F32 performs FMA with optional scaling based on VCC.
|
|
The scale direction depends on S2's exponent (the addend):
|
|
- If exponent(S2) > 127 (i.e., S2 >= 2.0): scale by 2^+64
|
|
- Otherwise: scale by 2^-64
|
|
|
|
NOTE: The PDF (page 449) incorrectly says just 2^32.
|
|
"""
|
|
|
|
def test_div_fmas_f32_no_scale(self):
|
|
"""V_DIV_FMAS_F32: VCC=0 -> normal FMA."""
|
|
instructions = [
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0), # VCC = 0
|
|
v_mov_b32_e32(v[0], 2.0), # S0
|
|
v_mov_b32_e32(v[1], 3.0), # S1
|
|
v_mov_b32_e32(v[2], 1.0), # S2
|
|
v_div_fmas_f32(v[3], v[0], v[1], v[2]), # 2*3+1 = 7
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), 7.0, places=5)
|
|
|
|
def test_div_fmas_f32_scale_up(self):
|
|
"""V_DIV_FMAS_F32: VCC=1 with S2 >= 2.0 -> scale by 2^+64."""
|
|
instructions = [
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 1), # VCC = 1
|
|
v_mov_b32_e32(v[0], 1.0), # S0
|
|
v_mov_b32_e32(v[1], 1.0), # S1
|
|
v_mov_b32_e32(v[2], 2.0), # S2 >= 2.0, so scale UP
|
|
v_div_fmas_f32(v[3], v[0], v[1], v[2]), # 2^+64 * (1*1+2) = 2^+64 * 3
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
expected = 3.0 * (2.0 ** 64)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
|
|
|
|
def test_div_fmas_f32_scale_down(self):
|
|
"""V_DIV_FMAS_F32: VCC=1 with S2 < 2.0 -> scale by 2^-64."""
|
|
instructions = [
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 1), # VCC = 1
|
|
v_mov_b32_e32(v[0], 2.0), # S0
|
|
v_mov_b32_e32(v[1], 3.0), # S1
|
|
v_mov_b32_e32(v[2], 1.0), # S2 < 2.0, so scale DOWN
|
|
v_div_fmas_f32(v[3], v[0], v[1], v[2]), # 2^-64 * (2*3+1) = 2^-64 * 7
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
expected = 7.0 * (2.0 ** -64)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), expected, delta=abs(expected) * 1e-6)
|
|
|
|
def test_div_fmas_f32_per_lane_vcc(self):
|
|
"""V_DIV_FMAS_F32: different VCC per lane with S2 < 2.0."""
|
|
instructions = [
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0b0101), # VCC: lanes 0,2 set
|
|
v_mov_b32_e32(v[0], 1.0),
|
|
v_mov_b32_e32(v[1], 1.0),
|
|
v_mov_b32_e32(v[2], 1.0), # S2 < 2.0, so scale DOWN
|
|
v_div_fmas_f32(v[3], v[0], v[1], v[2]), # fma(1,1,1) = 2, scaled = 2^-64 * 2
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
scaled = 2.0 * (2.0 ** -64)
|
|
unscaled = 2.0
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), scaled, delta=abs(scaled) * 1e-6) # lane 0: VCC=1
|
|
self.assertAlmostEqual(i2f(st.vgpr[1][3]), unscaled, places=5) # lane 1: VCC=0
|
|
self.assertAlmostEqual(i2f(st.vgpr[2][3]), scaled, delta=abs(scaled) * 1e-6) # lane 2: VCC=1
|
|
self.assertAlmostEqual(i2f(st.vgpr[3][3]), unscaled, places=5) # lane 3: VCC=0
|
|
|
|
|
|
class TestVDivFixup(unittest.TestCase):
|
|
"""Tests for V_DIV_FIXUP_F32 edge cases.
|
|
|
|
V_DIV_FIXUP_F32 is the final step of Newton-Raphson division.
|
|
It handles special cases: NaN, Inf, zero, overflow, underflow.
|
|
|
|
Args: S0=quotient from NR iteration, S1=denominator, S2=numerator
|
|
"""
|
|
|
|
def test_div_fixup_f32_normal(self):
|
|
"""V_DIV_FIXUP_F32: normal division passes through quotient."""
|
|
# 6.0 / 2.0 = 3.0
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 3.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 2.0), # S1 = denominator
|
|
v_mov_b32_e32(v[2], 6.0), # S2 = numerator
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
|
|
|
|
def test_div_fixup_f32_nan_numer(self):
|
|
"""V_DIV_FIXUP_F32: NaN numerator -> quiet NaN."""
|
|
nan = 0x7fc00000 # quiet NaN
|
|
instructions = [
|
|
s_mov_b32(s[0], nan),
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 1.0), # S1 = denominator
|
|
v_mov_b32_e32(v[2], s[0]), # S2 = numerator = NaN
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "Should be NaN")
|
|
|
|
def test_div_fixup_f32_nan_denom(self):
|
|
"""V_DIV_FIXUP_F32: NaN denominator -> quiet NaN."""
|
|
nan = 0x7fc00000 # quiet NaN
|
|
instructions = [
|
|
s_mov_b32(s[0], nan),
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], s[0]), # S1 = denominator = NaN
|
|
v_mov_b32_e32(v[2], 1.0), # S2 = numerator
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "Should be NaN")
|
|
|
|
def test_div_fixup_f32_zero_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: 0/0 -> NaN (0xffc00000)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient (doesn't matter)
|
|
v_mov_b32_e32(v[1], 0.0), # S1 = denominator = 0
|
|
v_mov_b32_e32(v[2], 0.0), # S2 = numerator = 0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "0/0 should be NaN")
|
|
|
|
def test_div_fixup_f32_inf_div_inf(self):
|
|
"""V_DIV_FIXUP_F32: inf/inf -> NaN."""
|
|
pos_inf = 0x7f800000
|
|
instructions = [
|
|
s_mov_b32(s[0], pos_inf),
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], s[0]), # S1 = denominator = +inf
|
|
v_mov_b32_e32(v[2], s[0]), # S2 = numerator = +inf
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "inf/inf should be NaN")
|
|
|
|
def test_div_fixup_f32_x_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: x/0 -> +/-inf based on sign."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 0.0), # S1 = denominator = 0
|
|
v_mov_b32_e32(v[2], 1.0), # S2 = numerator = 1.0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "x/0 should be inf")
|
|
self.assertGreater(i2f(st.vgpr[0][3]), 0, "1/0 should be +inf")
|
|
|
|
def test_div_fixup_f32_neg_x_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: -x/0 -> -inf."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 0.0), # S1 = denominator = 0
|
|
v_mov_b32_e32(v[2], -1.0), # S2 = numerator = -1.0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "-x/0 should be inf")
|
|
self.assertLess(i2f(st.vgpr[0][3]), 0, "-1/0 should be -inf")
|
|
|
|
def test_div_fixup_f32_zero_div_x(self):
|
|
"""V_DIV_FIXUP_F32: 0/x -> 0."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 2.0), # S1 = denominator = 2.0
|
|
v_mov_b32_e32(v[2], 0.0), # S2 = numerator = 0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(i2f(st.vgpr[0][3]), 0.0, "0/x should be 0")
|
|
|
|
def test_div_fixup_f32_x_div_inf(self):
|
|
"""V_DIV_FIXUP_F32: x/inf -> 0."""
|
|
pos_inf = 0x7f800000
|
|
instructions = [
|
|
s_mov_b32(s[0], pos_inf),
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], s[0]), # S1 = denominator = +inf
|
|
v_mov_b32_e32(v[2], 1.0), # S2 = numerator = 1.0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(i2f(st.vgpr[0][3]), 0.0, "x/inf should be 0")
|
|
|
|
def test_div_fixup_f32_inf_div_x(self):
|
|
"""V_DIV_FIXUP_F32: inf/x -> inf."""
|
|
pos_inf = 0x7f800000
|
|
instructions = [
|
|
s_mov_b32(s[0], pos_inf),
|
|
v_mov_b32_e32(v[0], 1.0), # S0 = quotient
|
|
v_mov_b32_e32(v[1], 1.0), # S1 = denominator = 1.0
|
|
v_mov_b32_e32(v[2], s[0]), # S2 = numerator = +inf
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "inf/x should be inf")
|
|
|
|
def test_div_fixup_f32_sign_propagation(self):
|
|
"""V_DIV_FIXUP_F32: sign is XOR of numer and denom signs."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 3.0), # S0 = |quotient|
|
|
v_mov_b32_e32(v[1], -2.0), # S1 = denominator (negative)
|
|
v_mov_b32_e32(v[2], 6.0), # S2 = numerator (positive)
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# pos / neg = neg
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), -3.0, places=5)
|
|
|
|
def test_div_fixup_f32_neg_neg(self):
|
|
"""V_DIV_FIXUP_F32: neg/neg -> positive."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 3.0), # S0 = |quotient|
|
|
v_mov_b32_e32(v[1], -2.0), # S1 = denominator (negative)
|
|
v_mov_b32_e32(v[2], -6.0), # S2 = numerator (negative)
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# neg / neg = pos
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), 3.0, places=5)
|
|
|
|
def test_div_fixup_f32_nan_estimate_overflow(self):
|
|
"""V_DIV_FIXUP_F32: NaN estimate returns overflow (inf).
|
|
|
|
PDF doesn't check isNAN(S0), but hardware returns OVERFLOW if S0 is NaN.
|
|
This happens when division fails (e.g., denorm denominator in V_DIV_SCALE).
|
|
"""
|
|
quiet_nan = 0x7fc00000
|
|
instructions = [
|
|
s_mov_b32(s[0], quiet_nan),
|
|
v_mov_b32_e32(v[0], s[0]), # S0 = NaN (failed estimate)
|
|
v_mov_b32_e32(v[1], 1.0), # S1 = denominator = 1.0
|
|
v_mov_b32_e32(v[2], 1.0), # S2 = numerator = 1.0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
|
|
self.assertEqual(st.vgpr[0][3], 0x7f800000, "Should be +inf (pos/pos)")
|
|
|
|
def test_div_fixup_f32_nan_estimate_sign(self):
|
|
"""V_DIV_FIXUP_F32: NaN estimate with negative sign returns -inf."""
|
|
quiet_nan = 0x7fc00000
|
|
instructions = [
|
|
s_mov_b32(s[0], quiet_nan),
|
|
v_mov_b32_e32(v[0], s[0]), # S0 = NaN (failed estimate)
|
|
v_mov_b32_e32(v[1], -1.0), # S1 = denominator = -1.0
|
|
v_mov_b32_e32(v[2], 1.0), # S2 = numerator = 1.0
|
|
v_div_fixup_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][3])), "NaN estimate should return inf")
|
|
self.assertEqual(st.vgpr[0][3], 0xff800000, "Should be -inf (pos/neg)")
|
|
|
|
|
|
class TestVCmpClass(unittest.TestCase):
|
|
"""Tests for V_CMP_CLASS_F32 float classification."""
|
|
|
|
def test_cmp_class_quiet_nan(self):
|
|
"""V_CMP_CLASS_F32 detects quiet NaN."""
|
|
quiet_nan = 0x7fc00000
|
|
instructions = [
|
|
s_mov_b32(s[0], quiet_nan), # large int encodes as literal
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], 0b0000000010), # bit 1 = quiet NaN (mask in VGPR for VOPC)
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect quiet NaN")
|
|
|
|
def test_cmp_class_signaling_nan(self):
|
|
"""V_CMP_CLASS_F32 detects signaling NaN."""
|
|
signal_nan = 0x7f800001
|
|
instructions = [
|
|
s_mov_b32(s[0], signal_nan), # large int encodes as literal
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], 0b0000000001), # bit 0 = signaling NaN
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect signaling NaN")
|
|
|
|
def test_cmp_class_quiet_nan_not_signaling(self):
|
|
"""Quiet NaN does not match signaling NaN mask."""
|
|
quiet_nan = 0x7fc00000
|
|
instructions = [
|
|
s_mov_b32(s[0], quiet_nan), # large int encodes as literal
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], 0b0000000001), # bit 0 = signaling NaN only
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 0, "Quiet NaN should not match signaling mask")
|
|
|
|
def test_cmp_class_signaling_nan_not_quiet(self):
|
|
"""Signaling NaN does not match quiet NaN mask."""
|
|
signal_nan = 0x7f800001
|
|
instructions = [
|
|
s_mov_b32(s[0], signal_nan), # large int encodes as literal
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], 0b0000000010), # bit 1 = quiet NaN only
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 0, "Signaling NaN should not match quiet mask")
|
|
|
|
def test_cmp_class_positive_inf(self):
|
|
"""V_CMP_CLASS_F32 detects +inf."""
|
|
pos_inf = 0x7f800000
|
|
instructions = [
|
|
s_mov_b32(s[0], pos_inf), # large int encodes as literal
|
|
s_mov_b32(s[1], 0b1000000000), # bit 9 = +inf (512 is outside inline range)
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], s[1]), # mask in VGPR
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect +inf")
|
|
|
|
def test_cmp_class_negative_inf(self):
|
|
"""V_CMP_CLASS_F32 detects -inf."""
|
|
neg_inf = 0xff800000
|
|
instructions = [
|
|
s_mov_b32(s[0], neg_inf), # large int encodes as literal
|
|
v_mov_b32_e32(v[0], s[0]), # value to classify
|
|
v_mov_b32_e32(v[1], 0b0000000100), # bit 2 = -inf
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect -inf")
|
|
|
|
def test_cmp_class_normal_positive(self):
|
|
"""V_CMP_CLASS_F32 detects positive normal."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # inline constant - value to classify
|
|
s_mov_b32(s[1], 0b0100000000), # bit 8 = positive normal (256 is outside inline range)
|
|
v_mov_b32_e32(v[1], s[1]), # mask in VGPR
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect positive normal")
|
|
|
|
def test_cmp_class_normal_negative(self):
|
|
"""V_CMP_CLASS_F32 detects negative normal."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], -1.0), # inline constant - value to classify
|
|
v_mov_b32_e32(v[1], 0b0000001000), # bit 3 = negative normal
|
|
v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Should detect negative normal")
|
|
|
|
|
|
class TestBasicOps(unittest.TestCase):
|
|
"""Basic instruction tests."""
|
|
|
|
def test_v_add_f32(self):
|
|
"""V_ADD_F32 adds two floats."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # inline constant
|
|
v_mov_b32_e32(v[1], 2.0), # inline constant
|
|
v_add_f32_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
|
|
|
|
def test_v_mul_f32(self):
|
|
"""V_MUL_F32 multiplies two floats."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0), # inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # inline constant
|
|
v_mul_f32_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 8.0, places=5)
|
|
|
|
def test_v_mov_b32(self):
|
|
"""V_MOV_B32 moves a value."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 42),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][0], 42)
|
|
|
|
def test_s_add_u32(self):
|
|
"""S_ADD_U32 adds two scalar values."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 100),
|
|
s_mov_b32(s[1], 200),
|
|
s_add_u32(s[2], s[0], s[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.sgpr[2], 300)
|
|
|
|
def test_s_add_u32_carry(self):
|
|
"""S_ADD_U32 sets SCC on overflow."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 64), # use inline constant for max
|
|
s_not_b32(s[0], s[0]), # s0 = ~64 = 0xffffffbf, close to max
|
|
s_mov_b32(s[1], 64),
|
|
s_add_u32(s[2], s[0], s[1]), # 0xffffffbf + 64 = 0xffffffff
|
|
s_mov_b32(s[3], 1),
|
|
s_add_u32(s[4], s[2], s[3]), # 0xffffffff + 1 = overflow
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.sgpr[4], 0)
|
|
self.assertEqual(st.scc, 1)
|
|
|
|
def test_v_alignbit_b32(self):
|
|
"""V_ALIGNBIT_B32 extracts bits from concatenated sources."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x12), # small values as inline constants
|
|
s_mov_b32(s[1], 0x34),
|
|
s_mov_b32(s[2], 4), # shift amount
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
v_alignbit_b32(v[1], s[0], s[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# {0x12, 0x34} >> 4 = 0x0000001200000034 >> 4 = 0x20000003
|
|
expected = ((0x12 << 32) | 0x34) >> 4
|
|
self.assertEqual(st.vgpr[0][1], expected & 0xffffffff)
|
|
|
|
|
|
class TestMultiLane(unittest.TestCase):
|
|
"""Tests for multi-lane execution."""
|
|
|
|
def test_v_mov_all_lanes(self):
|
|
"""V_MOV_B32 sets all lanes to the same value."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 42),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][0], 42)
|
|
|
|
def test_v_cmp_sets_vcc_bits(self):
|
|
"""V_CMP_EQ sets VCC bits based on per-lane comparison."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 5),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_cmp_eq_u32_e32(v[0], v[1]), # VOPC: src0, vsrc1 - writes VCC implicitly
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match")
|
|
|
|
|
|
class TestLaneInstructions(unittest.TestCase):
|
|
"""Tests for cross-lane instructions (readlane, writelane, readfirstlane).
|
|
|
|
These are critical for wave-level reductions and WMMA matrix operations.
|
|
|
|
Note: V_READLANE_B32 and V_READFIRSTLANE_B32 write to SGPR, but the VOP1/VOP3
|
|
encoding has a 'vdst' field. We use RawImm to encode SGPR indices directly.
|
|
"""
|
|
|
|
def _readlane(self, sdst_idx, vsrc, lane_idx):
|
|
"""Helper to create V_READLANE_B32 with SGPR destination."""
|
|
return VOP3(VOP3Op.V_READLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc, src1=lane_idx)
|
|
|
|
def _readfirstlane(self, sdst_idx, vsrc):
|
|
"""Helper to create V_READFIRSTLANE_B32 with SGPR destination."""
|
|
return VOP1(VOP1Op.V_READFIRSTLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc)
|
|
|
|
def test_v_readlane_b32_basic(self):
|
|
"""V_READLANE_B32 reads a value from a specific lane's VGPR."""
|
|
# v[255] = lane_id from prologue; compute v[0] = lane_id * 10
|
|
instructions = [
|
|
v_lshlrev_b32_e32(v[0], 1, v[255]), # v0 = lane_id * 2
|
|
v_lshlrev_b32_e32(v[1], 3, v[255]), # v1 = lane_id * 8
|
|
v_add_nc_u32_e32(v[0], v[0], v[1]), # v0 = lane_id * 10
|
|
# Now read lane 2's value (should be 20) into s0
|
|
self._readlane(0, v[0], 2), # s0 = v0 from lane 2 = 20
|
|
v_mov_b32_e32(v[2], s[0]), # broadcast to all lanes
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
# All lanes should have the value 20 (lane 2's value)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][2], 20, f"Lane {lane}: expected 20, got {st.vgpr[lane][2]}")
|
|
|
|
def test_v_readlane_b32_lane_0(self):
|
|
"""V_READLANE_B32 reading from lane 0."""
|
|
instructions = [
|
|
v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4
|
|
v_add_nc_u32_e32(v[0], 100, v[0]), # v0 = 100 + lane_id * 4
|
|
self._readlane(0, v[0], 0), # s0 = lane 0's v0 = 100
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][1], 100)
|
|
|
|
def test_v_readlane_b32_last_lane(self):
|
|
"""V_READLANE_B32 reading from the last active lane (lane 3 in 4-lane test)."""
|
|
instructions = [
|
|
v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4
|
|
v_add_nc_u32_e32(v[0], 100, v[0]), # v0 = 100 + lane_id * 4
|
|
self._readlane(0, v[0], 3), # s0 = lane 3's v0 = 112
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][1], 112)
|
|
|
|
def test_v_readlane_b32_different_vgpr(self):
|
|
"""V_READLANE_B32 reading from different VGPR indices.
|
|
|
|
Regression test for bug where rd_lane was checked against VGPR values
|
|
instead of being used as an index (using 'in' operator on list instead
|
|
of checking if index is within bounds).
|
|
"""
|
|
instructions = [
|
|
# Set up v[5] with per-lane values
|
|
v_lshlrev_b32_e32(v[5], 3, v[255]), # v5 = lane_id * 8
|
|
v_add_nc_u32_e32(v[5], 50, v[5]), # v5 = 50 + lane_id * 8
|
|
# Read lane 1's v[5] (should be 58)
|
|
self._readlane(0, v[5], 1),
|
|
v_mov_b32_e32(v[6], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][6], 58, f"Lane {lane}: expected 58 from v[5] lane 1")
|
|
|
|
def test_v_readfirstlane_b32_basic(self):
|
|
"""V_READFIRSTLANE_B32 reads from the first active lane."""
|
|
instructions = [
|
|
v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4
|
|
v_add_nc_u32_e32(v[0], 1000, v[0]), # v0 = 1000 + lane_id * 4
|
|
self._readfirstlane(0, v[0]), # s0 = first lane's v0 = 1000
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][1], 1000)
|
|
|
|
def test_v_readfirstlane_b32_different_vgpr(self):
|
|
"""V_READFIRSTLANE_B32 reading from different VGPR index.
|
|
|
|
Regression test for bug where src0_idx bounds check was incorrect.
|
|
"""
|
|
instructions = [
|
|
v_lshlrev_b32_e32(v[7], 5, v[255]), # v7 = lane_id * 32
|
|
v_add_nc_u32_e32(v[7], 200, v[7]), # v7 = 200 + lane_id * 32
|
|
self._readfirstlane(0, v[7]), # s0 = first lane's v7 = 200
|
|
v_mov_b32_e32(v[8], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][8], 200)
|
|
|
|
def test_v_writelane_b32_basic(self):
|
|
"""V_WRITELANE_B32 writes a scalar to a specific lane's VGPR."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0), # Initialize v0 = 0 for all lanes
|
|
s_mov_b32(s[0], 999), # Value to write
|
|
v_writelane_b32(v[0], s[0], 2), # Write 999 to lane 2's v0
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
if lane == 2:
|
|
self.assertEqual(st.vgpr[lane][0], 999, f"Lane 2 should have 999")
|
|
else:
|
|
self.assertEqual(st.vgpr[lane][0], 0, f"Lane {lane} should have 0")
|
|
|
|
def test_v_writelane_then_readlane(self):
|
|
"""V_WRITELANE followed by V_READLANE to verify round-trip."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
s_mov_b32(s[0], 0xdeadbeef),
|
|
v_writelane_b32(v[0], s[0], 1), # Write to lane 1
|
|
self._readlane(1, v[0], 1), # Read back from lane 1 into s1
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][1], 0xdeadbeef)
|
|
|
|
def test_v_readlane_for_reduction(self):
|
|
"""Simulate a wave reduction using readlane - common pattern in WMMA/reductions.
|
|
|
|
This pattern is used when reducing across lanes, e.g., for computing
|
|
the sum of all elements in a wave.
|
|
"""
|
|
# Each lane computes lane_id + 1, then we sum lanes 0-3 using readlane
|
|
instructions = [
|
|
v_add_nc_u32_e32(v[0], 1, v[255]), # v0 = lane_id + 1 (1, 2, 3, 4)
|
|
# Read all 4 lanes and sum in scalar registers
|
|
self._readlane(0, v[0], 0), # s0 = 1
|
|
self._readlane(1, v[0], 1), # s1 = 2
|
|
s_add_u32(s[0], s[0], s[1]), # s0 = 3
|
|
self._readlane(1, v[0], 2), # s1 = 3
|
|
s_add_u32(s[0], s[0], s[1]), # s0 = 6
|
|
self._readlane(1, v[0], 3), # s1 = 4
|
|
s_add_u32(s[0], s[0], s[1]), # s0 = 10
|
|
v_mov_b32_e32(v[1], s[0]), # Broadcast sum to all lanes
|
|
]
|
|
st = run_program(instructions, n_lanes=4)
|
|
for lane in range(4):
|
|
self.assertEqual(st.vgpr[lane][1], 10, f"Sum 1+2+3+4 should be 10")
|
|
|
|
|
|
class TestTrigonometry(unittest.TestCase):
|
|
"""Tests for trigonometric instructions."""
|
|
|
|
def test_v_sin_f32_small(self):
|
|
"""V_SIN_F32 computes sin for small values."""
|
|
import math
|
|
# sin(1.0) ≈ 0.8414709848
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0),
|
|
v_sin_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
expected = math.sin(1.0 * 2 * math.pi) # V_SIN_F32 expects input in cycles (0-1 = 0-2π)
|
|
self.assertAlmostEqual(result, expected, places=4)
|
|
|
|
def test_v_sin_f32_quarter(self):
|
|
"""V_SIN_F32 at 0.25 cycles = sin(π/2) = 1.0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(0.25)), # 0.25 is not an inline constant, use f2i
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_sin_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertAlmostEqual(result, 1.0, places=4)
|
|
|
|
def test_v_sin_f32_large(self):
|
|
"""V_SIN_F32 for large input value (132000.0)."""
|
|
import math
|
|
# This is the failing case: sin(132000.0) should be ≈ 0.294
|
|
# V_SIN_F32 input is in cycles, so we need frac(132000.0) * 2π
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(132000.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_sin_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
# frac(132000.0) = 0, so sin(0) = 0... but actually V_SIN_F32 does its own frac internally
|
|
# The expected value is sin(frac(132000.0) * 2π) where frac is done in the instruction
|
|
# For 132000.0, the hardware computes frac(132000.0) ≈ 0.046875 (due to precision)
|
|
# sin(0.046875 * 2π) ≈ 0.294
|
|
expected = math.sin(132000.0 * 2 * math.pi)
|
|
# Allow some tolerance due to precision differences
|
|
self.assertAlmostEqual(result, expected, places=2, msg=f"sin(132000) got {result}, expected ~{expected}")
|
|
|
|
|
|
class TestFMA(unittest.TestCase):
|
|
"""Tests for FMA instructions - key for OCML sin argument reduction."""
|
|
|
|
def test_v_fma_f32_basic(self):
|
|
"""V_FMA_F32: a*b+c basic case using inline constants only."""
|
|
# Inline float constants: 0.5, -0.5, 1.0, -1.0, 2.0, -2.0, 4.0, -4.0
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0), # inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # inline constant
|
|
v_mov_b32_e32(v[2], 1.0), # inline constant
|
|
v_fma_f32(v[3], v[0], v[1], v[2]), # 2*4+1 = 9
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), 9.0, places=5)
|
|
|
|
def test_v_fma_f32_negative(self):
|
|
"""V_FMA_F32 with negative multiplier (used in sin reduction)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], -2.0), # inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # inline constant
|
|
v_mov_b32_e32(v[2], 1.0), # inline constant
|
|
v_fma_f32(v[3], v[0], v[1], v[2]), # -2*4+1 = -7
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), -7.0, places=5)
|
|
|
|
def test_v_fmac_f32(self):
|
|
"""V_FMAC_F32: d = d + a*b using inline constants."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0), # inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # inline constant
|
|
v_mov_b32_e32(v[2], 1.0), # inline constant
|
|
v_fmac_f32_e32(v[2], v[0], v[1]), # v2 = v2 + v0*v1 = 1 + 2*4 = 9
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
|
|
|
|
def test_v_fmaak_f32(self):
|
|
"""V_FMAAK_F32: d = a * b + K using inline constants."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0), # inline constant
|
|
v_mov_b32_e32(v[1], 4.0), # inline constant
|
|
v_fmaak_f32_e32(v[2], v[0], v[1], 0x3f800000), # v2 = v0 * v1 + 1.0 = 2*4+1 = 9
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
|
|
|
|
def test_v_fma_f32_with_sgpr(self):
|
|
"""V_FMA_F32: using SGPR for non-inline constant."""
|
|
# Use SGPR to load 3.0 which is not an inline constant
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(3.0)), # 3.0 via literal in SGPR
|
|
v_mov_b32_e32(v[0], 2.0), # inline constant
|
|
v_mov_b32_e32(v[1], s[0]), # 3.0 from SGPR
|
|
v_mov_b32_e32(v[2], 4.0), # inline constant
|
|
v_fma_f32(v[3], v[0], v[1], v[2]), # 2*3+4 = 10
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][3]), 10.0, places=5)
|
|
|
|
|
|
class TestRounding(unittest.TestCase):
|
|
"""Tests for rounding instructions - used in sin argument reduction."""
|
|
|
|
def test_v_rndne_f32_half_even(self):
|
|
"""V_RNDNE_F32 rounds to nearest even."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.5)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rndne_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5) # rounds to even
|
|
|
|
def test_v_rndne_f32_half_odd(self):
|
|
"""V_RNDNE_F32 rounds 3.5 to 4 (nearest even)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(3.5)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rndne_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4.0, places=5)
|
|
|
|
def test_v_rndne_f32_large(self):
|
|
"""V_RNDNE_F32 with large value (like sin reduction uses)."""
|
|
# sin(1e5) reduction: 1e5 * (1/2pi) ≈ 15915.49...
|
|
val = 100000.0 * 0.15915494309189535 # 1/(2*pi)
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(val)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rndne_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
expected = round(val) # Python's round does banker's rounding
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), expected, places=0)
|
|
|
|
def test_v_floor_f32(self):
|
|
"""V_FLOOR_F32 floors to integer."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(3.7)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_floor_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 3.0, places=5)
|
|
|
|
def test_v_trunc_f32(self):
|
|
"""V_TRUNC_F32 truncates toward zero."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(-3.7)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_trunc_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), -3.0, places=5)
|
|
|
|
def test_v_fract_f32(self):
|
|
"""V_FRACT_F32 returns fractional part."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(3.75)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_fract_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.75, places=5)
|
|
|
|
def test_v_fract_f32_large(self):
|
|
"""V_FRACT_F32 with large value - precision matters here."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(132000.25)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_fract_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
# For large floats, fract precision degrades
|
|
self.assertGreaterEqual(result, 0.0)
|
|
self.assertLess(result, 1.0)
|
|
|
|
|
|
class TestConversion(unittest.TestCase):
|
|
"""Tests for conversion instructions."""
|
|
|
|
def test_v_cvt_i32_f32_positive(self):
|
|
"""V_CVT_I32_F32 converts float to signed int."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(42.7)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i32_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 42)
|
|
|
|
def test_v_cvt_i32_f32_negative(self):
|
|
"""V_CVT_I32_F32 converts negative float to signed int."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(-42.7)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i32_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# Result is signed, stored as unsigned
|
|
self.assertEqual(st.vgpr[0][1] & 0xffffffff, (-42) & 0xffffffff)
|
|
|
|
def test_v_cvt_i32_f32_large(self):
|
|
"""V_CVT_I32_F32 with large float (used in sin for quadrant)."""
|
|
# sin reduction converts round(x * 1/2pi) to int for quadrant selection
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(15915.0)), # ~1e5 / (2*pi)
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i32_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 15915)
|
|
|
|
def test_v_cvt_f32_i32(self):
|
|
"""V_CVT_F32_I32 converts signed int to float."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 42),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_f32_i32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 42.0, places=5)
|
|
|
|
def test_v_cvt_f32_u32(self):
|
|
"""V_CVT_F32_U32 converts unsigned int to float."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xffffffff), # max u32
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_f32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4294967296.0, places=-5)
|
|
|
|
|
|
class TestBitManipulation(unittest.TestCase):
|
|
"""Tests for bit manipulation - used in sin for quadrant selection."""
|
|
|
|
def test_v_and_b32(self):
|
|
"""V_AND_B32 bitwise and."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xff),
|
|
s_mov_b32(s[1], 0x0f),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_and_b32_e32(v[1], s[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0x0f)
|
|
|
|
def test_v_and_b32_quadrant(self):
|
|
"""V_AND_B32 for quadrant extraction (n & 3)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 15915), # some large number
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_and_b32_e32(v[1], 3, v[0]), # n & 3 for quadrant
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 15915 & 3)
|
|
|
|
def test_v_lshrrev_b32(self):
|
|
"""V_LSHRREV_B32 logical shift right."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xff00),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_lshrrev_b32_e32(v[1], 8, v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0xff)
|
|
|
|
def test_v_lshlrev_b32(self):
|
|
"""V_LSHLREV_B32 logical shift left."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xff),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_lshlrev_b32_e32(v[1], 8, v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0xff00)
|
|
|
|
def test_v_xor_b32(self):
|
|
"""V_XOR_B32 bitwise xor (used in sin for sign)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000000), # sign bit
|
|
s_mov_b32(s[1], f2i(1.0)),
|
|
v_mov_b32_e32(v[0], s[1]),
|
|
v_xor_b32_e32(v[1], s[0], v[0]), # flip sign
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
|
|
|
|
|
|
class TestOCMLSinSequence(unittest.TestCase):
|
|
"""Test the specific instruction sequence used in OCML sin."""
|
|
|
|
def test_sin_reduction_step1_mul(self):
|
|
"""First step: v12 = |x| * (1/2pi)."""
|
|
import math
|
|
one_over_2pi = 1.0 / (2.0 * math.pi) # 0x3e22f983 in hex
|
|
x = 100000.0
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(x)),
|
|
s_mov_b32(s[1], f2i(one_over_2pi)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mul_f32_e32(v[1], s[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
expected = x * one_over_2pi
|
|
self.assertAlmostEqual(result, expected, places=0)
|
|
|
|
def test_sin_reduction_step2_round(self):
|
|
"""Second step: round to nearest integer."""
|
|
import math
|
|
one_over_2pi = 1.0 / (2.0 * math.pi)
|
|
x = 100000.0
|
|
val = x * one_over_2pi # ~15915.49
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(val)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rndne_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
expected = round(val)
|
|
self.assertAlmostEqual(result, expected, places=0)
|
|
|
|
def test_sin_reduction_step3_fma(self):
|
|
"""Third step: x - n * (pi/2) via FMA."""
|
|
import math
|
|
# This is where precision matters - the FMA does: |x| + (-pi/2) * n
|
|
neg_half_pi = -math.pi / 2.0 # 0xbfc90fda
|
|
x = 100000.0
|
|
n = 15915.0
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(neg_half_pi)),
|
|
s_mov_b32(s[1], f2i(n)),
|
|
s_mov_b32(s[2], f2i(x)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_fma_f32(v[3], v[0], v[1], v[2]), # x + (-pi/2) * n
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
expected = x + neg_half_pi * n
|
|
# Allow some tolerance due to float precision
|
|
self.assertAlmostEqual(result, expected, places=2)
|
|
|
|
def test_sin_1e5_full_reduction(self):
|
|
"""Full reduction sequence for sin(1e5)."""
|
|
import math
|
|
x = 100000.0
|
|
one_over_2pi = 1.0 / (2.0 * math.pi)
|
|
neg_half_pi = -math.pi / 2.0
|
|
|
|
instructions = [
|
|
# Load constants
|
|
s_mov_b32(s[0], f2i(x)),
|
|
s_mov_b32(s[1], f2i(one_over_2pi)),
|
|
s_mov_b32(s[2], f2i(neg_half_pi)),
|
|
# Step 1: v1 = x * (1/2pi)
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mul_f32_e32(v[1], s[1], v[0]),
|
|
# Step 2: v2 = round(v1)
|
|
v_rndne_f32_e32(v[2], v[1]),
|
|
# Step 3: v3 = x + (-pi/2) * round_val (FMA)
|
|
v_fma_f32(v[3], s[2], v[2], v[0]),
|
|
# Step 4: convert to int for quadrant
|
|
v_cvt_i32_f32_e32(v[4], v[2]),
|
|
# Step 5: quadrant = n & 3
|
|
v_and_b32_e32(v[5], 3, v[4]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
|
|
# Check intermediate values
|
|
mul_result = i2f(st.vgpr[0][1])
|
|
round_result = i2f(st.vgpr[0][2])
|
|
reduced = i2f(st.vgpr[0][3])
|
|
quadrant = st.vgpr[0][5]
|
|
|
|
# Verify results match expected
|
|
expected_mul = x * one_over_2pi
|
|
expected_round = round(expected_mul)
|
|
expected_reduced = x + neg_half_pi * expected_round
|
|
expected_quadrant = int(expected_round) & 3
|
|
|
|
self.assertAlmostEqual(mul_result, expected_mul, places=0, msg=f"mul: got {mul_result}, expected {expected_mul}")
|
|
self.assertAlmostEqual(round_result, expected_round, places=0, msg=f"round: got {round_result}, expected {expected_round}")
|
|
self.assertEqual(quadrant, expected_quadrant, f"quadrant: got {quadrant}, expected {expected_quadrant}")
|
|
|
|
|
|
class TestMad64(unittest.TestCase):
|
|
"""Tests for V_MAD_U64_U32 - critical for OCML Payne-Hanek sin reduction."""
|
|
|
|
def test_v_mad_u64_u32_simple(self):
|
|
"""V_MAD_U64_U32: D = S0 * S1 + S2 (64-bit result)."""
|
|
# 3 * 4 + 5 = 17
|
|
instructions = [
|
|
s_mov_b32(s[0], 3),
|
|
s_mov_b32(s[1], 4),
|
|
v_mov_b32_e32(v[2], 5), # S2 lo
|
|
v_mov_b32_e32(v[3], 0), # S2 hi
|
|
v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]), # result in v[4:5]
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_lo = st.vgpr[0][4]
|
|
result_hi = st.vgpr[0][5]
|
|
result = result_lo | (result_hi << 32)
|
|
self.assertEqual(result, 17)
|
|
|
|
def test_v_mad_u64_u32_large_mult(self):
|
|
"""V_MAD_U64_U32 with large values that overflow 32 bits."""
|
|
# 0x80000000 * 2 + 0 = 0x100000000
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000000),
|
|
s_mov_b32(s[1], 2),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_mov_b32_e32(v[3], 0),
|
|
v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_lo = st.vgpr[0][4]
|
|
result_hi = st.vgpr[0][5]
|
|
result = result_lo | (result_hi << 32)
|
|
self.assertEqual(result, 0x100000000)
|
|
|
|
def test_v_mad_u64_u32_with_add(self):
|
|
"""V_MAD_U64_U32 with 64-bit addend."""
|
|
# 1000 * 1000 + 0x100000000 = 1000000 + 0x100000000 = 0x1000F4240
|
|
instructions = [
|
|
s_mov_b32(s[0], 1000),
|
|
s_mov_b32(s[1], 1000),
|
|
v_mov_b32_e32(v[2], 0), # S2 lo
|
|
v_mov_b32_e32(v[3], 1), # S2 hi = 0x100000000
|
|
v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_lo = st.vgpr[0][4]
|
|
result_hi = st.vgpr[0][5]
|
|
result = result_lo | (result_hi << 32)
|
|
expected = 1000 * 1000 + 0x100000000
|
|
self.assertEqual(result, expected)
|
|
|
|
def test_v_mad_u64_u32_max_values(self):
|
|
"""V_MAD_U64_U32 with max u32 values."""
|
|
# 0xFFFFFFFF * 0xFFFFFFFF + 0 = 0xFFFFFFFE00000001
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xFFFFFFFF),
|
|
s_mov_b32(s[1], 0xFFFFFFFF),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_mov_b32_e32(v[3], 0),
|
|
v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_lo = st.vgpr[0][4]
|
|
result_hi = st.vgpr[0][5]
|
|
result = result_lo | (result_hi << 32)
|
|
expected = 0xFFFFFFFF * 0xFFFFFFFF
|
|
self.assertEqual(result, expected)
|
|
|
|
|
|
class TestClz(unittest.TestCase):
|
|
"""Tests for V_CLZ_I32_U32 - count leading zeros, used in Payne-Hanek."""
|
|
|
|
def test_v_clz_i32_u32_zero(self):
|
|
"""V_CLZ_I32_U32 of 0 returns -1 (all bits are 0)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
v_clz_i32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# -1 as unsigned 32-bit
|
|
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
|
|
|
|
def test_v_clz_i32_u32_one(self):
|
|
"""V_CLZ_I32_U32 of 1 returns 31 (31 leading zeros)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1),
|
|
v_clz_i32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 31)
|
|
|
|
def test_v_clz_i32_u32_msb_set(self):
|
|
"""V_CLZ_I32_U32 of 0x80000000 returns 0 (no leading zeros)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000000),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_clz_i32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0)
|
|
|
|
def test_v_clz_i32_u32_half(self):
|
|
"""V_CLZ_I32_U32 of 0x8000 (bit 15) returns 16."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x8000),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_clz_i32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 16)
|
|
|
|
def test_v_clz_i32_u32_all_ones(self):
|
|
"""V_CLZ_I32_U32 of 0xFFFFFFFF returns 0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xFFFFFFFF),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_clz_i32_u32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0)
|
|
|
|
|
|
class TestCtz(unittest.TestCase):
|
|
"""Tests for V_CTZ_I32_B32 - count trailing zeros."""
|
|
|
|
def test_v_ctz_i32_b32_zero(self):
|
|
"""V_CTZ_I32_B32 of 0 returns -1 (all bits are 0)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
v_ctz_i32_b32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF)
|
|
|
|
def test_v_ctz_i32_b32_one(self):
|
|
"""V_CTZ_I32_B32 of 1 returns 0 (no trailing zeros)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1),
|
|
v_ctz_i32_b32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0)
|
|
|
|
def test_v_ctz_i32_b32_msb_set(self):
|
|
"""V_CTZ_I32_B32 of 0x80000000 returns 31."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000000),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_ctz_i32_b32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 31)
|
|
|
|
def test_v_ctz_i32_b32_half(self):
|
|
"""V_CTZ_I32_B32 of 0x8000 (bit 15) returns 15."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x8000),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_ctz_i32_b32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 15)
|
|
|
|
def test_v_ctz_i32_b32_all_ones(self):
|
|
"""V_CTZ_I32_B32 of 0xFFFFFFFF returns 0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xFFFFFFFF),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_ctz_i32_b32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0)
|
|
|
|
|
|
class TestDivision(unittest.TestCase):
|
|
"""Tests for division instructions - V_RCP, V_DIV_SCALE, V_DIV_FMAS, V_DIV_FIXUP."""
|
|
|
|
def test_v_rcp_f32_normal(self):
|
|
"""V_RCP_F32 of 2.0 returns 0.5."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 2.0),
|
|
v_rcp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
|
|
|
|
def test_v_rcp_f32_inf(self):
|
|
"""V_RCP_F32 of +inf returns 0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7f800000), # +inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rcp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
|
|
|
|
def test_v_rcp_f32_neg_inf(self):
|
|
"""V_RCP_F32 of -inf returns -0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xff800000), # -inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_rcp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertEqual(result, 0.0)
|
|
# Check it's negative zero
|
|
self.assertEqual(st.vgpr[0][1], 0x80000000)
|
|
|
|
def test_v_rcp_f32_zero(self):
|
|
"""V_RCP_F32 of 0 returns +inf."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
v_rcp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
import math
|
|
self.assertTrue(math.isinf(i2f(st.vgpr[0][1])))
|
|
|
|
def test_v_div_fixup_f32_normal(self):
|
|
"""V_DIV_FIXUP_F32 normal division 1.0/2.0."""
|
|
# S0 = approximation (from rcp * scale), S1 = denominator, S2 = numerator
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(0.5)), # approximation
|
|
s_mov_b32(s[1], f2i(2.0)), # denominator
|
|
s_mov_b32(s[2], f2i(1.0)), # numerator
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5)
|
|
|
|
def test_v_div_fixup_f32_one_div_inf(self):
|
|
"""V_DIV_FIXUP_F32: 1.0 / +inf = 0."""
|
|
# For x/inf: S0=approx(~0), S1=inf, S2=x
|
|
instructions = [
|
|
s_mov_b32(s[0], 0), # approximation (rcp of inf = 0)
|
|
s_mov_b32(s[1], 0x7f800000), # denominator = +inf
|
|
s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(i2f(st.vgpr[0][1]), 0.0)
|
|
|
|
def test_v_div_fixup_f32_one_div_neg_inf(self):
|
|
"""V_DIV_FIXUP_F32: 1.0 / -inf = -0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000000), # approximation (rcp of -inf = -0)
|
|
s_mov_b32(s[1], 0xff800000), # denominator = -inf
|
|
s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][1], 0x80000000) # -0.0
|
|
|
|
def test_v_div_fixup_f32_inf_div_inf(self):
|
|
"""V_DIV_FIXUP_F32: inf / inf = NaN."""
|
|
import math
|
|
instructions = [
|
|
s_mov_b32(s[0], 0), # approximation
|
|
s_mov_b32(s[1], 0x7f800000), # denominator = +inf
|
|
s_mov_b32(s[2], 0x7f800000), # numerator = +inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
|
|
|
|
def test_v_div_fixup_f32_zero_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: 0 / 0 = NaN."""
|
|
import math
|
|
instructions = [
|
|
s_mov_b32(s[0], 0), # approximation
|
|
s_mov_b32(s[1], 0), # denominator = 0
|
|
s_mov_b32(s[2], 0), # numerator = 0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][1])))
|
|
|
|
def test_v_div_fixup_f32_x_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: 1.0 / 0 = +inf."""
|
|
import math
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7f800000), # approximation (rcp of 0 = inf)
|
|
s_mov_b32(s[1], 0), # denominator = 0
|
|
s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertTrue(math.isinf(result) and result > 0)
|
|
|
|
def test_v_div_fixup_f32_neg_x_div_zero(self):
|
|
"""V_DIV_FIXUP_F32: -1.0 / 0 = -inf."""
|
|
import math
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xff800000), # approximation (rcp of 0 = inf, with sign)
|
|
s_mov_b32(s[1], 0), # denominator = 0
|
|
s_mov_b32(s[2], f2i(-1.0)), # numerator = -1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_div_fixup_f32(v[1], v[0], s[1], s[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertTrue(math.isinf(result) and result < 0)
|
|
|
|
|
|
class TestSpecialValues(unittest.TestCase):
|
|
"""Tests for special float values - inf, nan, zero handling."""
|
|
|
|
def test_v_mul_f32_zero_times_inf(self):
|
|
"""V_MUL_F32: 0 * inf = NaN."""
|
|
import math
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
s_mov_b32(s[0], 0x7f800000), # +inf
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_mul_f32_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
|
|
|
|
def test_v_add_f32_inf_minus_inf(self):
|
|
"""V_ADD_F32: inf + (-inf) = NaN."""
|
|
import math
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7f800000), # +inf
|
|
s_mov_b32(s[1], 0xff800000), # -inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_add_f32_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
|
|
|
|
def test_v_fma_f32_with_inf(self):
|
|
"""V_FMA_F32: 1.0 * inf + 0 = inf."""
|
|
import math
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0),
|
|
s_mov_b32(s[0], 0x7f800000), # +inf
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_fma_f32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertTrue(math.isinf(result) and result > 0)
|
|
|
|
def test_v_exp_f32_large_negative(self):
|
|
"""V_EXP_F32 of large negative value (2^-100) returns very small number."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(-100.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_exp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# V_EXP_F32 computes 2^x, so 2^-100 is ~7.9e-31 (very small but not 0)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertLess(result, 1e-20) # Just verify it's very small
|
|
|
|
def test_v_exp_f32_large_positive(self):
|
|
"""V_EXP_F32 of large positive value (2^100) returns very large number."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(100.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_exp_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# V_EXP_F32 computes 2^x, so 2^100 is ~1.27e30 (very large)
|
|
result = i2f(st.vgpr[0][1])
|
|
self.assertGreater(result, 1e20) # Just verify it's very large
|
|
|
|
|
|
class TestF16Conversions(unittest.TestCase):
|
|
"""Tests for f16 conversion and packing instructions."""
|
|
|
|
def test_v_cvt_f16_f32_basic(self):
|
|
"""V_CVT_F16_F32 converts f32 to f16 in low 16 bits."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # f32 1.0 = 0x3f800000
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
# f16 1.0 = 0x3c00, should be in low 16 bits
|
|
lo_bits = result & 0xffff
|
|
self.assertEqual(lo_bits, 0x3c00, f"Expected 0x3c00, got 0x{lo_bits:04x}")
|
|
|
|
def test_v_cvt_f16_f32_negative(self):
|
|
"""V_CVT_F16_F32 converts negative f32 to f16."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], -2.0), # f32 -2.0 = 0xc0000000
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
lo_bits = result & 0xffff
|
|
# f16 -2.0 = 0xc000
|
|
self.assertEqual(lo_bits, 0xc000, f"Expected 0xc000, got 0x{lo_bits:04x}")
|
|
|
|
def test_v_cvt_f16_f32_small(self):
|
|
"""V_CVT_F16_F32 converts small f32 value."""
|
|
from extra.assembly.amd.pcode import _f16, f32_to_f16
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0.5),
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
lo_bits = result & 0xffff
|
|
expected = f32_to_f16(0.5) # Should be 0x3800
|
|
self.assertEqual(lo_bits, expected, f"Expected 0x{expected:04x}, got 0x{lo_bits:04x}")
|
|
|
|
def test_v_cvt_f16_f32_preserves_high_bits(self):
|
|
"""V_CVT_F16_F32 preserves high 16 bits of destination.
|
|
|
|
Hardware verified: V_CVT_F16_F32 only writes to the low 16 bits of the
|
|
destination register, preserving the high 16 bits. This is important for
|
|
the common pattern of converting two f32 values and packing them.
|
|
"""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xdead0000), # Pre-fill with garbage in high bits
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_mov_b32_e32(v[0], 1.0),
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
hi_bits = (result >> 16) & 0xffff
|
|
lo_bits = result & 0xffff
|
|
self.assertEqual(lo_bits, 0x3c00, f"Low bits should be 0x3c00, got 0x{lo_bits:04x}")
|
|
self.assertEqual(hi_bits, 0xdead, f"High bits should be preserved as 0xdead, got 0x{hi_bits:04x}")
|
|
|
|
def test_v_cvt_f16_f32_same_src_dst_preserves_high_bits(self):
|
|
"""V_CVT_F16_F32 with same src/dst preserves high bits of source.
|
|
|
|
Regression test: When converting v0 in-place (v_cvt_f16_f32 v0, v0),
|
|
the high 16 bits of the original f32 value are preserved in the result.
|
|
For f32 1.0 (0x3f800000), the result should be 0x3f803c00:
|
|
- Low 16 bits: 0x3c00 (f16 1.0)
|
|
- High 16 bits: 0x3f80 (preserved from original f32)
|
|
"""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1.0), # v0 = 0x3f800000
|
|
v_cvt_f16_f32_e32(v[0], v[0]), # convert v0 in-place
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][0]
|
|
# Hardware preserves high bits: 0x3f800000 -> 0x3f803c00
|
|
self.assertEqual(result, 0x3f803c00, f"Expected 0x3f803c00, got 0x{result:08x}")
|
|
|
|
def test_v_cvt_f16_f32_reads_full_32bit_source(self):
|
|
"""V_CVT_F16_F32 must read full 32-bit f32 source, not just low 16 bits.
|
|
|
|
Regression test for a bug where V_CVT_F16_F32 was incorrectly treated as having
|
|
a 16-bit source because '_F16' is in the instruction name. The CVT naming convention
|
|
is V_CVT_DST_SRC, so V_CVT_F16_F32 has a 32-bit f32 source and 16-bit f16 destination.
|
|
|
|
The bug caused the emulator to only read the low 16 bits of the source register,
|
|
which would produce wrong results when the significant bits of the f32 value are
|
|
in the upper bits (as they are for most f32 values > 1.0 or < -1.0).
|
|
"""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong.
|
|
# Correct f16 result: 0x3e00 (1.5 in half precision)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3fc00000), # f32 1.5
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
lo_bits = result & 0xffff
|
|
# f16(1.5) = 0x3e00
|
|
self.assertEqual(lo_bits, 0x3e00, f"Expected f16(1.5)=0x3e00, got 0x{lo_bits:04x} ({_f16(lo_bits)})")
|
|
|
|
def test_v_cvt_f16_f32_then_pack_for_wmma(self):
|
|
"""Regression test: f32->f16 conversion followed by pack for WMMA input.
|
|
|
|
This sequence is used in fused fp16 GEMM kernels where f32 data is loaded,
|
|
converted to f16, packed into pairs, and fed to WMMA instructions.
|
|
|
|
The bug was: V_CVT_F16_F32 was treated as having 16-bit source (because '_F16'
|
|
is in the name), causing it to read only low 16 bits of the f32 input.
|
|
This resulted in WMMA receiving zero inputs and producing zero outputs.
|
|
"""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# Simulate loading two f32 values and converting/packing for WMMA
|
|
# f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000
|
|
# After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100
|
|
# After PACK: 0x41003e00 (hi=2.5, lo=1.5)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3fc00000), # f32 1.5
|
|
s_mov_b32(s[1], 0x40200000), # f32 2.5
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_cvt_f16_f32_e32(v[2], v[0]), # v2 = f16(1.5) = 0x3e00
|
|
v_cvt_f16_f32_e32(v[3], v[1]), # v3 = f16(2.5) = 0x4100
|
|
v_pack_b32_f16(v[4], v[2], v[3]), # v4 = pack(v2, v3) = 0x41003e00
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
|
|
# Check intermediate CVT results
|
|
v2_lo = st.vgpr[0][2] & 0xffff
|
|
v3_lo = st.vgpr[0][3] & 0xffff
|
|
self.assertEqual(v2_lo, 0x3e00, f"v2 should be f16(1.5)=0x3e00, got 0x{v2_lo:04x} ({_f16(v2_lo)})")
|
|
self.assertEqual(v3_lo, 0x4100, f"v3 should be f16(2.5)=0x4100, got 0x{v3_lo:04x} ({_f16(v3_lo)})")
|
|
|
|
# Check packed result
|
|
result = st.vgpr[0][4]
|
|
self.assertEqual(result, 0x41003e00, f"Expected packed 0x41003e00, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_basic(self):
|
|
"""V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
# First convert two f32 values to f16
|
|
v_mov_b32_e32(v[0], 1.0), # Will become f16 0x3c00
|
|
v_mov_b32_e32(v[2], -2.0), # Will become f16 0xc000
|
|
v_cvt_f16_f32_e32(v[1], v[0]), # v1 low = 0x3c00
|
|
v_cvt_f16_f32_e32(v[3], v[2]), # v3 low = 0xc000
|
|
# Now pack them: v4 = (v3.f16 << 16) | v1.f16
|
|
v_pack_b32_f16(v[4], v[1], v[3]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][4]
|
|
lo_bits = result & 0xffff
|
|
hi_bits = (result >> 16) & 0xffff
|
|
# Expected: lo=0x3c00 (1.0), hi=0xc000 (-2.0)
|
|
self.assertEqual(lo_bits, 0x3c00, f"Lo should be 0x3c00 (1.0), got 0x{lo_bits:04x} ({_f16(lo_bits)})")
|
|
self.assertEqual(hi_bits, 0xc000, f"Hi should be 0xc000 (-2.0), got 0x{hi_bits:04x} ({_f16(hi_bits)})")
|
|
|
|
def test_v_pack_b32_f16_both_positive(self):
|
|
"""V_PACK_B32_F16 packs two positive f16 values."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0.5), # f16 0x3800
|
|
v_mov_b32_e32(v[2], 2.0), # f16 0x4000
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
v_cvt_f16_f32_e32(v[3], v[2]),
|
|
v_pack_b32_f16(v[4], v[1], v[3]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][4]
|
|
lo_bits = result & 0xffff
|
|
hi_bits = (result >> 16) & 0xffff
|
|
self.assertEqual(lo_bits, 0x3800, f"Lo should be 0x3800 (0.5), got 0x{lo_bits:04x}")
|
|
self.assertEqual(hi_bits, 0x4000, f"Hi should be 0x4000 (2.0), got 0x{hi_bits:04x}")
|
|
|
|
def test_v_pack_b32_f16_zeros(self):
|
|
"""V_PACK_B32_F16 packs two zero values."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_cvt_f16_f32_e32(v[1], v[0]),
|
|
v_cvt_f16_f32_e32(v[3], v[2]),
|
|
v_pack_b32_f16(v[4], v[1], v[3]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][4]
|
|
self.assertEqual(result, 0, f"Expected 0x00000000, got 0x{result:08x}")
|
|
|
|
|
|
class TestPackInstructions(unittest.TestCase):
|
|
"""Tests for pack instructions."""
|
|
|
|
def test_v_pack_b32_f16(self):
|
|
"""V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
|
|
instructions = []
|
|
# f16 1.0 = 0x3c00, f16 2.0 = 0x4000
|
|
instructions.append(s_mov_b32(s[0], 0x3c00)) # f16 1.0
|
|
instructions.append(s_mov_b32(s[1], 0x4000)) # f16 2.0
|
|
instructions.append(v_mov_b32_e32(v[0], s[0]))
|
|
instructions.append(v_mov_b32_e32(v[1], s[1]))
|
|
# Pack: v[2] = (v[1].f16 << 16) | v[0].f16
|
|
instructions.append(v_pack_b32_f16(v[2], v[0], v[1]))
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: hi=0x4000 (2.0), lo=0x3c00 (1.0) -> 0x40003c00
|
|
self.assertEqual(result, 0x40003c00, f"Expected 0x40003c00, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_with_cvt(self):
|
|
"""V_PACK_B32_F16 after V_CVT_F16_F32 conversions."""
|
|
instructions = []
|
|
# f32 1.0 = 0x3f800000
|
|
instructions.append(s_mov_b32(s[0], 0x3f800000))
|
|
instructions.append(v_mov_b32_e32(v[0], s[0])) # f32 1.0
|
|
instructions.append(v_mov_b32_e32(v[1], s[0])) # f32 1.0
|
|
# Convert to f16
|
|
instructions.append(v_cvt_f16_f32_e32(v[2], v[0])) # v[2].f16 = 1.0
|
|
instructions.append(v_cvt_f16_f32_e32(v[3], v[1])) # v[3].f16 = 1.0
|
|
# Pack
|
|
instructions.append(v_pack_b32_f16(v[4], v[2], v[3]))
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][4]
|
|
# Expected: 0x3c003c00 (two f16 1.0 values)
|
|
self.assertEqual(result, 0x3c003c00, f"Expected 0x3c003c00, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_packed_sources(self):
|
|
"""V_PACK_B32_F16 with sources that have packed f16 pairs (both hi and lo used).
|
|
This mimics what happens in matmul kernels where VGPRs contain packed f16 data.
|
|
"""
|
|
instructions = []
|
|
# v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
|
|
# v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
|
|
# V_PACK_B32_F16 with default opsel=0 reads low halves from each source
|
|
# Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.lo=0x3c00 (1.0) -> 0x42003c00
|
|
instructions.append(s_mov_b32(s[0], 0x40003c00)) # packed: hi=2.0, lo=1.0
|
|
instructions.append(s_mov_b32(s[1], 0x44004200)) # packed: hi=4.0, lo=3.0
|
|
instructions.append(v_mov_b32_e32(v[0], s[0]))
|
|
instructions.append(v_mov_b32_e32(v[1], s[1]))
|
|
instructions.append(v_pack_b32_f16(v[2], v[0], v[1]))
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: hi=0x4200 (3.0), lo=0x3c00 (1.0) -> 0x42003c00
|
|
self.assertEqual(result, 0x42003c00, f"Expected 0x42003c00, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_opsel_hi_hi(self):
|
|
"""V_PACK_B32_F16 with opsel=0b0011 to read high halves from both sources.
|
|
This is used when extracting the high f16 values from packed registers.
|
|
"""
|
|
# v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
|
|
# v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
|
|
# With opsel=0b0011: read hi from v0 (0x4000=2.0) and hi from v1 (0x4400=4.0)
|
|
# Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.hi=0x4000 (2.0) -> 0x44004000
|
|
inst = v_pack_b32_f16(v[2], v[0], v[1])
|
|
inst._values['opsel'] = 0b0011 # opsel[0]=1 for src0 hi, opsel[1]=1 for src1 hi
|
|
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40003c00), # packed: hi=2.0, lo=1.0
|
|
s_mov_b32(s[1], 0x44004200), # packed: hi=4.0, lo=3.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
inst,
|
|
]
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: hi=0x4400 (4.0), lo=0x4000 (2.0) -> 0x44004000
|
|
self.assertEqual(result, 0x44004000, f"Expected 0x44004000, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_opsel_lo_hi(self):
|
|
"""V_PACK_B32_F16 with opsel=0b0010 to read lo from src0, hi from src1."""
|
|
# v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
|
|
# v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
|
|
# With opsel=0b0010: read lo from v0 (0x3c00=1.0), hi from v1 (0x4400=4.0)
|
|
# Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.lo=0x3c00 (1.0) -> 0x44003c00
|
|
inst = v_pack_b32_f16(v[2], v[0], v[1])
|
|
inst._values['opsel'] = 0b0010 # opsel[0]=0 for src0 lo, opsel[1]=1 for src1 hi
|
|
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40003c00),
|
|
s_mov_b32(s[1], 0x44004200),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
inst,
|
|
]
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: hi=0x4400 (4.0), lo=0x3c00 (1.0) -> 0x44003c00
|
|
self.assertEqual(result, 0x44003c00, f"Expected 0x44003c00, got 0x{result:08x}")
|
|
|
|
def test_v_pack_b32_f16_opsel_hi_lo(self):
|
|
"""V_PACK_B32_F16 with opsel=0b0001 to read hi from src0, lo from src1."""
|
|
# v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0)
|
|
# v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0)
|
|
# With opsel=0b0001: read hi from v0 (0x4000=2.0), lo from v1 (0x4200=3.0)
|
|
# Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.hi=0x4000 (2.0) -> 0x42004000
|
|
inst = v_pack_b32_f16(v[2], v[0], v[1])
|
|
inst._values['opsel'] = 0b0001 # opsel[0]=1 for src0 hi, opsel[1]=0 for src1 lo
|
|
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40003c00),
|
|
s_mov_b32(s[1], 0x44004200),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
inst,
|
|
]
|
|
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: hi=0x4200 (3.0), lo=0x4000 (2.0) -> 0x42004000
|
|
self.assertEqual(result, 0x42004000, f"Expected 0x42004000, got 0x{result:08x}")
|
|
|
|
|
|
class TestWMMA(unittest.TestCase):
|
|
"""Tests for WMMA (Wave Matrix Multiply-Accumulate) instructions."""
|
|
|
|
def test_v_wmma_f32_16x16x16_f16_basic(self):
|
|
"""V_WMMA_F32_16X16X16_F16 basic test - verify emulator matches hardware."""
|
|
# WMMA does D = A @ B + C where A,B are 16x16 f16, C,D are 16x16 f32
|
|
# Use: A=v[16:23], B=v[24:31], C=D=v[0:7] (output in captured range v[0:15])
|
|
instructions = []
|
|
|
|
# f16 1.0 = 0x3c00, packed pair = 0x3c003c00
|
|
instructions.append(s_mov_b32(s[0], 0x3c003c00))
|
|
|
|
# Set A (v16-v23) and B (v24-v31) to all 1.0s
|
|
for i in range(16, 32):
|
|
instructions.append(v_mov_b32_e32(v[i], s[0]))
|
|
|
|
# Set C (v0-v7) to all 0s (will also be output D)
|
|
for i in range(8):
|
|
instructions.append(v_mov_b32_e32(v[i], 0))
|
|
|
|
# Execute WMMA: v[0:7] = A @ B + C
|
|
instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
|
|
|
|
# Just run and compare - USE_HW=1 will verify emulator matches hardware
|
|
st = run_program(instructions, n_lanes=32)
|
|
|
|
# Verify at least some output is non-zero (actual values depend on WMMA layout)
|
|
# Output should be 16.0 (16 x 1.0 x 1.0) for each element
|
|
any_nonzero = any(st.vgpr[lane][0] != 0 for lane in range(32))
|
|
self.assertTrue(any_nonzero, "WMMA should produce non-zero output")
|
|
|
|
def test_v_wmma_f32_16x16x16_f16_all_ones(self):
|
|
"""V_WMMA_F32_16X16X16_F16 with all ones should produce 16.0 for each output element.
|
|
This verifies the matrix multiply is computing the correct sum.
|
|
"""
|
|
instructions = []
|
|
|
|
# f16 1.0 = 0x3c00, packed pair = 0x3c003c00
|
|
instructions.append(s_mov_b32(s[0], 0x3c003c00))
|
|
|
|
# Set A (v16-v23) and B (v24-v31) to all 1.0s
|
|
for i in range(16, 32):
|
|
instructions.append(v_mov_b32_e32(v[i], s[0]))
|
|
|
|
# Set C (v0-v7) to all 0s (will also be output D)
|
|
for i in range(8):
|
|
instructions.append(v_mov_b32_e32(v[i], 0))
|
|
|
|
# Execute WMMA: v[0:7] = A @ B + C
|
|
instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
|
|
|
|
st = run_program(instructions, n_lanes=32)
|
|
|
|
# All output elements should be 16.0 (sum of 16 * 1.0 * 1.0)
|
|
expected = f2i(16.0)
|
|
for lane in range(32):
|
|
for reg in range(8):
|
|
result = st.vgpr[lane][reg]
|
|
self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (16.0), got 0x{result:08x} ({i2f(result)})")
|
|
|
|
def test_v_wmma_f32_16x16x16_f16_with_accumulator(self):
|
|
"""V_WMMA_F32_16X16X16_F16 with non-zero accumulator.
|
|
Verifies that C matrix is properly added to the product.
|
|
"""
|
|
instructions = []
|
|
|
|
# f16 1.0 = 0x3c00, packed pair = 0x3c003c00
|
|
instructions.append(s_mov_b32(s[0], 0x3c003c00))
|
|
# f32 5.0 = 0x40a00000
|
|
instructions.append(s_mov_b32(s[1], f2i(5.0)))
|
|
|
|
# Set A (v16-v23) and B (v24-v31) to all 1.0s
|
|
for i in range(16, 32):
|
|
instructions.append(v_mov_b32_e32(v[i], s[0]))
|
|
|
|
# Set C (v0-v7) to all 5.0s
|
|
for i in range(8):
|
|
instructions.append(v_mov_b32_e32(v[i], s[1]))
|
|
|
|
# Execute WMMA: v[0:7] = A @ B + C = 16.0 + 5.0 = 21.0
|
|
instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0]))
|
|
|
|
st = run_program(instructions, n_lanes=32)
|
|
|
|
# All output elements should be 21.0 (16.0 + 5.0)
|
|
expected = f2i(21.0)
|
|
for lane in range(32):
|
|
for reg in range(8):
|
|
result = st.vgpr[lane][reg]
|
|
self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (21.0), got 0x{result:08x} ({i2f(result)})")
|
|
|
|
|
|
class TestVOP3P(unittest.TestCase):
|
|
"""Tests for VOP3P packed 16-bit operations."""
|
|
|
|
def test_v_pk_add_f16_basic(self):
|
|
"""V_PK_ADD_F16 adds two packed f16 values."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0)
|
|
# Result should be packed (4.0, 6.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40003c00), # packed f16: hi=2.0, lo=1.0
|
|
s_mov_b32(s[1], 0x44004200), # packed f16: hi=4.0, lo=3.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_pk_add_f16(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
# Expected: lo=1.0+3.0=4.0 (0x4400), hi=2.0+4.0=6.0 (0x4600) -> 0x46004400
|
|
lo = _f16(result & 0xffff)
|
|
hi = _f16((result >> 16) & 0xffff)
|
|
self.assertAlmostEqual(lo, 4.0, places=2, msg=f"lo: expected 4.0, got {lo}")
|
|
self.assertAlmostEqual(hi, 6.0, places=2, msg=f"hi: expected 6.0, got {hi}")
|
|
|
|
def test_v_pk_add_f16_with_inline_constant(self):
|
|
"""V_PK_ADD_F16 with inline constant POS_ONE (1.0).
|
|
Inline constants for VOP3P are f16 values in the low 16 bits only.
|
|
The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant.
|
|
"""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# v0 = packed (1.0, 1.0), add POS_ONE
|
|
# With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant)
|
|
# But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding
|
|
# Default opsel_hi=3 means: bit0=1 (src0 hi from hi), bit1=1 (src1 hi from hi)
|
|
# Since inline constant has 0 in hi half, hi result = v0.hi + 0 = 1.0
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c003c00), # packed f16: hi=1.0, lo=1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE), # Add inline constant 1.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
lo = _f16(result & 0xffff)
|
|
hi = _f16((result >> 16) & 0xffff)
|
|
# lo = 1.0 + 1.0 = 2.0, hi = 1.0 + 0.0 = 1.0 (inline const hi half is 0)
|
|
self.assertAlmostEqual(lo, 2.0, places=2, msg=f"lo: expected 2.0, got {lo} (result=0x{result:08x})")
|
|
self.assertAlmostEqual(hi, 1.0, places=2, msg=f"hi: expected 1.0, got {hi} (result=0x{result:08x})")
|
|
|
|
def test_v_pk_mul_f16_basic(self):
|
|
"""V_PK_MUL_F16 multiplies two packed f16 values."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0)
|
|
# Result should be packed (8.0, 15.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x42004000), # packed f16: hi=3.0, lo=2.0
|
|
s_mov_b32(s[1], 0x45004400), # packed f16: hi=5.0, lo=4.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_pk_mul_f16(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
lo = _f16(result & 0xffff)
|
|
hi = _f16((result >> 16) & 0xffff)
|
|
self.assertAlmostEqual(lo, 8.0, places=1, msg=f"lo: expected 8.0, got {lo}")
|
|
self.assertAlmostEqual(hi, 15.0, places=1, msg=f"hi: expected 15.0, got {hi}")
|
|
|
|
def test_v_pk_mul_f16_with_inline_constant(self):
|
|
"""V_PK_MUL_F16 with inline constant POS_TWO (2.0).
|
|
Inline constant has value only in low 16 bits, hi is 0.
|
|
"""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# v0 = packed (3.0, 4.0), multiply by POS_TWO
|
|
# lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x44004200), # packed f16: hi=4.0, lo=3.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
lo = _f16(result & 0xffff)
|
|
hi = _f16((result >> 16) & 0xffff)
|
|
self.assertAlmostEqual(lo, 6.0, places=1, msg=f"lo: expected 6.0, got {lo}")
|
|
self.assertAlmostEqual(hi, 0.0, places=1, msg=f"hi: expected 0.0, got {hi}")
|
|
|
|
def test_v_pk_fma_f16_basic(self):
|
|
"""V_PK_FMA_F16: D = A * B + C for packed f16."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
# A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0)
|
|
# Result should be packed (2*4+1=9.0, 3*5+1=16.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x42004000), # A: hi=3.0, lo=2.0
|
|
s_mov_b32(s[1], 0x45004400), # B: hi=5.0, lo=4.0
|
|
s_mov_b32(s[2], 0x3c003c00), # C: hi=1.0, lo=1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_pk_fma_f16(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
lo = _f16(result & 0xffff)
|
|
hi = _f16((result >> 16) & 0xffff)
|
|
self.assertAlmostEqual(lo, 9.0, places=1, msg=f"lo: expected 9.0, got {lo}")
|
|
self.assertAlmostEqual(hi, 16.0, places=0, msg=f"hi: expected 16.0, got {hi}")
|
|
|
|
|
|
class TestVFmaMix(unittest.TestCase):
|
|
"""Tests for V_FMA_MIX_F32/F16 mixed-precision FMA instructions.
|
|
|
|
These instructions are critical for OCML sin/cos implementations.
|
|
opsel_hi[i] controls whether source i is f32 (0) or f16 from hi bits (1)
|
|
opsel[i] selects which half (lo=0, hi=1) when source is f16
|
|
"""
|
|
|
|
def test_v_fma_mix_f32_all_f32(self):
|
|
"""V_FMA_MIX_F32 with all f32 sources."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f2i(1.0)),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
# opsel_hi=0, opsel_hi2=0 means all sources are f32
|
|
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+1=7, got {result}")
|
|
|
|
def test_v_fma_mix_f32_src2_f16_lo(self):
|
|
"""V_FMA_MIX_F32 with src2 as f16 from lo bits."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_2 = f32_to_f16(2.0) # 0x4000
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(1.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f16_2), # f16 2.0 in lo bits, 0 in hi bits
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
# opsel_hi2=1 means src2 is f16, opsel[2]=0 means use lo half
|
|
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=1),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 5.0, places=5, msg=f"1*3+2=5, got {result}")
|
|
|
|
def test_v_fma_mix_f32_src2_f16_hi(self):
|
|
"""V_FMA_MIX_F32 with src2 as f16 from hi bits."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_2 = f32_to_f16(2.0) # 0x4000
|
|
val = (f16_2 << 16) | 0 # hi = f16 2.0, lo = 0
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(1.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], val),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
# opsel_hi2=1 means src2 is f16, opsel[2]=1 (bit 2 set, opsel=4) means use hi half
|
|
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=4, opsel_hi=0, opsel_hi2=1),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 5.0, places=5, msg=f"1*3+2=5, got {result}")
|
|
|
|
def test_v_fma_mix_f32_with_abs(self):
|
|
"""V_FMA_MIX_F32 with abs modifier on src2."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f2i(-1.0)), # -1.0
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
# neg_hi field is used for abs in V_FMA_MIX, abs bit 2 (0b100) for |src2|
|
|
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0, neg_hi=4),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+|-1|=7, got {result}")
|
|
|
|
def test_v_fma_mixlo_f16(self):
|
|
"""V_FMA_MIXLO_F16 writes to low 16 bits of destination."""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f2i(1.0)),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
s_mov_b32(s[3], 0xdead0000), # garbage in hi bits
|
|
v_mov_b32_e32(v[3], s[3]),
|
|
VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
lo = _f16(st.vgpr[0][3] & 0xffff)
|
|
hi = (st.vgpr[0][3] >> 16) & 0xffff
|
|
self.assertAlmostEqual(lo, 7.0, places=1, msg=f"lo: 2*3+1=7, got {lo}")
|
|
self.assertEqual(hi, 0xdead, f"hi should be preserved, got 0x{hi:04x}")
|
|
|
|
|
|
class TestF64Conversions(unittest.TestCase):
|
|
"""Tests for 64-bit float operations and conversions."""
|
|
|
|
def test_v_add_f64_inline_constant(self):
|
|
"""V_ADD_F64 with inline constant POS_ONE (1.0) as f64."""
|
|
one_f64 = f2i64(1.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], one_f64 & 0xffffffff),
|
|
s_mov_b32(s[1], one_f64 >> 32),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_add_f64(v[2:4], v[0:2], SrcEnum.POS_ONE), # 1.0 + 1.0 = 2.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
|
|
self.assertAlmostEqual(result, 2.0, places=5)
|
|
|
|
def test_v_ldexp_f64_negative_exponent(self):
|
|
"""V_LDEXP_F64 with negative exponent (-32)."""
|
|
val = -8.0
|
|
val_bits = f2i64(val)
|
|
expected = -8.0 * (2.0 ** -32) # -1.862645149230957e-09
|
|
instructions = [
|
|
s_mov_b32(s[0], val_bits & 0xffffffff),
|
|
s_mov_b32(s[1], val_bits >> 32),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0), # -32
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
|
|
self.assertAlmostEqual(result, expected, places=15)
|
|
|
|
def test_f64_to_i64_conversion_sequence(self):
|
|
"""Test the f64->i64 conversion sequence used by the compiler.
|
|
|
|
The compiler generates:
|
|
v_trunc_f64 -> v_ldexp_f64 (by -32) -> v_floor_f64 -> v_fma_f64 (by -2^32)
|
|
-> v_cvt_u32_f64 (low bits) -> v_cvt_i32_f64 (high bits)
|
|
|
|
The FMA computes: trunc + (-2^32) * floor = trunc - floor * 2^32
|
|
which gives the low 32 bits as a positive float (for proper u32 conversion).
|
|
"""
|
|
val = -8.0
|
|
val_bits = f2i64(val)
|
|
lit = -4294967296.0 # -2^32 (note: NEGATIVE, so FMA does trunc - floor * 2^32)
|
|
lit_bits = f2i64(lit)
|
|
|
|
instructions = [
|
|
s_mov_b32(s[0], val_bits & 0xffffffff),
|
|
s_mov_b32(s[1], val_bits >> 32),
|
|
v_trunc_f64_e32(v[0:2], s[0:2]),
|
|
v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0), # -32
|
|
v_floor_f64_e32(v[2:4], v[2:4]),
|
|
s_mov_b32(s[2], lit_bits & 0xffffffff),
|
|
s_mov_b32(s[3], lit_bits >> 32),
|
|
v_fma_f64(v[0:2], s[2:4], v[2:4], v[0:2]),
|
|
v_cvt_u32_f64_e32(v[4], v[0:2]),
|
|
v_cvt_i32_f64_e32(v[5], v[2:4]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v4 = low 32 bits, v5 = high 32 bits (sign extended)
|
|
lo = st.vgpr[0][4]
|
|
hi = st.vgpr[0][5]
|
|
# For -8: lo should be 0xfffffff8, hi should be 0xffffffff
|
|
result = struct.unpack('<q', struct.pack('<II', lo, hi))[0]
|
|
self.assertEqual(result, -8, f"Expected -8, got {result} (lo=0x{lo:08x}, hi=0x{hi:08x})")
|
|
|
|
def test_v_cvt_i32_f64_writes_32bit_only(self):
|
|
"""V_CVT_I32_F64 should only write 32 bits, not 64.
|
|
|
|
Regression test: V_CVT_I32_F64 has a 64-bit source (f64) but 32-bit destination (i32).
|
|
The emulator was incorrectly writing 64 bits (clobbering vdst+1) because
|
|
is_64bit_op was True for any op ending in '_F64'.
|
|
"""
|
|
# Pre-fill v3 with a canary value that should NOT be clobbered
|
|
val_bits = f2i64(-1.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], val_bits & 0xffffffff),
|
|
s_mov_b32(s[1], val_bits >> 32),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], 0xDEADBEEF), # Canary value
|
|
v_mov_b32_e32(v[3], s[2]), # Put canary in v3
|
|
v_cvt_i32_f64_e32(v[2], v[0:2]), # Convert -1.0 -> -1 (0xffffffff)
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2]
|
|
canary = st.vgpr[0][3]
|
|
# V_CVT_I32_F64 of -1.0 should produce 0xffffffff (-1)
|
|
self.assertEqual(result, 0xffffffff, f"Expected 0xffffffff (-1), got 0x{result:08x}")
|
|
# v3 should still contain the canary (not clobbered by 64-bit write)
|
|
self.assertEqual(canary, 0xDEADBEEF, f"v3 canary should be 0xDEADBEEF, got 0x{canary:08x} (clobbered!)")
|
|
|
|
def test_v_frexp_mant_f64_range(self):
|
|
"""V_FREXP_MANT_F64 should return mantissa in [0.5, 1.0) range.
|
|
|
|
Regression test: The mantissa() helper was incorrectly multiplying by 2.0,
|
|
returning values in [1.0, 2.0) instead of the correct [0.5, 1.0) range.
|
|
"""
|
|
# Test with 2.0: frexp(2.0) should give mantissa=0.5, exponent=2
|
|
two_f64 = f2i64(2.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], two_f64 & 0xffffffff),
|
|
s_mov_b32(s[1], two_f64 >> 32),
|
|
v_frexp_mant_f64_e32(v[0:2], s[0:2]),
|
|
v_frexp_exp_i32_f64_e32(v[2], s[0:2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
mant = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
exp = st.vgpr[0][2]
|
|
if exp >= 0x80000000: exp -= 0x100000000 # sign extend
|
|
# frexp(2.0) = 0.5 * 2^2
|
|
self.assertAlmostEqual(mant, 0.5, places=10, msg=f"Expected mantissa 0.5, got {mant}")
|
|
self.assertEqual(exp, 2, f"Expected exponent 2, got {exp}")
|
|
|
|
def test_v_div_scale_f64_reads_64bit_sources(self):
|
|
"""V_DIV_SCALE_F64 must read all sources as 64-bit values.
|
|
|
|
Regression test: VOP3SD was reading sources as 32-bit for V_DIV_SCALE_F64,
|
|
causing incorrect results when the low 32 bits happened to look like 0 or denorm.
|
|
"""
|
|
# Set up v0:v1 = sqrt(2) ≈ 1.414, v2:v3 = 1.0
|
|
sqrt2_f64 = f2i64(1.4142135623730951)
|
|
one_f64 = f2i64(1.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], sqrt2_f64 & 0xffffffff),
|
|
s_mov_b32(s[1], sqrt2_f64 >> 32),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], one_f64 & 0xffffffff),
|
|
s_mov_b32(s[3], one_f64 >> 32),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_mov_b32_e32(v[3], s[3]),
|
|
# V_DIV_SCALE_F64: src0=v0:v1, src1=v0:v1, src2=v2:v3
|
|
# For normal inputs, should pass through src0 unchanged
|
|
VOP3SD(VOP3SDOp.V_DIV_SCALE_F64, vdst=v[4], sdst=s[10], src0=v[0], src1=v[0], src2=v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
|
|
# For normal (non-denorm, non-edge-case) inputs, V_DIV_SCALE_F64 passes through src0
|
|
self.assertAlmostEqual(result, 1.4142135623730951, places=10,
|
|
msg=f"Expected ~1.414, got {result} (may be nan if 64-bit sources not read correctly)")
|
|
|
|
|
|
class TestNewPcodeHelpers(unittest.TestCase):
|
|
"""Tests for newly added pcode helper functions (SAD, BYTE_PERMUTE, BF16)."""
|
|
|
|
def test_v_sad_u8_basic(self):
|
|
"""V_SAD_U8: Sum of absolute differences of 4 bytes."""
|
|
# s0 = 0x05040302, s1 = 0x04030201, s2 = 10 -> diff = 1+1+1+1 = 4, result = 14
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x05040302),
|
|
s_mov_b32(s[1], 0x04030201),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], 10),
|
|
v_sad_u8(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 14, f"Expected 14, got {result}")
|
|
|
|
def test_v_sad_u8_identical_bytes(self):
|
|
"""V_SAD_U8: When both operands are identical, SAD = 0 + accumulator."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xDEADBEEF),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[0]), # Same as v0
|
|
v_mov_b32_e32(v[2], 42), # Accumulator
|
|
v_sad_u8(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 42, f"Expected 42, got {result}")
|
|
|
|
def test_v_sad_u16_basic(self):
|
|
"""V_SAD_U16: Sum of absolute differences of 2 half-words."""
|
|
# s0 = 0x00020003, s1 = 0x00010001 -> diff = |2-1| + |3-1| = 1 + 2 = 3
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00020003),
|
|
s_mov_b32(s[1], 0x00010001),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_sad_u16(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 3, f"Expected 3, got {result}")
|
|
|
|
def test_v_sad_u32_basic(self):
|
|
"""V_SAD_U32: Absolute difference of 32-bit values."""
|
|
# s0 = 100, s1 = 30 -> diff = 70, s2 = 5 -> result = 75
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 100),
|
|
v_mov_b32_e32(v[1], 30),
|
|
v_mov_b32_e32(v[2], 5),
|
|
v_sad_u32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 75, f"Expected 75, got {result}")
|
|
|
|
def test_v_msad_u8_masked(self):
|
|
"""V_MSAD_U8: Skip bytes where reference (s1) is 0."""
|
|
# s0 = 0x10101010, s1 = 0x00010001, s2 = 0
|
|
# Only bytes 0 and 2 of s1 are non-zero, so only those contribute
|
|
# diff = |0x10-0x01| + |0x10-0x01| = 15 + 15 = 30
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x10101010),
|
|
s_mov_b32(s[1], 0x00010001),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], 0),
|
|
v_msad_u8(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 30, f"Expected 30, got {result}")
|
|
|
|
def test_v_perm_b32_select_bytes(self):
|
|
"""V_PERM_B32: Select bytes from combined {s0, s1}."""
|
|
# Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
|
|
# s0 = 0x03020100 -> bytes 4-7 of combined
|
|
# s1 = 0x07060504 -> bytes 0-3 of combined
|
|
# Combined = 0x03020100_07060504
|
|
# selector = 0x00010203 -> select bytes 3,2,1,0 from combined = 0x04,0x05,0x06,0x07
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x03020100),
|
|
s_mov_b32(s[1], 0x07060504),
|
|
s_mov_b32(s[2], 0x00010203),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_perm_b32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 0x04050607, f"Expected 0x04050607, got 0x{result:08x}")
|
|
|
|
def test_v_perm_b32_select_high_bytes(self):
|
|
"""V_PERM_B32: Select bytes from high word (s0)."""
|
|
# Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
|
|
# s0 = 0x03020100 -> bytes 4-7 of combined
|
|
# s1 = 0x07060504 -> bytes 0-3 of combined
|
|
# selector = 0x04050607 -> select bytes 7,6,5,4 from combined = 0x00,0x01,0x02,0x03
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x03020100),
|
|
s_mov_b32(s[1], 0x07060504),
|
|
s_mov_b32(s[2], 0x04050607),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_perm_b32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 0x00010203, f"Expected 0x00010203, got 0x{result:08x}")
|
|
|
|
def test_v_perm_b32_constant_values(self):
|
|
"""V_PERM_B32: Test constant 0x00 (sel=12) and 0xFF (sel>=13)."""
|
|
# selector = 0x0C0D0E0F -> bytes: 12=0x00, 13=0xFF, 14=0xFF, 15=0xFF
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x12345678),
|
|
s_mov_b32(s[1], 0xABCDEF01),
|
|
s_mov_b32(s[2], 0x0C0D0E0F),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_perm_b32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
# byte 0: sel=0x0F >= 13 -> 0xFF
|
|
# byte 1: sel=0x0E >= 13 -> 0xFF
|
|
# byte 2: sel=0x0D >= 13 -> 0xFF
|
|
# byte 3: sel=0x0C = 12 -> 0x00
|
|
self.assertEqual(result, 0x00FFFFFF, f"Expected 0x00FFFFFF, got 0x{result:08x}")
|
|
|
|
def test_v_perm_b32_sign_extend(self):
|
|
"""V_PERM_B32: Test sign extension selectors 8-11."""
|
|
# Combined = {S0, S1} where S1 is bytes 0-3, S0 is bytes 4-7
|
|
# s0 = 0x00008000 -> byte 5 (0x80) has sign bit set
|
|
# s1 = 0x80000080 -> bytes 1 (0x00) and 3 (0x80) have sign bits, byte 0 (0x80) has sign bit
|
|
# Combined = 0x00008000_80000080
|
|
# selector = 0x08090A0B -> sign of bytes 1,3,5,7
|
|
# byte 0: sel=0x0B -> sign of byte 7 (0x00) -> 0x00
|
|
# byte 1: sel=0x0A -> sign of byte 5 (0x80) -> 0xFF
|
|
# byte 2: sel=0x09 -> sign of byte 3 (0x80) -> 0xFF
|
|
# byte 3: sel=0x08 -> sign of byte 1 (0x00) -> 0x00
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00008000),
|
|
s_mov_b32(s[1], 0x80000080),
|
|
s_mov_b32(s[2], 0x08090A0B),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_perm_b32(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][3]
|
|
self.assertEqual(result, 0x00FFFF00, f"Expected 0x00FFFF00, got 0x{result:08x}")
|
|
|
|
def test_v_dot2_f32_bf16_basic(self):
|
|
"""V_DOT2_F32_BF16: Dot product of two bf16 pairs accumulated into f32."""
|
|
from extra.assembly.amd.pcode import _ibf16
|
|
# A = packed (2.0, 3.0) as bf16, B = packed (4.0, 5.0) as bf16
|
|
# Result = 2*4 + 3*5 + acc = 8 + 15 + 0 = 23.0
|
|
a_lo, a_hi = _ibf16(2.0), _ibf16(3.0)
|
|
b_lo, b_hi = _ibf16(4.0), _ibf16(5.0)
|
|
a_packed = (a_hi << 16) | a_lo
|
|
b_packed = (b_hi << 16) | b_lo
|
|
instructions = [
|
|
s_mov_b32(s[0], a_packed),
|
|
s_mov_b32(s[1], b_packed),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mov_b32_e32(v[2], 0), # accumulator = 0
|
|
v_dot2_f32_bf16(v[3], v[0], v[1], v[2]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 23.0, places=1, msg=f"Expected 23.0, got {result}")
|
|
|
|
|
|
class TestQuadmaskWqm(unittest.TestCase):
|
|
"""Tests for S_QUADMASK and S_WQM instructions."""
|
|
|
|
def test_s_quadmask_b32_all_quads_active(self):
|
|
"""S_QUADMASK_B32: All quads have at least one active lane."""
|
|
# Input: 0xFFFFFFFF (all bits set) -> all 8 quads active -> result = 0xFF
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xFFFFFFFF),
|
|
s_quadmask_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0xFF, f"Expected 0xFF, got 0x{result:x}")
|
|
self.assertEqual(st.scc, 1, "SCC should be 1 (result != 0)")
|
|
|
|
def test_s_quadmask_b32_alternating_quads(self):
|
|
"""S_QUADMASK_B32: Every other quad has lanes active."""
|
|
# Input: 0x0F0F0F0F -> quads 0,2,4,6 active (bits 0-3, 8-11, 16-19, 24-27)
|
|
# Result: bits 0,2,4,6 set = 0x55
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x0F0F0F0F),
|
|
s_quadmask_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0x55, f"Expected 0x55, got 0x{result:x}")
|
|
|
|
def test_s_quadmask_b32_no_quads_active(self):
|
|
"""S_QUADMASK_B32: No quads have active lanes."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0),
|
|
s_quadmask_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0, f"Expected 0, got 0x{result:x}")
|
|
self.assertEqual(st.scc, 0, "SCC should be 0 (result == 0)")
|
|
|
|
def test_s_quadmask_b32_single_lane_per_quad(self):
|
|
"""S_QUADMASK_B32: Single lane active in each quad."""
|
|
# Input: 0x11111111 -> bit 0 of each nibble set -> all 8 quads active
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x11111111),
|
|
s_quadmask_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0xFF, f"Expected 0xFF, got 0x{result:x}")
|
|
|
|
def test_s_wqm_b32_all_active(self):
|
|
"""S_WQM_B32: Whole quad mode - if any lane in quad is active, activate all."""
|
|
# Input: 0x11111111 -> one lane per quad -> output all quads fully active = 0xFFFFFFFF
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x11111111),
|
|
s_wqm_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0xFFFFFFFF, f"Expected 0xFFFFFFFF, got 0x{result:x}")
|
|
self.assertEqual(st.scc, 1, "SCC should be 1 (result != 0)")
|
|
|
|
def test_s_wqm_b32_alternating_quads(self):
|
|
"""S_WQM_B32: Only some quads have active lanes."""
|
|
# Input: 0x0000000F -> only quad 0 has lanes -> output = 0x0000000F (quad 0 all active)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00000001), # single lane in quad 0
|
|
s_wqm_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0x0000000F, f"Expected 0x0000000F, got 0x{result:x}")
|
|
|
|
def test_s_wqm_b32_zero(self):
|
|
"""S_WQM_B32: No lanes active."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0),
|
|
s_wqm_b32(s[1], s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.sgpr[1]
|
|
self.assertEqual(result, 0, f"Expected 0, got 0x{result:x}")
|
|
self.assertEqual(st.scc, 0, "SCC should be 0 (result == 0)")
|
|
|
|
|
|
class TestVOP2_16bit_HiHalf(unittest.TestCase):
|
|
"""Regression tests for VOP2 16-bit ops reading from high half of VGPR (v128+ encoding).
|
|
|
|
Bug: VOP2 16-bit ops like v_add_f16 with src0 as v128+ should read the HIGH 16 bits
|
|
of the corresponding VGPR (v128 = v0.hi, v129 = v1.hi, etc). The emulator was
|
|
incorrectly reading from VGPR v128+ instead of the high half of v0+.
|
|
|
|
Example: v_add_f16 v0, v128, v0 means v0.lo = v0.hi + v0.lo (fold packed result)
|
|
"""
|
|
|
|
def test_v_add_f16_src0_hi_fold(self):
|
|
"""v_add_f16 with src0=v128 (v0.hi) - fold packed f16 values.
|
|
|
|
This pattern is generated by LLVM for summing packed f16 results:
|
|
v_pk_mul_f16 produces [hi, lo] in v0, then v_add_f16 v0, v128, v0 sums them.
|
|
"""
|
|
instructions = [
|
|
# v0 = packed f16: high=2.0 (0x4000), low=1.0 (0x3c00)
|
|
s_mov_b32(s[0], 0x40003c00),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_add_f16 v1, v128, v0 means: v1.lo = v0.hi + v0.lo = 2.0 + 1.0 = 3.0
|
|
# v128 in src0 means "read high 16 bits of v0"
|
|
v_add_f16_e32(v[1], v[0].h, v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xffff
|
|
self.assertEqual(result, 0x4200, f"Expected 3.0 (0x4200), got 0x{result:04x}")
|
|
|
|
def test_v_add_f16_src0_hi_different_reg(self):
|
|
"""v_add_f16 with src0=v129 (v1.hi) reads high half of v1."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x44004200), # v1: high=4.0, low=3.0
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
s_mov_b32(s[1], 0x3c00), # v0: low=1.0
|
|
v_mov_b32_e32(v[0], s[1]),
|
|
# v_add_f16 v2, v129, v0 means: v2.lo = v1.hi + v0.lo = 4.0 + 1.0 = 5.0
|
|
v_add_f16_e32(v[2], v[1].h, v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xffff
|
|
self.assertEqual(result, 0x4500, f"Expected 5.0 (0x4500), got 0x{result:04x}")
|
|
|
|
def test_v_mul_f16_src0_hi(self):
|
|
"""v_mul_f16 with src0 from high half."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40003c00), # v0: high=2.0, low=1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x4200), # v1: low=3.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
# v_mul_f16 v2, v128, v1 means: v2.lo = v0.hi * v1.lo = 2.0 * 3.0 = 6.0
|
|
v_mul_f16_e32(v[2], v[0].h, v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xffff
|
|
self.assertEqual(result, 0x4600, f"Expected 6.0 (0x4600), got 0x{result:04x}")
|
|
|
|
def test_v_add_f16_multilane(self):
|
|
"""v_add_f16 with src0=v128 across multiple lanes."""
|
|
instructions = [
|
|
# Set up different packed values per lane using v_mov with lane-dependent values
|
|
# Lane 0: v0 = 0x40003c00 (hi=2.0, lo=1.0) -> sum = 3.0
|
|
# Lane 1: v0 = 0x44004200 (hi=4.0, lo=3.0) -> sum = 7.0
|
|
v_mov_b32_e32(v[0], 0x40003c00), # default for all lanes
|
|
# Use v_cmp to select lane 1 (v255 = lane_id from prologue)
|
|
v_cmp_eq_u32_e32(1, v[255]), # vcc = (lane == 1)
|
|
v_cndmask_b32_e64(v[0], v[0], 0x44004200, SrcEnum.VCC_LO),
|
|
# Now fold: v1.lo = v0.hi + v0.lo
|
|
v_add_f16_e32(v[1], v[0].h, v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=2)
|
|
# Lane 0: 2.0 + 1.0 = 3.0 (0x4200)
|
|
self.assertEqual(st.vgpr[0][1] & 0xffff, 0x4200, "Lane 0: expected 3.0")
|
|
# Lane 1: 4.0 + 3.0 = 7.0 (0x4700)
|
|
self.assertEqual(st.vgpr[1][1] & 0xffff, 0x4700, "Lane 1: expected 7.0")
|
|
|
|
|
|
class TestVOPC_16bit_HiHalf(unittest.TestCase):
|
|
"""Regression tests for VOPC 16-bit ops reading from high half of VGPR (v128+ encoding).
|
|
|
|
Bug: VOPC 16-bit ops like v_cmp_lt_f16 with vsrc1 as v128+ should read the HIGH 16 bits
|
|
of the corresponding VGPR. The emulator was incorrectly reading from VGPR v128+.
|
|
|
|
Example: v_cmp_nge_f16 vcc, v0, v128 compares v0.lo with v0.hi
|
|
"""
|
|
|
|
def test_v_cmp_lt_f16_vsrc1_hi(self):
|
|
"""v_cmp_lt_f16 comparing low half with high half of same register."""
|
|
instructions = [
|
|
# v0: high=2.0 (0x4000), low=1.0 (0x3c00)
|
|
s_mov_b32(s[0], 0x40003c00),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_cmp_lt_f16 vcc, v0, v128 means: vcc = (v0.lo < v0.hi) = (1.0 < 2.0) = true
|
|
v_cmp_lt_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (1.0 < 2.0)")
|
|
|
|
def test_v_cmp_gt_f16_vsrc1_hi(self):
|
|
"""v_cmp_gt_f16 with vsrc1 from high half."""
|
|
instructions = [
|
|
# v0: high=1.0 (0x3c00), low=2.0 (0x4000)
|
|
s_mov_b32(s[0], 0x3c004000),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_cmp_gt_f16 vcc, v0, v128 means: vcc = (v0.lo > v0.hi) = (2.0 > 1.0) = true
|
|
v_cmp_gt_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (2.0 > 1.0)")
|
|
|
|
def test_v_cmp_eq_f16_vsrc1_hi_equal(self):
|
|
"""v_cmp_eq_f16 with equal low and high halves."""
|
|
instructions = [
|
|
# v0: high=3.0 (0x4200), low=3.0 (0x4200)
|
|
s_mov_b32(s[0], 0x42004200),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_cmp_eq_f16 vcc, v0, v128 means: vcc = (v0.lo == v0.hi) = (3.0 == 3.0) = true
|
|
v_cmp_eq_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (3.0 == 3.0)")
|
|
|
|
def test_v_cmp_neq_f16_vsrc1_hi(self):
|
|
"""v_cmp_neq_f16 with different low and high halves."""
|
|
instructions = [
|
|
# v0: high=2.0 (0x4000), low=1.0 (0x3c00)
|
|
s_mov_b32(s[0], 0x40003c00),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_cmp_neq_f16 vcc, v0, v128 means: vcc = (v0.lo != v0.hi) = (1.0 != 2.0) = true
|
|
v_cmp_lg_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "Expected vcc=1 (1.0 != 2.0)")
|
|
|
|
def test_v_cmp_nge_f16_inf_self(self):
|
|
"""v_cmp_nge_f16 comparing -inf with itself (unordered less than).
|
|
|
|
Regression test: -inf < -inf should be false (IEEE 754).
|
|
The bug was VOPC 16-bit not handling v128+ encoding for vsrc1.
|
|
"""
|
|
instructions = [
|
|
# v0: both halves = -inf (0xFC00)
|
|
s_mov_b32(s[0], 0xFC00FC00),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
# v_cmp_nge_f16 is "not greater or equal" which is equivalent to "unordered less than"
|
|
# -inf nge -inf should be false (since -inf >= -inf is true)
|
|
v_cmp_nge_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 0, "Expected vcc=0 (-inf >= -inf)")
|
|
|
|
def test_v_cmp_f16_multilane(self):
|
|
"""v_cmp_lt_f16 with vsrc1=v128 across multiple lanes."""
|
|
instructions = [
|
|
# Lane 0: v0 = 0x40003c00 (hi=2.0, lo=1.0) -> 1.0 < 2.0 = true
|
|
# Lane 1: v0 = 0x3c004000 (hi=1.0, lo=2.0) -> 2.0 < 1.0 = false
|
|
v_mov_b32_e32(v[0], 0x40003c00), # default
|
|
# Use v_cmp to select lane 1 (v255 = lane_id from prologue)
|
|
v_cmp_eq_u32_e32(1, v[255]), # vcc = (lane == 1)
|
|
v_cndmask_b32_e64(v[0], v[0], 0x3c004000, SrcEnum.VCC_LO),
|
|
v_cmp_lt_f16_e32(v[0], v[0].h),
|
|
]
|
|
st = run_program(instructions, n_lanes=2)
|
|
self.assertEqual(st.vcc & 1, 1, "Lane 0: expected vcc=1 (1.0 < 2.0)")
|
|
self.assertEqual((st.vcc >> 1) & 1, 0, "Lane 1: expected vcc=0 (2.0 < 1.0)")
|
|
|
|
|
|
class TestF16SinKernelOps(unittest.TestCase):
|
|
"""Tests for F16 instructions used in the sin kernel. Run with USE_HW=1 to compare emulator vs hardware."""
|
|
|
|
def test_v_cvt_i16_f16_zero(self):
|
|
"""v_cvt_i16_f16: Convert f16 0.0 to i16 0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00000000), # f16 0.0 in low bits
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i16_f16_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xFFFF
|
|
self.assertEqual(result, 0, f"Expected 0, got {result}")
|
|
|
|
def test_v_cvt_i16_f16_one(self):
|
|
"""v_cvt_i16_f16: Convert f16 1.0 (0x3c00) to i16 1."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00003c00), # f16 1.0 in low bits
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i16_f16_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xFFFF
|
|
self.assertEqual(result, 1, f"Expected 1, got {result}")
|
|
|
|
def test_v_cvt_i16_f16_negative(self):
|
|
"""v_cvt_i16_f16: Convert f16 -2.0 (0xc000) to i16 -2."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x0000c000), # f16 -2.0 in low bits
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i16_f16_e32(v[1], v[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xFFFF
|
|
# -2 as signed 16-bit = 0xFFFE
|
|
self.assertEqual(result, 0xFFFE, f"Expected 0xFFFE (-2), got 0x{result:04x}")
|
|
|
|
def test_v_cvt_i16_f16_from_hi(self):
|
|
"""v_cvt_i16_f16: Convert f16 from high half of register."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c000000), # f16 1.0 in HIGH bits, 0.0 in low
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_cvt_i16_f16_e32(v[1], v[0].h), # Read from high half
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xFFFF
|
|
self.assertEqual(result, 1, f"Expected 1, got {result}")
|
|
|
|
def test_v_bfe_i32_sign_extend(self):
|
|
"""v_bfe_i32: Extract 16 bits with sign extension."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x80000001), # low 16 bits = 0x0001
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_bfe_i32(v[1], v[0], 0, 16), # Extract bits 0-15 with sign extend
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
self.assertEqual(result, 1, f"Expected 1, got {result}")
|
|
|
|
def test_v_bfe_i32_sign_extend_negative(self):
|
|
"""v_bfe_i32: Extract 16 bits with sign extension (negative value)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x0000FFFE), # low 16 bits = 0xFFFE = -2 as i16
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_bfe_i32(v[1], v[0], 0, 16), # Extract bits 0-15 with sign extend
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1]
|
|
# -2 sign-extended to 32 bits = 0xFFFFFFFE
|
|
self.assertEqual(result, 0xFFFFFFFE, f"Expected 0xFFFFFFFE (-2), got 0x{result:08x}")
|
|
|
|
def test_v_cndmask_b16_select_src0(self):
|
|
"""v_cndmask_b16: Select src0 when vcc=0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c003800), # src0.h=1.0, src0.l=0.5
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x4000c000), # src1.h=2.0, src1.l=-2.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0), # vcc = 0
|
|
v_cndmask_b16(v[2], v[0], v[1], SrcEnum.VCC_LO), # Should select v0.l = 0.5
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x3800, f"Expected 0x3800 (0.5), got 0x{result:04x}")
|
|
|
|
def test_v_cndmask_b16_select_src1(self):
|
|
"""v_cndmask_b16: Select src1 when vcc=1."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c003800), # src0.h=1.0, src0.l=0.5
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x4000c000), # src1.h=2.0, src1.l=-2.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 1), # vcc = 1 for lane 0
|
|
v_cndmask_b16(v[2], v[0], v[1], SrcEnum.VCC_LO), # Should select v1.l = -2.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0xc000, f"Expected 0xc000 (-2.0), got 0x{result:04x}")
|
|
|
|
def test_v_cndmask_b16_write_hi(self):
|
|
"""v_cndmask_b16: Write to high half with opsel."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c003800), # src0: hi=1.0, lo=0.5
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x4000c000), # src1: hi=2.0, lo=-2.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], 0xDEAD0000), # v2 initial: hi=0xDEAD, lo=0
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0), # vcc = 0
|
|
# opsel=8 means write to high half (bit 3 = dst hi)
|
|
# opsel=1 means read src0 from hi, opsel=2 means read src1 from hi
|
|
# v_cndmask_b16 v2.h, v0.h, v1.h, vcc -> select v0.h = 1.0
|
|
VOP3(VOP3Op.V_CNDMASK_B16, vdst=v[2], src0=v[0], src1=v[1], src2=SrcEnum.VCC_LO, opsel=0b1011),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_hi = (st.vgpr[0][2] >> 16) & 0xFFFF
|
|
result_lo = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result_hi, 0x3c00, f"Expected hi=0x3c00 (1.0), got 0x{result_hi:04x}")
|
|
self.assertEqual(result_lo, 0x0000, f"Expected lo preserved as 0, got 0x{result_lo:04x}")
|
|
|
|
def test_v_mul_f16_basic(self):
|
|
"""v_mul_f16: 2.0 * 3.0 = 6.0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00004000), # f16 2.0 in low bits
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x00004200), # f16 3.0 in low bits
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mul_f16_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x4600, f"Expected 0x4600 (6.0), got 0x{result:04x}")
|
|
|
|
def test_v_mul_f16_by_zero(self):
|
|
"""v_mul_f16: x * 0.0 = 0.0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00003c00), # f16 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x00000000), # f16 0.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mul_f16_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x0000, f"Expected 0x0000 (0.0), got 0x{result:04x}")
|
|
|
|
def test_v_mul_f16_hi_half(self):
|
|
"""v_mul_f16: Multiply using high halves."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x40000000), # hi=2.0, lo=0.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x42000000), # hi=3.0, lo=0.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mul_f16_e32(v[2].h, v[0].h, v[1].h), # 2.0 * 3.0 = 6.0 in hi
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_hi = (st.vgpr[0][2] >> 16) & 0xFFFF
|
|
self.assertEqual(result_hi, 0x4600, f"Expected hi=0x4600 (6.0), got 0x{result_hi:04x}")
|
|
|
|
def test_v_fmac_f16_basic(self):
|
|
"""v_fmac_f16: dst = src0 * src1 + dst = 2.0 * 3.0 + 1.0 = 7.0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00004000), # f16 2.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x00004200), # f16 3.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], 0x00003c00), # f16 1.0 (accumulator)
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
v_fmac_f16_e32(v[2], v[0], v[1]), # v2 = v0 * v1 + v2
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x4700, f"Expected 0x4700 (7.0), got 0x{result:04x}")
|
|
|
|
def test_v_fmac_f16_hi_dest(self):
|
|
"""v_fmac_f16 with .h destination: dst.h = src0 * src1 + dst.h.
|
|
|
|
This tests the case from AMD_LLVM sin(0) where V_FMAC_F16 writes to v0.h.
|
|
The accumulator D should be read from v0.h, not v0.l.
|
|
"""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
# Set up: v0 = {hi=0.5, lo=1.0}, src0 = 0.0 (literal), src1 = v1.l (any value)
|
|
# Expected: v0.h = 0.0 * v1.l + 0.5 = 0.5 (unchanged)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x38003c00), # v0 = {hi=0.5, lo=1.0}
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x38000000), # v1 = {hi=0.5, lo=0.0}
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
# v_fmac_f16 v0.h, literal(0.318...), v1.l (vdst=128 for .h)
|
|
# D = D + S0 * S1 = v0.h + 0.318 * 0.0 = 0.5 + 0 = 0.5
|
|
VOP2(VOP2Op.V_FMAC_F16, vdst=RawImm(128), src0=RawImm(255), vsrc1=RawImm(1), literal=0x3518), # 0.318... * 0.0 + 0.5
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
v0 = st.vgpr[0][0]
|
|
result_hi = _f16((v0 >> 16) & 0xffff)
|
|
result_lo = _f16(v0 & 0xffff)
|
|
self.assertAlmostEqual(result_hi, 0.5, delta=0.01, msg=f"Expected v0.h=0.5, got {result_hi}")
|
|
self.assertAlmostEqual(result_lo, 1.0, delta=0.01, msg=f"Expected v0.l=1.0, got {result_lo}")
|
|
|
|
def test_v_add_f16_basic(self):
|
|
"""v_add_f16: 1.0 + 2.0 = 3.0."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00003c00), # f16 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x00004000), # f16 2.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_add_f16_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x4200, f"Expected 0x4200 (3.0), got 0x{result:04x}")
|
|
|
|
def test_v_add_f16_negative(self):
|
|
"""v_add_f16: 1.0 + (-1.5703125) = -0.5703125."""
|
|
# 0xbe48 is approximately -1.5703125 in f16
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00003c00), # f16 1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x0000be48), # f16 -1.5703125
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_add_f16_e32(v[2], v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
# 1.0 + (-1.5703125) = -0.5703125 which is approximately 0xb890
|
|
# Allow some tolerance - just check it's negative and close
|
|
from extra.assembly.amd.pcode import _f16
|
|
result_f = _f16(result)
|
|
expected = 1.0 - 1.5703125
|
|
self.assertAlmostEqual(result_f, expected, places=2, msg=f"Expected ~{expected}, got {result_f}")
|
|
|
|
def test_v_fmaak_f16_basic(self):
|
|
"""v_fmaak_f16: dst = src0 * vsrc1 + K."""
|
|
# v_fmaak_f16 computes: D = S0 * S1 + K
|
|
# 2.0 * 3.0 + 1.0 = 7.0
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00004000), # f16 2.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x00004200), # f16 3.0
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_fmaak_f16_e32(v[2], v[0], v[1], 0x3c00), # v2 = v0 * v1 + 1.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][2] & 0xFFFF
|
|
self.assertEqual(result, 0x4700, f"Expected 0x4700 (7.0), got 0x{result:04x}")
|
|
|
|
def test_v_fmamk_f32_basic(self):
|
|
"""v_fmamk_f32: dst = src0 * K + vsrc1."""
|
|
# v_fmamk_f32 computes: D = S0 * K + S1
|
|
# 2.0 * 3.0 + 1.0 = 7.0
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(1.0)), # accumulator
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_fmamk_f32_e32(v[2], v[0], f2i(3.0), v[1]), # v2 = v0 * 3.0 + v1
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][2])
|
|
self.assertAlmostEqual(result, 7.0, places=5, msg=f"Expected 7.0, got {result}")
|
|
|
|
def test_v_fmamk_f32_small_constant(self):
|
|
"""v_fmamk_f32: Test with small constant like in sin kernel."""
|
|
# This mimics part of the sin kernel: 1.0 * (-1.13e-4) + (-3.1414795) ≈ -3.1415926
|
|
k_val = 0xb8ed5000 # approximately -0.0001131594 as f32
|
|
s1_val = f2i(-3.1414794921875)
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(1.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], s1_val),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_fmamk_f32_e32(v[2], v[0], k_val, v[1]), # v2 = 1.0 * K + v1
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][2])
|
|
k_f32 = i2f(k_val)
|
|
expected = 1.0 * k_f32 + (-3.1414794921875)
|
|
self.assertAlmostEqual(result, expected, places=5, msg=f"Expected {expected}, got {result}")
|
|
|
|
def test_v_mov_b16_to_hi(self):
|
|
"""v_mov_b16: Move immediate to high half, preserving low."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x0000DEAD), # initial: lo=0xDEAD, hi=0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b16_e32(v[0].h, 0x3800), # Move 0.5 to high half
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
|
|
result_lo = st.vgpr[0][0] & 0xFFFF
|
|
self.assertEqual(result_hi, 0x3800, f"Expected hi=0x3800, got 0x{result_hi:04x}")
|
|
self.assertEqual(result_lo, 0xDEAD, f"Expected lo=0xDEAD (preserved), got 0x{result_lo:04x}")
|
|
|
|
def test_v_mov_b16_to_lo(self):
|
|
"""v_mov_b16: Move immediate to low half, preserving high."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xBEEF0000), # initial: hi=0xBEEF, lo=0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b16_e32(v[0], 0x3c00), # Move 1.0 to low half
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result_hi = (st.vgpr[0][0] >> 16) & 0xFFFF
|
|
result_lo = st.vgpr[0][0] & 0xFFFF
|
|
self.assertEqual(result_lo, 0x3c00, f"Expected lo=0x3c00, got 0x{result_lo:04x}")
|
|
self.assertEqual(result_hi, 0xBEEF, f"Expected hi=0xBEEF (preserved), got 0x{result_hi:04x}")
|
|
|
|
def test_v_xor_b32_sign_flip(self):
|
|
"""v_xor_b32: XOR with 0x8000 flips sign of f16 in low bits."""
|
|
# 0x4246 is approximately 3.13671875 in f16
|
|
# XOR with 0x8000 gives 0xC246 which is -3.13671875
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00004246), # f16 3.13671875
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_xor_b32_e32(v[1], 0x8000, v[0]), # Flip sign bit of low half
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = st.vgpr[0][1] & 0xFFFF
|
|
self.assertEqual(result, 0xC246, f"Expected 0xC246 (-3.137), got 0x{result:04x}")
|
|
|
|
def test_v_fma_mix_f32_all_f32_sources(self):
|
|
"""v_fma_mix_f32: All sources as f32 (opsel_hi=0)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(2.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(3.0)),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f2i(1.0)),
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
# opsel_hi=0,0,0 means all sources are f32
|
|
VOP3P(VOP3POp.V_FMA_MIX_F32, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][3])
|
|
self.assertAlmostEqual(result, 7.0, places=5, msg=f"2*3+1=7, got {result}")
|
|
|
|
def test_v_fma_mixlo_f16_all_f32_sources(self):
|
|
"""v_fma_mixlo_f16: All sources as f32, result to low f16."""
|
|
instructions = [
|
|
s_mov_b32(s[0], f2i(1.0)),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f2i(-1.22e-10)), # Very small
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
s_mov_b32(s[2], f2i(-3.1415927)), # -pi
|
|
v_mov_b32_e32(v[2], s[2]),
|
|
s_mov_b32(s[3], 0xDEAD0000), # Garbage in hi
|
|
v_mov_b32_e32(v[3], s[3]),
|
|
# 1.0 * (-1.22e-10) + (-3.1415927) ≈ -3.1415927
|
|
VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[0], src1=v[1], src2=v[2], opsel=0, opsel_hi=0, opsel_hi2=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
from extra.assembly.amd.pcode import _f16
|
|
result_lo = _f16(st.vgpr[0][3] & 0xFFFF)
|
|
result_hi = (st.vgpr[0][3] >> 16) & 0xFFFF
|
|
# Result should be approximately -pi
|
|
self.assertAlmostEqual(result_lo, -3.14, delta=0.01, msg=f"Expected ~-3.14, got {result_lo}")
|
|
self.assertEqual(result_hi, 0xDEAD, f"Expected hi preserved as 0xDEAD, got 0x{result_hi:04x}")
|
|
|
|
|
|
class TestVCmpClassF16(unittest.TestCase):
|
|
"""Tests for V_CMP_CLASS_F16 - critical for f16 sin/cos classification.
|
|
|
|
Class bit mapping:
|
|
bit 0 = signaling NaN
|
|
bit 1 = quiet NaN
|
|
bit 2 = -infinity
|
|
bit 3 = -normal
|
|
bit 4 = -denormal
|
|
bit 5 = -zero
|
|
bit 6 = +zero
|
|
bit 7 = +denormal
|
|
bit 8 = +normal
|
|
bit 9 = +infinity
|
|
|
|
This is crucial for the f16 sin kernel which uses v_cmp_class_f16 to detect
|
|
special values like +-0, +-inf, NaN and select appropriate outputs.
|
|
"""
|
|
|
|
def test_cmp_class_f16_positive_zero(self):
|
|
"""V_CMP_CLASS_F16: +zero should match bit 6."""
|
|
# f16 +0.0 = 0x0000
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0), # f16 +0.0 in low 16 bits
|
|
v_mov_b32_e32(v[1], 0x40), # bit 6 only (+zero)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x40")
|
|
|
|
def test_cmp_class_f16_negative_zero(self):
|
|
"""V_CMP_CLASS_F16: -zero should match bit 5."""
|
|
# f16 -0.0 = 0x8000
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x8000), # f16 -0.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x20), # bit 5 only (-zero)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -zero with mask 0x20")
|
|
|
|
def test_cmp_class_f16_positive_normal(self):
|
|
"""V_CMP_CLASS_F16: +1.0 (normal) should match bit 8."""
|
|
# f16 1.0 = 0x3c00
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c00), # f16 +1.0
|
|
s_mov_b32(s[1], 0x100), # bit 8 (+normal)
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +1.0 with mask 0x100 (+normal)")
|
|
|
|
def test_cmp_class_f16_negative_normal(self):
|
|
"""V_CMP_CLASS_F16: -1.0 (normal) should match bit 3."""
|
|
# f16 -1.0 = 0xbc00
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xbc00), # f16 -1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x08), # bit 3 (-normal)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -1.0 with mask 0x08 (-normal)")
|
|
|
|
def test_cmp_class_f16_positive_infinity(self):
|
|
"""V_CMP_CLASS_F16: +inf should match bit 9."""
|
|
# f16 +inf = 0x7c00
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7c00), # f16 +inf
|
|
s_mov_b32(s[1], 0x200), # bit 9 (+inf)
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +inf with mask 0x200")
|
|
|
|
def test_cmp_class_f16_negative_infinity(self):
|
|
"""V_CMP_CLASS_F16: -inf should match bit 2."""
|
|
# f16 -inf = 0xfc00
|
|
instructions = [
|
|
s_mov_b32(s[0], 0xfc00), # f16 -inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x04), # bit 2 (-inf)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -inf with mask 0x04")
|
|
|
|
def test_cmp_class_f16_quiet_nan(self):
|
|
"""V_CMP_CLASS_F16: quiet NaN should match bit 1."""
|
|
# f16 quiet NaN = 0x7e00 (exponent all 1s, mantissa MSB set)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7e00), # f16 quiet NaN
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x02), # bit 1 (quiet NaN)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for quiet NaN with mask 0x02")
|
|
|
|
def test_cmp_class_f16_signaling_nan(self):
|
|
"""V_CMP_CLASS_F16: signaling NaN should match bit 0."""
|
|
# f16 signaling NaN = 0x7c01 (exponent all 1s, mantissa MSB clear, other mantissa bits set)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7c01), # f16 signaling NaN
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x01), # bit 0 (signaling NaN)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for signaling NaN with mask 0x01")
|
|
|
|
def test_cmp_class_f16_positive_denormal(self):
|
|
"""V_CMP_CLASS_F16: positive denormal should match bit 7."""
|
|
# f16 smallest positive denormal = 0x0001
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 1), # f16 +denormal (0x0001)
|
|
v_mov_b32_e32(v[1], 0x80), # bit 7 (+denormal)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +denormal with mask 0x80")
|
|
|
|
def test_cmp_class_f16_negative_denormal(self):
|
|
"""V_CMP_CLASS_F16: negative denormal should match bit 4."""
|
|
# f16 smallest negative denormal = 0x8001
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x8001), # f16 -denormal
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
v_mov_b32_e32(v[1], 0x10), # bit 4 (-denormal)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for -denormal with mask 0x10")
|
|
|
|
def test_cmp_class_f16_combined_mask_zeros(self):
|
|
"""V_CMP_CLASS_F16: mask 0x60 covers both +zero and -zero."""
|
|
# Test with +0.0
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0), # f16 +0.0
|
|
v_mov_b32_e32(v[1], 0x60), # bits 5 and 6 (+-zero)
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x60")
|
|
|
|
def test_cmp_class_f16_combined_mask_1f8(self):
|
|
"""V_CMP_CLASS_F16: mask 0x1f8 covers -normal,-denorm,-zero,+zero,+denorm,+normal.
|
|
|
|
This is the exact mask used in the f16 sin kernel at PC=46:
|
|
v_cmp_class_f16_e64 vcc_lo, v1, 0x1f8
|
|
|
|
The kernel uses this to detect if the input is a "normal" finite value
|
|
(not NaN, not infinity). If the check fails (vcc=0), it selects NaN output.
|
|
"""
|
|
# Test with +0.0 - should match via bit 6
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0), # f16 +0.0
|
|
s_mov_b32(s[0], 0x1f8),
|
|
v_mov_b32_e32(v[1], s[0]), # mask 0x1f8
|
|
v_cmp_class_f16_e32(v[0], v[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with mask 0x1f8")
|
|
|
|
def test_cmp_class_f16_vop3_encoding(self):
|
|
"""V_CMP_CLASS_F16 in VOP3 encoding (v_cmp_class_f16_e64).
|
|
|
|
This tests the exact instruction encoding used in the f16 sin kernel.
|
|
VOP3 encoding allows the result to go to any SGPR pair, not just VCC.
|
|
"""
|
|
# v_cmp_class_f16_e64 vcc_lo, v0, 0x1f8
|
|
# Use SGPR to hold the mask since literals require special handling
|
|
instructions = [
|
|
v_mov_b32_e32(v[0], 0), # f16 +0.0
|
|
s_mov_b32(s[0], 0x1f8), # class mask
|
|
VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[0]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +zero with VOP3 encoding")
|
|
|
|
def test_cmp_class_f16_vop3_normal_positive(self):
|
|
"""V_CMP_CLASS_F16 VOP3 encoding with +1.0 (normal)."""
|
|
# f16 1.0 = 0x3c00, should match bit 8 (+normal) in mask 0x1f8
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x3c00), # f16 +1.0
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x1f8), # class mask
|
|
VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 1, "VCC should be 1 for +1.0 (normal) with mask 0x1f8")
|
|
|
|
def test_cmp_class_f16_vop3_nan_fails_mask(self):
|
|
"""V_CMP_CLASS_F16 VOP3: NaN should NOT match mask 0x1f8 (no NaN bits set)."""
|
|
# f16 quiet NaN = 0x7e00, should NOT match mask 0x1f8 (bits 3-8 only)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7e00), # f16 quiet NaN
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x1f8), # class mask
|
|
VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for NaN with mask 0x1f8 (no NaN bits)")
|
|
|
|
def test_cmp_class_f16_vop3_inf_fails_mask(self):
|
|
"""V_CMP_CLASS_F16 VOP3: +inf should NOT match mask 0x1f8 (no inf bits set)."""
|
|
# f16 +inf = 0x7c00, should NOT match mask 0x1f8 (bits 3-8 only)
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x7c00), # f16 +inf
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], 0x1f8), # class mask
|
|
VOP3(VOP3Op.V_CMP_CLASS_F16, vdst=RawImm(VCC), src0=v[0], src1=s[1]),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vcc & 1, 0, "VCC should be 0 for +inf with mask 0x1f8 (no inf bits)")
|
|
|
|
|
|
class TestVOP3F16Modifiers(unittest.TestCase):
|
|
"""Tests for VOP3 16-bit ops with abs/neg modifiers and inline constants.
|
|
|
|
VOP3 16-bit ops must:
|
|
1. Use f16 inline constants (not f32)
|
|
2. Apply abs/neg modifiers as f16 operations (toggle bit 15)
|
|
|
|
This is critical for sin/cos kernels that use v_cvt_f32_f16 with |abs|
|
|
and v_fma_f16 with inline constants.
|
|
"""
|
|
|
|
def test_v_cvt_f32_f16_abs_negative(self):
|
|
"""V_CVT_F32_F16 with |abs| on negative value."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_neg1 = f32_to_f16(-1.0) # 0xbc00
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_neg1),
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_cvt_f32_f16_e64(v[0], abs(v[1])), # |(-1.0)| = 1.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][0])
|
|
self.assertAlmostEqual(result, 1.0, places=5, msg=f"Expected 1.0, got {result}")
|
|
|
|
def test_v_cvt_f32_f16_abs_positive(self):
|
|
"""V_CVT_F32_F16 with |abs| on positive value (should stay positive)."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_2 = f32_to_f16(2.0) # 0x4000
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_2),
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_cvt_f32_f16_e64(v[0], abs(v[1])), # |2.0| = 2.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][0])
|
|
self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected 2.0, got {result}")
|
|
|
|
def test_v_cvt_f32_f16_neg_positive(self):
|
|
"""V_CVT_F32_F16 with neg on positive value."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_2 = f32_to_f16(2.0) # 0x4000
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_2),
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_cvt_f32_f16_e64(v[0], -v[1]), # -(2.0) = -2.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][0])
|
|
self.assertAlmostEqual(result, -2.0, places=5, msg=f"Expected -2.0, got {result}")
|
|
|
|
def test_v_cvt_f32_f16_neg_negative(self):
|
|
"""V_CVT_F32_F16 with neg on negative value (double negative)."""
|
|
from extra.assembly.amd.pcode import f32_to_f16
|
|
f16_neg2 = f32_to_f16(-2.0) # 0xc000
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_neg2),
|
|
v_mov_b32_e32(v[1], s[0]),
|
|
v_cvt_f32_f16_e64(v[0], -v[1]), # -(-2.0) = 2.0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i2f(st.vgpr[0][0])
|
|
self.assertAlmostEqual(result, 2.0, places=5, msg=f"Expected 2.0, got {result}")
|
|
|
|
def test_v_fma_f16_inline_const_1_0(self):
|
|
"""V_FMA_F16: a*b + 1.0 should use f16 inline constant."""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
# v4 = 0.3259 (f16), v6 = -0.4866 (f16), src2 = 1.0 inline
|
|
# Result: 0.3259 * (-0.4866) + 1.0 = 0.8413...
|
|
f16_a = f32_to_f16(0.325928) # 0x3537
|
|
f16_b = f32_to_f16(-0.486572) # 0xb7c9
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_a),
|
|
v_mov_b32_e32(v[4], s[0]),
|
|
s_mov_b32(s[1], f16_b),
|
|
v_mov_b32_e32(v[6], s[1]),
|
|
v_fma_f16(v[4], v[4], v[6], 1.0), # 1.0 is inline constant
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = _f16(st.vgpr[0][4] & 0xffff)
|
|
expected = 0.325928 * (-0.486572) + 1.0
|
|
self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected ~{expected:.4f}, got {result}")
|
|
|
|
def test_v_fma_f16_inline_const_0_5(self):
|
|
"""V_FMA_F16: a*b + 0.5 should use f16 inline constant."""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
f16_a = f32_to_f16(2.0)
|
|
f16_b = f32_to_f16(3.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_a),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f16_b),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_fma_f16(v[2], v[0], v[1], 0.5), # 0.5 is inline constant
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = _f16(st.vgpr[0][2] & 0xffff)
|
|
expected = 2.0 * 3.0 + 0.5
|
|
self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected {expected}, got {result}")
|
|
|
|
def test_v_fma_f16_inline_const_neg_1_0(self):
|
|
"""V_FMA_F16: a*b + (-1.0) should use f16 inline constant."""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
f16_a = f32_to_f16(2.0)
|
|
f16_b = f32_to_f16(3.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_a),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f16_b),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_fma_f16(v[2], v[0], v[1], -1.0), # -1.0 is inline constant
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = _f16(st.vgpr[0][2] & 0xffff)
|
|
expected = 2.0 * 3.0 + (-1.0)
|
|
self.assertAlmostEqual(result, expected, delta=0.01, msg=f"Expected {expected}, got {result}")
|
|
|
|
def test_v_add_f16_abs_both(self):
|
|
"""V_ADD_F16 with abs on both operands."""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
f16_neg2 = f32_to_f16(-2.0)
|
|
f16_neg3 = f32_to_f16(-3.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_neg2),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f16_neg3),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_add_f16_e64(v[2], abs(v[0]), abs(v[1])), # |-2| + |-3| = 5
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = _f16(st.vgpr[0][2] & 0xffff)
|
|
self.assertAlmostEqual(result, 5.0, delta=0.01, msg=f"Expected 5.0, got {result}")
|
|
|
|
def test_v_mul_f16_neg_abs(self):
|
|
"""V_MUL_F16 with neg on one operand and abs on another."""
|
|
from extra.assembly.amd.pcode import f32_to_f16, _f16
|
|
f16_2 = f32_to_f16(2.0)
|
|
f16_neg3 = f32_to_f16(-3.0)
|
|
instructions = [
|
|
s_mov_b32(s[0], f16_2),
|
|
v_mov_b32_e32(v[0], s[0]),
|
|
s_mov_b32(s[1], f16_neg3),
|
|
v_mov_b32_e32(v[1], s[1]),
|
|
v_mul_f16_e64(v[2], -v[0], abs(v[1])), # -(2) * |-3| = -6
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = _f16(st.vgpr[0][2] & 0xffff)
|
|
self.assertAlmostEqual(result, -6.0, delta=0.01, msg=f"Expected -6.0, got {result}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|
|
|
|
|
|
class TestVFmaMixSinCase(unittest.TestCase):
|
|
"""Tests for the specific V_FMA_MIXLO_F16 case that fails in AMD_LLVM sin(0) kernel."""
|
|
|
|
def test_v_fma_mixlo_f16_sin_case(self):
|
|
"""V_FMA_MIXLO_F16 case from sin kernel at pc=0x14e.
|
|
|
|
This tests the specific operands that produce the wrong result:
|
|
- src0 = v3 = 0x3f800000 (f32 1.0)
|
|
- src1 = s6 = 0xaf05a309 (f32 tiny negative)
|
|
- src2 = v5 = 0xc0490fdb (f32 -π)
|
|
- Result should be approximately -π (tiny * 1.0 + -π ≈ -π)
|
|
"""
|
|
from extra.assembly.amd.pcode import _f16
|
|
instructions = [
|
|
# Set up operands as in the sin kernel
|
|
s_mov_b32(s[0], 0x3f800000), # f32 1.0
|
|
v_mov_b32_e32(v[3], s[0]),
|
|
s_mov_b32(s[1], 0xaf05a309), # f32 tiny negative
|
|
s_mov_b32(s[6], s[1]),
|
|
s_mov_b32(s[2], 0xc0490fdb), # f32 -π
|
|
v_mov_b32_e32(v[5], s[2]),
|
|
# Pre-fill v3 with expected hi bits
|
|
s_mov_b32(s[3], 0x3f800000), # hi = f32 1.0 encoding (will be overwritten by opsel behavior)
|
|
v_mov_b32_e32(v[3], s[3]),
|
|
# V_FMA_MIXLO_F16: src0=v3 (259), src1=s6, src2=v5 (261), opsel=0, opsel_hi=0, opsel_hi2=0
|
|
VOP3P(VOP3POp.V_FMA_MIXLO_F16, vdst=v[3], src0=v[3], src1=s[6], src2=v[5], opsel=0, opsel_hi=0, opsel_hi2=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
lo = _f16(st.vgpr[0][3] & 0xffff)
|
|
# Result should be approximately -π = -3.14...
|
|
# f16 -π ≈ 0xc248 = -3.140625
|
|
self.assertAlmostEqual(lo, -3.14159, delta=0.01, msg=f"Expected ~-π, got {lo}")
|
|
|
|
|
|
class TestVTrigPreopF64(unittest.TestCase):
|
|
"""Tests for V_TRIG_PREOP_F64 instruction.
|
|
|
|
V_TRIG_PREOP_F64 extracts chunks of 2/PI for Payne-Hanek trig range reduction.
|
|
For input S0 (f64) and index S1 (0, 1, or 2), it returns a portion of 2/PI
|
|
scaled appropriately for computing |S0| * (2/PI) in extended precision.
|
|
|
|
The three chunks (index 0, 1, 2) when summed should equal 2/PI.
|
|
"""
|
|
|
|
def test_trig_preop_f64_index0(self):
|
|
"""V_TRIG_PREOP_F64 index=0: primary chunk of 2/PI."""
|
|
import math
|
|
two_over_pi = 2.0 / math.pi
|
|
instructions = [
|
|
# S0 = 1.0 (f64), S1 = 0 (index)
|
|
s_mov_b32(s[0], 0x00000000), # low bits of 1.0
|
|
s_mov_b32(s[1], 0x3ff00000), # high bits of 1.0
|
|
v_trig_preop_f64(v[0], abs(s[0]), 0), # index 0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
# For x=1.0, index=0 should give the main part of 2/PI
|
|
self.assertAlmostEqual(result, two_over_pi, places=10, msg=f"Expected ~{two_over_pi}, got {result}")
|
|
|
|
def test_trig_preop_f64_index1(self):
|
|
"""V_TRIG_PREOP_F64 index=1: secondary chunk (extended precision bits)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00000000), # low bits of 1.0
|
|
s_mov_b32(s[1], 0x3ff00000), # high bits of 1.0
|
|
v_trig_preop_f64(v[0], abs(s[0]), 1), # index 1
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
# Index 1 gives the next 53 bits, should be very small (~1e-16)
|
|
self.assertLess(abs(result), 1e-15, msg=f"Expected tiny value, got {result}")
|
|
self.assertGreater(abs(result), 0, msg="Expected non-zero value")
|
|
|
|
def test_trig_preop_f64_index2(self):
|
|
"""V_TRIG_PREOP_F64 index=2: tertiary chunk (more extended precision bits)."""
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00000000), # low bits of 1.0
|
|
s_mov_b32(s[1], 0x3ff00000), # high bits of 1.0
|
|
v_trig_preop_f64(v[0], abs(s[0]), 2), # index 2
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
# Index 2 gives the next 53 bits after index 1, should be tiny (~1e-32)
|
|
self.assertLess(abs(result), 1e-30, msg=f"Expected very tiny value, got {result}")
|
|
|
|
def test_trig_preop_f64_sum_equals_two_over_pi(self):
|
|
"""V_TRIG_PREOP_F64: sum of chunks 0,1,2 should equal 2/PI."""
|
|
import math
|
|
two_over_pi = 2.0 / math.pi
|
|
instructions = [
|
|
s_mov_b32(s[0], 0x00000000), # low bits of 1.0
|
|
s_mov_b32(s[1], 0x3ff00000), # high bits of 1.0
|
|
v_trig_preop_f64(v[0], abs(s[0]), 0), # index 0 -> v[0:1]
|
|
v_trig_preop_f64(v[2], abs(s[0]), 1), # index 1 -> v[2:3]
|
|
v_trig_preop_f64(v[4], abs(s[0]), 2), # index 2 -> v[4:5]
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
p0 = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
p1 = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32))
|
|
p2 = i642f(st.vgpr[0][4] | (st.vgpr[0][5] << 32))
|
|
total = p0 + p1 + p2
|
|
self.assertAlmostEqual(total, two_over_pi, places=14, msg=f"Expected {two_over_pi}, got {total} (p0={p0}, p1={p1}, p2={p2})")
|
|
|
|
def test_trig_preop_f64_large_input(self):
|
|
"""V_TRIG_PREOP_F64 with larger input should adjust shift based on exponent."""
|
|
import math
|
|
# For x=2.0, exponent(2.0)=1024 which is <= 1077, so no adjustment
|
|
# But let's test with x=2^60 where exponent > 1077
|
|
large_val = 2.0 ** 60 # exponent = 1083 > 1077
|
|
large_bits = f2i64(large_val)
|
|
instructions = [
|
|
s_mov_b32(s[0], large_bits & 0xffffffff),
|
|
s_mov_b32(s[1], (large_bits >> 32) & 0xffffffff),
|
|
v_trig_preop_f64(v[0], abs(s[0]), 0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
result = i642f(st.vgpr[0][0] | (st.vgpr[0][1] << 32))
|
|
# Result should still be a valid float (not NaN or inf)
|
|
self.assertFalse(math.isnan(result), "Result should not be NaN")
|
|
self.assertFalse(math.isinf(result), "Result should not be inf")
|
|
|
|
|
|
class Test64BitLiterals(unittest.TestCase):
|
|
"""Regression tests for 64-bit instruction literal encoding.
|
|
Tests verify that Inst.to_bytes() correctly encodes 64-bit literals."""
|
|
|
|
def test_64bit_literal_negative_encoding(self):
|
|
"""Verify 64-bit instruction encodes negative literals correctly.
|
|
Regression test: -33 should encode as 0xffffffdf in the literal field,
|
|
NOT as 0xffffffff (which would happen with incorrect sign extension)."""
|
|
neg_val = -33
|
|
expected_lit = neg_val & 0xffffffff # 0xffffffdf
|
|
inst = v_add_f64(v[2], v[0], neg_val)
|
|
# Check the literal is stored correctly (in high 32 bits for 64-bit ops)
|
|
self.assertIsNotNone(inst._literal, "Literal should be set")
|
|
# Literal is stored as (lit32 << 32) for 64-bit ops
|
|
actual_lit = (inst._literal >> 32) & 0xffffffff
|
|
self.assertEqual(actual_lit, expected_lit, f"Literal should be {expected_lit:#x}, got {actual_lit:#x}")
|
|
# Also verify the encoded bytes
|
|
code = inst.to_bytes()
|
|
# Literal is last 4 bytes
|
|
lit_bytes = code[-4:]
|
|
lit_val = int.from_bytes(lit_bytes, 'little')
|
|
self.assertEqual(lit_val, expected_lit, f"Encoded literal should be {expected_lit:#x}, got {lit_val:#x}")
|
|
|
|
def test_64bit_literal_positive_encoding(self):
|
|
"""Verify 64-bit instruction encodes large positive literals correctly."""
|
|
large_val = 0x12345678
|
|
inst = v_add_f64(v[2], v[0], large_val)
|
|
self.assertIsNotNone(inst._literal, "Literal should be set")
|
|
actual_lit = (inst._literal >> 32) & 0xffffffff
|
|
self.assertEqual(actual_lit, large_val, f"Literal should be {large_val:#x}, got {actual_lit:#x}")
|
|
# Verify encoded bytes
|
|
code = inst.to_bytes()
|
|
lit_bytes = code[-4:]
|
|
lit_val = int.from_bytes(lit_bytes, 'little')
|
|
self.assertEqual(lit_val, large_val, f"Encoded literal should be {large_val:#x}, got {lit_val:#x}")
|
|
|
|
|
|
class TestWave32VCCBranch(unittest.TestCase):
|
|
"""Regression tests for wave32 VCC branch behavior.
|
|
In wave32 mode, S_CBRANCH_VCCNZ/VCCZ should only check VCC_LO (lower 32 bits),
|
|
ignoring VCC_HI. Bug: emulator was checking full 64-bit VCC, causing incorrect
|
|
branches when VCC_LO=0 but VCC_HI!=0."""
|
|
|
|
def test_cbranch_vccnz_ignores_vcc_hi(self):
|
|
"""S_CBRANCH_VCCNZ should NOT branch when VCC_LO=0, even if VCC_HI!=0.
|
|
This is the fix for test_avg_pool3d failure where the emulator incorrectly
|
|
branched due to stale VCC_HI bits."""
|
|
instructions = [
|
|
# Set VCC_HI to non-zero (simulating stale bits from previous ops)
|
|
s_mov_b32(s[SrcEnum.VCC_HI - 128], 0x80000000), # VCC_HI = 0x80000000
|
|
# Set VCC_LO to zero (the condition we're testing)
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0), # VCC_LO = 0
|
|
# Now S_CBRANCH_VCCNZ should NOT branch since VCC_LO is 0
|
|
# If it doesn't branch, we'll set v0 = 1; if it branches, v0 stays 0
|
|
v_mov_b32_e32(v[0], 0),
|
|
s_cbranch_vccnz(2), # Skip next instruction if VCC != 0
|
|
v_mov_b32_e32(v[0], 1), # This should execute
|
|
s_nop(0), # Jump target
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v0 should be 1 because VCC_LO=0 means no branch
|
|
self.assertEqual(st.vgpr[0][0], 1, "Should NOT branch when VCC_LO=0 (VCC_HI ignored in wave32)")
|
|
|
|
def test_cbranch_vccz_ignores_vcc_hi(self):
|
|
"""S_CBRANCH_VCCZ should branch when VCC_LO=0, regardless of VCC_HI."""
|
|
instructions = [
|
|
# Set VCC_HI to non-zero (simulating stale bits)
|
|
s_mov_b32(s[SrcEnum.VCC_HI - 128], 0x80000000), # VCC_HI = 0x80000000
|
|
# Set VCC_LO to zero
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 0), # VCC_LO = 0
|
|
# S_CBRANCH_VCCZ should branch since VCC_LO is 0
|
|
v_mov_b32_e32(v[0], 0),
|
|
s_cbranch_vccz(2), # Skip next instruction if VCC == 0
|
|
v_mov_b32_e32(v[0], 1), # This should NOT execute
|
|
s_nop(0), # Jump target
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v0 should be 0 because VCC_LO=0 means branch is taken
|
|
self.assertEqual(st.vgpr[0][0], 0, "Should branch when VCC_LO=0 (VCC_HI ignored in wave32)")
|
|
|
|
def test_cbranch_vccnz_branches_on_vcc_lo(self):
|
|
"""S_CBRANCH_VCCNZ should branch when VCC_LO!=0."""
|
|
instructions = [
|
|
# Set VCC_LO to non-zero
|
|
s_mov_b32(s[SrcEnum.VCC_LO - 128], 1), # VCC_LO = 1
|
|
s_mov_b32(s[SrcEnum.VCC_HI - 128], 0), # VCC_HI = 0
|
|
v_mov_b32_e32(v[0], 0),
|
|
s_cbranch_vccnz(2), # Skip next instruction if VCC != 0
|
|
v_mov_b32_e32(v[0], 1), # This should NOT execute
|
|
s_nop(0), # Jump target
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v0 should be 0 because VCC_LO=1 means branch is taken
|
|
self.assertEqual(st.vgpr[0][0], 0, "Should branch when VCC_LO!=0")
|
|
|
|
|
|
class TestVOP3VOPC16Bit(unittest.TestCase):
|
|
"""Regression tests for VOP3-encoded VOPC 16-bit comparison instructions.
|
|
When VOPC comparisons are encoded in VOP3 format, they use opsel bits to select
|
|
which 16-bit half of each source to compare.
|
|
Bug: Emulator was ignoring opsel and using VGPR bit 7 encoding instead."""
|
|
|
|
def test_cmp_eq_u16_opsel_lo_lo(self):
|
|
"""V_CMP_EQ_U16 VOP3 with opsel=0 compares lo halves."""
|
|
# v0 = 0x12340005 (lo=5, hi=0x1234)
|
|
# v1 = 0x56780005 (lo=5, hi=0x5678)
|
|
# opsel=0: compare lo halves -> 5 == 5 -> true
|
|
instructions = [
|
|
s_mov_b32(s[2], 0x12340005),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
s_mov_b32(s[2], 0x56780005),
|
|
v_mov_b32_e32(v[1], s[2]),
|
|
VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=0), # dst=s0
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# s0 should have bit 0 set (comparison true for lane 0)
|
|
self.assertEqual(st.sgpr[0] & 1, 1, "lo==lo should be true: 5==5")
|
|
|
|
def test_cmp_eq_u16_opsel_hi_hi(self):
|
|
"""V_CMP_EQ_U16 VOP3 with opsel=3 compares hi halves."""
|
|
# v0 = 0x12340005 (lo=5, hi=0x1234)
|
|
# v1 = 0x56780005 (lo=5, hi=0x5678)
|
|
# opsel=3 (bits 0 and 1 set): compare hi halves -> 0x1234 != 0x5678 -> false
|
|
instructions = [
|
|
s_mov_b32(s[2], 0x12340005),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
s_mov_b32(s[2], 0x56780005),
|
|
v_mov_b32_e32(v[1], s[2]),
|
|
VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3), # dst=s0, hi vs hi
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# s0 should have bit 0 clear (comparison false for lane 0)
|
|
self.assertEqual(st.sgpr[0] & 1, 0, "hi==hi should be false: 0x1234!=0x5678")
|
|
|
|
def test_cmp_eq_u16_opsel_hi_hi_equal(self):
|
|
"""V_CMP_EQ_U16 VOP3 with opsel=3 compares hi halves (equal case)."""
|
|
# v0 = 0x12340005 (lo=5, hi=0x1234)
|
|
# v1 = 0x12340009 (lo=9, hi=0x1234)
|
|
# opsel=3: compare hi halves -> 0x1234 == 0x1234 -> true
|
|
instructions = [
|
|
s_mov_b32(s[2], 0x12340005),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
s_mov_b32(s[2], 0x12340009),
|
|
v_mov_b32_e32(v[1], s[2]),
|
|
VOP3(VOP3Op.V_CMP_EQ_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3), # dst=s0, hi vs hi
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# s0 should have bit 0 set (comparison true for lane 0)
|
|
self.assertEqual(st.sgpr[0] & 1, 1, "hi==hi should be true: 0x1234==0x1234")
|
|
|
|
def test_cmp_gt_u16_opsel_hi(self):
|
|
"""V_CMP_GT_U16 VOP3 with opsel=3 compares hi halves."""
|
|
# v0 = 0x99990005 (lo=5, hi=0x9999)
|
|
# v1 = 0x12340005 (lo=5, hi=0x1234)
|
|
# opsel=3: compare hi halves -> 0x9999 > 0x1234 -> true
|
|
instructions = [
|
|
s_mov_b32(s[2], 0x99990005),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
s_mov_b32(s[2], 0x12340005),
|
|
v_mov_b32_e32(v[1], s[2]),
|
|
VOP3(VOP3Op.V_CMP_GT_U16, vdst=v[0], src0=v[0], src1=v[1], opsel=3), # dst=s0, hi vs hi
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# s0 should have bit 0 set (comparison true for lane 0)
|
|
self.assertEqual(st.sgpr[0] & 1, 1, "hi>hi should be true: 0x9999>0x1234")
|
|
|
|
|
|
class TestDS2Addr(unittest.TestCase):
|
|
"""Regression tests for DS_LOAD_2ADDR and DS_STORE_2ADDR instructions.
|
|
These ops use offset scaling: offset * sizeof(data) for address calculation.
|
|
Bug: Emulator was using offset*4 for both B32 and B64, but B64 needs offset*8."""
|
|
|
|
def test_ds_store_load_2addr_b32(self):
|
|
"""DS_STORE_2ADDR_B32 and DS_LOAD_2ADDR_B32 with offset scaling by 4."""
|
|
# Store 0x12345678 at offset0=0 (*4=0) and 0xDEADBEEF at offset1=1 (*4=4)
|
|
# Then load them back
|
|
instructions = [
|
|
v_mov_b32_e32(v[10], 0), # addr base = 0
|
|
s_mov_b32(s[2], 0x12345678),
|
|
v_mov_b32_e32(v[0], s[2]), # data0
|
|
s_mov_b32(s[2], 0xDEADBEEF),
|
|
v_mov_b32_e32(v[1], s[2]), # data1
|
|
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1),
|
|
s_waitcnt(lgkmcnt=0),
|
|
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=0, offset1=1),
|
|
s_waitcnt(lgkmcnt=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][2], 0x12345678, "v2 should have value from offset 0")
|
|
self.assertEqual(st.vgpr[0][3], 0xDEADBEEF, "v3 should have value from offset 4")
|
|
|
|
def test_ds_store_load_2addr_b32_nonzero_offsets(self):
|
|
"""DS_STORE_2ADDR_B32 with non-zero offsets (offset*4 scaling)."""
|
|
# Store at offset0=2 (*4=8) and offset1=5 (*4=20)
|
|
instructions = [
|
|
v_mov_b32_e32(v[10], 0), # addr base = 0
|
|
s_mov_b32(s[2], 0x11111111),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
s_mov_b32(s[2], 0x22222222),
|
|
v_mov_b32_e32(v[1], s[2]),
|
|
DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=2, offset1=5),
|
|
s_waitcnt(lgkmcnt=0),
|
|
DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[2], offset0=2, offset1=5),
|
|
s_waitcnt(lgkmcnt=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have value from offset 8 (2*4)")
|
|
self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have value from offset 20 (5*4)")
|
|
|
|
def test_ds_store_load_2addr_b64(self):
|
|
"""DS_STORE_2ADDR_B64 and DS_LOAD_2ADDR_B64 with offset scaling by 8."""
|
|
# For B64: each value is 8 bytes (2 dwords), offsets scaled by 8
|
|
# Store 64-bit value at offset0=0 (*8=0) and another at offset1=1 (*8=8)
|
|
instructions = [
|
|
v_mov_b32_e32(v[10], 0), # addr base = 0
|
|
# First 64-bit value: 0x123456789ABCDEF0
|
|
s_mov_b32(s[2], 0x9ABCDEF0),
|
|
v_mov_b32_e32(v[0], s[2]), # low dword
|
|
s_mov_b32(s[2], 0x12345678),
|
|
v_mov_b32_e32(v[1], s[2]), # high dword
|
|
# Second 64-bit value: 0xDEADBEEFCAFEBABE
|
|
s_mov_b32(s[2], 0xCAFEBABE),
|
|
v_mov_b32_e32(v[2], s[2]), # low dword
|
|
s_mov_b32(s[2], 0xDEADBEEF),
|
|
v_mov_b32_e32(v[3], s[2]), # high dword
|
|
DS(DSOp.DS_STORE_2ADDR_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=0, offset1=1),
|
|
s_waitcnt(lgkmcnt=0),
|
|
DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=1),
|
|
s_waitcnt(lgkmcnt=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v4,v5 = first 64-bit value from offset 0
|
|
self.assertEqual(st.vgpr[0][4], 0x9ABCDEF0, "v4 should have low dword of first value")
|
|
self.assertEqual(st.vgpr[0][5], 0x12345678, "v5 should have high dword of first value")
|
|
# v6,v7 = second 64-bit value from offset 8 (1*8)
|
|
self.assertEqual(st.vgpr[0][6], 0xCAFEBABE, "v6 should have low dword of second value")
|
|
self.assertEqual(st.vgpr[0][7], 0xDEADBEEF, "v7 should have high dword of second value")
|
|
|
|
def test_ds_2addr_b64_no_overlap(self):
|
|
"""DS_LOAD_2ADDR_B64 with adjacent offsets should not overlap.
|
|
Regression test: offset1=1 should access bytes 8-15, not overlap with offset0=0 (bytes 0-7)."""
|
|
instructions = [
|
|
v_mov_b32_e32(v[10], 0),
|
|
# Store 4 distinct dwords at addresses 0,4,8,12 using regular DS_STORE
|
|
s_mov_b32(s[2], 0x11111111),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
ds_store_b32(addr=v[10], data0=v[0], offset0=0),
|
|
s_mov_b32(s[2], 0x22222222),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
ds_store_b32(addr=v[10], data0=v[0], offset0=4),
|
|
s_mov_b32(s[2], 0x33333333),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
ds_store_b32(addr=v[10], data0=v[0], offset0=8),
|
|
s_mov_b32(s[2], 0x44444444),
|
|
v_mov_b32_e32(v[0], s[2]),
|
|
ds_store_b32(addr=v[10], data0=v[0], offset0=12),
|
|
s_waitcnt(lgkmcnt=0),
|
|
# Load with DS_LOAD_2ADDR_B64: offset0=0 should get 0-7, offset1=1 should get 8-15
|
|
DS(DSOp.DS_LOAD_2ADDR_B64, addr=v[10], vdst=v[4], offset0=0, offset1=1),
|
|
s_waitcnt(lgkmcnt=0),
|
|
]
|
|
st = run_program(instructions, n_lanes=1)
|
|
# v4,v5 from addr 0-7: 0x11111111, 0x22222222
|
|
self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should be 0x11111111")
|
|
self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should be 0x22222222")
|
|
# v6,v7 from addr 8-15: 0x33333333, 0x44444444
|
|
self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should be 0x33333333")
|
|
self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should be 0x44444444")
|