diff --git a/CLAUDE.md b/CLAUDE.md index 17af1c2074..e0825c8329 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -79,7 +79,7 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()" ## Common Environment Variables -- `DEBUG=1-4` - Increasing verbosity +- `DEBUG=1-7` - Increasing verbosity (7 shows assembly output) - `VIZ=1` - Enable graph visualization - `SPEC=1` - Enable UOp spec verification - `NOOPT=1` - Disable optimizations @@ -100,6 +100,14 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()" - Run tests before proposing commits - Test with `SPEC=2` when modifying UOp-related code +## Auto-generated Files (DO NOT EDIT) + +The following files are auto-generated and should never be edited manually: +- `extra/assembly/rdna3/autogen/gen_pcode.py` - Generated by `python -m extra.assembly.rdna3.pcode` +- `extra/assembly/rdna3/autogen/__init__.py` - Generated from AMD ISA definitions + +To add missing instruction implementations, add them to `extra/assembly/rdna3/emu.py` instead. + ## Style Notes - 2-space indentation, 150 char line limit diff --git a/extra/assembly/rdna3/alu.py b/extra/assembly/rdna3/alu.py deleted file mode 100644 index b58b741c5f..0000000000 --- a/extra/assembly/rdna3/alu.py +++ /dev/null @@ -1,254 +0,0 @@ -# Pure combinational ALU functions for RDNA3 emulation -from __future__ import annotations -import struct, math -from typing import Callable -from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, VOP1Op, VOP2Op, VOP3Op - -# Format base offsets for unified opcode space -SOP2_BASE, SOP1_BASE, SOPC_BASE, SOPK_BASE = 0x000, 0x100, 0x200, 0x300 -VOP2_BASE, VOP1_BASE = 0x100, 0x180 - -# Float conversion helpers -_I, _f, _H, _e = struct.Struct(' float: return _f.unpack(_I.pack(i & 0xffffffff))[0] -def i32(f: float) -> int: - if math.isinf(f): return 0x7f800000 if f > 0 else 0xff800000 - try: return _I.unpack(_f.pack(f))[0] - except (OverflowError, struct.error): return 0x7f800000 if f > 0 else 0xff800000 -def f16(i: int) -> float: return _e.unpack(_H.pack(i & 0xffff))[0] -def i16(f: float) -> int: - if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00 - try: return _H.unpack(_e.pack(f))[0] - except (OverflowError, struct.error): return 0x7c00 if f > 0 else 0xfc00 -def sext(v: int, b: int) -> int: return v - (1 << b) if v & (1 << (b-1)) else v -def clz(x: int) -> int: return 32 - x.bit_length() if x else 32 -def cls(x: int) -> int: x &= 0xffffffff; return 31 if x in (0, 0xffffffff) else clz(~x & 0xffffffff if x >> 31 else x) - 1 -def _cvt_i32_f32(v): return (0x7fffffff if v > 0 else 0x80000000) if math.isinf(v) else (0 if math.isnan(v) else max(-0x80000000, min(0x7fffffff, int(v))) & 0xffffffff) -def _cvt_u32_f32(v): return (0xffffffff if v > 0 else 0) if math.isinf(v) else (0 if math.isnan(v) or v < 0 else min(0xffffffff, int(v))) - -# SALU: op -> fn(s0, s1, scc_in) -> (result, scc_out) -SALU: dict[int, Callable] = { - # SOP2 - SOP2_BASE + SOP2Op.S_ADD_U32: lambda a, b, scc: ((a + b) & 0xffffffff, int((a + b) >= 0x100000000)), - SOP2_BASE + SOP2Op.S_SUB_U32: lambda a, b, scc: ((a - b) & 0xffffffff, int(b > a)), - SOP2_BASE + SOP2Op.S_ADDC_U32: lambda a, b, scc: ((r := a + b + scc) & 0xffffffff, int(r >= 0x100000000)), - SOP2_BASE + SOP2Op.S_SUBB_U32: lambda a, b, scc: ((a - b - scc) & 0xffffffff, int((b + scc) > a)), - SOP2_BASE + SOP2Op.S_ADD_I32: lambda a, b, scc: ((r := sext(a, 32) + sext(b, 32)) & 0xffffffff, int(((a >> 31) == (b >> 31)) and ((a >> 31) != ((r >> 31) & 1)))), - SOP2_BASE + SOP2Op.S_SUB_I32: lambda a, b, scc: ((r := sext(a, 32) - sext(b, 32)) & 0xffffffff, int(((a >> 31) != (b >> 31)) and ((a >> 31) != ((r >> 31) & 1)))), - SOP2_BASE + SOP2Op.S_AND_B32: lambda a, b, scc: ((r := a & b), int(r != 0)), - SOP2_BASE + SOP2Op.S_OR_B32: lambda a, b, scc: ((r := a | b), int(r != 0)), - SOP2_BASE + SOP2Op.S_XOR_B32: lambda a, b, scc: ((r := a ^ b), int(r != 0)), - SOP2_BASE + SOP2Op.S_AND_NOT1_B32: lambda a, b, scc: ((r := a & (~b & 0xffffffff)), int(r != 0)), - SOP2_BASE + SOP2Op.S_OR_NOT1_B32: lambda a, b, scc: ((r := a | (~b & 0xffffffff)), int(r != 0)), - SOP2_BASE + SOP2Op.S_LSHL_B32: lambda a, b, scc: ((r := (a << (b & 0x1f)) & 0xffffffff), int(r != 0)), - SOP2_BASE + SOP2Op.S_LSHR_B32: lambda a, b, scc: ((r := a >> (b & 0x1f)), int(r != 0)), - SOP2_BASE + SOP2Op.S_ASHR_I32: lambda a, b, scc: ((r := sext(a, 32) >> (b & 0x1f)) & 0xffffffff, int(r != 0)), - SOP2_BASE + SOP2Op.S_MUL_I32: lambda a, b, scc: ((sext(a, 32) * sext(b, 32)) & 0xffffffff, scc), - SOP2_BASE + SOP2Op.S_MUL_HI_U32: lambda a, b, scc: (((a * b) >> 32) & 0xffffffff, scc), - SOP2_BASE + SOP2Op.S_MUL_HI_I32: lambda a, b, scc: (((sext(a, 32) * sext(b, 32)) >> 32) & 0xffffffff, scc), - SOP2_BASE + SOP2Op.S_MIN_I32: lambda a, b, scc: (a, 1) if sext(a, 32) < sext(b, 32) else (b, 0), - SOP2_BASE + SOP2Op.S_MIN_U32: lambda a, b, scc: (a, 1) if a < b else (b, 0), - SOP2_BASE + SOP2Op.S_MAX_I32: lambda a, b, scc: (a, 1) if sext(a, 32) > sext(b, 32) else (b, 0), - SOP2_BASE + SOP2Op.S_MAX_U32: lambda a, b, scc: (a, 1) if a > b else (b, 0), - SOP2_BASE + SOP2Op.S_CSELECT_B32: lambda a, b, scc: (a if scc else b, scc), - SOP2_BASE + SOP2Op.S_BFE_U32: lambda a, b, scc: ((r := ((a >> (b & 0x1f)) & ((1 << ((b >> 16) & 0x7f)) - 1)) if (b >> 16) & 0x7f else 0), int(r != 0)), - SOP2_BASE + SOP2Op.S_BFE_I32: lambda a, b, scc: ((r := sext((a >> (b & 0x1f)) & ((1 << w) - 1), w) & 0xffffffff if (w := (b >> 16) & 0x7f) else 0), int(r != 0)), - SOP2_BASE + SOP2Op.S_PACK_LL_B32_B16: lambda a, b, scc: ((a & 0xffff) | ((b & 0xffff) << 16), scc), - SOP2_BASE + SOP2Op.S_PACK_LH_B32_B16: lambda a, b, scc: ((a & 0xffff) | (b & 0xffff0000), scc), - SOP2_BASE + SOP2Op.S_PACK_HH_B32_B16: lambda a, b, scc: (((a >> 16) & 0xffff) | (b & 0xffff0000), scc), - SOP2_BASE + SOP2Op.S_PACK_HL_B32_B16: lambda a, b, scc: (((a >> 16) & 0xffff) | ((b & 0xffff) << 16), scc), - SOP2_BASE + SOP2Op.S_ADD_F32: lambda a, b, scc: (i32(f32(a) + f32(b)), scc), - SOP2_BASE + SOP2Op.S_SUB_F32: lambda a, b, scc: (i32(f32(a) - f32(b)), scc), - SOP2_BASE + SOP2Op.S_MUL_F32: lambda a, b, scc: (i32(f32(a) * f32(b)), scc), - # SOP1 - SOP1_BASE + SOP1Op.S_MOV_B32: lambda a, b, scc: (a, scc), - SOP1_BASE + SOP1Op.S_NOT_B32: lambda a, b, scc: ((r := (~a) & 0xffffffff), int(r != 0)), - SOP1_BASE + SOP1Op.S_BREV_B32: lambda a, b, scc: (int(f'{a & 0xffffffff:032b}'[::-1], 2), scc), - SOP1_BASE + SOP1Op.S_CLZ_I32_U32: lambda a, b, scc: (clz(a), scc), - SOP1_BASE + SOP1Op.S_CLS_I32: lambda a, b, scc: (cls(a), scc), - SOP1_BASE + SOP1Op.S_SEXT_I32_I8: lambda a, b, scc: (sext(a & 0xff, 8) & 0xffffffff, scc), - SOP1_BASE + SOP1Op.S_SEXT_I32_I16: lambda a, b, scc: (sext(a & 0xffff, 16) & 0xffffffff, scc), - SOP1_BASE + SOP1Op.S_ABS_I32: lambda a, b, scc: ((r := abs(sext(a, 32)) & 0xffffffff), int(r != 0)), - SOP1_BASE + SOP1Op.S_CVT_F32_I32: lambda a, b, scc: (i32(float(sext(a, 32))), scc), - SOP1_BASE + SOP1Op.S_CVT_F32_U32: lambda a, b, scc: (i32(float(a)), scc), - SOP1_BASE + SOP1Op.S_CVT_I32_F32: lambda a, b, scc: (_cvt_i32_f32(f32(a)), scc), - SOP1_BASE + SOP1Op.S_CVT_U32_F32: lambda a, b, scc: (_cvt_u32_f32(f32(a)), scc), - SOP1_BASE + SOP1Op.S_CEIL_F32: lambda a, b, scc: (i32(math.ceil(f32(a))), scc), - SOP1_BASE + SOP1Op.S_FLOOR_F32: lambda a, b, scc: (i32(math.floor(f32(a))), scc), - SOP1_BASE + SOP1Op.S_TRUNC_F32: lambda a, b, scc: (i32(math.trunc(f32(a))), scc), - SOP1_BASE + SOP1Op.S_RNDNE_F32: lambda a, b, scc: (i32(round(f32(a))), scc), - SOP1_BASE + SOP1Op.S_CVT_F16_F32: lambda a, b, scc: (i16(f32(a)), scc), - SOP1_BASE + SOP1Op.S_CVT_F32_F16: lambda a, b, scc: (i32(f16(a)), scc), - # SOPC - SOPC_BASE + SOPCOp.S_CMP_EQ_I32: lambda a, b, scc: (0, int(sext(a, 32) == sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_LG_I32: lambda a, b, scc: (0, int(sext(a, 32) != sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_GT_I32: lambda a, b, scc: (0, int(sext(a, 32) > sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_GE_I32: lambda a, b, scc: (0, int(sext(a, 32) >= sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_LT_I32: lambda a, b, scc: (0, int(sext(a, 32) < sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_LE_I32: lambda a, b, scc: (0, int(sext(a, 32) <= sext(b, 32))), - SOPC_BASE + SOPCOp.S_CMP_EQ_U32: lambda a, b, scc: (0, int(a == b)), - SOPC_BASE + SOPCOp.S_CMP_LG_U32: lambda a, b, scc: (0, int(a != b)), - SOPC_BASE + SOPCOp.S_CMP_GT_U32: lambda a, b, scc: (0, int(a > b)), - SOPC_BASE + SOPCOp.S_CMP_GE_U32: lambda a, b, scc: (0, int(a >= b)), - SOPC_BASE + SOPCOp.S_CMP_LT_U32: lambda a, b, scc: (0, int(a < b)), - SOPC_BASE + SOPCOp.S_CMP_LE_U32: lambda a, b, scc: (0, int(a <= b)), - SOPC_BASE + SOPCOp.S_BITCMP0_B32: lambda a, b, scc: (0, int((a & (1 << (b & 0x1f))) == 0)), - SOPC_BASE + SOPCOp.S_BITCMP1_B32: lambda a, b, scc: (0, int((a & (1 << (b & 0x1f))) != 0)), - # SOPK - SOPK_BASE + SOPKOp.S_MOVK_I32: lambda a, b, scc: (sext(b, 16) & 0xffffffff, scc), - SOPK_BASE + SOPKOp.S_CMOVK_I32: lambda a, b, scc: ((sext(b, 16) & 0xffffffff) if scc else a, scc), - SOPK_BASE + SOPKOp.S_ADDK_I32: lambda a, b, scc: ((r := sext(a, 32) + sext(b, 16)) & 0xffffffff, int(((a >> 31) == ((b >> 15) & 1)) and ((a >> 31) != ((r >> 31) & 1)))), - SOPK_BASE + SOPKOp.S_MULK_I32: lambda a, b, scc: ((sext(a, 32) * sext(b, 16)) & 0xffffffff, scc), - SOPK_BASE + SOPKOp.S_CMPK_EQ_I32: lambda a, b, scc: (0, int(sext(a, 32) == sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_LG_I32: lambda a, b, scc: (0, int(sext(a, 32) != sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_GT_I32: lambda a, b, scc: (0, int(sext(a, 32) > sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_GE_I32: lambda a, b, scc: (0, int(sext(a, 32) >= sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_LT_I32: lambda a, b, scc: (0, int(sext(a, 32) < sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_LE_I32: lambda a, b, scc: (0, int(sext(a, 32) <= sext(b, 16))), - SOPK_BASE + SOPKOp.S_CMPK_EQ_U32: lambda a, b, scc: (0, int(a == (b & 0xffff))), - SOPK_BASE + SOPKOp.S_CMPK_LG_U32: lambda a, b, scc: (0, int(a != (b & 0xffff))), - SOPK_BASE + SOPKOp.S_CMPK_GT_U32: lambda a, b, scc: (0, int(a > (b & 0xffff))), - SOPK_BASE + SOPKOp.S_CMPK_GE_U32: lambda a, b, scc: (0, int(a >= (b & 0xffff))), - SOPK_BASE + SOPKOp.S_CMPK_LT_U32: lambda a, b, scc: (0, int(a < (b & 0xffff))), - SOPK_BASE + SOPKOp.S_CMPK_LE_U32: lambda a, b, scc: (0, int(a <= (b & 0xffff))), -} - -# VALU: op -> fn(s0, s1, s2) -> result -VALU: dict[int, Callable] = { - # VOP2 - VOP2_BASE + VOP2Op.V_ADD_F32: lambda a, b, c: i32(f32(a) + f32(b)), - VOP2_BASE + VOP2Op.V_SUB_F32: lambda a, b, c: i32(f32(a) - f32(b)), - VOP2_BASE + VOP2Op.V_SUBREV_F32: lambda a, b, c: i32(f32(b) - f32(a)), - VOP2_BASE + VOP2Op.V_MUL_F32: lambda a, b, c: i32(f32(a) * f32(b)), - VOP2_BASE + VOP2Op.V_MIN_F32: lambda a, b, c: i32(min(f32(a), f32(b))), - VOP2_BASE + VOP2Op.V_MAX_F32: lambda a, b, c: i32(max(f32(a), f32(b))), - VOP2_BASE + VOP2Op.V_ADD_NC_U32: lambda a, b, c: (a + b) & 0xffffffff, - VOP2_BASE + VOP2Op.V_SUB_NC_U32: lambda a, b, c: (a - b) & 0xffffffff, - VOP2_BASE + VOP2Op.V_SUBREV_NC_U32: lambda a, b, c: (b - a) & 0xffffffff, - VOP2_BASE + VOP2Op.V_AND_B32: lambda a, b, c: a & b, - VOP2_BASE + VOP2Op.V_OR_B32: lambda a, b, c: a | b, - VOP2_BASE + VOP2Op.V_XOR_B32: lambda a, b, c: a ^ b, - VOP2_BASE + VOP2Op.V_XNOR_B32: lambda a, b, c: (~(a ^ b)) & 0xffffffff, - VOP2_BASE + VOP2Op.V_LSHLREV_B32: lambda a, b, c: (b << (a & 0x1f)) & 0xffffffff, - VOP2_BASE + VOP2Op.V_LSHRREV_B32: lambda a, b, c: b >> (a & 0x1f), - VOP2_BASE + VOP2Op.V_ASHRREV_I32: lambda a, b, c: (sext(b, 32) >> (a & 0x1f)) & 0xffffffff, - VOP2_BASE + VOP2Op.V_MIN_I32: lambda a, b, c: a if sext(a, 32) < sext(b, 32) else b, - VOP2_BASE + VOP2Op.V_MAX_I32: lambda a, b, c: a if sext(a, 32) > sext(b, 32) else b, - VOP2_BASE + VOP2Op.V_MIN_U32: lambda a, b, c: min(a, b), - VOP2_BASE + VOP2Op.V_MAX_U32: lambda a, b, c: max(a, b), - VOP2_BASE + VOP2Op.V_MUL_I32_I24: lambda a, b, c: (sext(a & 0xffffff, 24) * sext(b & 0xffffff, 24)) & 0xffffffff, - VOP2_BASE + VOP2Op.V_MUL_HI_I32_I24: lambda a, b, c: ((sext(a & 0xffffff, 24) * sext(b & 0xffffff, 24)) >> 32) & 0xffffffff, - VOP2_BASE + VOP2Op.V_MUL_U32_U24: lambda a, b, c: ((a & 0xffffff) * (b & 0xffffff)) & 0xffffffff, - VOP2_BASE + VOP2Op.V_MUL_HI_U32_U24: lambda a, b, c: (((a & 0xffffff) * (b & 0xffffff)) >> 32) & 0xffffffff, - VOP2_BASE + VOP2Op.V_CVT_PK_RTZ_F16_F32: lambda a, b, c: i16(f32(a)) | (i16(f32(b)) << 16), - VOP2_BASE + VOP2Op.V_LDEXP_F16: lambda a, b, c: i16(math.ldexp(f16(a), sext(b, 32))), - VOP2_BASE + VOP2Op.V_ADD_F16: lambda a, b, c: i16(f16(a) + f16(b)), - VOP2_BASE + VOP2Op.V_SUB_F16: lambda a, b, c: i16(f16(a) - f16(b)), - VOP2_BASE + VOP2Op.V_MUL_F16: lambda a, b, c: i16(f16(a) * f16(b)), - VOP2_BASE + VOP2Op.V_MIN_F16: lambda a, b, c: i16(min(f16(a), f16(b))), - VOP2_BASE + VOP2Op.V_MAX_F16: lambda a, b, c: i16(max(f16(a), f16(b))), - # VOP1 - VOP1_BASE + VOP1Op.V_MOV_B32: lambda a, b, c: a, - VOP1_BASE + VOP1Op.V_NOT_B32: lambda a, b, c: (~a) & 0xffffffff, - VOP1_BASE + VOP1Op.V_BFREV_B32: lambda a, b, c: int(f'{a & 0xffffffff:032b}'[::-1], 2), - VOP1_BASE + VOP1Op.V_CLZ_I32_U32: lambda a, b, c: clz(a), - VOP1_BASE + VOP1Op.V_CLS_I32: lambda a, b, c: cls(a), - VOP1_BASE + VOP1Op.V_CVT_F32_I32: lambda a, b, c: i32(float(sext(a, 32))), - VOP1_BASE + VOP1Op.V_CVT_F32_U32: lambda a, b, c: i32(float(a)), - VOP1_BASE + VOP1Op.V_CVT_I32_F32: lambda a, b, c: _cvt_i32_f32(f32(a)), - VOP1_BASE + VOP1Op.V_CVT_U32_F32: lambda a, b, c: _cvt_u32_f32(f32(a)), - VOP1_BASE + VOP1Op.V_CVT_F16_F32: lambda a, b, c: i16(f32(a)), - VOP1_BASE + VOP1Op.V_CVT_F32_F16: lambda a, b, c: i32(f16(a)), - VOP1_BASE + VOP1Op.V_RCP_F32: lambda a, b, c: i32(1.0 / f32(a) if f32(a) != 0 else math.copysign(float('inf'), f32(a))), - VOP1_BASE + VOP1Op.V_RCP_IFLAG_F32: lambda a, b, c: i32(1.0 / f32(a) if f32(a) != 0 else math.copysign(float('inf'), f32(a))), - VOP1_BASE + VOP1Op.V_RSQ_F32: lambda a, b, c: i32(1.0 / math.sqrt(f32(a)) if f32(a) > 0 else (float('nan') if f32(a) < 0 else float('inf'))), - VOP1_BASE + VOP1Op.V_SQRT_F32: lambda a, b, c: i32(math.sqrt(f32(a)) if f32(a) >= 0 else float('nan')), - VOP1_BASE + VOP1Op.V_LOG_F32: lambda a, b, c: i32(math.log2(f32(a)) if f32(a) > 0 else (float('-inf') if f32(a) == 0 else float('nan'))), - VOP1_BASE + VOP1Op.V_EXP_F32: lambda a, b, c: i32(float('inf') if f32(a) > 128 else (0.0 if f32(a) < -150 else math.pow(2.0, f32(a)))), - VOP1_BASE + VOP1Op.V_SIN_F32: lambda a, b, c: i32(math.sin(f32(a) * 2 * math.pi)), - VOP1_BASE + VOP1Op.V_COS_F32: lambda a, b, c: i32(math.cos(f32(a) * 2 * math.pi)), - VOP1_BASE + VOP1Op.V_FLOOR_F32: lambda a, b, c: i32(math.floor(f32(a))), - VOP1_BASE + VOP1Op.V_CEIL_F32: lambda a, b, c: i32(math.ceil(f32(a))), - VOP1_BASE + VOP1Op.V_TRUNC_F32: lambda a, b, c: i32(math.trunc(f32(a))), - VOP1_BASE + VOP1Op.V_RNDNE_F32: lambda a, b, c: i32(round(f32(a))), - VOP1_BASE + VOP1Op.V_FRACT_F32: lambda a, b, c: i32((v := f32(a)) - math.floor(v)), - VOP1_BASE + VOP1Op.V_CVT_F32_UBYTE0: lambda a, b, c: i32(float(a & 0xff)), - VOP1_BASE + VOP1Op.V_CVT_F32_UBYTE1: lambda a, b, c: i32(float((a >> 8) & 0xff)), - VOP1_BASE + VOP1Op.V_CVT_F32_UBYTE2: lambda a, b, c: i32(float((a >> 16) & 0xff)), - VOP1_BASE + VOP1Op.V_CVT_F32_UBYTE3: lambda a, b, c: i32(float((a >> 24) & 0xff)), - VOP1_BASE + VOP1Op.V_FREXP_MANT_F32: lambda a, b, c: i32(math.frexp(v)[0] if (v := f32(a)) != 0 else 0.0), - VOP1_BASE + VOP1Op.V_FREXP_EXP_I32_F32: lambda a, b, c: (math.frexp(v)[1] if (v := f32(a)) != 0 else 0) & 0xffffffff, - # VOP3 - VOP3Op.V_FMA_F32: lambda a, b, c: i32(f32(a) * f32(b) + f32(c)), - VOP3Op.V_DIV_FMAS_F32: lambda a, b, c: i32(f32(a) * f32(b) + f32(c)), - VOP3Op.V_ADD3_U32: lambda a, b, c: (a + b + c) & 0xffffffff, - VOP3Op.V_LSHL_ADD_U32: lambda a, b, c: ((a << (b & 0x1f)) + c) & 0xffffffff, - VOP3Op.V_ADD_LSHL_U32: lambda a, b, c: ((a + b) << (c & 0x1f)) & 0xffffffff, - VOP3Op.V_XOR3_B32: lambda a, b, c: a ^ b ^ c, - VOP3Op.V_OR3_B32: lambda a, b, c: a | b | c, - VOP3Op.V_AND_OR_B32: lambda a, b, c: (a & b) | c, - VOP3Op.V_LSHL_OR_B32: lambda a, b, c: ((a << (b & 0x1f)) | c) & 0xffffffff, - VOP3Op.V_XAD_U32: lambda a, b, c: ((a ^ b) + c) & 0xffffffff, - VOP3Op.V_MAD_U32_U24: lambda a, b, c: ((a & 0xffffff) * (b & 0xffffff) + c) & 0xffffffff, - VOP3Op.V_MAD_I32_I24: lambda a, b, c: (sext(a & 0xffffff, 24) * sext(b & 0xffffff, 24) + sext(c, 32)) & 0xffffffff, - VOP3Op.V_BFE_U32: lambda a, b, c: (a >> (b & 0x1f)) & ((1 << (c & 0x1f)) - 1) if c & 0x1f else 0, - VOP3Op.V_BFE_I32: lambda a, b, c: sext((a >> (b & 0x1f)) & ((1 << w) - 1), w) & 0xffffffff if (w := c & 0x1f) else 0, - VOP3Op.V_ALIGNBIT_B32: lambda a, b, c: (((a << 32) | b) >> (c & 0x1f)) & 0xffffffff, - VOP3Op.V_MUL_LO_U32: lambda a, b, c: (a * b) & 0xffffffff, - VOP3Op.V_MUL_HI_U32: lambda a, b, c: ((a * b) >> 32) & 0xffffffff, - VOP3Op.V_MUL_HI_I32: lambda a, b, c: ((sext(a, 32) * sext(b, 32)) >> 32) & 0xffffffff, - VOP3Op.V_LDEXP_F32: lambda a, b, c: i32(math.ldexp(f32(a), sext(b, 32))), - VOP3Op.V_DIV_FIXUP_F32: lambda a, b, c: i32(math.copysign(float('inf'), f32(c)) if f32(b) == 0.0 else f32(c) / f32(b)), - VOP3Op.V_PACK_B32_F16: lambda a, b, c: (a & 0xffff) | ((b & 0xffff) << 16), - VOP3Op.V_CVT_PK_RTZ_F16_F32: lambda a, b, c: i16(f32(a)) | (i16(f32(b)) << 16), - VOP3Op.V_LSHLREV_B16: lambda a, b, c: ((b & 0xffff) << (a & 0xf)) & 0xffff, - VOP3Op.V_LSHRREV_B16: lambda a, b, c: (b & 0xffff) >> (a & 0xf), - VOP3Op.V_ASHRREV_I16: lambda a, b, c: (sext(b & 0xffff, 16) >> (a & 0xf)) & 0xffff, - VOP3Op.V_ADD_NC_U16: lambda a, b, c: ((a & 0xffff) + (b & 0xffff)) & 0xffff, - VOP3Op.V_SUB_NC_U16: lambda a, b, c: ((a & 0xffff) - (b & 0xffff)) & 0xffff, - VOP3Op.V_MUL_LO_U16: lambda a, b, c: ((a & 0xffff) * (b & 0xffff)) & 0xffff, - VOP3Op.V_MIN_U16: lambda a, b, c: min(a & 0xffff, b & 0xffff), - VOP3Op.V_MAX_U16: lambda a, b, c: max(a & 0xffff, b & 0xffff), - VOP3Op.V_MIN_I16: lambda a, b, c: (a & 0xffff) if sext(a & 0xffff, 16) < sext(b & 0xffff, 16) else (b & 0xffff), - VOP3Op.V_MAX_I16: lambda a, b, c: (a & 0xffff) if sext(a & 0xffff, 16) > sext(b & 0xffff, 16) else (b & 0xffff), - VOP3Op.V_MAD_U16: lambda a, b, c: ((a & 0xffff) * (b & 0xffff) + (c & 0xffff)) & 0xffff, - VOP3Op.V_MAD_I16: lambda a, b, c: (sext(a & 0xffff, 16) * sext(b & 0xffff, 16) + sext(c & 0xffff, 16)) & 0xffff, - VOP3Op.V_FMA_F16: lambda a, b, c: i16(f16(a) * f16(b) + f16(c)), - VOP3Op.V_MIN3_I32: lambda a, b, c: sorted([sext(a, 32), sext(b, 32), sext(c, 32)])[0] & 0xffffffff, - VOP3Op.V_MAX3_I32: lambda a, b, c: sorted([sext(a, 32), sext(b, 32), sext(c, 32)])[2] & 0xffffffff, - VOP3Op.V_MED3_I32: lambda a, b, c: sorted([sext(a, 32), sext(b, 32), sext(c, 32)])[1] & 0xffffffff, - VOP3Op.V_MIN3_F16: lambda a, b, c: i16(min(f16(a), f16(b), f16(c))), - VOP3Op.V_MAX3_F16: lambda a, b, c: i16(max(f16(a), f16(b), f16(c))), - VOP3Op.V_MED3_F16: lambda a, b, c: i16(sorted([f16(a), f16(b), f16(c)])[1]), - VOP3Op.V_MIN3_U16: lambda a, b, c: min(a & 0xffff, b & 0xffff, c & 0xffff), - VOP3Op.V_MAX3_U16: lambda a, b, c: max(a & 0xffff, b & 0xffff, c & 0xffff), - VOP3Op.V_MED3_U16: lambda a, b, c: sorted([a & 0xffff, b & 0xffff, c & 0xffff])[1], - VOP3Op.V_MIN3_I16: lambda a, b, c: sorted([sext(a & 0xffff, 16), sext(b & 0xffff, 16), sext(c & 0xffff, 16)])[0] & 0xffff, - VOP3Op.V_MAX3_I16: lambda a, b, c: sorted([sext(a & 0xffff, 16), sext(b & 0xffff, 16), sext(c & 0xffff, 16)])[2] & 0xffff, - VOP3Op.V_MED3_I16: lambda a, b, c: sorted([sext(a & 0xffff, 16), sext(b & 0xffff, 16), sext(c & 0xffff, 16)])[1] & 0xffff, -} - -def _cmp8(a, b): return [False, a < b, a == b, a <= b, a > b, a != b, a >= b, True] -def _cmp6(a, b): return [a < b, a == b, a <= b, a > b, a != b, a >= b] - -def vopc(op: int, s0: int, s1: int, s0_hi: int = 0, s1_hi: int = 0) -> int: - base = op & 0x7f - if 16 <= base <= 31: # F32 - f0, f1, cmp, nan = f32(s0), f32(s1), base - 16, math.isnan(f32(s0)) or math.isnan(f32(s1)) - return int([False, f0f1, f0!=f1, f0>=f1, not nan, nan, f0f1 or nan, f0!=f1 or nan, f0>=f1 or nan, True][cmp]) - if 49 <= base <= 54: return int(_cmp6(sext(s0 & 0xffff, 16), sext(s1 & 0xffff, 16))[base - 49]) # I16 - if 57 <= base <= 62: return int(_cmp6(s0 & 0xffff, s1 & 0xffff)[base - 57]) # U16 - if 64 <= base <= 79: # I32/U32 - cmp = (base - 64) % 8 - return int(_cmp8(sext(s0, 32), sext(s1, 32))[cmp] if base < 72 else _cmp8(s0, s1)[cmp]) - if 80 <= base <= 95: # I64/U64 - s0_64, s1_64 = s0 | (s0_hi << 32), s1 | (s1_hi << 32) - return int(_cmp8(sext(s0_64, 64), sext(s1_64, 64))[(base - 80) % 8] if base < 88 else _cmp8(s0_64, s1_64)[(base - 80) % 8]) - if base == 126: # CLASS_F32 - f, mask = f32(s0), s1 - if math.isnan(f): return int(bool(mask & 0x3)) - if math.isinf(f): return int(bool(mask & (0x4 if f < 0 else 0x200))) - if f == 0.0: return int(bool(mask & (0x20 if (s0 >> 31) & 1 else 0x40))) - exp, sign = (s0 >> 23) & 0xff, (s0 >> 31) & 1 - return int(bool(mask & ((0x10 if sign else 0x80) if exp == 0 else (0x8 if sign else 0x100)))) - raise NotImplementedError(f"VOPC op {op} (base {base})") diff --git a/extra/assembly/rdna3/autogen/__init__.py b/extra/assembly/rdna3/autogen/__init__.py index 88b7b81c27..c6c040ee52 100644 --- a/extra/assembly/rdna3/autogen/__init__.py +++ b/extra/assembly/rdna3/autogen/__init__.py @@ -2638,16 +2638,16 @@ v_add_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_NC_U32) v_sub_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_NC_U32) v_subrev_nc_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_NC_U32) v_fmac_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F32) -v_fmamk_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F32) -v_fmaak_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F32) +def v_fmamk_f32_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F32, vdst, src0, vsrc1, literal=K) +def v_fmaak_f32_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F32, vdst, src0, vsrc1, literal=K) v_cvt_pk_rtz_f16_f32_e32 = functools.partial(VOP2, VOP2Op.V_CVT_PK_RTZ_F16_F32) v_add_f16_e32 = functools.partial(VOP2, VOP2Op.V_ADD_F16) v_sub_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F16) v_subrev_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_F16) v_mul_f16_e32 = functools.partial(VOP2, VOP2Op.V_MUL_F16) v_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F16) -v_fmamk_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAMK_F16) -v_fmaak_f16_e32 = functools.partial(VOP2, VOP2Op.V_FMAAK_F16) +def v_fmamk_f16_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F16, vdst, src0, vsrc1, literal=K) +def v_fmaak_f16_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F16, vdst, src0, vsrc1, literal=K) v_max_f16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_F16) v_min_f16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_F16) v_ldexp_f16_e32 = functools.partial(VOP2, VOP2Op.V_LDEXP_F16) diff --git a/extra/assembly/rdna3/autogen/gen_pcode.py b/extra/assembly/rdna3/autogen/gen_pcode.py new file mode 100644 index 0000000000..b9d1bbb025 --- /dev/null +++ b/extra/assembly/rdna3/autogen/gen_pcode.py @@ -0,0 +1,16706 @@ +# autogenerated by pcode.py - do not edit +# to regenerate: python -m extra.assembly.rdna3.pcode +# ruff: noqa: E501,F405,F403 +from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp +from extra.assembly.rdna3.pcode import * + +def _SOP1Op_S_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_MOV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b64 = S0.b64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b64 = S0.b64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_CMOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if SCC then + # D0.b32 = S0.b32 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if SCC: + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CMOV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if SCC then + # D0.b64 = S0.b64 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if SCC: + D0.b64 = S0.b64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_BREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32[31 : 0] = S0.u32[0 : 31] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32[31 : 0] = S0.u32[0 : 31] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BREV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[63 : 0] = S0.u64[0 : 63] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[63 : 0] = S0.u64[0 : 63] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_CTZ_I32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from LSB + # if S0.u32[i] == 1'1U then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(0, int(31)+1): + if S0.u32[i] == 1: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CTZ_I32_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if no ones are found + # for i in 0 : 63 do + # // Search from LSB + # if S0.u64[i] == 1'1U then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(0, int(63)+1): + if S0.u64[i] == 1: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CLZ_I32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from MSB + # if S0.u32[31 - i] == 1'1U then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(0, int(31)+1): + if S0.u32[31 - i] == 1: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CLZ_I32_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if no ones are found + # for i in 0 : 63 do + # // Search from MSB + # if S0.u64[63 - i] == 1'1U then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(0, int(63)+1): + if S0.u64[63 - i] == 1: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CLS_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if all bits are the same + # for i in 1 : 31 do + # // Search from MSB + # if S0.u32[31 - i] != S0.u32[31] then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(1, int(31)+1): + if S0.u32[31 - i] != S0.u32[31]: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CLS_I32_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = -1; + # // Set if all bits are the same + # for i in 1 : 63 do + # // Search from MSB + # if S0.u64[63 - i] != S0.u64[63] then + # tmp = i; + # endif + # endfor; + # D0.i32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(-1) + for i in range(1, int(63)+1): + if S0.u64[63 - i] != S0.u64[63]: + tmp = Reg(i) + D0.i32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_SEXT_I32_I8(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i8)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i8)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_SEXT_I32_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BITSET0_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32[S0.u32[4 : 0]] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32[S0.u32[4 : 0]] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BITSET0_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[S0.u32[5 : 0]] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[S0.u32[5 : 0]] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_BITSET1_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32[S0.u32[4 : 0]] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32[S0.u32[4 : 0]] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BITSET1_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[S0.u32[5 : 0]] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[S0.u32[5 : 0]] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_BITREPLICATE_B64_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32; + # for i in 0 : 31 do + # D0.u64[i * 2] = tmp[i]; + # D0.u64[i * 2 + 1] = tmp[i] + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32) + for i in range(0, int(31)+1): + D0.u64[i * 2] = tmp[i] + D0.u64[i * 2 + 1] = tmp[i] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_ABS_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 < 0 ? -S0.i32 : S0.i32; + # SCC = D0.i32 != 0 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((-S0.i32) if (S0.i32 < 0) else (S0.i32)) + SCC = Reg(D0.i32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BCNT0_I32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 0; + # for i in 0 : 31 do + # tmp += S0.u32[i] == 1'0U ? 1 : 0 + # endfor; + # D0.i32 = tmp; + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(0) + for i in range(0, int(31)+1): + tmp += ((1) if (S0.u32[i] == 0) else (0)) + D0.i32 = tmp + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BCNT0_I32_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 0; + # for i in 0 : 63 do + # tmp += S0.u64[i] == 1'0U ? 1 : 0 + # endfor; + # D0.i32 = tmp; + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(0) + for i in range(0, int(63)+1): + tmp += ((1) if (S0.u64[i] == 0) else (0)) + D0.i32 = tmp + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_BCNT1_I32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 0; + # for i in 0 : 31 do + # tmp += S0.u32[i] == 1'1U ? 1 : 0 + # endfor; + # D0.i32 = tmp; + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(0) + for i in range(0, int(31)+1): + tmp += ((1) if (S0.u32[i] == 1) else (0)) + D0.i32 = tmp + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_BCNT1_I32_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 0; + # for i in 0 : 63 do + # tmp += S0.u64[i] == 1'1U ? 1 : 0 + # endfor; + # D0.i32 = tmp; + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(0) + for i in range(0, int(63)+1): + tmp += ((1) if (S0.u64[i] == 1) else (0)) + D0.i32 = tmp + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_NOT_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~S0.u32; + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~S0.u32 + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_NOT_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = ~S0.u64; + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ~S0.u64 + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_AND_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u32; + # EXEC.u32 = (S0.u32 & EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (S0.u32 & EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_AND_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u64; + # EXEC.u64 = (S0.u64 & EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (S0.u64 & EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_OR_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, set + # SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar destination + # saveexec = EXEC.u32; + # EXEC.u32 = (S0.u32 | EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (S0.u32 | EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_OR_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, set + # SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar destination + # saveexec = EXEC.u64; + # EXEC.u64 = (S0.u64 | EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (S0.u64 | EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_XOR_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise XOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u32; + # EXEC.u32 = (S0.u32 ^ EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (S0.u32 ^ EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_XOR_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise XOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u64; + # EXEC.u64 = (S0.u64 ^ EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (S0.u64 ^ EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_NAND_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise NAND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u32; + # EXEC.u32 = ~(S0.u32 & EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = ~(S0.u32 & EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_NAND_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise NAND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u64; + # EXEC.u64 = ~(S0.u64 & EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = ~(S0.u64 & EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_NOR_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise NOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u32; + # EXEC.u32 = ~(S0.u32 | EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = ~(S0.u32 | EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_NOR_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise NOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u64; + # EXEC.u64 = ~(S0.u64 | EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = ~(S0.u64 | EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_XNOR_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise XNOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u32; + # EXEC.u32 = ~(S0.u32 ^ EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = ~(S0.u32 ^ EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_XNOR_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise XNOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, + # set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar + # saveexec = EXEC.u64; + # EXEC.u64 = ~(S0.u64 ^ EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = ~(S0.u64 ^ EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_AND_NOT0_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into + # the EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into + # saveexec = EXEC.u32; + # EXEC.u32 = (~S0.u32 & EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (~S0.u32 & EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_AND_NOT0_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into + # the EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into + # saveexec = EXEC.u64; + # EXEC.u64 = (~S0.u64 & EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (~S0.u64 & EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_OR_NOT0_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the EXEC mask and the negation of the scalar input, store the calculated result into the + # EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the + # saveexec = EXEC.u32; + # EXEC.u32 = (~S0.u32 | EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (~S0.u32 | EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_OR_NOT0_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the EXEC mask and the negation of the scalar input, store the calculated result into the + # EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the + # saveexec = EXEC.u64; + # EXEC.u64 = (~S0.u64 | EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (~S0.u64 | EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_AND_NOT1_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into + # the EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into + # saveexec = EXEC.u32; + # EXEC.u32 = (S0.u32 & ~EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (S0.u32 & ~EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_AND_NOT1_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into + # the EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into + # saveexec = EXEC.u64; + # EXEC.u64 = (S0.u64 & ~EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (S0.u64 & ~EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_OR_NOT1_SAVEEXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the scalar input and the negation of the EXEC mask, store the calculated result into the + # EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the + # saveexec = EXEC.u32; + # EXEC.u32 = (S0.u32 | ~EXEC.u32); + # D0.u32 = saveexec.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u32) + EXEC.u32 = (S0.u32 | ~EXEC.u32) + D0.u32 = saveexec.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_OR_NOT1_SAVEEXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise OR on the scalar input and the negation of the EXEC mask, store the calculated result into the + # EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the + # saveexec = EXEC.u64; + # EXEC.u64 = (S0.u64 | ~EXEC.u64); + # D0.u64 = saveexec.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + saveexec = Reg(EXEC.u64) + EXEC.u64 = (S0.u64 | ~EXEC.u64) + D0.u64 = saveexec.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_AND_NOT0_WREXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into + # Unlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op + # result. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is + # EXEC.u32 = (~S0.u32 & EXEC.u32); + # D0.u32 = EXEC.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u32 = (~S0.u32 & EXEC.u32) + D0.u32 = EXEC.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_AND_NOT0_WREXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into + # Unlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op + # result. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is + # EXEC.u64 = (~S0.u64 & EXEC.u64); + # D0.u64 = EXEC.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64 = (~S0.u64 & EXEC.u64) + D0.u64 = EXEC.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_AND_NOT1_WREXEC_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into + # Unlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op + # result. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is + # EXEC.u32 = (S0.u32 & ~EXEC.u32); + # D0.u32 = EXEC.u32; + # SCC = EXEC.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u32 = (S0.u32 & ~EXEC.u32) + D0.u32 = EXEC.u32 + SCC = Reg(EXEC.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_AND_NOT1_WREXEC_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into + # Unlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op + # result. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is + # EXEC.u64 = (S0.u64 & ~EXEC.u64); + # D0.u64 = EXEC.u64; + # SCC = EXEC.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64 = (S0.u64 & ~EXEC.u64) + D0.u64 = EXEC.u64 + SCC = Reg(EXEC.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP1Op_S_SENDMSG_RTN_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # If SDST is VCC then VCCZ is undefined. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_SENDMSG_RTN_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # If SDST is VCC then VCCZ is undefined. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CEIL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 > 0.0) and (S0.f32 != D0.f32)): + D0.f32 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_FLOOR_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += -1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 < 0.0) and (S0.f32 != D0.f32)): + D0.f32 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_TRUNC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_RNDNE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = floor(S0.f32 + 0.5F); + # if (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then + # D0.f32 -= 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = floor(S0.f32 + 0.5) + if (isEven(F(floor(S0.f32))) and (fract(S0.f32) == 0.5)): + D0.f32 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_F32_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = i32_to_f32(S0.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = i32_to_f32(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_F32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_U32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f32_to_u32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = f32_to_u32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = f32_to_f16(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = f32_to_f16(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CVT_HI_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0[31 : 16].f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0[31 : 16].f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_CEIL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 > 0.0) and (S0.f16 != D0.f16)): + D0.f16 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_FLOOR_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += -16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 < 0.0) and (S0.f16 != D0.f16)): + D0.f16 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_TRUNC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP1Op_S_RNDNE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = floor(S0.f16 + 16'0.5); + # if (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then + # D0.f16 -= 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = floor(S0.f16 + 0.5) + if (isEven(F(floor(S0.f16))) and (fract(S0.f16) == 0.5)): + D0.f16 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +SOP1Op_FUNCTIONS = { + SOP1Op.S_MOV_B32: _SOP1Op_S_MOV_B32, + SOP1Op.S_MOV_B64: _SOP1Op_S_MOV_B64, + SOP1Op.S_CMOV_B32: _SOP1Op_S_CMOV_B32, + SOP1Op.S_CMOV_B64: _SOP1Op_S_CMOV_B64, + SOP1Op.S_BREV_B32: _SOP1Op_S_BREV_B32, + SOP1Op.S_BREV_B64: _SOP1Op_S_BREV_B64, + SOP1Op.S_CTZ_I32_B32: _SOP1Op_S_CTZ_I32_B32, + SOP1Op.S_CTZ_I32_B64: _SOP1Op_S_CTZ_I32_B64, + SOP1Op.S_CLZ_I32_U32: _SOP1Op_S_CLZ_I32_U32, + SOP1Op.S_CLZ_I32_U64: _SOP1Op_S_CLZ_I32_U64, + SOP1Op.S_CLS_I32: _SOP1Op_S_CLS_I32, + SOP1Op.S_CLS_I32_I64: _SOP1Op_S_CLS_I32_I64, + SOP1Op.S_SEXT_I32_I8: _SOP1Op_S_SEXT_I32_I8, + SOP1Op.S_SEXT_I32_I16: _SOP1Op_S_SEXT_I32_I16, + SOP1Op.S_BITSET0_B32: _SOP1Op_S_BITSET0_B32, + SOP1Op.S_BITSET0_B64: _SOP1Op_S_BITSET0_B64, + SOP1Op.S_BITSET1_B32: _SOP1Op_S_BITSET1_B32, + SOP1Op.S_BITSET1_B64: _SOP1Op_S_BITSET1_B64, + SOP1Op.S_BITREPLICATE_B64_B32: _SOP1Op_S_BITREPLICATE_B64_B32, + SOP1Op.S_ABS_I32: _SOP1Op_S_ABS_I32, + SOP1Op.S_BCNT0_I32_B32: _SOP1Op_S_BCNT0_I32_B32, + SOP1Op.S_BCNT0_I32_B64: _SOP1Op_S_BCNT0_I32_B64, + SOP1Op.S_BCNT1_I32_B32: _SOP1Op_S_BCNT1_I32_B32, + SOP1Op.S_BCNT1_I32_B64: _SOP1Op_S_BCNT1_I32_B64, + SOP1Op.S_NOT_B32: _SOP1Op_S_NOT_B32, + SOP1Op.S_NOT_B64: _SOP1Op_S_NOT_B64, + SOP1Op.S_AND_SAVEEXEC_B32: _SOP1Op_S_AND_SAVEEXEC_B32, + SOP1Op.S_AND_SAVEEXEC_B64: _SOP1Op_S_AND_SAVEEXEC_B64, + SOP1Op.S_OR_SAVEEXEC_B32: _SOP1Op_S_OR_SAVEEXEC_B32, + SOP1Op.S_OR_SAVEEXEC_B64: _SOP1Op_S_OR_SAVEEXEC_B64, + SOP1Op.S_XOR_SAVEEXEC_B32: _SOP1Op_S_XOR_SAVEEXEC_B32, + SOP1Op.S_XOR_SAVEEXEC_B64: _SOP1Op_S_XOR_SAVEEXEC_B64, + SOP1Op.S_NAND_SAVEEXEC_B32: _SOP1Op_S_NAND_SAVEEXEC_B32, + SOP1Op.S_NAND_SAVEEXEC_B64: _SOP1Op_S_NAND_SAVEEXEC_B64, + SOP1Op.S_NOR_SAVEEXEC_B32: _SOP1Op_S_NOR_SAVEEXEC_B32, + SOP1Op.S_NOR_SAVEEXEC_B64: _SOP1Op_S_NOR_SAVEEXEC_B64, + SOP1Op.S_XNOR_SAVEEXEC_B32: _SOP1Op_S_XNOR_SAVEEXEC_B32, + SOP1Op.S_XNOR_SAVEEXEC_B64: _SOP1Op_S_XNOR_SAVEEXEC_B64, + SOP1Op.S_AND_NOT0_SAVEEXEC_B32: _SOP1Op_S_AND_NOT0_SAVEEXEC_B32, + SOP1Op.S_AND_NOT0_SAVEEXEC_B64: _SOP1Op_S_AND_NOT0_SAVEEXEC_B64, + SOP1Op.S_OR_NOT0_SAVEEXEC_B32: _SOP1Op_S_OR_NOT0_SAVEEXEC_B32, + SOP1Op.S_OR_NOT0_SAVEEXEC_B64: _SOP1Op_S_OR_NOT0_SAVEEXEC_B64, + SOP1Op.S_AND_NOT1_SAVEEXEC_B32: _SOP1Op_S_AND_NOT1_SAVEEXEC_B32, + SOP1Op.S_AND_NOT1_SAVEEXEC_B64: _SOP1Op_S_AND_NOT1_SAVEEXEC_B64, + SOP1Op.S_OR_NOT1_SAVEEXEC_B32: _SOP1Op_S_OR_NOT1_SAVEEXEC_B32, + SOP1Op.S_OR_NOT1_SAVEEXEC_B64: _SOP1Op_S_OR_NOT1_SAVEEXEC_B64, + SOP1Op.S_AND_NOT0_WREXEC_B32: _SOP1Op_S_AND_NOT0_WREXEC_B32, + SOP1Op.S_AND_NOT0_WREXEC_B64: _SOP1Op_S_AND_NOT0_WREXEC_B64, + SOP1Op.S_AND_NOT1_WREXEC_B32: _SOP1Op_S_AND_NOT1_WREXEC_B32, + SOP1Op.S_AND_NOT1_WREXEC_B64: _SOP1Op_S_AND_NOT1_WREXEC_B64, + SOP1Op.S_SENDMSG_RTN_B32: _SOP1Op_S_SENDMSG_RTN_B32, + SOP1Op.S_SENDMSG_RTN_B64: _SOP1Op_S_SENDMSG_RTN_B64, + SOP1Op.S_CEIL_F32: _SOP1Op_S_CEIL_F32, + SOP1Op.S_FLOOR_F32: _SOP1Op_S_FLOOR_F32, + SOP1Op.S_TRUNC_F32: _SOP1Op_S_TRUNC_F32, + SOP1Op.S_RNDNE_F32: _SOP1Op_S_RNDNE_F32, + SOP1Op.S_CVT_F32_I32: _SOP1Op_S_CVT_F32_I32, + SOP1Op.S_CVT_F32_U32: _SOP1Op_S_CVT_F32_U32, + SOP1Op.S_CVT_I32_F32: _SOP1Op_S_CVT_I32_F32, + SOP1Op.S_CVT_U32_F32: _SOP1Op_S_CVT_U32_F32, + SOP1Op.S_CVT_F16_F32: _SOP1Op_S_CVT_F16_F32, + SOP1Op.S_CVT_F32_F16: _SOP1Op_S_CVT_F32_F16, + SOP1Op.S_CVT_HI_F32_F16: _SOP1Op_S_CVT_HI_F32_F16, + SOP1Op.S_CEIL_F16: _SOP1Op_S_CEIL_F16, + SOP1Op.S_FLOOR_F16: _SOP1Op_S_FLOOR_F16, + SOP1Op.S_TRUNC_F16: _SOP1Op_S_TRUNC_F16, + SOP1Op.S_RNDNE_F16: _SOP1Op_S_RNDNE_F16, +} + +def _SOP2Op_S_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_SUB_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32; + # SCC = S1.u32 > S0.u32 ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32) + SCC = Reg(((1) if (S1.u32 > S0.u32) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ADD_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.i32 + S1.i32; + # SCC = ((S0.u32[31] == S1.u32[31]) && (S0.u32[31] != tmp.u32[31])); + # D0.i32 = tmp.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.i32 + S1.i32) + SCC = Reg(((S0.u32[31] == S1.u32[31]) and (S0.u32[31] != tmp.u32[31]))) + D0.i32 = tmp.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_SUB_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.i32 - S1.i32; + # SCC = ((S0.u32[31] != S1.u32[31]) && (S0.u32[31] != tmp.u32[31])); + # D0.i32 = tmp.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.i32 - S1.i32) + SCC = Reg(((S0.u32[31] != S1.u32[31]) and (S0.u32[31] != tmp.u32[31]))) + D0.i32 = tmp.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ADDC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32) + SCC.u64; + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32) + SCC.u64) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_SUBB_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32 - SCC.u32; + # SCC = 64'U(S1.u32) + SCC.u64 > 64'U(S0.u32) ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32 - SCC.u32) + SCC = Reg(((1) if ((S1.u32) + SCC.u64 > (S0.u32)) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ABSDIFF_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 - S1.i32; + # if D0.i32 < 0 then + # D0.i32 = -D0.i32 + # endif; + # SCC = D0.i32 != 0 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = S0.i32 - S1.i32 + if D0.i32 < 0: + D0.i32 = -D0.i32 + SCC = Reg(D0.i32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHL_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 << S1[4 : 0].u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 << S1[4 : 0].u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHL_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 << S1[5 : 0].u32); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 << S1[5 : 0].u32) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_LSHR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 >> S1[4 : 0].u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 >> S1[4 : 0].u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHR_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 >> S1[5 : 0].u32); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 >> S1[5 : 0].u32) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_ASHR_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i32) >> S1[4 : 0].u32); + # SCC = D0.i32 != 0 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i32) >> S1[4 : 0].u32) + SCC = Reg(D0.i32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ASHR_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i64 = (signext(S0.i64) >> S1[5 : 0].u32); + # SCC = D0.i64 != 0LL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i64 = (signext(S0.i64) >> S1[5 : 0].u32) + SCC = Reg(D0.i64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_LSHL1_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = (64'U(S0.u32) << 1U) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(((S0.u32) << 1) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHL2_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = (64'U(S0.u32) << 2U) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(((S0.u32) << 2) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHL3_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = (64'U(S0.u32) << 3U) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(((S0.u32) << 3) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_LSHL4_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = (64'U(S0.u32) << 4U) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(((S0.u32) << 4) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MIN_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 < S1.i32; + # D0.i32 = SCC ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 < S1.i32) + D0.i32 = ((S0.i32) if (SCC) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MIN_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 < S1.u32; + # D0.u32 = SCC ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 < S1.u32) + D0.u32 = ((S0.u32) if (SCC) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MAX_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 >= S1.i32; + # D0.i32 = SCC ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 >= S1.i32) + D0.i32 = ((S0.i32) if (SCC) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MAX_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 >= S1.u32; + # D0.u32 = SCC ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 >= S1.u32) + D0.u32 = ((S0.u32) if (SCC) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_AND_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 & S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 & S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_AND_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 & S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 & S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_OR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 | S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 | S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_OR_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 | S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 | S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_XOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 ^ S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 ^ S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_XOR_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 ^ S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 ^ S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_NAND_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~(S0.u32 & S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~(S0.u32 & S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_NAND_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = ~(S0.u64 & S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ~(S0.u64 & S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_NOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~(S0.u32 | S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~(S0.u32 | S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_NOR_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = ~(S0.u64 | S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ~(S0.u64 | S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_XNOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~(S0.u32 ^ S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~(S0.u32 ^ S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_XNOR_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = ~(S0.u64 ^ S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ~(S0.u64 ^ S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_AND_NOT1_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 & ~S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 & ~S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_AND_NOT1_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 & ~S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 & ~S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_OR_NOT1_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 | ~S1.u32); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 | ~S1.u32) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_OR_NOT1_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S0.u64 | ~S1.u64); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S0.u64 | ~S1.u64) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_BFE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1U << S1[22 : 16].u32) - 1U)); + # SCC = D0.u32 != 0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1 << S1[22 : 16].u32) - 1)) + SCC = Reg(D0.u32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_BFE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S1[22 : 16].u32) - 1)); + # D0.i32 = signext_from_bit(tmp.i32, S1[22 : 16].u32); + # SCC = D0.i32 != 0 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S1[22 : 16].u32) - 1)) + D0.i32 = signext_from_bit(tmp.i32, S1[22 : 16].u32) + SCC = Reg(D0.i32 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_BFE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = ((S0.u64 >> S1[5 : 0].u32) & ((1ULL << S1[22 : 16].u32) - 1ULL)); + # SCC = D0.u64 != 0ULL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ((S0.u64 >> S1[5 : 0].u32) & ((1 << S1[22 : 16].u32) - 1)) + SCC = Reg(D0.u64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_BFE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp.i64 = ((S0.i64 >> S1[5 : 0].u32) & ((1LL << S1[22 : 16].u32) - 1LL)); + # D0.i64 = signext_from_bit(tmp.i64, S1[22 : 16].u32); + # SCC = D0.i64 != 0LL + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp.i64 = ((S0.i64 >> S1[5 : 0].u32) & ((1 << S1[22 : 16].u32) - 1)) + D0.i64 = signext_from_bit(tmp.i64, S1[22 : 16].u32) + SCC = Reg(D0.i64 != 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_BFM_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (((1U << S0[4 : 0].u32) - 1U) << S1[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((1 << S0[4 : 0].u32) - 1) << S1[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_BFM_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (((1ULL << S0[5 : 0].u32) - 1ULL) << S1[5 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (((1 << S0[5 : 0].u32) - 1) << S1[5 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_MUL_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 * S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = S0.i32 * S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MUL_HI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U((64'U(S0.u32) * 64'U(S1.u32)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((S0.u32) * (S1.u32)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MUL_HI_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I((64'I(S0.i32) * 64'I(S1.i32)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (((S0.i32) * (S1.i32)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_CSELECT_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = SCC ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32) if (SCC) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_CSELECT_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = SCC ? S0.u64 : S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = ((S0.u64) if (SCC) else (S1.u64)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _SOP2Op_S_PACK_LL_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[15 : 0].u16, S0[15 : 0].u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[15 : 0].u16, S0[15 : 0].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_PACK_LH_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[31 : 16].u16, S0[15 : 0].u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[31 : 16].u16, S0[15 : 0].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_PACK_HH_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[31 : 16].u16, S0[31 : 16].u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[31 : 16].u16, S0[31 : 16].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_PACK_HL_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[15 : 0].u16, S0[31 : 16].u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[15 : 0].u16, S0[31 : 16].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ADD_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 + S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 + S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_SUB_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 - S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 - S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MAX_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MUL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 * S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_FMAAK_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_FMAMK_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_FMAC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, D0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_CVT_PK_RTZ_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # prev_mode = ROUND_MODE; + # tmp[15 : 0].f16 = f32_to_f16(S0.f32); + # tmp[31 : 16].f16 = f32_to_f16(S1.f32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + prev_mode = ROUND_MODE + tmp[15 : 0].f16 = f32_to_f16(S0.f32) + tmp[31 : 16].f16 = f32_to_f16(S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_ADD_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 + S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 + S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_SUB_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 - S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 - S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_MUL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 * S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 * S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOP2Op_S_FMAC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, S1.f16, D0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, S1.f16, D0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +SOP2Op_FUNCTIONS = { + SOP2Op.S_ADD_U32: _SOP2Op_S_ADD_U32, + SOP2Op.S_SUB_U32: _SOP2Op_S_SUB_U32, + SOP2Op.S_ADD_I32: _SOP2Op_S_ADD_I32, + SOP2Op.S_SUB_I32: _SOP2Op_S_SUB_I32, + SOP2Op.S_ADDC_U32: _SOP2Op_S_ADDC_U32, + SOP2Op.S_SUBB_U32: _SOP2Op_S_SUBB_U32, + SOP2Op.S_ABSDIFF_I32: _SOP2Op_S_ABSDIFF_I32, + SOP2Op.S_LSHL_B32: _SOP2Op_S_LSHL_B32, + SOP2Op.S_LSHL_B64: _SOP2Op_S_LSHL_B64, + SOP2Op.S_LSHR_B32: _SOP2Op_S_LSHR_B32, + SOP2Op.S_LSHR_B64: _SOP2Op_S_LSHR_B64, + SOP2Op.S_ASHR_I32: _SOP2Op_S_ASHR_I32, + SOP2Op.S_ASHR_I64: _SOP2Op_S_ASHR_I64, + SOP2Op.S_LSHL1_ADD_U32: _SOP2Op_S_LSHL1_ADD_U32, + SOP2Op.S_LSHL2_ADD_U32: _SOP2Op_S_LSHL2_ADD_U32, + SOP2Op.S_LSHL3_ADD_U32: _SOP2Op_S_LSHL3_ADD_U32, + SOP2Op.S_LSHL4_ADD_U32: _SOP2Op_S_LSHL4_ADD_U32, + SOP2Op.S_MIN_I32: _SOP2Op_S_MIN_I32, + SOP2Op.S_MIN_U32: _SOP2Op_S_MIN_U32, + SOP2Op.S_MAX_I32: _SOP2Op_S_MAX_I32, + SOP2Op.S_MAX_U32: _SOP2Op_S_MAX_U32, + SOP2Op.S_AND_B32: _SOP2Op_S_AND_B32, + SOP2Op.S_AND_B64: _SOP2Op_S_AND_B64, + SOP2Op.S_OR_B32: _SOP2Op_S_OR_B32, + SOP2Op.S_OR_B64: _SOP2Op_S_OR_B64, + SOP2Op.S_XOR_B32: _SOP2Op_S_XOR_B32, + SOP2Op.S_XOR_B64: _SOP2Op_S_XOR_B64, + SOP2Op.S_NAND_B32: _SOP2Op_S_NAND_B32, + SOP2Op.S_NAND_B64: _SOP2Op_S_NAND_B64, + SOP2Op.S_NOR_B32: _SOP2Op_S_NOR_B32, + SOP2Op.S_NOR_B64: _SOP2Op_S_NOR_B64, + SOP2Op.S_XNOR_B32: _SOP2Op_S_XNOR_B32, + SOP2Op.S_XNOR_B64: _SOP2Op_S_XNOR_B64, + SOP2Op.S_AND_NOT1_B32: _SOP2Op_S_AND_NOT1_B32, + SOP2Op.S_AND_NOT1_B64: _SOP2Op_S_AND_NOT1_B64, + SOP2Op.S_OR_NOT1_B32: _SOP2Op_S_OR_NOT1_B32, + SOP2Op.S_OR_NOT1_B64: _SOP2Op_S_OR_NOT1_B64, + SOP2Op.S_BFE_U32: _SOP2Op_S_BFE_U32, + SOP2Op.S_BFE_I32: _SOP2Op_S_BFE_I32, + SOP2Op.S_BFE_U64: _SOP2Op_S_BFE_U64, + SOP2Op.S_BFE_I64: _SOP2Op_S_BFE_I64, + SOP2Op.S_BFM_B32: _SOP2Op_S_BFM_B32, + SOP2Op.S_BFM_B64: _SOP2Op_S_BFM_B64, + SOP2Op.S_MUL_I32: _SOP2Op_S_MUL_I32, + SOP2Op.S_MUL_HI_U32: _SOP2Op_S_MUL_HI_U32, + SOP2Op.S_MUL_HI_I32: _SOP2Op_S_MUL_HI_I32, + SOP2Op.S_CSELECT_B32: _SOP2Op_S_CSELECT_B32, + SOP2Op.S_CSELECT_B64: _SOP2Op_S_CSELECT_B64, + SOP2Op.S_PACK_LL_B32_B16: _SOP2Op_S_PACK_LL_B32_B16, + SOP2Op.S_PACK_LH_B32_B16: _SOP2Op_S_PACK_LH_B32_B16, + SOP2Op.S_PACK_HH_B32_B16: _SOP2Op_S_PACK_HH_B32_B16, + SOP2Op.S_PACK_HL_B32_B16: _SOP2Op_S_PACK_HL_B32_B16, + SOP2Op.S_ADD_F32: _SOP2Op_S_ADD_F32, + SOP2Op.S_SUB_F32: _SOP2Op_S_SUB_F32, + SOP2Op.S_MIN_F32: _SOP2Op_S_MIN_F32, + SOP2Op.S_MAX_F32: _SOP2Op_S_MAX_F32, + SOP2Op.S_MUL_F32: _SOP2Op_S_MUL_F32, + SOP2Op.S_FMAAK_F32: _SOP2Op_S_FMAAK_F32, + SOP2Op.S_FMAMK_F32: _SOP2Op_S_FMAMK_F32, + SOP2Op.S_FMAC_F32: _SOP2Op_S_FMAC_F32, + SOP2Op.S_CVT_PK_RTZ_F16_F32: _SOP2Op_S_CVT_PK_RTZ_F16_F32, + SOP2Op.S_ADD_F16: _SOP2Op_S_ADD_F16, + SOP2Op.S_SUB_F16: _SOP2Op_S_SUB_F16, + SOP2Op.S_MIN_F16: _SOP2Op_S_MIN_F16, + SOP2Op.S_MAX_F16: _SOP2Op_S_MAX_F16, + SOP2Op.S_MUL_F16: _SOP2Op_S_MUL_F16, + SOP2Op.S_FMAC_F16: _SOP2Op_S_FMAC_F16, +} + +def _SOPCOp_S_CMP_EQ_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 == S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 == S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LG_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 <> S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 != S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 > S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 > S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 >= S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 >= S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 < S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 < S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 <= S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 <= S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_EQ_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 == S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 == S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LG_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 <> S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 != S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 > S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 > S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 >= S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 >= S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 < S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 < S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 <= S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 <= S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_BITCMP0_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32[S1.u32[4 : 0]] == 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32[S1.u32[4 : 0]] == 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_BITCMP1_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32[S1.u32[4 : 0]] == 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32[S1.u32[4 : 0]] == 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_BITCMP0_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64[S1.u32[5 : 0]] == 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u64[S1.u32[5 : 0]] == 0) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_BITCMP1_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64[S1.u32[5 : 0]] == 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u64[S1.u32[5 : 0]] == 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_EQ_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64 == S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u64 == S1.u64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LG_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64 <> S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u64 != S1.u64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 < S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 < S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 < S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 < S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_EQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 == S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 == S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_EQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 == S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 == S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 <= S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 <= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 <= S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 <= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 > S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 > S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 > S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 > S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 <> S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 != S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_LG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 <> S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 != S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f32 >= S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f32 >= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_GE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.f16 >= S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.f16 >= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_O_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(( not isNAN(F(S0.f32)) and not isNAN(F(S1.f32)))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_O_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(( not isNAN(F(S0.f16)) and not isNAN(F(S1.f16)))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_U_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((isNAN(F(S0.f32)) or isNAN(F(S1.f32)))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_U_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((isNAN(F(S0.f16)) or isNAN(F(S1.f16)))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NGE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 >= S1.f32); + # // With NAN inputs this is not the same operation as < + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 >= S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NGE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 >= S1.f16); + # // With NAN inputs this is not the same operation as < + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 >= S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 <> S1.f32); + # // With NAN inputs this is not the same operation as == + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 != S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 <> S1.f16); + # // With NAN inputs this is not the same operation as == + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 != S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NGT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 > S1.f32); + # // With NAN inputs this is not the same operation as <= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 > S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NGT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 > S1.f16); + # // With NAN inputs this is not the same operation as <= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 > S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 <= S1.f32); + # // With NAN inputs this is not the same operation as > + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 <= S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 <= S1.f16); + # // With NAN inputs this is not the same operation as > + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 <= S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NEQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 == S1.f32); + # // With NAN inputs this is not the same operation as != + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 == S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NEQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 == S1.f16); + # // With NAN inputs this is not the same operation as != + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 == S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f32 < S1.f32); + # // With NAN inputs this is not the same operation as >= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f32 < S1.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPCOp_S_CMP_NLT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = !(S0.f16 < S1.f16); + # // With NAN inputs this is not the same operation as >= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg( not (S0.f16 < S1.f16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +SOPCOp_FUNCTIONS = { + SOPCOp.S_CMP_EQ_I32: _SOPCOp_S_CMP_EQ_I32, + SOPCOp.S_CMP_LG_I32: _SOPCOp_S_CMP_LG_I32, + SOPCOp.S_CMP_GT_I32: _SOPCOp_S_CMP_GT_I32, + SOPCOp.S_CMP_GE_I32: _SOPCOp_S_CMP_GE_I32, + SOPCOp.S_CMP_LT_I32: _SOPCOp_S_CMP_LT_I32, + SOPCOp.S_CMP_LE_I32: _SOPCOp_S_CMP_LE_I32, + SOPCOp.S_CMP_EQ_U32: _SOPCOp_S_CMP_EQ_U32, + SOPCOp.S_CMP_LG_U32: _SOPCOp_S_CMP_LG_U32, + SOPCOp.S_CMP_GT_U32: _SOPCOp_S_CMP_GT_U32, + SOPCOp.S_CMP_GE_U32: _SOPCOp_S_CMP_GE_U32, + SOPCOp.S_CMP_LT_U32: _SOPCOp_S_CMP_LT_U32, + SOPCOp.S_CMP_LE_U32: _SOPCOp_S_CMP_LE_U32, + SOPCOp.S_BITCMP0_B32: _SOPCOp_S_BITCMP0_B32, + SOPCOp.S_BITCMP1_B32: _SOPCOp_S_BITCMP1_B32, + SOPCOp.S_BITCMP0_B64: _SOPCOp_S_BITCMP0_B64, + SOPCOp.S_BITCMP1_B64: _SOPCOp_S_BITCMP1_B64, + SOPCOp.S_CMP_EQ_U64: _SOPCOp_S_CMP_EQ_U64, + SOPCOp.S_CMP_LG_U64: _SOPCOp_S_CMP_LG_U64, + SOPCOp.S_CMP_LT_F32: _SOPCOp_S_CMP_LT_F32, + SOPCOp.S_CMP_LT_F16: _SOPCOp_S_CMP_LT_F16, + SOPCOp.S_CMP_EQ_F32: _SOPCOp_S_CMP_EQ_F32, + SOPCOp.S_CMP_EQ_F16: _SOPCOp_S_CMP_EQ_F16, + SOPCOp.S_CMP_LE_F32: _SOPCOp_S_CMP_LE_F32, + SOPCOp.S_CMP_LE_F16: _SOPCOp_S_CMP_LE_F16, + SOPCOp.S_CMP_GT_F32: _SOPCOp_S_CMP_GT_F32, + SOPCOp.S_CMP_GT_F16: _SOPCOp_S_CMP_GT_F16, + SOPCOp.S_CMP_LG_F32: _SOPCOp_S_CMP_LG_F32, + SOPCOp.S_CMP_LG_F16: _SOPCOp_S_CMP_LG_F16, + SOPCOp.S_CMP_GE_F32: _SOPCOp_S_CMP_GE_F32, + SOPCOp.S_CMP_GE_F16: _SOPCOp_S_CMP_GE_F16, + SOPCOp.S_CMP_O_F32: _SOPCOp_S_CMP_O_F32, + SOPCOp.S_CMP_O_F16: _SOPCOp_S_CMP_O_F16, + SOPCOp.S_CMP_U_F32: _SOPCOp_S_CMP_U_F32, + SOPCOp.S_CMP_U_F16: _SOPCOp_S_CMP_U_F16, + SOPCOp.S_CMP_NGE_F32: _SOPCOp_S_CMP_NGE_F32, + SOPCOp.S_CMP_NGE_F16: _SOPCOp_S_CMP_NGE_F16, + SOPCOp.S_CMP_NLG_F32: _SOPCOp_S_CMP_NLG_F32, + SOPCOp.S_CMP_NLG_F16: _SOPCOp_S_CMP_NLG_F16, + SOPCOp.S_CMP_NGT_F32: _SOPCOp_S_CMP_NGT_F32, + SOPCOp.S_CMP_NGT_F16: _SOPCOp_S_CMP_NGT_F16, + SOPCOp.S_CMP_NLE_F32: _SOPCOp_S_CMP_NLE_F32, + SOPCOp.S_CMP_NLE_F16: _SOPCOp_S_CMP_NLE_F16, + SOPCOp.S_CMP_NEQ_F32: _SOPCOp_S_CMP_NEQ_F32, + SOPCOp.S_CMP_NEQ_F16: _SOPCOp_S_CMP_NEQ_F16, + SOPCOp.S_CMP_NLT_F32: _SOPCOp_S_CMP_NLT_F32, + SOPCOp.S_CMP_NLT_F16: _SOPCOp_S_CMP_NLT_F16, +} + +def _SOPKOp_S_MOVK_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(SIMM16.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_VERSION(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Do nothing - for use by tools only + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMOVK_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if SCC then + # D0.i32 = 32'I(signext(SIMM16.i16)) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if SCC: + D0.i32 = (signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_EQ_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) == signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) == signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LG_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) != signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) != signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_GT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) > signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) > signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_GE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) >= signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) >= signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) < signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) < signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = 64'I(S0.i32) <= signext(SIMM16.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg((S0.i32) <= signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_EQ_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 == 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 == (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LG_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 != 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 != (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_GT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 > 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 > (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_GE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 >= 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 >= (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 < 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 < (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_CMPK_LE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 <= 32'U(SIMM16.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 <= (SIMM16.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_ADDK_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = D0.i32; + # D0.i32 = 32'I(64'I(D0.i32) + signext(SIMM16.i16)); + # SCC = ((tmp[31] == SIMM16.i16[15]) && (tmp[31] != D0.i32[31])); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(D0.i32) + D0.i32 = ((D0.i32) + signext(SIMM16.i16)) + SCC = Reg(((tmp[31] == SIMM16.i16[15]) and (tmp[31] != D0.i32[31]))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPKOp_S_MULK_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(64'I(D0.i32) * signext(SIMM16.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((D0.i32) * signext(SIMM16.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +SOPKOp_FUNCTIONS = { + SOPKOp.S_MOVK_I32: _SOPKOp_S_MOVK_I32, + SOPKOp.S_VERSION: _SOPKOp_S_VERSION, + SOPKOp.S_CMOVK_I32: _SOPKOp_S_CMOVK_I32, + SOPKOp.S_CMPK_EQ_I32: _SOPKOp_S_CMPK_EQ_I32, + SOPKOp.S_CMPK_LG_I32: _SOPKOp_S_CMPK_LG_I32, + SOPKOp.S_CMPK_GT_I32: _SOPKOp_S_CMPK_GT_I32, + SOPKOp.S_CMPK_GE_I32: _SOPKOp_S_CMPK_GE_I32, + SOPKOp.S_CMPK_LT_I32: _SOPKOp_S_CMPK_LT_I32, + SOPKOp.S_CMPK_LE_I32: _SOPKOp_S_CMPK_LE_I32, + SOPKOp.S_CMPK_EQ_U32: _SOPKOp_S_CMPK_EQ_U32, + SOPKOp.S_CMPK_LG_U32: _SOPKOp_S_CMPK_LG_U32, + SOPKOp.S_CMPK_GT_U32: _SOPKOp_S_CMPK_GT_U32, + SOPKOp.S_CMPK_GE_U32: _SOPKOp_S_CMPK_GE_U32, + SOPKOp.S_CMPK_LT_U32: _SOPKOp_S_CMPK_LT_U32, + SOPKOp.S_CMPK_LE_U32: _SOPKOp_S_CMPK_LE_U32, + SOPKOp.S_ADDK_I32: _SOPKOp_S_ADDK_I32, + SOPKOp.S_MULK_I32: _SOPKOp_S_MULK_I32, +} + +def _SOPPOp_S_NOP(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # for i in 0U : SIMM16.u16[3 : 0].u32 do + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + for i in range(0, int(SIMM16.u16[3 : 0].u32)+1): + pass + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPPOp_S_DELAY_ALU(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # instruction may be omitted. For wave64 the compiler may not know the status of the EXEC mask and hence + # // 1 cycle delay here + # // 2 cycles delay here + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _SOPPOp_S_TRAP(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // PC passed into trap handler points to S_TRAP itself, + # // trap base address + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +SOPPOp_FUNCTIONS = { + SOPPOp.S_NOP: _SOPPOp_S_NOP, + SOPPOp.S_DELAY_ALU: _SOPPOp_S_DELAY_ALU, + SOPPOp.S_TRAP: _SOPPOp_S_TRAP, +} + +def _VOP1Op_V_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare lane : 32'U; + # if WAVE64 then + # // 64 lanes + # if EXEC == 0x0LL then + # lane = 0U; + # // Force lane 0 if all lanes are disabled + # else + # lane = 32'U(s_ff1_i32_b64(EXEC)); + # // Lowest active lane + # endif + # else + # // 32 lanes + # if EXEC_LO.i32 == 0 then + # lane = 0U; + # // Force lane 0 if all lanes are disabled + # else + # lane = 32'U(s_ff1_i32_b32(EXEC_LO)); + # // Lowest active lane + # endif + # endif; + # D0.b32 = VGPR[lane][SRC0.u32] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE64: + if EXEC == 0x0: + lane = 0 + else: + lane = (s_ff1_i32_b64(EXEC)) + else: + if EXEC_LO.i32 == 0: + lane = 0 + else: + lane = (s_ff1_i32_b32(EXEC_LO)) + D0.b32 = VGPR[lane][SRC0.u32] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f64_to_i32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f64_to_i32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F64_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = i32_to_f64(S0.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = i32_to_f64(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_CVT_F32_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = i32_to_f32(S0.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = i32_to_f32(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_U32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f32_to_u32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = f32_to_u32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = f32_to_f16(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = f32_to_f16(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_NEAREST_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(floor(S0.f32 + 0.5F)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(floor(S0.f32 + 0.5)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_FLOOR_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(floor(S0.f32)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(floor(S0.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f64_to_f32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f64_to_f32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F64_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = f32_to_f64(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = f32_to_f64(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_CVT_F32_UBYTE0(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[7 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[7 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_UBYTE1(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[15 : 8].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[15 : 8].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_UBYTE2(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[23 : 16].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[23 : 16].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F32_UBYTE3(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[31 : 24].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[31 : 24].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_U32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f64_to_u32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = f64_to_u32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F64_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = u32_to_f64(S0.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = u32_to_f64(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_TRUNC_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_CEIL_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += 1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 > 0.0) and (S0.f64 != D0.f64)): + D0.f64 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_RNDNE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = floor(S0.f64 + 0.5); + # if (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then + # D0.f64 -= 1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = floor(S0.f64 + 0.5) + if (isEven(floor(S0.f64)) and (fract(S0.f64) == 0.5)): + D0.f64 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_FLOOR_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += -1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 < 0.0) and (S0.f64 != D0.f64)): + D0.f64 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_MOV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b16 = S0.b16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b16 = S0.b16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FRACT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 + -floor(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 + -floor(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_TRUNC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CEIL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 > 0.0) and (S0.f32 != D0.f32)): + D0.f32 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RNDNE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = floor(S0.f32 + 0.5F); + # if (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then + # D0.f32 -= 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = floor(S0.f32 + 0.5) + if (isEven(F(floor(S0.f32))) and (fract(S0.f32) == 0.5)): + D0.f32 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FLOOR_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += -1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 < 0.0) and (S0.f32 != D0.f32)): + D0.f32 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_EXP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = pow(2.0F, S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = pow(2.0, S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_LOG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = log2(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = log2(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RCP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / S0.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RCP_IFLAG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / S0.f32; + # // Can only raise integer DIV_BY_ZERO exception + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RSQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / sqrt(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / sqrt(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RCP_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = 1.0 / S0.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = 1.0 / S0.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_RSQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = 1.0 / sqrt(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = 1.0 / sqrt(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_SQRT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = sqrt(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = sqrt(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_SQRT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = sqrt(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = sqrt(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_SIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = sin(S0.f32 * 32'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = sin(S0.f32 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_COS_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = cos(S0.f32 * 32'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = cos(S0.f32 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_NOT_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~S0.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_BFREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32[31 : 0] = S0.u32[0 : 31] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32[31 : 0] = S0.u32[0 : 31] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CLZ_I32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from MSB + # if S0.u32[31 - i] == 1'1U then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(0, int(31)+1): + if S0.u32[31 - i] == 1: + D0.i32 = i; break # Stop at first 1 bit found + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CTZ_I32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from LSB + # if S0.u32[i] == 1'1U then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(0, int(31)+1): + if S0.u32[i] == 1: + D0.i32 = i; break # Stop at first 1 bit found + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CLS_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if all bits are the same + # for i in 1 : 31 do + # // Search from MSB + # if S0.i32[31 - i] != S0.i32[31] then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(1, int(31)+1): + if S0.i32[31 - i] != S0.i32[31]: + D0.i32 = i + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FREXP_EXP_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then + # D0.i32 = 0 + # else + # D0.i32 = exponent(S0.f64) - 1023 + 1 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((S0.f64 == INF) or (S0.f64 == (-INF)) or isNAN(S0.f64)): + D0.i32 = 0 + else: + D0.i32 = exponent(S0.f64) - 1023 + 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FREXP_MANT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then + # D0.f64 = S0.f64 + # else + # D0.f64 = mantissa(S0.f64) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((S0.f64 == INF) or (S0.f64 == (-INF)) or isNAN(S0.f64)): + D0.f64 = S0.f64 + else: + D0.f64 = mantissa(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_FRACT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = S0.f64 + -floor(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = S0.f64 + -floor(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP1Op_V_FREXP_EXP_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then + # D0.i32 = 0 + # else + # D0.i32 = exponent(S0.f32) - 127 + 1 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == INF) or (F(S0.f32) == (-INF)) or isNAN(F(S0.f32))): + D0.i32 = 0 + else: + D0.i32 = exponent(S0.f32) - 127 + 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FREXP_MANT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then + # D0.f32 = S0.f32 + # else + # D0.f32 = mantissa(S0.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == INF) or (F(S0.f32) == (-INF)) or isNAN(F(S0.f32))): + D0.f32 = S0.f32 + else: + D0.f32 = mantissa(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_MOVRELS_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # addr = SRC0.u32; + # // Raw value from instruction + # D0.b32 = VGPR[laneId][addr].b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + addr = SRC0.u32 + D0.b32 = VGPR[laneId][addr].b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F16_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = u16_to_f16(S0.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = u16_to_f16(S0.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_F16_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = i16_to_f16(S0.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = i16_to_f16(S0.i16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_U16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = f16_to_u16(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = f16_to_u16(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = f16_to_i16(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = f16_to_i16(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RCP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = 16'1.0 / S0.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = 1.0 / S0.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_SQRT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = sqrt(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = sqrt(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RSQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = 16'1.0 / sqrt(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = 1.0 / sqrt(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_LOG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = log2(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = log2(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_EXP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = pow(16'2.0, S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = pow(2.0, S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FREXP_MANT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then + # D0.f16 = S0.f16 + # else + # D0.f16 = mantissa(S0.f16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f16) == INF) or (F(S0.f16) == (-INF)) or isNAN(F(S0.f16))): + D0.f16 = S0.f16 + else: + D0.f16 = mantissa(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FREXP_EXP_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then + # D0.i16 = 16'0 + # else + # D0.i16 = 16'I(exponent(S0.f16) - 15 + 1) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f16) == INF) or (F(S0.f16) == (-INF)) or isNAN(F(S0.f16))): + D0.i16 = 0 + else: + D0.i16 = (exponent(S0.f16) - 15 + 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FLOOR_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += -16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 < 0.0) and (S0.f16 != D0.f16)): + D0.f16 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CEIL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 > 0.0) and (S0.f16 != D0.f16)): + D0.f16 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_TRUNC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_RNDNE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = floor(S0.f16 + 16'0.5); + # if (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then + # D0.f16 -= 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = floor(S0.f16 + 0.5) + if (isEven(F(floor(S0.f16))) and (fract(S0.f16) == 0.5)): + D0.f16 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_FRACT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 + -floor(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 + -floor(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_SIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = sin(S0.f16 * 16'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = sin(S0.f16 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_COS_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = cos(S0.f16 * 16'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = cos(S0.f16 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_NORM_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = f16_to_snorm(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = f16_to_snorm(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_NORM_U16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = f16_to_unorm(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = f16_to_unorm(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_SWAP_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = D0.b32; + # D0.b32 = S0.b32; + # S0.b32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(D0.b32) + D0.b32 = S0.b32 + S0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_SWAP_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = D0.b16; + # D0.b16 = S0.b16; + # S0.b16 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(D0.b16) + D0.b16 = S0.b16 + S0.b16 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_NOT_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = ~S0.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = ~S0.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_I32_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_U32_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { 16'0, S0.u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(0, S0.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +VOP1Op_FUNCTIONS = { + VOP1Op.V_MOV_B32: _VOP1Op_V_MOV_B32, + VOP1Op.V_READFIRSTLANE_B32: _VOP1Op_V_READFIRSTLANE_B32, + VOP1Op.V_CVT_I32_F64: _VOP1Op_V_CVT_I32_F64, + VOP1Op.V_CVT_F64_I32: _VOP1Op_V_CVT_F64_I32, + VOP1Op.V_CVT_F32_I32: _VOP1Op_V_CVT_F32_I32, + VOP1Op.V_CVT_F32_U32: _VOP1Op_V_CVT_F32_U32, + VOP1Op.V_CVT_U32_F32: _VOP1Op_V_CVT_U32_F32, + VOP1Op.V_CVT_I32_F32: _VOP1Op_V_CVT_I32_F32, + VOP1Op.V_CVT_F16_F32: _VOP1Op_V_CVT_F16_F32, + VOP1Op.V_CVT_F32_F16: _VOP1Op_V_CVT_F32_F16, + VOP1Op.V_CVT_NEAREST_I32_F32: _VOP1Op_V_CVT_NEAREST_I32_F32, + VOP1Op.V_CVT_FLOOR_I32_F32: _VOP1Op_V_CVT_FLOOR_I32_F32, + VOP1Op.V_CVT_F32_F64: _VOP1Op_V_CVT_F32_F64, + VOP1Op.V_CVT_F64_F32: _VOP1Op_V_CVT_F64_F32, + VOP1Op.V_CVT_F32_UBYTE0: _VOP1Op_V_CVT_F32_UBYTE0, + VOP1Op.V_CVT_F32_UBYTE1: _VOP1Op_V_CVT_F32_UBYTE1, + VOP1Op.V_CVT_F32_UBYTE2: _VOP1Op_V_CVT_F32_UBYTE2, + VOP1Op.V_CVT_F32_UBYTE3: _VOP1Op_V_CVT_F32_UBYTE3, + VOP1Op.V_CVT_U32_F64: _VOP1Op_V_CVT_U32_F64, + VOP1Op.V_CVT_F64_U32: _VOP1Op_V_CVT_F64_U32, + VOP1Op.V_TRUNC_F64: _VOP1Op_V_TRUNC_F64, + VOP1Op.V_CEIL_F64: _VOP1Op_V_CEIL_F64, + VOP1Op.V_RNDNE_F64: _VOP1Op_V_RNDNE_F64, + VOP1Op.V_FLOOR_F64: _VOP1Op_V_FLOOR_F64, + VOP1Op.V_MOV_B16: _VOP1Op_V_MOV_B16, + VOP1Op.V_FRACT_F32: _VOP1Op_V_FRACT_F32, + VOP1Op.V_TRUNC_F32: _VOP1Op_V_TRUNC_F32, + VOP1Op.V_CEIL_F32: _VOP1Op_V_CEIL_F32, + VOP1Op.V_RNDNE_F32: _VOP1Op_V_RNDNE_F32, + VOP1Op.V_FLOOR_F32: _VOP1Op_V_FLOOR_F32, + VOP1Op.V_EXP_F32: _VOP1Op_V_EXP_F32, + VOP1Op.V_LOG_F32: _VOP1Op_V_LOG_F32, + VOP1Op.V_RCP_F32: _VOP1Op_V_RCP_F32, + VOP1Op.V_RCP_IFLAG_F32: _VOP1Op_V_RCP_IFLAG_F32, + VOP1Op.V_RSQ_F32: _VOP1Op_V_RSQ_F32, + VOP1Op.V_RCP_F64: _VOP1Op_V_RCP_F64, + VOP1Op.V_RSQ_F64: _VOP1Op_V_RSQ_F64, + VOP1Op.V_SQRT_F32: _VOP1Op_V_SQRT_F32, + VOP1Op.V_SQRT_F64: _VOP1Op_V_SQRT_F64, + VOP1Op.V_SIN_F32: _VOP1Op_V_SIN_F32, + VOP1Op.V_COS_F32: _VOP1Op_V_COS_F32, + VOP1Op.V_NOT_B32: _VOP1Op_V_NOT_B32, + VOP1Op.V_BFREV_B32: _VOP1Op_V_BFREV_B32, + VOP1Op.V_CLZ_I32_U32: _VOP1Op_V_CLZ_I32_U32, + VOP1Op.V_CTZ_I32_B32: _VOP1Op_V_CTZ_I32_B32, + VOP1Op.V_CLS_I32: _VOP1Op_V_CLS_I32, + VOP1Op.V_FREXP_EXP_I32_F64: _VOP1Op_V_FREXP_EXP_I32_F64, + VOP1Op.V_FREXP_MANT_F64: _VOP1Op_V_FREXP_MANT_F64, + VOP1Op.V_FRACT_F64: _VOP1Op_V_FRACT_F64, + VOP1Op.V_FREXP_EXP_I32_F32: _VOP1Op_V_FREXP_EXP_I32_F32, + VOP1Op.V_FREXP_MANT_F32: _VOP1Op_V_FREXP_MANT_F32, + VOP1Op.V_MOVRELS_B32: _VOP1Op_V_MOVRELS_B32, + VOP1Op.V_CVT_F16_U16: _VOP1Op_V_CVT_F16_U16, + VOP1Op.V_CVT_F16_I16: _VOP1Op_V_CVT_F16_I16, + VOP1Op.V_CVT_U16_F16: _VOP1Op_V_CVT_U16_F16, + VOP1Op.V_CVT_I16_F16: _VOP1Op_V_CVT_I16_F16, + VOP1Op.V_RCP_F16: _VOP1Op_V_RCP_F16, + VOP1Op.V_SQRT_F16: _VOP1Op_V_SQRT_F16, + VOP1Op.V_RSQ_F16: _VOP1Op_V_RSQ_F16, + VOP1Op.V_LOG_F16: _VOP1Op_V_LOG_F16, + VOP1Op.V_EXP_F16: _VOP1Op_V_EXP_F16, + VOP1Op.V_FREXP_MANT_F16: _VOP1Op_V_FREXP_MANT_F16, + VOP1Op.V_FREXP_EXP_I16_F16: _VOP1Op_V_FREXP_EXP_I16_F16, + VOP1Op.V_FLOOR_F16: _VOP1Op_V_FLOOR_F16, + VOP1Op.V_CEIL_F16: _VOP1Op_V_CEIL_F16, + VOP1Op.V_TRUNC_F16: _VOP1Op_V_TRUNC_F16, + VOP1Op.V_RNDNE_F16: _VOP1Op_V_RNDNE_F16, + VOP1Op.V_FRACT_F16: _VOP1Op_V_FRACT_F16, + VOP1Op.V_SIN_F16: _VOP1Op_V_SIN_F16, + VOP1Op.V_COS_F16: _VOP1Op_V_COS_F16, + VOP1Op.V_CVT_NORM_I16_F16: _VOP1Op_V_CVT_NORM_I16_F16, + VOP1Op.V_CVT_NORM_U16_F16: _VOP1Op_V_CVT_NORM_U16_F16, + VOP1Op.V_SWAP_B32: _VOP1Op_V_SWAP_B32, + VOP1Op.V_SWAP_B16: _VOP1Op_V_SWAP_B16, + VOP1Op.V_NOT_B16: _VOP1Op_V_NOT_B16, + VOP1Op.V_CVT_I32_I16: _VOP1Op_V_CVT_I32_I16, + VOP1Op.V_CVT_U32_U16: _VOP1Op_V_CVT_U32_U16, +} + +def _VOP2Op_V_CNDMASK_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = VCC.u64[laneId] ? S1.u32 : S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S1.u32) if (VCC.u64[laneId]) else (S0.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_DOT2ACC_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = D0.f32; + # tmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16); + # tmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16); + # D0.f32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(D0.f32) + tmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16) + tmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16) + D0.f32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_ADD_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 + S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 + S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUB_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 - S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 - S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUBREV_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S1.f32 - S0.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S1.f32 - S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAC_DX9_ZERO_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # // DX9 rules, 0.0 * x = 0.0 + # D0.f32 = S2.f32 + # else + # D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = S2.f32 + else: + D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_DX9_ZERO_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # // DX9 rules, 0.0 * x = 0.0 + # D0.f32 = 0.0F + # else + # D0.f32 = S0.f32 * S1.f32 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = 0.0 + else: + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 * S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_I32_I24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(S0.i24) * 32'I(S1.i24) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S0.i24) * (S1.i24) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_HI_I32_I24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I((64'I(S0.i24) * 64'I(S1.i24)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (((S0.i24) * (S1.i24)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_U32_U24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(S0.u24) * 32'U(S1.u24) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u24) * (S1.u24) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_HI_U32_U24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U((64'U(S0.u24) * 64'U(S1.u24)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((S0.u24) * (S1.u24)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MAX_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MIN_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 < S1.i32 ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((S0.i32) if (S0.i32 < S1.i32) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MAX_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 >= S1.i32 ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((S0.i32) if (S0.i32 >= S1.i32) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MIN_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 < S1.u32 ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32) if (S0.u32 < S1.u32) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MAX_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 >= S1.u32 ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32) if (S0.u32 >= S1.u32) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_LSHLREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S1.u32 << S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S1.u32 << S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_LSHRREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S1.u32 >> S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S1.u32 >> S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_ASHRREV_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = (S1.i32 >> S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S1.i32 >> S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_AND_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 & S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 & S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_OR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 | S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 | S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_XOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 ^ S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 ^ S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_XNOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~(S0.u32 ^ S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~(S0.u32 ^ S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_ADD_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32) + VCC.u64[laneId].u64; + # VCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32) + VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if (tmp >= 0x100000000) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUB_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32 - VCC.u64[laneId].u32; + # VCC.u64[laneId] = 64'U(S1.u32) + VCC.u64[laneId].u64 > 64'U(S0.u32) ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32 - VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if ((S1.u32) + VCC.u64[laneId] > (S0.u32)) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUBREV_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S1.u32 - S0.u32 - VCC.u64[laneId].u32; + # VCC.u64[laneId] = 64'U(S0.u32) + VCC.u64[laneId].u64 > 64'U(S1.u32) ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S1.u32 - S0.u32 - VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if ((S0.u32) + VCC.u64[laneId] > (S1.u32)) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_ADD_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 + S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 + S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUB_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 - S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 - S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUBREV_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S1.u32 - S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S1.u32 - S0.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, D0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAMK_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAAK_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_CVT_PK_RTZ_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # prev_mode = ROUND_MODE; + # tmp[15 : 0].f16 = f32_to_f16(S0.f32); + # tmp[31 : 16].f16 = f32_to_f16(S1.f32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + prev_mode = ROUND_MODE + tmp[15 : 0].f16 = f32_to_f16(S0.f32) + tmp[31 : 16].f16 = f32_to_f16(S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_ADD_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 + S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 + S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUB_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 - S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 - S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_SUBREV_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S1.f16 - S0.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S1.f16 - S0.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MUL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 * S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 * S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, S1.f16, D0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, S1.f16, D0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAMK_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, SIMM32.f16, S1.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, SIMM32.f16, S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_FMAAK_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, S1.f16, SIMM32.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, S1.f16, SIMM32.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_MIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_LDEXP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 * 16'F(2.0F ** 32'I(S1.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 * F(2.0 ** (S1.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP2Op_V_PK_FMAC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, D0[31 : 16].f16); + # D0[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, D0[15 : 0].f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, D0[31 : 16].f16) + D0[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, D0[15 : 0].f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +VOP2Op_FUNCTIONS = { + VOP2Op.V_CNDMASK_B32: _VOP2Op_V_CNDMASK_B32, + VOP2Op.V_DOT2ACC_F32_F16: _VOP2Op_V_DOT2ACC_F32_F16, + VOP2Op.V_ADD_F32: _VOP2Op_V_ADD_F32, + VOP2Op.V_SUB_F32: _VOP2Op_V_SUB_F32, + VOP2Op.V_SUBREV_F32: _VOP2Op_V_SUBREV_F32, + VOP2Op.V_FMAC_DX9_ZERO_F32: _VOP2Op_V_FMAC_DX9_ZERO_F32, + VOP2Op.V_MUL_DX9_ZERO_F32: _VOP2Op_V_MUL_DX9_ZERO_F32, + VOP2Op.V_MUL_F32: _VOP2Op_V_MUL_F32, + VOP2Op.V_MUL_I32_I24: _VOP2Op_V_MUL_I32_I24, + VOP2Op.V_MUL_HI_I32_I24: _VOP2Op_V_MUL_HI_I32_I24, + VOP2Op.V_MUL_U32_U24: _VOP2Op_V_MUL_U32_U24, + VOP2Op.V_MUL_HI_U32_U24: _VOP2Op_V_MUL_HI_U32_U24, + VOP2Op.V_MIN_F32: _VOP2Op_V_MIN_F32, + VOP2Op.V_MAX_F32: _VOP2Op_V_MAX_F32, + VOP2Op.V_MIN_I32: _VOP2Op_V_MIN_I32, + VOP2Op.V_MAX_I32: _VOP2Op_V_MAX_I32, + VOP2Op.V_MIN_U32: _VOP2Op_V_MIN_U32, + VOP2Op.V_MAX_U32: _VOP2Op_V_MAX_U32, + VOP2Op.V_LSHLREV_B32: _VOP2Op_V_LSHLREV_B32, + VOP2Op.V_LSHRREV_B32: _VOP2Op_V_LSHRREV_B32, + VOP2Op.V_ASHRREV_I32: _VOP2Op_V_ASHRREV_I32, + VOP2Op.V_AND_B32: _VOP2Op_V_AND_B32, + VOP2Op.V_OR_B32: _VOP2Op_V_OR_B32, + VOP2Op.V_XOR_B32: _VOP2Op_V_XOR_B32, + VOP2Op.V_XNOR_B32: _VOP2Op_V_XNOR_B32, + VOP2Op.V_ADD_CO_CI_U32: _VOP2Op_V_ADD_CO_CI_U32, + VOP2Op.V_SUB_CO_CI_U32: _VOP2Op_V_SUB_CO_CI_U32, + VOP2Op.V_SUBREV_CO_CI_U32: _VOP2Op_V_SUBREV_CO_CI_U32, + VOP2Op.V_ADD_NC_U32: _VOP2Op_V_ADD_NC_U32, + VOP2Op.V_SUB_NC_U32: _VOP2Op_V_SUB_NC_U32, + VOP2Op.V_SUBREV_NC_U32: _VOP2Op_V_SUBREV_NC_U32, + VOP2Op.V_FMAC_F32: _VOP2Op_V_FMAC_F32, + VOP2Op.V_FMAMK_F32: _VOP2Op_V_FMAMK_F32, + VOP2Op.V_FMAAK_F32: _VOP2Op_V_FMAAK_F32, + VOP2Op.V_CVT_PK_RTZ_F16_F32: _VOP2Op_V_CVT_PK_RTZ_F16_F32, + VOP2Op.V_ADD_F16: _VOP2Op_V_ADD_F16, + VOP2Op.V_SUB_F16: _VOP2Op_V_SUB_F16, + VOP2Op.V_SUBREV_F16: _VOP2Op_V_SUBREV_F16, + VOP2Op.V_MUL_F16: _VOP2Op_V_MUL_F16, + VOP2Op.V_FMAC_F16: _VOP2Op_V_FMAC_F16, + VOP2Op.V_FMAMK_F16: _VOP2Op_V_FMAMK_F16, + VOP2Op.V_FMAAK_F16: _VOP2Op_V_FMAAK_F16, + VOP2Op.V_MAX_F16: _VOP2Op_V_MAX_F16, + VOP2Op.V_MIN_F16: _VOP2Op_V_MIN_F16, + VOP2Op.V_LDEXP_F16: _VOP2Op_V_LDEXP_F16, + VOP2Op.V_PK_FMAC_F16: _VOP2Op_V_PK_FMAC_F16, +} + +def _VOP3Op_V_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare lane : 32'U; + # if WAVE64 then + # // 64 lanes + # if EXEC == 0x0LL then + # lane = 0U; + # // Force lane 0 if all lanes are disabled + # else + # lane = 32'U(s_ff1_i32_b64(EXEC)); + # // Lowest active lane + # endif + # else + # // 32 lanes + # if EXEC_LO.i32 == 0 then + # lane = 0U; + # // Force lane 0 if all lanes are disabled + # else + # lane = 32'U(s_ff1_i32_b32(EXEC_LO)); + # // Lowest active lane + # endif + # endif; + # D0.b32 = VGPR[lane][SRC0.u32] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE64: + if EXEC == 0x0: + lane = 0 + else: + lane = (s_ff1_i32_b64(EXEC)) + else: + if EXEC_LO.i32 == 0: + lane = 0 + else: + lane = (s_ff1_i32_b32(EXEC_LO)) + D0.b32 = VGPR[lane][SRC0.u32] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f64_to_i32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f64_to_i32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F64_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = i32_to_f64(S0.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = i32_to_f64(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_CVT_F32_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = i32_to_f32(S0.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = i32_to_f32(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_U32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f32_to_u32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = f32_to_u32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = f32_to_f16(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = f32_to_f16(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_NEAREST_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(floor(S0.f32 + 0.5F)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(floor(S0.f32 + 0.5)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_FLOOR_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(floor(S0.f32)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(floor(S0.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f64_to_f32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = f64_to_f32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F64_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = f32_to_f64(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = f32_to_f64(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_CVT_F32_UBYTE0(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[7 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[7 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_UBYTE1(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[15 : 8].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[15 : 8].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_UBYTE2(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[23 : 16].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[23 : 16].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F32_UBYTE3(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[31 : 24].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[31 : 24].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_U32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f64_to_u32(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = f64_to_u32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F64_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = u32_to_f64(S0.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = u32_to_f64(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_TRUNC_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_CEIL_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += 1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 > 0.0) and (S0.f64 != D0.f64)): + D0.f64 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_RNDNE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = floor(S0.f64 + 0.5); + # if (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then + # D0.f64 -= 1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = floor(S0.f64 + 0.5) + if (isEven(floor(S0.f64)) and (fract(S0.f64) == 0.5)): + D0.f64 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_FLOOR_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += -1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 < 0.0) and (S0.f64 != D0.f64)): + D0.f64 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MOV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b16 = S0.b16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.b16 = S0.b16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FRACT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 + -floor(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 + -floor(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_TRUNC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CEIL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 > 0.0) and (S0.f32 != D0.f32)): + D0.f32 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RNDNE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = floor(S0.f32 + 0.5F); + # if (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then + # D0.f32 -= 1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = floor(S0.f32 + 0.5) + if (isEven(F(floor(S0.f32))) and (fract(S0.f32) == 0.5)): + D0.f32 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FLOOR_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = trunc(S0.f32); + # if ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then + # D0.f32 += -1.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = trunc(S0.f32) + if ((S0.f32 < 0.0) and (S0.f32 != D0.f32)): + D0.f32 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_EXP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = pow(2.0F, S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = pow(2.0, S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LOG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = log2(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = log2(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RCP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / S0.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RCP_IFLAG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / S0.f32; + # // Can only raise integer DIV_BY_ZERO exception + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RSQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = 1.0F / sqrt(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = 1.0 / sqrt(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RCP_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = 1.0 / S0.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = 1.0 / S0.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_RSQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = 1.0 / sqrt(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = 1.0 / sqrt(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_SQRT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = sqrt(S0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = sqrt(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SQRT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = sqrt(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = sqrt(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_SIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = sin(S0.f32 * 32'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = sin(S0.f32 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_COS_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = cos(S0.f32 * 32'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = cos(S0.f32 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_NOT_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~S0.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BFREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32[31 : 0] = S0.u32[0 : 31] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32[31 : 0] = S0.u32[0 : 31] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CLZ_I32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from MSB + # if S0.u32[31 - i] == 1'1U then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(0, int(31)+1): + if S0.u32[31 - i] == 1: + D0.i32 = i; break # Stop at first 1 bit found + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CTZ_I32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if no ones are found + # for i in 0 : 31 do + # // Search from LSB + # if S0.u32[i] == 1'1U then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(0, int(31)+1): + if S0.u32[i] == 1: + D0.i32 = i; break # Stop at first 1 bit found + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CLS_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = -1; + # // Set if all bits are the same + # for i in 1 : 31 do + # // Search from MSB + # if S0.i32[31 - i] != S0.i32[31] then + # D0.i32 = i; + # endif + # endfor + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = -1 + for i in range(1, int(31)+1): + if S0.i32[31 - i] != S0.i32[31]: + D0.i32 = i + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FREXP_EXP_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then + # D0.i32 = 0 + # else + # D0.i32 = exponent(S0.f64) - 1023 + 1 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((S0.f64 == INF) or (S0.f64 == (-INF)) or isNAN(S0.f64)): + D0.i32 = 0 + else: + D0.i32 = exponent(S0.f64) - 1023 + 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FREXP_MANT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then + # D0.f64 = S0.f64 + # else + # D0.f64 = mantissa(S0.f64) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((S0.f64 == INF) or (S0.f64 == (-INF)) or isNAN(S0.f64)): + D0.f64 = S0.f64 + else: + D0.f64 = mantissa(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_FRACT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = S0.f64 + -floor(S0.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = S0.f64 + -floor(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_FREXP_EXP_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then + # D0.i32 = 0 + # else + # D0.i32 = exponent(S0.f32) - 127 + 1 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == INF) or (F(S0.f32) == (-INF)) or isNAN(F(S0.f32))): + D0.i32 = 0 + else: + D0.i32 = exponent(S0.f32) - 127 + 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FREXP_MANT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then + # D0.f32 = S0.f32 + # else + # D0.f32 = mantissa(S0.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == INF) or (F(S0.f32) == (-INF)) or isNAN(F(S0.f32))): + D0.f32 = S0.f32 + else: + D0.f32 = mantissa(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MOVRELS_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # addr = SRC0.u32; + # // Raw value from instruction + # D0.b32 = VGPR[laneId][addr].b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + addr = SRC0.u32 + D0.b32 = VGPR[laneId][addr].b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F16_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = u16_to_f16(S0.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = u16_to_f16(S0.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_F16_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = i16_to_f16(S0.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = i16_to_f16(S0.i16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_U16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = f16_to_u16(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = f16_to_u16(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = f16_to_i16(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = f16_to_i16(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RCP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = 16'1.0 / S0.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = 1.0 / S0.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SQRT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = sqrt(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = sqrt(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RSQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = 16'1.0 / sqrt(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = 1.0 / sqrt(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LOG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = log2(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = log2(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_EXP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = pow(16'2.0, S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = pow(2.0, S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FREXP_MANT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then + # D0.f16 = S0.f16 + # else + # D0.f16 = mantissa(S0.f16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f16) == INF) or (F(S0.f16) == (-INF)) or isNAN(F(S0.f16))): + D0.f16 = S0.f16 + else: + D0.f16 = mantissa(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FREXP_EXP_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then + # D0.i16 = 16'0 + # else + # D0.i16 = 16'I(exponent(S0.f16) - 15 + 1) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f16) == INF) or (F(S0.f16) == (-INF)) or isNAN(F(S0.f16))): + D0.i16 = 0 + else: + D0.i16 = (exponent(S0.f16) - 15 + 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FLOOR_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += -16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 < 0.0) and (S0.f16 != D0.f16)): + D0.f16 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CEIL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16); + # if ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then + # D0.f16 += 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + if ((S0.f16 > 0.0) and (S0.f16 != D0.f16)): + D0.f16 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_TRUNC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = trunc(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = trunc(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_RNDNE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = floor(S0.f16 + 16'0.5); + # if (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then + # D0.f16 -= 16'1.0 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = floor(S0.f16 + 0.5) + if (isEven(F(floor(S0.f16))) and (fract(S0.f16) == 0.5)): + D0.f16 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FRACT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 + -floor(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 + -floor(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = sin(S0.f16 * 16'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = sin(S0.f16 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_COS_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = cos(S0.f16 * 16'F(PI * 2.0)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = cos(S0.f16 * F(PI * 2.0)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_NORM_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = f16_to_snorm(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = f16_to_snorm(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_NORM_U16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = f16_to_unorm(S0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = f16_to_unorm(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_NOT_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = ~S0.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = ~S0.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_I32_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_U32_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { 16'0, S0.u16 } + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0 = Reg(_pack(0, S0.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CNDMASK_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = VCC.u64[laneId] ? S1.u32 : S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S1.u32) if (VCC.u64[laneId]) else (S0.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 + S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 + S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 - S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 - S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUBREV_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S1.f32 - S0.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S1.f32 - S0.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMAC_DX9_ZERO_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # // DX9 rules, 0.0 * x = 0.0 + # D0.f32 = S2.f32 + # else + # D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = S2.f32 + else: + D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_DX9_ZERO_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # // DX9 rules, 0.0 * x = 0.0 + # D0.f32 = 0.0F + # else + # D0.f32 = S0.f32 * S1.f32 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = 0.0 + else: + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 * S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_I32_I24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(S0.i24) * 32'I(S1.i24) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S0.i24) * (S1.i24) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_HI_I32_I24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I((64'I(S0.i24) * 64'I(S1.i24)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (((S0.i24) * (S1.i24)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_U32_U24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(S0.u24) * 32'U(S1.u24) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u24) * (S1.u24) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_HI_U32_U24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U((64'U(S0.u24) * 64'U(S1.u24)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((S0.u24) * (S1.u24)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif LT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif LT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32))) + # elsif isSignalNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif isQuietNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isQuietNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # else + # if isNAN(64'F(S1.f32)) then + # D0.f32 = S0.f32 + # elsif isNAN(64'F(S0.f32)) then + # D0.f32 = S1.f32 + # elsif GT_NEG_ZERO(S0.f32, S1.f32) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f32 = S0.f32 + # else + # D0.f32 = S1.f32 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f32)): + D0.f32 = F(cvtToQuietNAN(F(S0.f32))) + elif isSignalNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif isQuietNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isQuietNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + else: + if isNAN(F(S1.f32)): + D0.f32 = S0.f32 + elif isNAN(F(S0.f32)): + D0.f32 = S1.f32 + elif GT_NEG_ZERO(S0.f32, S1.f32): + D0.f32 = S0.f32 + else: + D0.f32 = S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 < S1.i32 ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((S0.i32) if (S0.i32 < S1.i32) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 >= S1.i32 ? S0.i32 : S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = ((S0.i32) if (S0.i32 >= S1.i32) else (S1.i32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 < S1.u32 ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32) if (S0.u32 < S1.u32) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 >= S1.u32 ? S0.u32 : S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32) if (S0.u32 >= S1.u32) else (S1.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHLREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S1.u32 << S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S1.u32 << S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHRREV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S1.u32 >> S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S1.u32 >> S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ASHRREV_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = (S1.i32 >> S0[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S1.i32 >> S0[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_AND_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 & S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 & S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_OR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 | S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 | S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_XOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 ^ S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 ^ S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_XNOR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ~(S0.u32 ^ S1.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ~(S0.u32 ^ S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 + S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 + S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 - S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 - S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUBREV_NC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S1.u32 - S0.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S1.u32 - S0.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMAC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, D0.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, D0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_RTZ_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # prev_mode = ROUND_MODE; + # tmp[15 : 0].f16 = f32_to_f16(S0.f32); + # tmp[31 : 16].f16 = f32_to_f16(S1.f32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + prev_mode = ROUND_MODE + tmp[15 : 0].f16 = f32_to_f16(S0.f32) + tmp[31 : 16].f16 = f32_to_f16(S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 + S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 + S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 - S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 - S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUBREV_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S1.f16 - S0.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S1.f16 - S0.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 * S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 * S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMAC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, S1.f16, D0.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, S1.f16, D0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif GT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif GT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(64'F(S0.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16))) + # elsif isSignalNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif isQuietNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isQuietNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # else + # if isNAN(64'F(S1.f16)) then + # D0.f16 = S0.f16 + # elsif isNAN(64'F(S0.f16)) then + # D0.f16 = S1.f16 + # elsif LT_NEG_ZERO(S0.f16, S1.f16) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f16 = S0.f16 + # else + # D0.f16 = S1.f16 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(F(S0.f16)): + D0.f16 = F(cvtToQuietNAN(F(S0.f16))) + elif isSignalNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif isQuietNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isQuietNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + else: + if isNAN(F(S1.f16)): + D0.f16 = S0.f16 + elif isNAN(F(S0.f16)): + D0.f16 = S1.f16 + elif LT_NEG_ZERO(S0.f16, S1.f16): + D0.f16 = S0.f16 + else: + D0.f16 = S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LDEXP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = S0.f16 * 16'F(2.0F ** 32'I(S1.i16)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = S0.f16 * F(2.0 ** (S1.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMA_DX9_ZERO_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # // DX9 rules, 0.0 * x = 0.0 + # D0.f32 = S2.f32 + # else + # D0.f32 = fma(S0.f32, S1.f32, S2.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((F(S0.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = S2.f32 + else: + D0.f32 = fma(S0.f32, S1.f32, S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_I32_I24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(S0.i24) * 32'I(S1.i24) + S2.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S0.i24) * (S1.i24) + S2.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_U32_U24(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(S0.u24) * 32'U(S1.u24) + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u24) * (S1.u24) + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CUBEID_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Set D0.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). + # // XYZ coordinate is given in (S0.f, S1.f, S2.f). + # // S0.f = x + # // S1.f = y + # // S2.f = z + # if ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then + # if S2.f32 < 0.0F then + # D0.f32 = 5.0F + # else + # D0.f32 = 4.0F + # endif + # elsif abs(S1.f32) >= abs(S0.f32) then + # if S1.f32 < 0.0F then + # D0.f32 = 3.0F + # else + # D0.f32 = 2.0F + # endif + # else + # if S0.f32 < 0.0F then + # D0.f32 = 1.0F + # else + # D0.f32 = 0.0F + # endif + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((abs(S2.f32) >= abs(S0.f32)) and (abs(S2.f32) >= abs(S1.f32))): + if S2.f32 < 0.0: + D0.f32 = 5.0 + else: + D0.f32 = 4.0 + elif abs(S1.f32) >= abs(S0.f32): + if S1.f32 < 0.0: + D0.f32 = 3.0 + else: + D0.f32 = 2.0 + else: + if S0.f32 < 0.0: + D0.f32 = 1.0 + else: + D0.f32 = 0.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CUBESC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // D0.f = cubemap S coordinate. + # // XYZ coordinate is given in (S0.f, S1.f, S2.f). + # // S0.f = x + # // S1.f = y + # // S2.f = z + # if ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then + # if S2.f32 < 0.0F then + # D0.f32 = -S0.f32 + # else + # D0.f32 = S0.f32 + # endif + # elsif abs(S1.f32) >= abs(S0.f32) then + # D0.f32 = S0.f32 + # else + # if S0.f32 < 0.0F then + # D0.f32 = S2.f32 + # else + # D0.f32 = -S2.f32 + # endif + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((abs(S2.f32) >= abs(S0.f32)) and (abs(S2.f32) >= abs(S1.f32))): + if S2.f32 < 0.0: + D0.f32 = -S0.f32 + else: + D0.f32 = S0.f32 + elif abs(S1.f32) >= abs(S0.f32): + D0.f32 = S0.f32 + else: + if S0.f32 < 0.0: + D0.f32 = S2.f32 + else: + D0.f32 = -S2.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CUBETC_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // D0.f = cubemap T coordinate. + # // XYZ coordinate is given in (S0.f, S1.f, S2.f). + # // S0.f = x + # // S1.f = y + # // S2.f = z + # if ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then + # D0.f32 = -S1.f32 + # elsif abs(S1.f32) >= abs(S0.f32) then + # if S1.f32 < 0.0F then + # D0.f32 = -S2.f32 + # else + # D0.f32 = S2.f32 + # endif + # else + # D0.f32 = -S1.f32 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((abs(S2.f32) >= abs(S0.f32)) and (abs(S2.f32) >= abs(S1.f32))): + D0.f32 = -S1.f32 + elif abs(S1.f32) >= abs(S0.f32): + if S1.f32 < 0.0: + D0.f32 = -S2.f32 + else: + D0.f32 = S2.f32 + else: + D0.f32 = -S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CUBEMA_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // D0.f = 2.0 * cubemap major axis. + # // XYZ coordinate is given in (S0.f, S1.f, S2.f). + # // S0.f = x + # // S1.f = y + # // S2.f = z + # if ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then + # D0.f32 = S2.f32 * 2.0F + # elsif abs(S1.f32) >= abs(S0.f32) then + # D0.f32 = S1.f32 * 2.0F + # else + # D0.f32 = S0.f32 * 2.0F + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((abs(S2.f32) >= abs(S0.f32)) and (abs(S2.f32) >= abs(S1.f32))): + D0.f32 = S2.f32 * 2.0 + elif abs(S1.f32) >= abs(S0.f32): + D0.f32 = S1.f32 * 2.0 + else: + D0.f32 = S0.f32 * 2.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BFE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1U << S2[4 : 0].u32) - 1U)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1 << S2[4 : 0].u32) - 1)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BFE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S2[4 : 0].u32) - 1)); + # D0.i32 = signext_from_bit(tmp.i32, S2[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S2[4 : 0].u32) - 1)) + D0.i32 = signext_from_bit(tmp.i32, S2[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BFI_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 & S1.u32) | (~S0.u32 & S2.u32)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 & S1.u32) | (~S0.u32 & S2.u32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMA_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = fma(S0.f32, S1.f32, S2.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = fma(S0.f32, S1.f32, S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMA_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = fma(S0.f64, S1.f64, S2.f64) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = fma(S0.f64, S1.f64, S2.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_LERP_U8(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = ((S0.u32[31 : 24] + S1.u32[31 : 24] + S2.u32[24].u8) >> 1U << 24U); + # tmp += ((S0.u32[23 : 16] + S1.u32[23 : 16] + S2.u32[16].u8) >> 1U << 16U); + # tmp += ((S0.u32[15 : 8] + S1.u32[15 : 8] + S2.u32[8].u8) >> 1U << 8U); + # tmp += ((S0.u32[7 : 0] + S1.u32[7 : 0] + S2.u32[0].u8) >> 1U); + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(((S0.u32[31 : 24] + S1.u32[31 : 24] + S2.u32[24].u8) >> 1 << 24)) + tmp += ((S0.u32[23 : 16] + S1.u32[23 : 16] + S2.u32[16].u8) >> 1 << 16) + tmp += ((S0.u32[15 : 8] + S1.u32[15 : 8] + S2.u32[8].u8) >> 1 << 8) + tmp += ((S0.u32[7 : 0] + S1.u32[7 : 0] + S2.u32[0].u8) >> 1) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ALIGNBIT_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0].u32) & 0xffffffffLL) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((_pack32(S0.u32, S1.u32) >> S2.u32[4 : 0].u32) & 0xffffffff) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ALIGNBYTE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(({ S0.u32, S1.u32 } >> (S2.u32[1 : 0].u32 * 8U)) & 0xffffffffLL) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((_pack32(S0.u32, S1.u32) >> (S2.u32[1 : 0].u32 * 8)) & 0xffffffff) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MULLIT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if ((S1.f32 == -MAX_FLOAT_F32) || (64'F(S1.f32) == -INF) || isNAN(64'F(S1.f32)) || (S2.f32 <= 0.0F) || + # isNAN(64'F(S2.f32))) then + # D0.f32 = -MAX_FLOAT_F32 + # else + # D0.f32 = S0.f32 * S1.f32 + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if ((S1.f32 == -MAX_FLOAT_F32) or (F(S1.f32) == (-INF)) or isNAN(F(S1.f32)) or (S2.f32 <= 0.0) or isNAN(F(S2.f32))): + D0.f32 = -MAX_FLOAT_F32 + else: + D0.f32 = S0.f32 * S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = v_min_i32(v_min_i32(S0.i32, S1.i32), S2.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = v_min_i32(v_min_i32(S0.i32, S1.i32), S2.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = v_min_u32(v_min_u32(S0.u32, S1.u32), S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = v_min_u32(v_min_u32(S0.u32, S1.u32), S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = v_max_f32(v_max_f32(S0.f32, S1.f32), S2.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = v_max_f32(v_max_f32(S0.f32, S1.f32), S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = v_max_i32(v_max_i32(S0.i32, S1.i32), S2.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = v_max_i32(v_max_i32(S0.i32, S1.i32), S2.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = v_max_u32(v_max_u32(S0.u32, S1.u32), S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = v_max_u32(v_max_u32(S0.u32, S1.u32), S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)) || isNAN(64'F(S2.f32))) then + # D0.f32 = v_min3_f32(S0.f32, S1.f32, S2.f32) + # elsif v_max3_f32(S0.f32, S1.f32, S2.f32) == S0.f32 then + # D0.f32 = v_max_f32(S1.f32, S2.f32) + # elsif v_max3_f32(S0.f32, S1.f32, S2.f32) == S1.f32 then + # D0.f32 = v_max_f32(S0.f32, S2.f32) + # else + # D0.f32 = v_max_f32(S0.f32, S1.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if (isNAN(F(S0.f32)) or isNAN(F(S1.f32)) or isNAN(F(S2.f32))): + D0.f32 = v_min3_f32(S0.f32, S1.f32, S2.f32) + elif v_max3_f32(S0.f32, S1.f32, S2.f32) == S0.f32: + D0.f32 = v_max_f32(S1.f32, S2.f32) + elif v_max3_f32(S0.f32, S1.f32, S2.f32) == S1.f32: + D0.f32 = v_max_f32(S0.f32, S2.f32) + else: + D0.f32 = v_max_f32(S0.f32, S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if v_max3_i32(S0.i32, S1.i32, S2.i32) == S0.i32 then + # D0.i32 = v_max_i32(S1.i32, S2.i32) + # elsif v_max3_i32(S0.i32, S1.i32, S2.i32) == S1.i32 then + # D0.i32 = v_max_i32(S0.i32, S2.i32) + # else + # D0.i32 = v_max_i32(S0.i32, S1.i32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if v_max3_i32(S0.i32, S1.i32, S2.i32) == S0.i32: + D0.i32 = v_max_i32(S1.i32, S2.i32) + elif v_max3_i32(S0.i32, S1.i32, S2.i32) == S1.i32: + D0.i32 = v_max_i32(S0.i32, S2.i32) + else: + D0.i32 = v_max_i32(S0.i32, S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if v_max3_u32(S0.u32, S1.u32, S2.u32) == S0.u32 then + # D0.u32 = v_max_u32(S1.u32, S2.u32) + # elsif v_max3_u32(S0.u32, S1.u32, S2.u32) == S1.u32 then + # D0.u32 = v_max_u32(S0.u32, S2.u32) + # else + # D0.u32 = v_max_u32(S0.u32, S1.u32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if v_max3_u32(S0.u32, S1.u32, S2.u32) == S0.u32: + D0.u32 = v_max_u32(S1.u32, S2.u32) + elif v_max3_u32(S0.u32, S1.u32, S2.u32) == S1.u32: + D0.u32 = v_max_u32(S0.u32, S2.u32) + else: + D0.u32 = v_max_u32(S0.u32, S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SAD_U8(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // UNSIGNED comparison + # tmp = S2.u32; + # tmp += 32'U(ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0])); + # tmp += 32'U(ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8])); + # tmp += 32'U(ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16])); + # tmp += 32'U(ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24])); + # D0.u32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S2.u32) + tmp += (ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0])) + tmp += (ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8])) + tmp += (ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16])) + tmp += (ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24])) + D0.u32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SAD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // UNSIGNED comparison + # tmp = S2.u32; + # tmp += ABSDIFF(S0[15 : 0].u16, S1[15 : 0].u16); + # tmp += ABSDIFF(S0[31 : 16].u16, S1[31 : 16].u16); + # D0.u32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S2.u32) + tmp += ABSDIFF(S0[15 : 0].u16, S1[15 : 0].u16) + tmp += ABSDIFF(S0[31 : 16].u16, S1[31 : 16].u16) + D0.u32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SAD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // UNSIGNED comparison + # D0.u32 = ABSDIFF(S0.u32, S1.u32) + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ABSDIFF(S0.u32, S1.u32) + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_U8_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = (S2.u32 & 32'U(~(0xff << (S1.u32[1 : 0].u32 * 8U)))); + # tmp = (tmp | ((32'U(f32_to_u8(S0.f32)) & 255U) << (S1.u32[1 : 0].u32 * 8U))); + # D0.u32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S2.u32 & (~(0xff << (S1.u32[1 : 0].u32 * 8))))) + tmp = Reg((tmp | (((f32_to_u8(S0.f32)) & 255) << (S1.u32[1 : 0].u32 * 8)))) + D0.u32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_DIV_FIXUP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # sign_out = (sign(S1.f32) ^ sign(S2.f32)); + # if isNAN(64'F(S2.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S2.f32))) + # elsif isNAN(64'F(S1.f32)) then + # D0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32))) + # elsif ((64'F(S1.f32) == 0.0) && (64'F(S2.f32) == 0.0)) then + # // 0/0 + # D0.f32 = 32'F(0xffc00000) + # elsif ((64'F(abs(S1.f32)) == +INF) && (64'F(abs(S2.f32)) == +INF)) then + # // inf/inf + # D0.f32 = 32'F(0xffc00000) + # elsif ((64'F(S1.f32) == 0.0) || (64'F(abs(S2.f32)) == +INF)) then + # // x/0, or inf/y + # D0.f32 = sign_out ? -INF.f32 : +INF.f32 + # elsif ((64'F(abs(S1.f32)) == +INF) || (64'F(S2.f32) == 0.0)) then + # // x/inf, 0/y + # D0.f32 = sign_out ? -0.0F : 0.0F + # elsif exponent(S2.f32) - exponent(S1.f32) < -150 then + # D0.f32 = sign_out ? -UNDERFLOW_F32 : UNDERFLOW_F32 + # elsif exponent(S1.f32) == 255 then + # D0.f32 = sign_out ? -OVERFLOW_F32 : OVERFLOW_F32 + # else + # D0.f32 = sign_out ? -abs(S0.f32) : abs(S0.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + sign_out = (sign(S1.f32) ^ sign(S2.f32)) + if isNAN(F(S2.f32)): + D0.f32 = F(cvtToQuietNAN(F(S2.f32))) + elif isNAN(F(S1.f32)): + D0.f32 = F(cvtToQuietNAN(F(S1.f32))) + elif ((F(S1.f32) == 0.0) and (F(S2.f32) == 0.0)): + D0.f32 = F(0xffc00000) + elif ((F(abs(S1.f32)) == INF) and (F(abs(S2.f32)) == INF)): + D0.f32 = F(0xffc00000) + elif ((F(S1.f32) == 0.0) or (F(abs(S2.f32)) == INF)): + D0.f32 = (((-INF).f32) if (sign_out) else (INF.f32)) + elif ((F(abs(S1.f32)) == INF) or (F(S2.f32) == 0.0)): + D0.f32 = ((-0.0) if (sign_out) else (0.0)) + elif exponent(S2.f32) - exponent(S1.f32) < -150: + D0.f32 = ((-UNDERFLOW_F32) if (sign_out) else (UNDERFLOW_F32)) + elif exponent(S1.f32) == 255: + D0.f32 = ((-OVERFLOW_F32) if (sign_out) else (OVERFLOW_F32)) + else: + D0.f32 = ((-abs(S0.f32)) if (sign_out) else (abs(S0.f32))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_DIV_FIXUP_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # sign_out = (sign(S1.f64) ^ sign(S2.f64)); + # if isNAN(S2.f64) then + # D0.f64 = cvtToQuietNAN(S2.f64) + # elsif isNAN(S1.f64) then + # D0.f64 = cvtToQuietNAN(S1.f64) + # elsif ((S1.f64 == 0.0) && (S2.f64 == 0.0)) then + # // 0/0 + # D0.f64 = 64'F(0xfff8000000000000LL) + # elsif ((abs(S1.f64) == +INF) && (abs(S2.f64) == +INF)) then + # // inf/inf + # D0.f64 = 64'F(0xfff8000000000000LL) + # elsif ((S1.f64 == 0.0) || (abs(S2.f64) == +INF)) then + # // x/0, or inf/y + # D0.f64 = sign_out ? -INF : +INF + # elsif ((abs(S1.f64) == +INF) || (S2.f64 == 0.0)) then + # // x/inf, 0/y + # D0.f64 = sign_out ? -0.0 : 0.0 + # elsif exponent(S2.f64) - exponent(S1.f64) < -1075 then + # D0.f64 = sign_out ? -UNDERFLOW_F64 : UNDERFLOW_F64 + # elsif exponent(S1.f64) == 2047 then + # D0.f64 = sign_out ? -OVERFLOW_F64 : OVERFLOW_F64 + # else + # D0.f64 = sign_out ? -abs(S0.f64) : abs(S0.f64) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + sign_out = (sign(S1.f64) ^ sign(S2.f64)) + if isNAN(S2.f64): + D0.f64 = cvtToQuietNAN(S2.f64) + elif isNAN(S1.f64): + D0.f64 = cvtToQuietNAN(S1.f64) + elif ((S1.f64 == 0.0) and (S2.f64 == 0.0)): + D0.f64 = F(0xfff8000000000000) + elif ((abs(S1.f64) == INF) and (abs(S2.f64) == INF)): + D0.f64 = F(0xfff8000000000000) + elif ((S1.f64 == 0.0) or (abs(S2.f64) == INF)): + D0.f64 = (((-INF)) if (sign_out) else (INF)) + elif ((abs(S1.f64) == INF) or (S2.f64 == 0.0)): + D0.f64 = ((-0.0) if (sign_out) else (0.0)) + elif exponent(S2.f64) - exponent(S1.f64) < -1075: + D0.f64 = ((-UNDERFLOW_F64) if (sign_out) else (UNDERFLOW_F64)) + elif exponent(S1.f64) == 2047: + D0.f64 = ((-OVERFLOW_F64) if (sign_out) else (OVERFLOW_F64)) + else: + D0.f64 = ((-abs(S0.f64)) if (sign_out) else (abs(S0.f64))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_DIV_FMAS_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if VCC.u64[laneId] then + # D0.f32 = 2.0F ** 32 * fma(S0.f32, S1.f32, S2.f32) + # else + # D0.f32 = fma(S0.f32, S1.f32, S2.f32) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if VCC.u64[laneId]: + D0.f32 = 2.0 ** 32 * fma(S0.f32, S1.f32, S2.f32) + else: + D0.f32 = fma(S0.f32, S1.f32, S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_DIV_FMAS_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if VCC.u64[laneId] then + # D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64) + # else + # D0.f64 = fma(S0.f64, S1.f64, S2.f64) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if VCC.u64[laneId]: + D0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64) + else: + D0.f64 = fma(S0.f64, S1.f64, S2.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MSAD_U8(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // UNSIGNED comparison + # tmp = S2.u32; + # tmp += S1.u32[7 : 0] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0])); + # tmp += S1.u32[15 : 8] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8])); + # tmp += S1.u32[23 : 16] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16])); + # tmp += S1.u32[31 : 24] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24])); + # D0.u32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S2.u32) + tmp += ((0) if (S1.u32[7 : 0] == 0) else ((ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0])))) + tmp += ((0) if (S1.u32[15 : 8] == 0) else ((ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8])))) + tmp += ((0) if (S1.u32[23 : 16] == 0) else ((ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16])))) + tmp += ((0) if (S1.u32[31 : 24] == 0) else ((ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24])))) + D0.u32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_XOR3_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 ^ S1.u32 ^ S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 ^ S1.u32 ^ S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 * S1.u16 + S2.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = S0.u16 * S1.u16 + S2.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_XAD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 ^ S1.u32) + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 ^ S1.u32) + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHL_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 << S1.u32[4 : 0].u32) + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 << S1.u32[4 : 0].u32) + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_LSHL_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 + S1.u32) << S2.u32[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 + S1.u32) << S2.u32[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_FMA_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = fma(S0.f16, S1.f16, S2.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = fma(S0.f16, S1.f16, S2.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = v_min_f16(v_min_f16(S0.f16, S1.f16), S2.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = v_min_f16(v_min_f16(S0.f16, S1.f16), S2.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = v_min_i16(v_min_i16(S0.i16, S1.i16), S2.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = v_min_i16(v_min_i16(S0.i16, S1.i16), S2.i16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN3_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = v_min_u16(v_min_u16(S0.u16, S1.u16), S2.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = v_min_u16(v_min_u16(S0.u16, S1.u16), S2.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = v_max_f16(v_max_f16(S0.f16, S1.f16), S2.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = v_max_f16(v_max_f16(S0.f16, S1.f16), S2.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = v_max_i16(v_max_i16(S0.i16, S1.i16), S2.i16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = v_max_i16(v_max_i16(S0.i16, S1.i16), S2.i16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX3_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = v_max_u16(v_max_u16(S0.u16, S1.u16), S2.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = v_max_u16(v_max_u16(S0.u16, S1.u16), S2.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)) || isNAN(64'F(S2.f16))) then + # D0.f16 = v_min3_f16(S0.f16, S1.f16, S2.f16) + # elsif v_max3_f16(S0.f16, S1.f16, S2.f16) == S0.f16 then + # D0.f16 = v_max_f16(S1.f16, S2.f16) + # elsif v_max3_f16(S0.f16, S1.f16, S2.f16) == S1.f16 then + # D0.f16 = v_max_f16(S0.f16, S2.f16) + # else + # D0.f16 = v_max_f16(S0.f16, S1.f16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if (isNAN(F(S0.f16)) or isNAN(F(S1.f16)) or isNAN(F(S2.f16))): + D0.f16 = v_min3_f16(S0.f16, S1.f16, S2.f16) + elif v_max3_f16(S0.f16, S1.f16, S2.f16) == S0.f16: + D0.f16 = v_max_f16(S1.f16, S2.f16) + elif v_max3_f16(S0.f16, S1.f16, S2.f16) == S1.f16: + D0.f16 = v_max_f16(S0.f16, S2.f16) + else: + D0.f16 = v_max_f16(S0.f16, S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if v_max3_i16(S0.i16, S1.i16, S2.i16) == S0.i16 then + # D0.i16 = v_max_i16(S1.i16, S2.i16) + # elsif v_max3_i16(S0.i16, S1.i16, S2.i16) == S1.i16 then + # D0.i16 = v_max_i16(S0.i16, S2.i16) + # else + # D0.i16 = v_max_i16(S0.i16, S1.i16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if v_max3_i16(S0.i16, S1.i16, S2.i16) == S0.i16: + D0.i16 = v_max_i16(S1.i16, S2.i16) + elif v_max3_i16(S0.i16, S1.i16, S2.i16) == S1.i16: + D0.i16 = v_max_i16(S0.i16, S2.i16) + else: + D0.i16 = v_max_i16(S0.i16, S1.i16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MED3_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if v_max3_u16(S0.u16, S1.u16, S2.u16) == S0.u16 then + # D0.u16 = v_max_u16(S1.u16, S2.u16) + # elsif v_max3_u16(S0.u16, S1.u16, S2.u16) == S1.u16 then + # D0.u16 = v_max_u16(S0.u16, S2.u16) + # else + # D0.u16 = v_max_u16(S0.u16, S1.u16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if v_max3_u16(S0.u16, S1.u16, S2.u16) == S0.u16: + D0.u16 = v_max_u16(S1.u16, S2.u16) + elif v_max3_u16(S0.u16, S1.u16, S2.u16) == S1.u16: + D0.u16 = v_max_u16(S0.u16, S2.u16) + else: + D0.u16 = v_max_u16(S0.u16, S1.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = S0.i16 * S1.i16 + S2.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = S0.i16 * S1.i16 + S2.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_DIV_FIXUP_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # sign_out = (sign(S1.f16) ^ sign(S2.f16)); + # if isNAN(64'F(S2.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S2.f16))) + # elsif isNAN(64'F(S1.f16)) then + # D0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16))) + # elsif ((64'F(S1.f16) == 0.0) && (64'F(S2.f16) == 0.0)) then + # // 0/0 + # D0.f16 = 16'F(0xfe00) + # elsif ((64'F(abs(S1.f16)) == +INF) && (64'F(abs(S2.f16)) == +INF)) then + # // inf/inf + # D0.f16 = 16'F(0xfe00) + # elsif ((64'F(S1.f16) == 0.0) || (64'F(abs(S2.f16)) == +INF)) then + # // x/0, or inf/y + # D0.f16 = sign_out ? -INF.f16 : +INF.f16 + # elsif ((64'F(abs(S1.f16)) == +INF) || (64'F(S2.f16) == 0.0)) then + # // x/inf, 0/y + # D0.f16 = sign_out ? -16'0.0 : 16'0.0 + # else + # D0.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + sign_out = (sign(S1.f16) ^ sign(S2.f16)) + if isNAN(F(S2.f16)): + D0.f16 = F(cvtToQuietNAN(F(S2.f16))) + elif isNAN(F(S1.f16)): + D0.f16 = F(cvtToQuietNAN(F(S1.f16))) + elif ((F(S1.f16) == 0.0) and (F(S2.f16) == 0.0)): + D0.f16 = F(0xfe00) + elif ((F(abs(S1.f16)) == INF) and (F(abs(S2.f16)) == INF)): + D0.f16 = F(0xfe00) + elif ((F(S1.f16) == 0.0) or (F(abs(S2.f16)) == INF)): + D0.f16 = (((-INF).f16) if (sign_out) else (INF.f16)) + elif ((F(abs(S1.f16)) == INF) or (F(S2.f16) == 0.0)): + D0.f16 = ((-0.0) if (sign_out) else (0.0)) + else: + D0.f16 = ((-abs(S0.f16)) if (sign_out) else (abs(S0.f16))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD3_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 + S1.u32 + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 + S1.u32 + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHL_OR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 << S1.u32[4 : 0].u32) | S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 << S1.u32[4 : 0].u32) | S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_AND_OR_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = ((S0.u32 & S1.u32) | S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = ((S0.u32 & S1.u32) | S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_OR3_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (S0.u32 | S1.u32 | S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u32 | S1.u32 | S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_U32_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U(S0.u16) * 32'U(S1.u16) + S2.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (S0.u16) * (S1.u16) + S2.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAD_I32_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(S0.i16) * 32'I(S1.i16) + S2.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (S0.i16) * (S1.i16) + S2.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CNDMASK_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = VCC.u64[laneId] ? S1.u16 : S0.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = ((S1.u16) if (VCC.u64[laneId]) else (S0.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAXMIN_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = v_min_f32(v_max_f32(S0.f32, S1.f32), S2.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = v_min_f32(v_max_f32(S0.f32, S1.f32), S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MINMAX_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = v_max_f32(v_min_f32(S0.f32, S1.f32), S2.f32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = v_max_f32(v_min_f32(S0.f32, S1.f32), S2.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAXMIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = v_min_f16(v_max_f16(S0.f16, S1.f16), S2.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = v_min_f16(v_max_f16(S0.f16, S1.f16), S2.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MINMAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = v_max_f16(v_min_f16(S0.f16, S1.f16), S2.f16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f16 = v_max_f16(v_min_f16(S0.f16, S1.f16), S2.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAXMIN_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = v_min_u32(v_max_u32(S0.u32, S1.u32), S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = v_min_u32(v_max_u32(S0.u32, S1.u32), S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MINMAX_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = v_max_u32(v_min_u32(S0.u32, S1.u32), S2.u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = v_max_u32(v_min_u32(S0.u32, S1.u32), S2.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAXMIN_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = v_min_i32(v_max_i32(S0.i32, S1.i32), S2.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = v_min_i32(v_max_i32(S0.i32, S1.i32), S2.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MINMAX_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = v_max_i32(v_min_i32(S0.i32, S1.i32), S2.i32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = v_max_i32(v_min_i32(S0.i32, S1.i32), S2.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_DOT2_F16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S2.f16; + # tmp += S0[15 : 0].f16 * S1[15 : 0].f16; + # tmp += S0[31 : 16].f16 * S1[31 : 16].f16; + # D0.f16 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S2.f16) + tmp += S0[15 : 0].f16 * S1[15 : 0].f16 + tmp += S0[31 : 16].f16 * S1[31 : 16].f16 + D0.f16 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_NC_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 + S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = S0.u16 + S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_NC_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 - S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = S0.u16 - S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_LO_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 * S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = S0.u16 * S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_I16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[31 : 16] = 16'B(v_cvt_i16_f32(S1.f32)); + # tmp[15 : 0] = 16'B(v_cvt_i16_f32(S0.f32)); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16] = (v_cvt_i16_f32(S1.f32)) + tmp[15 : 0] = (v_cvt_i16_f32(S0.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_U16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[31 : 16] = 16'B(v_cvt_u16_f32(S1.f32)); + # tmp[15 : 0] = 16'B(v_cvt_u16_f32(S0.f32)); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16] = (v_cvt_u16_f32(S1.f32)) + tmp[15 : 0] = (v_cvt_u16_f32(S0.f32)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 >= S1.u16 ? S0.u16 : S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = ((S0.u16) if (S0.u16 >= S1.u16) else (S1.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MAX_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = S0.i16 >= S1.i16 ? S0.i16 : S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = ((S0.i16) if (S0.i16 >= S1.i16) else (S1.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = S0.u16 < S1.u16 ? S0.u16 : S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = ((S0.u16) if (S0.u16 < S1.u16) else (S1.u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MIN_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = S0.i16 < S1.i16 ? S0.i16 : S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = ((S0.i16) if (S0.i16 < S1.i16) else (S1.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_NC_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = S0.i16 + S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = S0.i16 + S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_NC_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = S0.i16 - S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = S0.i16 - S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_PACK_B32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0[31 : 16].f16 = S1.f16; + # D0[15 : 0].f16 = S0.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0[31 : 16].f16 = S1.f16 + D0[15 : 0].f16 = S0.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_NORM_I16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = f16_to_snorm(S0.f16); + # tmp[31 : 16].i16 = f16_to_snorm(S1.f16); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = f16_to_snorm(S0.f16) + tmp[31 : 16].i16 = f16_to_snorm(S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_NORM_U16_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = f16_to_unorm(S0.f16); + # tmp[31 : 16].u16 = f16_to_unorm(S1.f16); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = f16_to_unorm(S0.f16) + tmp[31 : 16].u16 = f16_to_unorm(S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LDEXP_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = S0.f32 * 2.0F ** S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f32 = S0.f32 * 2.0 ** S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BFM_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = (((1U << S0[4 : 0].u32) - 1U) << S1[4 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((1 << S0[4 : 0].u32) - 1) << S1[4 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_BCNT_U32_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S1.u32; + # for i in 0 : 31 do + # tmp += S0[i].u32; + # // count i'th bit + # endfor; + # D0.u32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S1.u32) + for i in range(0, int(31)+1): + tmp += S0[i].u32 + D0.u32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_NORM_I16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = f32_to_snorm(S0.f32); + # tmp[31 : 16].i16 = f32_to_snorm(S1.f32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = f32_to_snorm(S0.f32) + tmp[31 : 16].i16 = f32_to_snorm(S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_NORM_U16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = f32_to_unorm(S0.f32); + # tmp[31 : 16].u16 = f32_to_unorm(S1.f32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = f32_to_unorm(S0.f32) + tmp[31 : 16].u16 = f32_to_unorm(S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_U16_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = u32_to_u16(S0.u32); + # tmp[31 : 16].u16 = u32_to_u16(S1.u32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = u32_to_u16(S0.u32) + tmp[31 : 16].u16 = u32_to_u16(S1.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_PK_I16_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = i32_to_i16(S0.i32); + # tmp[31 : 16].i16 = i32_to_i16(S1.i32); + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = i32_to_i16(S0.i32) + tmp[31 : 16].i16 = i32_to_i16(S1.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_SUB_NC_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 - S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = S0.i32 - S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_NC_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = S0.i32 + S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = S0.i32 + S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ADD_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = S0.f64 + S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = S0.f64 + S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MUL_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = S0.f64 * S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = S0.f64 * S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MIN_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where -0.0 < +0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(S0.f64) then + # D0.f64 = cvtToQuietNAN(S0.f64) + # elsif isSignalNAN(S1.f64) then + # D0.f64 = cvtToQuietNAN(S1.f64) + # elsif isQuietNAN(S1.f64) then + # D0.f64 = S0.f64 + # elsif isQuietNAN(S0.f64) then + # D0.f64 = S1.f64 + # elsif LT_NEG_ZERO(S0.f64, S1.f64) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f64 = S0.f64 + # else + # D0.f64 = S1.f64 + # endif + # else + # if isNAN(S1.f64) then + # D0.f64 = S0.f64 + # elsif isNAN(S0.f64) then + # D0.f64 = S1.f64 + # elsif LT_NEG_ZERO(S0.f64, S1.f64) then + # // NOTE: -0<+0 is TRUE in this comparison + # D0.f64 = S0.f64 + # else + # D0.f64 = S1.f64 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(S0.f64): + D0.f64 = cvtToQuietNAN(S0.f64) + elif isSignalNAN(S1.f64): + D0.f64 = cvtToQuietNAN(S1.f64) + elif isQuietNAN(S1.f64): + D0.f64 = S0.f64 + elif isQuietNAN(S0.f64): + D0.f64 = S1.f64 + elif LT_NEG_ZERO(S0.f64, S1.f64): + D0.f64 = S0.f64 + else: + D0.f64 = S1.f64 + else: + if isNAN(S1.f64): + D0.f64 = S0.f64 + elif isNAN(S0.f64): + D0.f64 = S1.f64 + elif LT_NEG_ZERO(S0.f64, S1.f64): + D0.f64 = S0.f64 + else: + D0.f64 = S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MAX_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # // Version of comparison where +0.0 > -0.0, differs from IEEE + # if WAVE_MODE.IEEE then + # if isSignalNAN(S0.f64) then + # D0.f64 = cvtToQuietNAN(S0.f64) + # elsif isSignalNAN(S1.f64) then + # D0.f64 = cvtToQuietNAN(S1.f64) + # elsif isQuietNAN(S1.f64) then + # D0.f64 = S0.f64 + # elsif isQuietNAN(S0.f64) then + # D0.f64 = S1.f64 + # elsif GT_NEG_ZERO(S0.f64, S1.f64) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f64 = S0.f64 + # else + # D0.f64 = S1.f64 + # endif + # else + # if isNAN(S1.f64) then + # D0.f64 = S0.f64 + # elsif isNAN(S0.f64) then + # D0.f64 = S1.f64 + # elsif GT_NEG_ZERO(S0.f64, S1.f64) then + # // NOTE: +0>-0 is TRUE in this comparison + # D0.f64 = S0.f64 + # else + # D0.f64 = S1.f64 + # endif + # endif; + # // Inequalities in the above pseudocode behave differently from IEEE + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE_MODE.IEEE: + if isSignalNAN(S0.f64): + D0.f64 = cvtToQuietNAN(S0.f64) + elif isSignalNAN(S1.f64): + D0.f64 = cvtToQuietNAN(S1.f64) + elif isQuietNAN(S1.f64): + D0.f64 = S0.f64 + elif isQuietNAN(S0.f64): + D0.f64 = S1.f64 + elif GT_NEG_ZERO(S0.f64, S1.f64): + D0.f64 = S0.f64 + else: + D0.f64 = S1.f64 + else: + if isNAN(S1.f64): + D0.f64 = S0.f64 + elif isNAN(S0.f64): + D0.f64 = S1.f64 + elif GT_NEG_ZERO(S0.f64, S1.f64): + D0.f64 = S0.f64 + else: + D0.f64 = S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_LDEXP_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = S0.f64 * 2.0 ** S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.f64 = S0.f64 * 2.0 ** S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_MUL_LO_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = S0.u32 * S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = S0.u32 * S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_HI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = 32'U((64'U(S0.u32) * 64'U(S1.u32)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u32 = (((S0.u32) * (S1.u32)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_MUL_HI_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I((64'I(S0.i32) * 64'I(S1.i32)) >> 32U) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i32 = (((S0.i32) * (S1.i32)) >> 32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHLREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = (S1.u16 << S0[3 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = (S1.u16 << S0[3 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHRREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = (S1.u16 >> S0[3 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = (S1.u16 >> S0[3 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_ASHRREV_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i16 = (S1.i16 >> S0[3 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i16 = (S1.i16 >> S0[3 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_LSHLREV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S1.u64 << S0[5 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S1.u64 << S0[5 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_LSHRREV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64 = (S1.u64 >> S0[5 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64 = (S1.u64 >> S0[5 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_ASHRREV_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i64 = (S1.i64 >> S0[5 : 0].u32) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.i64 = (S1.i64 >> S0[5 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3Op_V_READLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare lane : 32'U; + # if WAVE32 then + # lane = S1.u32[4 : 0].u32; + # // Lane select for wave32 + # else + # lane = S1.u32[5 : 0].u32; + # // Lane select for wave64 + # endif; + # D0.b32 = VGPR[lane][SRC0.u32] + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if WAVE32: + lane = S1.u32[4 : 0].u32 + else: + lane = S1.u32[5 : 0].u32 + D0.b32 = VGPR[lane][SRC0.u32] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_AND_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = (S0.u16 & S1.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = (S0.u16 & S1.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_OR_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = (S0.u16 | S1.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = (S0.u16 | S1.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_XOR_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u16 = (S0.u16 ^ S1.u16) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u16 = (S0.u16 ^ S1.u16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +VOP3Op_FUNCTIONS = { + VOP3Op.V_MOV_B32: _VOP3Op_V_MOV_B32, + VOP3Op.V_READFIRSTLANE_B32: _VOP3Op_V_READFIRSTLANE_B32, + VOP3Op.V_CVT_I32_F64: _VOP3Op_V_CVT_I32_F64, + VOP3Op.V_CVT_F64_I32: _VOP3Op_V_CVT_F64_I32, + VOP3Op.V_CVT_F32_I32: _VOP3Op_V_CVT_F32_I32, + VOP3Op.V_CVT_F32_U32: _VOP3Op_V_CVT_F32_U32, + VOP3Op.V_CVT_U32_F32: _VOP3Op_V_CVT_U32_F32, + VOP3Op.V_CVT_I32_F32: _VOP3Op_V_CVT_I32_F32, + VOP3Op.V_CVT_F16_F32: _VOP3Op_V_CVT_F16_F32, + VOP3Op.V_CVT_F32_F16: _VOP3Op_V_CVT_F32_F16, + VOP3Op.V_CVT_NEAREST_I32_F32: _VOP3Op_V_CVT_NEAREST_I32_F32, + VOP3Op.V_CVT_FLOOR_I32_F32: _VOP3Op_V_CVT_FLOOR_I32_F32, + VOP3Op.V_CVT_F32_F64: _VOP3Op_V_CVT_F32_F64, + VOP3Op.V_CVT_F64_F32: _VOP3Op_V_CVT_F64_F32, + VOP3Op.V_CVT_F32_UBYTE0: _VOP3Op_V_CVT_F32_UBYTE0, + VOP3Op.V_CVT_F32_UBYTE1: _VOP3Op_V_CVT_F32_UBYTE1, + VOP3Op.V_CVT_F32_UBYTE2: _VOP3Op_V_CVT_F32_UBYTE2, + VOP3Op.V_CVT_F32_UBYTE3: _VOP3Op_V_CVT_F32_UBYTE3, + VOP3Op.V_CVT_U32_F64: _VOP3Op_V_CVT_U32_F64, + VOP3Op.V_CVT_F64_U32: _VOP3Op_V_CVT_F64_U32, + VOP3Op.V_TRUNC_F64: _VOP3Op_V_TRUNC_F64, + VOP3Op.V_CEIL_F64: _VOP3Op_V_CEIL_F64, + VOP3Op.V_RNDNE_F64: _VOP3Op_V_RNDNE_F64, + VOP3Op.V_FLOOR_F64: _VOP3Op_V_FLOOR_F64, + VOP3Op.V_MOV_B16: _VOP3Op_V_MOV_B16, + VOP3Op.V_FRACT_F32: _VOP3Op_V_FRACT_F32, + VOP3Op.V_TRUNC_F32: _VOP3Op_V_TRUNC_F32, + VOP3Op.V_CEIL_F32: _VOP3Op_V_CEIL_F32, + VOP3Op.V_RNDNE_F32: _VOP3Op_V_RNDNE_F32, + VOP3Op.V_FLOOR_F32: _VOP3Op_V_FLOOR_F32, + VOP3Op.V_EXP_F32: _VOP3Op_V_EXP_F32, + VOP3Op.V_LOG_F32: _VOP3Op_V_LOG_F32, + VOP3Op.V_RCP_F32: _VOP3Op_V_RCP_F32, + VOP3Op.V_RCP_IFLAG_F32: _VOP3Op_V_RCP_IFLAG_F32, + VOP3Op.V_RSQ_F32: _VOP3Op_V_RSQ_F32, + VOP3Op.V_RCP_F64: _VOP3Op_V_RCP_F64, + VOP3Op.V_RSQ_F64: _VOP3Op_V_RSQ_F64, + VOP3Op.V_SQRT_F32: _VOP3Op_V_SQRT_F32, + VOP3Op.V_SQRT_F64: _VOP3Op_V_SQRT_F64, + VOP3Op.V_SIN_F32: _VOP3Op_V_SIN_F32, + VOP3Op.V_COS_F32: _VOP3Op_V_COS_F32, + VOP3Op.V_NOT_B32: _VOP3Op_V_NOT_B32, + VOP3Op.V_BFREV_B32: _VOP3Op_V_BFREV_B32, + VOP3Op.V_CLZ_I32_U32: _VOP3Op_V_CLZ_I32_U32, + VOP3Op.V_CTZ_I32_B32: _VOP3Op_V_CTZ_I32_B32, + VOP3Op.V_CLS_I32: _VOP3Op_V_CLS_I32, + VOP3Op.V_FREXP_EXP_I32_F64: _VOP3Op_V_FREXP_EXP_I32_F64, + VOP3Op.V_FREXP_MANT_F64: _VOP3Op_V_FREXP_MANT_F64, + VOP3Op.V_FRACT_F64: _VOP3Op_V_FRACT_F64, + VOP3Op.V_FREXP_EXP_I32_F32: _VOP3Op_V_FREXP_EXP_I32_F32, + VOP3Op.V_FREXP_MANT_F32: _VOP3Op_V_FREXP_MANT_F32, + VOP3Op.V_MOVRELS_B32: _VOP3Op_V_MOVRELS_B32, + VOP3Op.V_CVT_F16_U16: _VOP3Op_V_CVT_F16_U16, + VOP3Op.V_CVT_F16_I16: _VOP3Op_V_CVT_F16_I16, + VOP3Op.V_CVT_U16_F16: _VOP3Op_V_CVT_U16_F16, + VOP3Op.V_CVT_I16_F16: _VOP3Op_V_CVT_I16_F16, + VOP3Op.V_RCP_F16: _VOP3Op_V_RCP_F16, + VOP3Op.V_SQRT_F16: _VOP3Op_V_SQRT_F16, + VOP3Op.V_RSQ_F16: _VOP3Op_V_RSQ_F16, + VOP3Op.V_LOG_F16: _VOP3Op_V_LOG_F16, + VOP3Op.V_EXP_F16: _VOP3Op_V_EXP_F16, + VOP3Op.V_FREXP_MANT_F16: _VOP3Op_V_FREXP_MANT_F16, + VOP3Op.V_FREXP_EXP_I16_F16: _VOP3Op_V_FREXP_EXP_I16_F16, + VOP3Op.V_FLOOR_F16: _VOP3Op_V_FLOOR_F16, + VOP3Op.V_CEIL_F16: _VOP3Op_V_CEIL_F16, + VOP3Op.V_TRUNC_F16: _VOP3Op_V_TRUNC_F16, + VOP3Op.V_RNDNE_F16: _VOP3Op_V_RNDNE_F16, + VOP3Op.V_FRACT_F16: _VOP3Op_V_FRACT_F16, + VOP3Op.V_SIN_F16: _VOP3Op_V_SIN_F16, + VOP3Op.V_COS_F16: _VOP3Op_V_COS_F16, + VOP3Op.V_CVT_NORM_I16_F16: _VOP3Op_V_CVT_NORM_I16_F16, + VOP3Op.V_CVT_NORM_U16_F16: _VOP3Op_V_CVT_NORM_U16_F16, + VOP3Op.V_NOT_B16: _VOP3Op_V_NOT_B16, + VOP3Op.V_CVT_I32_I16: _VOP3Op_V_CVT_I32_I16, + VOP3Op.V_CVT_U32_U16: _VOP3Op_V_CVT_U32_U16, + VOP3Op.V_CNDMASK_B32: _VOP3Op_V_CNDMASK_B32, + VOP3Op.V_ADD_F32: _VOP3Op_V_ADD_F32, + VOP3Op.V_SUB_F32: _VOP3Op_V_SUB_F32, + VOP3Op.V_SUBREV_F32: _VOP3Op_V_SUBREV_F32, + VOP3Op.V_FMAC_DX9_ZERO_F32: _VOP3Op_V_FMAC_DX9_ZERO_F32, + VOP3Op.V_MUL_DX9_ZERO_F32: _VOP3Op_V_MUL_DX9_ZERO_F32, + VOP3Op.V_MUL_F32: _VOP3Op_V_MUL_F32, + VOP3Op.V_MUL_I32_I24: _VOP3Op_V_MUL_I32_I24, + VOP3Op.V_MUL_HI_I32_I24: _VOP3Op_V_MUL_HI_I32_I24, + VOP3Op.V_MUL_U32_U24: _VOP3Op_V_MUL_U32_U24, + VOP3Op.V_MUL_HI_U32_U24: _VOP3Op_V_MUL_HI_U32_U24, + VOP3Op.V_MIN_F32: _VOP3Op_V_MIN_F32, + VOP3Op.V_MAX_F32: _VOP3Op_V_MAX_F32, + VOP3Op.V_MIN_I32: _VOP3Op_V_MIN_I32, + VOP3Op.V_MAX_I32: _VOP3Op_V_MAX_I32, + VOP3Op.V_MIN_U32: _VOP3Op_V_MIN_U32, + VOP3Op.V_MAX_U32: _VOP3Op_V_MAX_U32, + VOP3Op.V_LSHLREV_B32: _VOP3Op_V_LSHLREV_B32, + VOP3Op.V_LSHRREV_B32: _VOP3Op_V_LSHRREV_B32, + VOP3Op.V_ASHRREV_I32: _VOP3Op_V_ASHRREV_I32, + VOP3Op.V_AND_B32: _VOP3Op_V_AND_B32, + VOP3Op.V_OR_B32: _VOP3Op_V_OR_B32, + VOP3Op.V_XOR_B32: _VOP3Op_V_XOR_B32, + VOP3Op.V_XNOR_B32: _VOP3Op_V_XNOR_B32, + VOP3Op.V_ADD_NC_U32: _VOP3Op_V_ADD_NC_U32, + VOP3Op.V_SUB_NC_U32: _VOP3Op_V_SUB_NC_U32, + VOP3Op.V_SUBREV_NC_U32: _VOP3Op_V_SUBREV_NC_U32, + VOP3Op.V_FMAC_F32: _VOP3Op_V_FMAC_F32, + VOP3Op.V_CVT_PK_RTZ_F16_F32: _VOP3Op_V_CVT_PK_RTZ_F16_F32, + VOP3Op.V_ADD_F16: _VOP3Op_V_ADD_F16, + VOP3Op.V_SUB_F16: _VOP3Op_V_SUB_F16, + VOP3Op.V_SUBREV_F16: _VOP3Op_V_SUBREV_F16, + VOP3Op.V_MUL_F16: _VOP3Op_V_MUL_F16, + VOP3Op.V_FMAC_F16: _VOP3Op_V_FMAC_F16, + VOP3Op.V_MAX_F16: _VOP3Op_V_MAX_F16, + VOP3Op.V_MIN_F16: _VOP3Op_V_MIN_F16, + VOP3Op.V_LDEXP_F16: _VOP3Op_V_LDEXP_F16, + VOP3Op.V_FMA_DX9_ZERO_F32: _VOP3Op_V_FMA_DX9_ZERO_F32, + VOP3Op.V_MAD_I32_I24: _VOP3Op_V_MAD_I32_I24, + VOP3Op.V_MAD_U32_U24: _VOP3Op_V_MAD_U32_U24, + VOP3Op.V_CUBEID_F32: _VOP3Op_V_CUBEID_F32, + VOP3Op.V_CUBESC_F32: _VOP3Op_V_CUBESC_F32, + VOP3Op.V_CUBETC_F32: _VOP3Op_V_CUBETC_F32, + VOP3Op.V_CUBEMA_F32: _VOP3Op_V_CUBEMA_F32, + VOP3Op.V_BFE_U32: _VOP3Op_V_BFE_U32, + VOP3Op.V_BFE_I32: _VOP3Op_V_BFE_I32, + VOP3Op.V_BFI_B32: _VOP3Op_V_BFI_B32, + VOP3Op.V_FMA_F32: _VOP3Op_V_FMA_F32, + VOP3Op.V_FMA_F64: _VOP3Op_V_FMA_F64, + VOP3Op.V_LERP_U8: _VOP3Op_V_LERP_U8, + VOP3Op.V_ALIGNBIT_B32: _VOP3Op_V_ALIGNBIT_B32, + VOP3Op.V_ALIGNBYTE_B32: _VOP3Op_V_ALIGNBYTE_B32, + VOP3Op.V_MULLIT_F32: _VOP3Op_V_MULLIT_F32, + VOP3Op.V_MIN3_F32: _VOP3Op_V_MIN3_F32, + VOP3Op.V_MIN3_I32: _VOP3Op_V_MIN3_I32, + VOP3Op.V_MIN3_U32: _VOP3Op_V_MIN3_U32, + VOP3Op.V_MAX3_F32: _VOP3Op_V_MAX3_F32, + VOP3Op.V_MAX3_I32: _VOP3Op_V_MAX3_I32, + VOP3Op.V_MAX3_U32: _VOP3Op_V_MAX3_U32, + VOP3Op.V_MED3_F32: _VOP3Op_V_MED3_F32, + VOP3Op.V_MED3_I32: _VOP3Op_V_MED3_I32, + VOP3Op.V_MED3_U32: _VOP3Op_V_MED3_U32, + VOP3Op.V_SAD_U8: _VOP3Op_V_SAD_U8, + VOP3Op.V_SAD_U16: _VOP3Op_V_SAD_U16, + VOP3Op.V_SAD_U32: _VOP3Op_V_SAD_U32, + VOP3Op.V_CVT_PK_U8_F32: _VOP3Op_V_CVT_PK_U8_F32, + VOP3Op.V_DIV_FIXUP_F32: _VOP3Op_V_DIV_FIXUP_F32, + VOP3Op.V_DIV_FIXUP_F64: _VOP3Op_V_DIV_FIXUP_F64, + VOP3Op.V_DIV_FMAS_F32: _VOP3Op_V_DIV_FMAS_F32, + VOP3Op.V_DIV_FMAS_F64: _VOP3Op_V_DIV_FMAS_F64, + VOP3Op.V_MSAD_U8: _VOP3Op_V_MSAD_U8, + VOP3Op.V_XOR3_B32: _VOP3Op_V_XOR3_B32, + VOP3Op.V_MAD_U16: _VOP3Op_V_MAD_U16, + VOP3Op.V_XAD_U32: _VOP3Op_V_XAD_U32, + VOP3Op.V_LSHL_ADD_U32: _VOP3Op_V_LSHL_ADD_U32, + VOP3Op.V_ADD_LSHL_U32: _VOP3Op_V_ADD_LSHL_U32, + VOP3Op.V_FMA_F16: _VOP3Op_V_FMA_F16, + VOP3Op.V_MIN3_F16: _VOP3Op_V_MIN3_F16, + VOP3Op.V_MIN3_I16: _VOP3Op_V_MIN3_I16, + VOP3Op.V_MIN3_U16: _VOP3Op_V_MIN3_U16, + VOP3Op.V_MAX3_F16: _VOP3Op_V_MAX3_F16, + VOP3Op.V_MAX3_I16: _VOP3Op_V_MAX3_I16, + VOP3Op.V_MAX3_U16: _VOP3Op_V_MAX3_U16, + VOP3Op.V_MED3_F16: _VOP3Op_V_MED3_F16, + VOP3Op.V_MED3_I16: _VOP3Op_V_MED3_I16, + VOP3Op.V_MED3_U16: _VOP3Op_V_MED3_U16, + VOP3Op.V_MAD_I16: _VOP3Op_V_MAD_I16, + VOP3Op.V_DIV_FIXUP_F16: _VOP3Op_V_DIV_FIXUP_F16, + VOP3Op.V_ADD3_U32: _VOP3Op_V_ADD3_U32, + VOP3Op.V_LSHL_OR_B32: _VOP3Op_V_LSHL_OR_B32, + VOP3Op.V_AND_OR_B32: _VOP3Op_V_AND_OR_B32, + VOP3Op.V_OR3_B32: _VOP3Op_V_OR3_B32, + VOP3Op.V_MAD_U32_U16: _VOP3Op_V_MAD_U32_U16, + VOP3Op.V_MAD_I32_I16: _VOP3Op_V_MAD_I32_I16, + VOP3Op.V_CNDMASK_B16: _VOP3Op_V_CNDMASK_B16, + VOP3Op.V_MAXMIN_F32: _VOP3Op_V_MAXMIN_F32, + VOP3Op.V_MINMAX_F32: _VOP3Op_V_MINMAX_F32, + VOP3Op.V_MAXMIN_F16: _VOP3Op_V_MAXMIN_F16, + VOP3Op.V_MINMAX_F16: _VOP3Op_V_MINMAX_F16, + VOP3Op.V_MAXMIN_U32: _VOP3Op_V_MAXMIN_U32, + VOP3Op.V_MINMAX_U32: _VOP3Op_V_MINMAX_U32, + VOP3Op.V_MAXMIN_I32: _VOP3Op_V_MAXMIN_I32, + VOP3Op.V_MINMAX_I32: _VOP3Op_V_MINMAX_I32, + VOP3Op.V_DOT2_F16_F16: _VOP3Op_V_DOT2_F16_F16, + VOP3Op.V_ADD_NC_U16: _VOP3Op_V_ADD_NC_U16, + VOP3Op.V_SUB_NC_U16: _VOP3Op_V_SUB_NC_U16, + VOP3Op.V_MUL_LO_U16: _VOP3Op_V_MUL_LO_U16, + VOP3Op.V_CVT_PK_I16_F32: _VOP3Op_V_CVT_PK_I16_F32, + VOP3Op.V_CVT_PK_U16_F32: _VOP3Op_V_CVT_PK_U16_F32, + VOP3Op.V_MAX_U16: _VOP3Op_V_MAX_U16, + VOP3Op.V_MAX_I16: _VOP3Op_V_MAX_I16, + VOP3Op.V_MIN_U16: _VOP3Op_V_MIN_U16, + VOP3Op.V_MIN_I16: _VOP3Op_V_MIN_I16, + VOP3Op.V_ADD_NC_I16: _VOP3Op_V_ADD_NC_I16, + VOP3Op.V_SUB_NC_I16: _VOP3Op_V_SUB_NC_I16, + VOP3Op.V_PACK_B32_F16: _VOP3Op_V_PACK_B32_F16, + VOP3Op.V_CVT_PK_NORM_I16_F16: _VOP3Op_V_CVT_PK_NORM_I16_F16, + VOP3Op.V_CVT_PK_NORM_U16_F16: _VOP3Op_V_CVT_PK_NORM_U16_F16, + VOP3Op.V_LDEXP_F32: _VOP3Op_V_LDEXP_F32, + VOP3Op.V_BFM_B32: _VOP3Op_V_BFM_B32, + VOP3Op.V_BCNT_U32_B32: _VOP3Op_V_BCNT_U32_B32, + VOP3Op.V_CVT_PK_NORM_I16_F32: _VOP3Op_V_CVT_PK_NORM_I16_F32, + VOP3Op.V_CVT_PK_NORM_U16_F32: _VOP3Op_V_CVT_PK_NORM_U16_F32, + VOP3Op.V_CVT_PK_U16_U32: _VOP3Op_V_CVT_PK_U16_U32, + VOP3Op.V_CVT_PK_I16_I32: _VOP3Op_V_CVT_PK_I16_I32, + VOP3Op.V_SUB_NC_I32: _VOP3Op_V_SUB_NC_I32, + VOP3Op.V_ADD_NC_I32: _VOP3Op_V_ADD_NC_I32, + VOP3Op.V_ADD_F64: _VOP3Op_V_ADD_F64, + VOP3Op.V_MUL_F64: _VOP3Op_V_MUL_F64, + VOP3Op.V_MIN_F64: _VOP3Op_V_MIN_F64, + VOP3Op.V_MAX_F64: _VOP3Op_V_MAX_F64, + VOP3Op.V_LDEXP_F64: _VOP3Op_V_LDEXP_F64, + VOP3Op.V_MUL_LO_U32: _VOP3Op_V_MUL_LO_U32, + VOP3Op.V_MUL_HI_U32: _VOP3Op_V_MUL_HI_U32, + VOP3Op.V_MUL_HI_I32: _VOP3Op_V_MUL_HI_I32, + VOP3Op.V_LSHLREV_B16: _VOP3Op_V_LSHLREV_B16, + VOP3Op.V_LSHRREV_B16: _VOP3Op_V_LSHRREV_B16, + VOP3Op.V_ASHRREV_I16: _VOP3Op_V_ASHRREV_I16, + VOP3Op.V_LSHLREV_B64: _VOP3Op_V_LSHLREV_B64, + VOP3Op.V_LSHRREV_B64: _VOP3Op_V_LSHRREV_B64, + VOP3Op.V_ASHRREV_I64: _VOP3Op_V_ASHRREV_I64, + VOP3Op.V_READLANE_B32: _VOP3Op_V_READLANE_B32, + VOP3Op.V_AND_B16: _VOP3Op_V_AND_B16, + VOP3Op.V_OR_B16: _VOP3Op_V_OR_B16, + VOP3Op.V_XOR_B16: _VOP3Op_V_XOR_B16, +} + +def _VOP3SDOp_V_ADD_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32) + VCC.u64[laneId].u64; + # VCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32) + VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if (tmp >= 0x100000000) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_SUB_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32 - VCC.u64[laneId].u32; + # VCC.u64[laneId] = 64'U(S1.u32) + VCC.u64[laneId].u64 > 64'U(S0.u32) ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32 - VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if ((S1.u32) + VCC.u64[laneId] > (S0.u32)) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_SUBREV_CO_CI_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S1.u32 - S0.u32 - VCC.u64[laneId].u32; + # VCC.u64[laneId] = 64'U(S0.u32) + VCC.u64[laneId].u64 > 64'U(S1.u32) ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S1.u32 - S0.u32 - VCC.u64[laneId]) + VCC.u64[laneId] = ((1) if ((S0.u32) + VCC.u64[laneId] > (S1.u32)) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_DIV_SCALE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC = 0x0LL; + # if ((64'F(S2.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then + # D0.f32 = NAN.f32 + # elsif exponent(S2.f32) - exponent(S1.f32) >= 96 then + # // N/D near MAX_FLOAT_F32 + # VCC = 0x1LL; + # if S0.f32 == S1.f32 then + # // Only scale the denominator + # D0.f32 = ldexp(S0.f32, 64) + # endif + # elsif S1.f32 == DENORM.f32 then + # D0.f32 = ldexp(S0.f32, 64) + # elsif ((1.0 / 64'F(S1.f32) == DENORM.f64) && (S2.f32 / S1.f32 == DENORM.f32)) then + # VCC = 0x1LL; + # if S0.f32 == S1.f32 then + # // Only scale the denominator + # D0.f32 = ldexp(S0.f32, 64) + # endif + # elsif 1.0 / 64'F(S1.f32) == DENORM.f64 then + # D0.f32 = ldexp(S0.f32, -64) + # elsif S2.f32 / S1.f32 == DENORM.f32 then + # VCC = 0x1LL; + # if S0.f32 == S2.f32 then + # // Only scale the numerator + # D0.f32 = ldexp(S0.f32, 64) + # endif + # elsif exponent(S2.f32) <= 23 then + # // Numerator is tiny + # D0.f32 = ldexp(S0.f32, 64) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(s0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + VCC = Reg(0x0) + if ((F(S2.f32) == 0.0) or (F(S1.f32) == 0.0)): + D0.f32 = float("nan") + elif exponent(S2.f32) - exponent(S1.f32) >= 96: + VCC = Reg(0x1) + if S0.f32 == S1.f32: + D0.f32 = ldexp(S0.f32, 64) + elif S1.f32 == DENORM.f32: + D0.f32 = ldexp(S0.f32, 64) + elif ((1.0 / F(S1.f32) == DENORM.f64) and (S2.f32 / S1.f32 == DENORM.f32)): + VCC = Reg(0x1) + if S0.f32 == S1.f32: + D0.f32 = ldexp(S0.f32, 64) + elif 1.0 / F(S1.f32) == DENORM.f64: + D0.f32 = ldexp(S0.f32, -64) + elif S2.f32 / S1.f32 == DENORM.f32: + VCC = Reg(0x1) + if S0.f32 == S2.f32: + D0.f32 = ldexp(S0.f32, 64) + elif exponent(S2.f32) <= 23: + D0.f32 = ldexp(S0.f32, 64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_DIV_SCALE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC = 0x0LL; + # if ((S2.f64 == 0.0) || (S1.f64 == 0.0)) then + # D0.f64 = NAN.f64 + # elsif exponent(S2.f64) - exponent(S1.f64) >= 768 then + # // N/D near MAX_FLOAT_F64 + # VCC = 0x1LL; + # if S0.f64 == S1.f64 then + # // Only scale the denominator + # D0.f64 = ldexp(S0.f64, 128) + # endif + # elsif S1.f64 == DENORM.f64 then + # D0.f64 = ldexp(S0.f64, 128) + # elsif ((1.0 / S1.f64 == DENORM.f64) && (S2.f64 / S1.f64 == DENORM.f64)) then + # VCC = 0x1LL; + # if S0.f64 == S1.f64 then + # // Only scale the denominator + # D0.f64 = ldexp(S0.f64, 128) + # endif + # elsif 1.0 / S1.f64 == DENORM.f64 then + # D0.f64 = ldexp(S0.f64, -128) + # elsif S2.f64 / S1.f64 == DENORM.f64 then + # VCC = 0x1LL; + # if S0.f64 == S2.f64 then + # // Only scale the numerator + # D0.f64 = ldexp(S0.f64, 128) + # endif + # elsif exponent(S2.f64) <= 53 then + # // Numerator is tiny + # D0.f64 = ldexp(S0.f64, 128) + # endif + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(s0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + VCC = Reg(0x0) + if ((S2.f64 == 0.0) or (S1.f64 == 0.0)): + D0.f64 = float("nan") + elif exponent(S2.f64) - exponent(S1.f64) >= 768: + VCC = Reg(0x1) + if S0.f64 == S1.f64: + D0.f64 = ldexp(S0.f64, 128) + elif S1.f64 == DENORM.f64: + D0.f64 = ldexp(S0.f64, 128) + elif ((1.0 / S1.f64 == DENORM.f64) and (S2.f64 / S1.f64 == DENORM.f64)): + VCC = Reg(0x1) + if S0.f64 == S1.f64: + D0.f64 = ldexp(S0.f64, 128) + elif 1.0 / S1.f64 == DENORM.f64: + D0.f64 = ldexp(S0.f64, -128) + elif S2.f64 / S1.f64 == DENORM.f64: + VCC = Reg(0x1) + if S0.f64 == S2.f64: + D0.f64 = ldexp(S0.f64, 128) + elif exponent(S2.f64) <= 53: + D0.f64 = ldexp(S0.f64, 128) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + return result + +def _VOP3SDOp_V_MAD_U64_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # { D1.u1, D0.u64 } = 65'B(65'U(S0.u32) * 65'U(S1.u32) + 65'U(S2.u64)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + _full = ((S0.u32) * (S1.u32) + (S2.u64)) + D0.u64 = int(_full) & 0xffffffffffffffff + D1 = Reg((int(_full) >> 64) & 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + result['d1'] = D1._val & 1 + return result + +def _VOP3SDOp_V_MAD_I64_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # { D1.i1, D0.i64 } = 65'B(65'I(S0.i32) * 65'I(S1.i32) + 65'I(S2.i64)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + _full = ((S0.i32) * (S1.i32) + (S2.i64)) + D0.u64 = int(_full) & 0xffffffffffffffff + D1 = Reg((int(_full) >> 64) & 1) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['d0_64'] = True + result['d1'] = D1._val & 1 + return result + +def _VOP3SDOp_V_ADD_CO_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32); + # VCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32)) + VCC.u64[laneId] = ((1) if (tmp >= 0x100000000) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_SUB_CO_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32; + # VCC.u64[laneId] = S1.u32 > S0.u32 ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32) + VCC.u64[laneId] = ((1) if (S1.u32 > S0.u32) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3SDOp_V_SUBREV_CO_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S1.u32 - S0.u32; + # VCC.u64[laneId] = S0.u32 > S1.u32 ? 1'1U : 1'0U; + # // VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32. + # D0.u32 = tmp.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S1.u32 - S0.u32) + VCC.u64[laneId] = ((1) if (S0.u32 > S1.u32) else (0)) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +VOP3SDOp_FUNCTIONS = { + VOP3SDOp.V_ADD_CO_CI_U32: _VOP3SDOp_V_ADD_CO_CI_U32, + VOP3SDOp.V_SUB_CO_CI_U32: _VOP3SDOp_V_SUB_CO_CI_U32, + VOP3SDOp.V_SUBREV_CO_CI_U32: _VOP3SDOp_V_SUBREV_CO_CI_U32, + VOP3SDOp.V_DIV_SCALE_F32: _VOP3SDOp_V_DIV_SCALE_F32, + VOP3SDOp.V_DIV_SCALE_F64: _VOP3SDOp_V_DIV_SCALE_F64, + VOP3SDOp.V_MAD_U64_U32: _VOP3SDOp_V_MAD_U64_U32, + VOP3SDOp.V_MAD_I64_I32: _VOP3SDOp_V_MAD_I64_I32, + VOP3SDOp.V_ADD_CO_U32: _VOP3SDOp_V_ADD_CO_U32, + VOP3SDOp.V_SUB_CO_U32: _VOP3SDOp_V_SUB_CO_U32, + VOP3SDOp.V_SUBREV_CO_U32: _VOP3SDOp_V_SUBREV_CO_U32, +} + +def _VOP3POp_V_PK_MAD_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = S0[31 : 16].i16 * S1[31 : 16].i16 + S2[31 : 16].i16; + # tmp[15 : 0].i16 = S0[15 : 0].i16 * S1[15 : 0].i16 + S2[15 : 0].i16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = S0[31 : 16].i16 * S1[31 : 16].i16 + S2[31 : 16].i16 + tmp[15 : 0].i16 = S0[15 : 0].i16 * S1[15 : 0].i16 + S2[15 : 0].i16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MUL_LO_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_ADD_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = S0[31 : 16].i16 + S1[31 : 16].i16; + # tmp[15 : 0].i16 = S0[15 : 0].i16 + S1[15 : 0].i16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = S0[31 : 16].i16 + S1[31 : 16].i16 + tmp[15 : 0].i16 = S0[15 : 0].i16 + S1[15 : 0].i16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_SUB_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = S0[31 : 16].i16 - S1[31 : 16].i16; + # tmp[15 : 0].i16 = S0[15 : 0].i16 - S1[15 : 0].i16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = S0[31 : 16].i16 - S1[31 : 16].i16 + tmp[15 : 0].i16 = S0[15 : 0].i16 - S1[15 : 0].i16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_LSHLREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = (S1[31 : 16].u16 << S0.u32[19 : 16].u32); + # tmp[15 : 0].u16 = (S1[15 : 0].u16 << S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = (S1[31 : 16].u16 << S0.u32[19 : 16].u32) + tmp[15 : 0].u16 = (S1[15 : 0].u16 << S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_LSHRREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = (S1[31 : 16].u16 >> S0.u32[19 : 16].u32); + # tmp[15 : 0].u16 = (S1[15 : 0].u16 >> S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = (S1[31 : 16].u16 >> S0.u32[19 : 16].u32) + tmp[15 : 0].u16 = (S1[15 : 0].u16 >> S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_ASHRREV_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = (S1[31 : 16].i16 >> S0.u32[19 : 16].u32); + # tmp[15 : 0].i16 = (S1[15 : 0].i16 >> S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = (S1[31 : 16].i16 >> S0.u32[19 : 16].u32) + tmp[15 : 0].i16 = (S1[15 : 0].i16 >> S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MAX_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = S0[31 : 16].i16 >= S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16; + # tmp[15 : 0].i16 = S0[15 : 0].i16 >= S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = ((S0[31 : 16].i16) if (S0[31 : 16].i16 >= S1[31 : 16].i16) else (S1[31 : 16].i16)) + tmp[15 : 0].i16 = ((S0[15 : 0].i16) if (S0[15 : 0].i16 >= S1[15 : 0].i16) else (S1[15 : 0].i16)) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MIN_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = S0[31 : 16].i16 < S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16; + # tmp[15 : 0].i16 = S0[15 : 0].i16 < S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = ((S0[31 : 16].i16) if (S0[31 : 16].i16 < S1[31 : 16].i16) else (S1[31 : 16].i16)) + tmp[15 : 0].i16 = ((S0[15 : 0].i16) if (S0[15 : 0].i16 < S1[15 : 0].i16) else (S1[15 : 0].i16)) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MAD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + S2[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + S2[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + S2[31 : 16].u16 + tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + S2[15 : 0].u16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_ADD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 + S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 + S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = S0[31 : 16].u16 + S1[31 : 16].u16 + tmp[15 : 0].u16 = S0[15 : 0].u16 + S1[15 : 0].u16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_SUB_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 - S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 - S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = S0[31 : 16].u16 - S1[31 : 16].u16 + tmp[15 : 0].u16 = S0[15 : 0].u16 - S1[15 : 0].u16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MAX_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 >= S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 >= S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = ((S0[31 : 16].u16) if (S0[31 : 16].u16 >= S1[31 : 16].u16) else (S1[31 : 16].u16)) + tmp[15 : 0].u16 = ((S0[15 : 0].u16) if (S0[15 : 0].u16 >= S1[15 : 0].u16) else (S1[15 : 0].u16)) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MIN_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 < S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 < S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = ((S0[31 : 16].u16) if (S0[31 : 16].u16 < S1[31 : 16].u16) else (S1[31 : 16].u16)) + tmp[15 : 0].u16 = ((S0[15 : 0].u16) if (S0[15 : 0].u16 < S1[15 : 0].u16) else (S1[15 : 0].u16)) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_FMA_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, S2[31 : 16].f16); + # tmp[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, S2[15 : 0].f16); + # D0.b32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, S2[31 : 16].f16) + tmp[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, S2[15 : 0].f16) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_ADD_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].f16 = S0[31 : 16].f16 + S1[31 : 16].f16; + # tmp[15 : 0].f16 = S0[15 : 0].f16 + S1[15 : 0].f16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].f16 = S0[31 : 16].f16 + S1[31 : 16].f16 + tmp[15 : 0].f16 = S0[15 : 0].f16 + S1[15 : 0].f16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MUL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].f16 = S0[31 : 16].f16 * S1[31 : 16].f16; + # tmp[15 : 0].f16 = S0[15 : 0].f16 * S1[15 : 0].f16; + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].f16 = S0[31 : 16].f16 * S1[31 : 16].f16 + tmp[15 : 0].f16 = S0[15 : 0].f16 * S1[15 : 0].f16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].f16 = v_min_f16(S0[31 : 16].f16, S1[31 : 16].f16); + # tmp[15 : 0].f16 = v_min_f16(S0[15 : 0].f16, S1[15 : 0].f16); + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].f16 = v_min_f16(S0[31 : 16].f16, S1[31 : 16].f16) + tmp[15 : 0].f16 = v_min_f16(S0[15 : 0].f16, S1[15 : 0].f16) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_PK_MAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].f16 = v_max_f16(S0[31 : 16].f16, S1[31 : 16].f16); + # tmp[15 : 0].f16 = v_max_f16(S0[15 : 0].f16, S1[15 : 0].f16); + # D0.b32 = tmp.b32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp[31 : 16].f16 = v_max_f16(S0[31 : 16].f16, S1[31 : 16].f16) + tmp[15 : 0].f16 = v_max_f16(S0[15 : 0].f16, S1[15 : 0].f16) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3POp_V_DOT2_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S2.f32; + # tmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16); + # tmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16); + # D0.f32 = tmp + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + tmp = Reg(S2.f32) + tmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16) + tmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16) + D0.f32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +VOP3POp_FUNCTIONS = { + VOP3POp.V_PK_MAD_I16: _VOP3POp_V_PK_MAD_I16, + VOP3POp.V_PK_MUL_LO_U16: _VOP3POp_V_PK_MUL_LO_U16, + VOP3POp.V_PK_ADD_I16: _VOP3POp_V_PK_ADD_I16, + VOP3POp.V_PK_SUB_I16: _VOP3POp_V_PK_SUB_I16, + VOP3POp.V_PK_LSHLREV_B16: _VOP3POp_V_PK_LSHLREV_B16, + VOP3POp.V_PK_LSHRREV_B16: _VOP3POp_V_PK_LSHRREV_B16, + VOP3POp.V_PK_ASHRREV_I16: _VOP3POp_V_PK_ASHRREV_I16, + VOP3POp.V_PK_MAX_I16: _VOP3POp_V_PK_MAX_I16, + VOP3POp.V_PK_MIN_I16: _VOP3POp_V_PK_MIN_I16, + VOP3POp.V_PK_MAD_U16: _VOP3POp_V_PK_MAD_U16, + VOP3POp.V_PK_ADD_U16: _VOP3POp_V_PK_ADD_U16, + VOP3POp.V_PK_SUB_U16: _VOP3POp_V_PK_SUB_U16, + VOP3POp.V_PK_MAX_U16: _VOP3POp_V_PK_MAX_U16, + VOP3POp.V_PK_MIN_U16: _VOP3POp_V_PK_MIN_U16, + VOP3POp.V_PK_FMA_F16: _VOP3POp_V_PK_FMA_F16, + VOP3POp.V_PK_ADD_F16: _VOP3POp_V_PK_ADD_F16, + VOP3POp.V_PK_MUL_F16: _VOP3POp_V_PK_MUL_F16, + VOP3POp.V_PK_MIN_F16: _VOP3POp_V_PK_MIN_F16, + VOP3POp.V_PK_MAX_F16: _VOP3POp_V_PK_MAX_F16, + VOP3POp.V_DOT2_F32_F16: _VOP3POp_V_DOT2_F32_F16, +} + +def _VOPCOp_V_CMP_F_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f16 < S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 < S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f16 == S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 == S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f16 <= S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 <= S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.f16 > S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 > S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f16 <> S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 != S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f16 >= S1.f16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f16 >= S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_O_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC + # D0.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16))); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = ( not isNAN(F(S0.f16)) and not isNAN(F(S1.f16))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_U_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16))); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = (isNAN(F(S0.f16)) or isNAN(F(S1.f16))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f16 >= S1.f16); + # // With NAN inputs this is not the same operation as < + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 >= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f16 <> S1.f16); + # // With NAN inputs this is not the same operation as == + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 != S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = !(S0.f16 > S1.f16); + # // With NAN inputs this is not the same operation as <= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 > S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f16 <= S1.f16); + # // With NAN inputs this is not the same operation as > + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 <= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NEQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f16 == S1.f16); + # // With NAN inputs this is not the same operation as != + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 == S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f16 < S1.f16); + # // With NAN inputs this is not the same operation as >= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f16 < S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f32 < S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 < S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f32 == S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 == S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f32 <= S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 <= S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.f32 > S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 > S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f32 <> S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 != S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f32 >= S1.f32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f32 >= S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_O_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC + # D0.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32))); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = ( not isNAN(F(S0.f32)) and not isNAN(F(S1.f32))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_U_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32))); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = (isNAN(F(S0.f32)) or isNAN(F(S1.f32))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f32 >= S1.f32); + # // With NAN inputs this is not the same operation as < + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 >= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f32 <> S1.f32); + # // With NAN inputs this is not the same operation as == + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 != S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = !(S0.f32 > S1.f32); + # // With NAN inputs this is not the same operation as <= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 > S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f32 <= S1.f32); + # // With NAN inputs this is not the same operation as > + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 <= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NEQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f32 == S1.f32); + # // With NAN inputs this is not the same operation as != + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 == S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f32 < S1.f32); + # // With NAN inputs this is not the same operation as >= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f32 < S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f64 < S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 < S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.f64 == S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 == S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f64 <= S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 <= S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.f64 > S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 > S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LG_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f64 <> S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 != S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.f64 >= S1.f64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.f64 >= S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_O_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC + # D0.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64)); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = ( not isNAN(S0.f64) and not isNAN(S1.f64)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_U_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64)); + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = (isNAN(S0.f64) or isNAN(S1.f64)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f64 >= S1.f64); + # // With NAN inputs this is not the same operation as < + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 >= S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLG_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f64 <> S1.f64); + # // With NAN inputs this is not the same operation as == + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 != S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NGT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VCC or a scalar register. + # D0.u64[laneId] = !(S0.f64 > S1.f64); + # // With NAN inputs this is not the same operation as <= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 > S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = !(S0.f64 <= S1.f64); + # // With NAN inputs this is not the same operation as > + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 <= S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NEQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f64 == S1.f64); + # // With NAN inputs this is not the same operation as != + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 == S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NLT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC + # D0.u64[laneId] = !(S0.f64 < S1.f64); + # // With NAN inputs this is not the same operation as >= + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = not (S0.f64 < S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i16 < S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 < S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i16 == S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 == S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i16 <= S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 <= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.i16 > S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 > S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.i16 <> S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 != S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i16 >= S1.i16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i16 >= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u16 < S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 < S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u16 == S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 == S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u16 <= S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 <= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.u16 > S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 > S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.u16 <> S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 != S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u16 >= S1.u16; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u16 >= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i32 < S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 < S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i32 == S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 == S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i32 <= S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 <= S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.i32 > S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 > S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.i32 <> S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 != S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i32 >= S1.i32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i32 >= S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u32 < S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 < S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u32 == S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 == S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u32 <= S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 <= S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.u32 > S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 > S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.u32 <> S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 != S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u32 >= S1.u32; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u32 >= S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i64 < S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 < S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.i64 == S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 == S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i64 <= S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 <= S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.i64 > S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 > S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.i64 <> S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 != S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.i64 >= S1.i64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.i64 >= S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_F_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 0. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'0U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LT_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u64 < S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 < S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_EQ_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a + # D0.u64[laneId] = S0.u64 == S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 == S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_LE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u64 <= S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 <= S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GT_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC + # D0.u64[laneId] = S0.u64 > S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 > S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_NE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC + # D0.u64[laneId] = S0.u64 <> S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 != S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_GE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u64[laneId] = S0.u64 >= S1.u64; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = S0.u64 >= S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_T_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1. Store the result into VCC or a scalar register. + # D0.u64[laneId] = 1'1U; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + D0.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_CLASS_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # half-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(64'F(S0.f16)) then + # result = S1.u32[0] + # elsif isQuietNAN(64'F(S0.f16)) then + # result = S1.u32[1] + # elsif exponent(S0.f16) == 31 then + # // +-INF + # result = S1.u32[sign(S0.f16) ? 2 : 9] + # elsif exponent(S0.f16) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f16) ? 3 : 8] + # elsif 64'F(abs(S0.f16)) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f16) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f16) ? 5 : 6] + # endif; + # D0.u64[laneId] = result; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(F(S0.f16)): + result = S1.u32[0] + elif isQuietNAN(F(S0.f16)): + result = S1.u32[1] + elif exponent(S0.f16) == 31: + result = S1.u32[((2) if (sign(S0.f16)) else (9))] + elif exponent(S0.f16) > 0: + result = S1.u32[((3) if (sign(S0.f16)) else (8))] + elif F(abs(S0.f16)) > 0.0: + result = S1.u32[((4) if (sign(S0.f16)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f16)) else (6))] + D0.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_CLASS_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # single-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(64'F(S0.f32)) then + # result = S1.u32[0] + # elsif isQuietNAN(64'F(S0.f32)) then + # result = S1.u32[1] + # elsif exponent(S0.f32) == 255 then + # // +-INF + # result = S1.u32[sign(S0.f32) ? 2 : 9] + # elsif exponent(S0.f32) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f32) ? 3 : 8] + # elsif 64'F(abs(S0.f32)) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f32) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f32) ? 5 : 6] + # endif; + # D0.u64[laneId] = result; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(F(S0.f32)): + result = S1.u32[0] + elif isQuietNAN(F(S0.f32)): + result = S1.u32[1] + elif exponent(S0.f32) == 255: + result = S1.u32[((2) if (sign(S0.f32)) else (9))] + elif exponent(S0.f32) > 0: + result = S1.u32[((3) if (sign(S0.f32)) else (8))] + elif F(abs(S0.f32)) > 0.0: + result = S1.u32[((4) if (sign(S0.f32)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f32)) else (6))] + D0.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMP_CLASS_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # double-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(S0.f64) then + # result = S1.u32[0] + # elsif isQuietNAN(S0.f64) then + # result = S1.u32[1] + # elsif exponent(S0.f64) == 2047 then + # // +-INF + # result = S1.u32[sign(S0.f64) ? 2 : 9] + # elsif exponent(S0.f64) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f64) ? 3 : 8] + # elsif abs(S0.f64) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f64) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f64) ? 5 : 6] + # endif; + # D0.u64[laneId] = result; + # // D0 = VCC in VOPC encoding. + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(S0.f64): + result = S1.u32[0] + elif isQuietNAN(S0.f64): + result = S1.u32[1] + elif exponent(S0.f64) == 2047: + result = S1.u32[((2) if (sign(S0.f64)) else (9))] + elif exponent(S0.f64) > 0: + result = S1.u32[((3) if (sign(S0.f64)) else (8))] + elif abs(S0.f64) > 0.0: + result = S1.u32[((4) if (sign(S0.f64)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f64)) else (6))] + D0.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + if EXEC._val != exec_mask: result['exec'] = EXEC._val + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_F_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f16 < S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 < S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.f16 == S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 == S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f16 <= S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 <= S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f16 > S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 > S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f16 <> S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 != S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f16 >= S1.f16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f16 >= S1.f16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_O_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = ( not isNAN(F(S0.f16)) and not isNAN(F(S1.f16))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_U_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = (isNAN(F(S0.f16)) or isNAN(F(S1.f16))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 >= S1.f16); + # // With NAN inputs this is not the same operation as < + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 >= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLG_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 <> S1.f16); + # // With NAN inputs this is not the same operation as == + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 != S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 > S1.f16); + # // With NAN inputs this is not the same operation as <= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 > S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLE_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 <= S1.f16); + # // With NAN inputs this is not the same operation as > + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 <= S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NEQ_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 == S1.f16); + # // With NAN inputs this is not the same operation as != + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 == S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLT_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f16 < S1.f16); + # // With NAN inputs this is not the same operation as >= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f16 < S1.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f32 < S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 < S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.f32 == S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 == S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f32 <= S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 <= S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f32 > S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 > S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f32 <> S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 != S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f32 >= S1.f32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f32 >= S1.f32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_O_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = ( not isNAN(F(S0.f32)) and not isNAN(F(S1.f32))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_U_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32))) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = (isNAN(F(S0.f32)) or isNAN(F(S1.f32))) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 >= S1.f32); + # // With NAN inputs this is not the same operation as < + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 >= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLG_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 <> S1.f32); + # // With NAN inputs this is not the same operation as == + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 != S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 > S1.f32); + # // With NAN inputs this is not the same operation as <= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 > S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLE_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 <= S1.f32); + # // With NAN inputs this is not the same operation as > + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 <= S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NEQ_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 == S1.f32); + # // With NAN inputs this is not the same operation as != + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 == S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLT_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f32 < S1.f32); + # // With NAN inputs this is not the same operation as >= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f32 < S1.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f64 < S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 < S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.f64 == S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 == S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f64 <= S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 <= S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f64 > S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 > S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LG_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f64 <> S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 != S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.f64 >= S1.f64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.f64 >= S1.f64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_O_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = ( not isNAN(S0.f64) and not isNAN(S1.f64)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_U_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64)) + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = (isNAN(S0.f64) or isNAN(S1.f64)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 >= S1.f64); + # // With NAN inputs this is not the same operation as < + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 >= S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLG_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 <> S1.f64); + # // With NAN inputs this is not the same operation as == + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 != S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NGT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 > S1.f64); + # // With NAN inputs this is not the same operation as <= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 > S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 <= S1.f64); + # // With NAN inputs this is not the same operation as > + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 <= S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NEQ_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 == S1.f64); + # // With NAN inputs this is not the same operation as != + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 == S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NLT_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = !(S0.f64 < S1.f64); + # // With NAN inputs this is not the same operation as >= + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = not (S0.f64 < S1.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i16 < S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 < S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.i16 == S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 == S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i16 <= S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 <= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i16 > S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 > S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i16 <> S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 != S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i16 >= S1.i16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i16 >= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u16 < S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 < S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.u16 == S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 == S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u16 <= S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 <= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u16 > S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 > S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u16 <> S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 != S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u16 >= S1.u16 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u16 >= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i32 < S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 < S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.i32 == S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 == S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i32 <= S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 <= S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i32 > S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 > S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i32 <> S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 != S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i32 >= S1.i32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i32 >= S1.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u32 < S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 < S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.u32 == S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 == S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u32 <= S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 <= S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u32 > S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 > S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u32 <> S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 != S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u32 >= S1.u32 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u32 >= S1.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i64 < S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 < S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.i64 == S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 == S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i64 <= S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 <= S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i64 > S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 > S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i64 <> S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 != S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.i64 >= S1.i64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.i64 >= S1.i64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_I64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_F_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'0U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LT_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u64 < S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 < S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_EQ_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = S0.u64 == S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 == S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_LE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u64 <= S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 <= S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GT_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u64 > S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 > S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_NE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u64 <> S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 != S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_GE_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = S0.u64 >= S1.u64 + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = S0.u64 >= S1.u64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_T_U64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = 1'1U + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + EXEC.u64[laneId] = 1 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_CLASS_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(64'F(S0.f16)) then + # result = S1.u32[0] + # elsif isQuietNAN(64'F(S0.f16)) then + # result = S1.u32[1] + # elsif exponent(S0.f16) == 31 then + # // +-INF + # result = S1.u32[sign(S0.f16) ? 2 : 9] + # elsif exponent(S0.f16) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f16) ? 3 : 8] + # elsif 64'F(abs(S0.f16)) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f16) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f16) ? 5 : 6] + # endif; + # EXEC.u64[laneId] = result + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(F(S0.f16)): + result = S1.u32[0] + elif isQuietNAN(F(S0.f16)): + result = S1.u32[1] + elif exponent(S0.f16) == 31: + result = S1.u32[((2) if (sign(S0.f16)) else (9))] + elif exponent(S0.f16) > 0: + result = S1.u32[((3) if (sign(S0.f16)) else (8))] + elif F(abs(S0.f16)) > 0.0: + result = S1.u32[((4) if (sign(S0.f16)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f16)) else (6))] + EXEC.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_CLASS_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(64'F(S0.f32)) then + # result = S1.u32[0] + # elsif isQuietNAN(64'F(S0.f32)) then + # result = S1.u32[1] + # elsif exponent(S0.f32) == 255 then + # // +-INF + # result = S1.u32[sign(S0.f32) ? 2 : 9] + # elsif exponent(S0.f32) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f32) ? 3 : 8] + # elsif 64'F(abs(S0.f32)) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f32) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f32) ? 5 : 6] + # endif; + # EXEC.u64[laneId] = result + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(F(S0.f32)): + result = S1.u32[0] + elif isQuietNAN(F(S0.f32)): + result = S1.u32[1] + elif exponent(S0.f32) == 255: + result = S1.u32[((2) if (sign(S0.f32)) else (9))] + elif exponent(S0.f32) > 0: + result = S1.u32[((3) if (sign(S0.f32)) else (8))] + elif F(abs(S0.f32)) > 0.0: + result = S1.u32[((4) if (sign(S0.f32)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f32)) else (6))] + EXEC.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +def _VOPCOp_V_CMPX_CLASS_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # S1.u[0] value is a signaling NAN. + # S1.u[1] value is a quiet NAN. + # S1.u[2] value is negative infinity. + # S1.u[3] value is a negative normal value. + # S1.u[4] value is a negative denormal value. + # S1.u[5] value is negative zero. + # S1.u[6] value is positive zero. + # S1.u[7] value is a positive denormal value. + # S1.u[8] value is a positive normal value. + # S1.u[9] value is positive infinity. + # declare result : 1'U; + # if isSignalNAN(S0.f64) then + # result = S1.u32[0] + # elsif isQuietNAN(S0.f64) then + # result = S1.u32[1] + # elsif exponent(S0.f64) == 2047 then + # // +-INF + # result = S1.u32[sign(S0.f64) ? 2 : 9] + # elsif exponent(S0.f64) > 0 then + # // +-normal value + # result = S1.u32[sign(S0.f64) ? 3 : 8] + # elsif abs(S0.f64) > 0.0 then + # // +-denormal value + # result = S1.u32[sign(S0.f64) ? 4 : 7] + # else + # // +-0.0 + # result = S1.u32[sign(S0.f64) ? 5 : 6] + # endif; + # EXEC.u64[laneId] = result + S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0) + SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32) + tmp, saveexec = Reg(0), Reg(exec_mask) + laneId = lane + SIMM16, SIMM32 = Reg(literal), Reg(literal) + SRC0, VDST = Reg(src0_idx), Reg(vdst_idx) + # --- compiled pseudocode --- + if isSignalNAN(S0.f64): + result = S1.u32[0] + elif isQuietNAN(S0.f64): + result = S1.u32[1] + elif exponent(S0.f64) == 2047: + result = S1.u32[((2) if (sign(S0.f64)) else (9))] + elif exponent(S0.f64) > 0: + result = S1.u32[((3) if (sign(S0.f64)) else (8))] + elif abs(S0.f64) > 0.0: + result = S1.u32[((4) if (sign(S0.f64)) else (7))] + else: + result = S1.u32[((5) if (sign(S0.f64)) else (6))] + EXEC.u64[laneId] = result + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + return result + +VOPCOp_FUNCTIONS = { + VOPCOp.V_CMP_F_F16: _VOPCOp_V_CMP_F_F16, + VOPCOp.V_CMP_LT_F16: _VOPCOp_V_CMP_LT_F16, + VOPCOp.V_CMP_EQ_F16: _VOPCOp_V_CMP_EQ_F16, + VOPCOp.V_CMP_LE_F16: _VOPCOp_V_CMP_LE_F16, + VOPCOp.V_CMP_GT_F16: _VOPCOp_V_CMP_GT_F16, + VOPCOp.V_CMP_LG_F16: _VOPCOp_V_CMP_LG_F16, + VOPCOp.V_CMP_GE_F16: _VOPCOp_V_CMP_GE_F16, + VOPCOp.V_CMP_O_F16: _VOPCOp_V_CMP_O_F16, + VOPCOp.V_CMP_U_F16: _VOPCOp_V_CMP_U_F16, + VOPCOp.V_CMP_NGE_F16: _VOPCOp_V_CMP_NGE_F16, + VOPCOp.V_CMP_NLG_F16: _VOPCOp_V_CMP_NLG_F16, + VOPCOp.V_CMP_NGT_F16: _VOPCOp_V_CMP_NGT_F16, + VOPCOp.V_CMP_NLE_F16: _VOPCOp_V_CMP_NLE_F16, + VOPCOp.V_CMP_NEQ_F16: _VOPCOp_V_CMP_NEQ_F16, + VOPCOp.V_CMP_NLT_F16: _VOPCOp_V_CMP_NLT_F16, + VOPCOp.V_CMP_T_F16: _VOPCOp_V_CMP_T_F16, + VOPCOp.V_CMP_F_F32: _VOPCOp_V_CMP_F_F32, + VOPCOp.V_CMP_LT_F32: _VOPCOp_V_CMP_LT_F32, + VOPCOp.V_CMP_EQ_F32: _VOPCOp_V_CMP_EQ_F32, + VOPCOp.V_CMP_LE_F32: _VOPCOp_V_CMP_LE_F32, + VOPCOp.V_CMP_GT_F32: _VOPCOp_V_CMP_GT_F32, + VOPCOp.V_CMP_LG_F32: _VOPCOp_V_CMP_LG_F32, + VOPCOp.V_CMP_GE_F32: _VOPCOp_V_CMP_GE_F32, + VOPCOp.V_CMP_O_F32: _VOPCOp_V_CMP_O_F32, + VOPCOp.V_CMP_U_F32: _VOPCOp_V_CMP_U_F32, + VOPCOp.V_CMP_NGE_F32: _VOPCOp_V_CMP_NGE_F32, + VOPCOp.V_CMP_NLG_F32: _VOPCOp_V_CMP_NLG_F32, + VOPCOp.V_CMP_NGT_F32: _VOPCOp_V_CMP_NGT_F32, + VOPCOp.V_CMP_NLE_F32: _VOPCOp_V_CMP_NLE_F32, + VOPCOp.V_CMP_NEQ_F32: _VOPCOp_V_CMP_NEQ_F32, + VOPCOp.V_CMP_NLT_F32: _VOPCOp_V_CMP_NLT_F32, + VOPCOp.V_CMP_T_F32: _VOPCOp_V_CMP_T_F32, + VOPCOp.V_CMP_F_F64: _VOPCOp_V_CMP_F_F64, + VOPCOp.V_CMP_LT_F64: _VOPCOp_V_CMP_LT_F64, + VOPCOp.V_CMP_EQ_F64: _VOPCOp_V_CMP_EQ_F64, + VOPCOp.V_CMP_LE_F64: _VOPCOp_V_CMP_LE_F64, + VOPCOp.V_CMP_GT_F64: _VOPCOp_V_CMP_GT_F64, + VOPCOp.V_CMP_LG_F64: _VOPCOp_V_CMP_LG_F64, + VOPCOp.V_CMP_GE_F64: _VOPCOp_V_CMP_GE_F64, + VOPCOp.V_CMP_O_F64: _VOPCOp_V_CMP_O_F64, + VOPCOp.V_CMP_U_F64: _VOPCOp_V_CMP_U_F64, + VOPCOp.V_CMP_NGE_F64: _VOPCOp_V_CMP_NGE_F64, + VOPCOp.V_CMP_NLG_F64: _VOPCOp_V_CMP_NLG_F64, + VOPCOp.V_CMP_NGT_F64: _VOPCOp_V_CMP_NGT_F64, + VOPCOp.V_CMP_NLE_F64: _VOPCOp_V_CMP_NLE_F64, + VOPCOp.V_CMP_NEQ_F64: _VOPCOp_V_CMP_NEQ_F64, + VOPCOp.V_CMP_NLT_F64: _VOPCOp_V_CMP_NLT_F64, + VOPCOp.V_CMP_T_F64: _VOPCOp_V_CMP_T_F64, + VOPCOp.V_CMP_LT_I16: _VOPCOp_V_CMP_LT_I16, + VOPCOp.V_CMP_EQ_I16: _VOPCOp_V_CMP_EQ_I16, + VOPCOp.V_CMP_LE_I16: _VOPCOp_V_CMP_LE_I16, + VOPCOp.V_CMP_GT_I16: _VOPCOp_V_CMP_GT_I16, + VOPCOp.V_CMP_NE_I16: _VOPCOp_V_CMP_NE_I16, + VOPCOp.V_CMP_GE_I16: _VOPCOp_V_CMP_GE_I16, + VOPCOp.V_CMP_LT_U16: _VOPCOp_V_CMP_LT_U16, + VOPCOp.V_CMP_EQ_U16: _VOPCOp_V_CMP_EQ_U16, + VOPCOp.V_CMP_LE_U16: _VOPCOp_V_CMP_LE_U16, + VOPCOp.V_CMP_GT_U16: _VOPCOp_V_CMP_GT_U16, + VOPCOp.V_CMP_NE_U16: _VOPCOp_V_CMP_NE_U16, + VOPCOp.V_CMP_GE_U16: _VOPCOp_V_CMP_GE_U16, + VOPCOp.V_CMP_F_I32: _VOPCOp_V_CMP_F_I32, + VOPCOp.V_CMP_LT_I32: _VOPCOp_V_CMP_LT_I32, + VOPCOp.V_CMP_EQ_I32: _VOPCOp_V_CMP_EQ_I32, + VOPCOp.V_CMP_LE_I32: _VOPCOp_V_CMP_LE_I32, + VOPCOp.V_CMP_GT_I32: _VOPCOp_V_CMP_GT_I32, + VOPCOp.V_CMP_NE_I32: _VOPCOp_V_CMP_NE_I32, + VOPCOp.V_CMP_GE_I32: _VOPCOp_V_CMP_GE_I32, + VOPCOp.V_CMP_T_I32: _VOPCOp_V_CMP_T_I32, + VOPCOp.V_CMP_F_U32: _VOPCOp_V_CMP_F_U32, + VOPCOp.V_CMP_LT_U32: _VOPCOp_V_CMP_LT_U32, + VOPCOp.V_CMP_EQ_U32: _VOPCOp_V_CMP_EQ_U32, + VOPCOp.V_CMP_LE_U32: _VOPCOp_V_CMP_LE_U32, + VOPCOp.V_CMP_GT_U32: _VOPCOp_V_CMP_GT_U32, + VOPCOp.V_CMP_NE_U32: _VOPCOp_V_CMP_NE_U32, + VOPCOp.V_CMP_GE_U32: _VOPCOp_V_CMP_GE_U32, + VOPCOp.V_CMP_T_U32: _VOPCOp_V_CMP_T_U32, + VOPCOp.V_CMP_F_I64: _VOPCOp_V_CMP_F_I64, + VOPCOp.V_CMP_LT_I64: _VOPCOp_V_CMP_LT_I64, + VOPCOp.V_CMP_EQ_I64: _VOPCOp_V_CMP_EQ_I64, + VOPCOp.V_CMP_LE_I64: _VOPCOp_V_CMP_LE_I64, + VOPCOp.V_CMP_GT_I64: _VOPCOp_V_CMP_GT_I64, + VOPCOp.V_CMP_NE_I64: _VOPCOp_V_CMP_NE_I64, + VOPCOp.V_CMP_GE_I64: _VOPCOp_V_CMP_GE_I64, + VOPCOp.V_CMP_T_I64: _VOPCOp_V_CMP_T_I64, + VOPCOp.V_CMP_F_U64: _VOPCOp_V_CMP_F_U64, + VOPCOp.V_CMP_LT_U64: _VOPCOp_V_CMP_LT_U64, + VOPCOp.V_CMP_EQ_U64: _VOPCOp_V_CMP_EQ_U64, + VOPCOp.V_CMP_LE_U64: _VOPCOp_V_CMP_LE_U64, + VOPCOp.V_CMP_GT_U64: _VOPCOp_V_CMP_GT_U64, + VOPCOp.V_CMP_NE_U64: _VOPCOp_V_CMP_NE_U64, + VOPCOp.V_CMP_GE_U64: _VOPCOp_V_CMP_GE_U64, + VOPCOp.V_CMP_T_U64: _VOPCOp_V_CMP_T_U64, + VOPCOp.V_CMP_CLASS_F16: _VOPCOp_V_CMP_CLASS_F16, + VOPCOp.V_CMP_CLASS_F32: _VOPCOp_V_CMP_CLASS_F32, + VOPCOp.V_CMP_CLASS_F64: _VOPCOp_V_CMP_CLASS_F64, + VOPCOp.V_CMPX_F_F16: _VOPCOp_V_CMPX_F_F16, + VOPCOp.V_CMPX_LT_F16: _VOPCOp_V_CMPX_LT_F16, + VOPCOp.V_CMPX_EQ_F16: _VOPCOp_V_CMPX_EQ_F16, + VOPCOp.V_CMPX_LE_F16: _VOPCOp_V_CMPX_LE_F16, + VOPCOp.V_CMPX_GT_F16: _VOPCOp_V_CMPX_GT_F16, + VOPCOp.V_CMPX_LG_F16: _VOPCOp_V_CMPX_LG_F16, + VOPCOp.V_CMPX_GE_F16: _VOPCOp_V_CMPX_GE_F16, + VOPCOp.V_CMPX_O_F16: _VOPCOp_V_CMPX_O_F16, + VOPCOp.V_CMPX_U_F16: _VOPCOp_V_CMPX_U_F16, + VOPCOp.V_CMPX_NGE_F16: _VOPCOp_V_CMPX_NGE_F16, + VOPCOp.V_CMPX_NLG_F16: _VOPCOp_V_CMPX_NLG_F16, + VOPCOp.V_CMPX_NGT_F16: _VOPCOp_V_CMPX_NGT_F16, + VOPCOp.V_CMPX_NLE_F16: _VOPCOp_V_CMPX_NLE_F16, + VOPCOp.V_CMPX_NEQ_F16: _VOPCOp_V_CMPX_NEQ_F16, + VOPCOp.V_CMPX_NLT_F16: _VOPCOp_V_CMPX_NLT_F16, + VOPCOp.V_CMPX_T_F16: _VOPCOp_V_CMPX_T_F16, + VOPCOp.V_CMPX_F_F32: _VOPCOp_V_CMPX_F_F32, + VOPCOp.V_CMPX_LT_F32: _VOPCOp_V_CMPX_LT_F32, + VOPCOp.V_CMPX_EQ_F32: _VOPCOp_V_CMPX_EQ_F32, + VOPCOp.V_CMPX_LE_F32: _VOPCOp_V_CMPX_LE_F32, + VOPCOp.V_CMPX_GT_F32: _VOPCOp_V_CMPX_GT_F32, + VOPCOp.V_CMPX_LG_F32: _VOPCOp_V_CMPX_LG_F32, + VOPCOp.V_CMPX_GE_F32: _VOPCOp_V_CMPX_GE_F32, + VOPCOp.V_CMPX_O_F32: _VOPCOp_V_CMPX_O_F32, + VOPCOp.V_CMPX_U_F32: _VOPCOp_V_CMPX_U_F32, + VOPCOp.V_CMPX_NGE_F32: _VOPCOp_V_CMPX_NGE_F32, + VOPCOp.V_CMPX_NLG_F32: _VOPCOp_V_CMPX_NLG_F32, + VOPCOp.V_CMPX_NGT_F32: _VOPCOp_V_CMPX_NGT_F32, + VOPCOp.V_CMPX_NLE_F32: _VOPCOp_V_CMPX_NLE_F32, + VOPCOp.V_CMPX_NEQ_F32: _VOPCOp_V_CMPX_NEQ_F32, + VOPCOp.V_CMPX_NLT_F32: _VOPCOp_V_CMPX_NLT_F32, + VOPCOp.V_CMPX_T_F32: _VOPCOp_V_CMPX_T_F32, + VOPCOp.V_CMPX_F_F64: _VOPCOp_V_CMPX_F_F64, + VOPCOp.V_CMPX_LT_F64: _VOPCOp_V_CMPX_LT_F64, + VOPCOp.V_CMPX_EQ_F64: _VOPCOp_V_CMPX_EQ_F64, + VOPCOp.V_CMPX_LE_F64: _VOPCOp_V_CMPX_LE_F64, + VOPCOp.V_CMPX_GT_F64: _VOPCOp_V_CMPX_GT_F64, + VOPCOp.V_CMPX_LG_F64: _VOPCOp_V_CMPX_LG_F64, + VOPCOp.V_CMPX_GE_F64: _VOPCOp_V_CMPX_GE_F64, + VOPCOp.V_CMPX_O_F64: _VOPCOp_V_CMPX_O_F64, + VOPCOp.V_CMPX_U_F64: _VOPCOp_V_CMPX_U_F64, + VOPCOp.V_CMPX_NGE_F64: _VOPCOp_V_CMPX_NGE_F64, + VOPCOp.V_CMPX_NLG_F64: _VOPCOp_V_CMPX_NLG_F64, + VOPCOp.V_CMPX_NGT_F64: _VOPCOp_V_CMPX_NGT_F64, + VOPCOp.V_CMPX_NLE_F64: _VOPCOp_V_CMPX_NLE_F64, + VOPCOp.V_CMPX_NEQ_F64: _VOPCOp_V_CMPX_NEQ_F64, + VOPCOp.V_CMPX_NLT_F64: _VOPCOp_V_CMPX_NLT_F64, + VOPCOp.V_CMPX_T_F64: _VOPCOp_V_CMPX_T_F64, + VOPCOp.V_CMPX_LT_I16: _VOPCOp_V_CMPX_LT_I16, + VOPCOp.V_CMPX_EQ_I16: _VOPCOp_V_CMPX_EQ_I16, + VOPCOp.V_CMPX_LE_I16: _VOPCOp_V_CMPX_LE_I16, + VOPCOp.V_CMPX_GT_I16: _VOPCOp_V_CMPX_GT_I16, + VOPCOp.V_CMPX_NE_I16: _VOPCOp_V_CMPX_NE_I16, + VOPCOp.V_CMPX_GE_I16: _VOPCOp_V_CMPX_GE_I16, + VOPCOp.V_CMPX_LT_U16: _VOPCOp_V_CMPX_LT_U16, + VOPCOp.V_CMPX_EQ_U16: _VOPCOp_V_CMPX_EQ_U16, + VOPCOp.V_CMPX_LE_U16: _VOPCOp_V_CMPX_LE_U16, + VOPCOp.V_CMPX_GT_U16: _VOPCOp_V_CMPX_GT_U16, + VOPCOp.V_CMPX_NE_U16: _VOPCOp_V_CMPX_NE_U16, + VOPCOp.V_CMPX_GE_U16: _VOPCOp_V_CMPX_GE_U16, + VOPCOp.V_CMPX_F_I32: _VOPCOp_V_CMPX_F_I32, + VOPCOp.V_CMPX_LT_I32: _VOPCOp_V_CMPX_LT_I32, + VOPCOp.V_CMPX_EQ_I32: _VOPCOp_V_CMPX_EQ_I32, + VOPCOp.V_CMPX_LE_I32: _VOPCOp_V_CMPX_LE_I32, + VOPCOp.V_CMPX_GT_I32: _VOPCOp_V_CMPX_GT_I32, + VOPCOp.V_CMPX_NE_I32: _VOPCOp_V_CMPX_NE_I32, + VOPCOp.V_CMPX_GE_I32: _VOPCOp_V_CMPX_GE_I32, + VOPCOp.V_CMPX_T_I32: _VOPCOp_V_CMPX_T_I32, + VOPCOp.V_CMPX_F_U32: _VOPCOp_V_CMPX_F_U32, + VOPCOp.V_CMPX_LT_U32: _VOPCOp_V_CMPX_LT_U32, + VOPCOp.V_CMPX_EQ_U32: _VOPCOp_V_CMPX_EQ_U32, + VOPCOp.V_CMPX_LE_U32: _VOPCOp_V_CMPX_LE_U32, + VOPCOp.V_CMPX_GT_U32: _VOPCOp_V_CMPX_GT_U32, + VOPCOp.V_CMPX_NE_U32: _VOPCOp_V_CMPX_NE_U32, + VOPCOp.V_CMPX_GE_U32: _VOPCOp_V_CMPX_GE_U32, + VOPCOp.V_CMPX_T_U32: _VOPCOp_V_CMPX_T_U32, + VOPCOp.V_CMPX_F_I64: _VOPCOp_V_CMPX_F_I64, + VOPCOp.V_CMPX_LT_I64: _VOPCOp_V_CMPX_LT_I64, + VOPCOp.V_CMPX_EQ_I64: _VOPCOp_V_CMPX_EQ_I64, + VOPCOp.V_CMPX_LE_I64: _VOPCOp_V_CMPX_LE_I64, + VOPCOp.V_CMPX_GT_I64: _VOPCOp_V_CMPX_GT_I64, + VOPCOp.V_CMPX_NE_I64: _VOPCOp_V_CMPX_NE_I64, + VOPCOp.V_CMPX_GE_I64: _VOPCOp_V_CMPX_GE_I64, + VOPCOp.V_CMPX_T_I64: _VOPCOp_V_CMPX_T_I64, + VOPCOp.V_CMPX_F_U64: _VOPCOp_V_CMPX_F_U64, + VOPCOp.V_CMPX_LT_U64: _VOPCOp_V_CMPX_LT_U64, + VOPCOp.V_CMPX_EQ_U64: _VOPCOp_V_CMPX_EQ_U64, + VOPCOp.V_CMPX_LE_U64: _VOPCOp_V_CMPX_LE_U64, + VOPCOp.V_CMPX_GT_U64: _VOPCOp_V_CMPX_GT_U64, + VOPCOp.V_CMPX_NE_U64: _VOPCOp_V_CMPX_NE_U64, + VOPCOp.V_CMPX_GE_U64: _VOPCOp_V_CMPX_GE_U64, + VOPCOp.V_CMPX_T_U64: _VOPCOp_V_CMPX_T_U64, + VOPCOp.V_CMPX_CLASS_F16: _VOPCOp_V_CMPX_CLASS_F16, + VOPCOp.V_CMPX_CLASS_F32: _VOPCOp_V_CMPX_CLASS_F32, + VOPCOp.V_CMPX_CLASS_F64: _VOPCOp_V_CMPX_CLASS_F64, +} + + +# Manually implemented lane instructions (require special vgpr_write handling) +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VGPR[lane][VDST] = S0.b32 - writes s0 to specified lane's VGPR + wr_lane = s1 & 0x1f # lane select (5 bits for wave32) + return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} + +def _VOP3Op_V_READLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = VGPR[lane][SRC0] - reads from specified lane's VGPR + rd_lane = s1 & 0x1f # lane select (5 bits for wave32) + val = VGPR[rd_lane][src0_idx] if VGPR is not None and rd_lane < len(VGPR) and src0_idx < len(VGPR[rd_lane]) else s0 + return {'d0': val & 0xffffffff, 'scc': scc} + +def _VOP1Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = VGPR[first_active_lane][SRC0] - reads from first active lane + first_lane = 0 + for i in range(32): + if exec_mask & (1 << i): + first_lane = i + break + val = VGPR[first_lane][src0_idx] if VGPR is not None and first_lane < len(VGPR) and src0_idx < len(VGPR[first_lane]) else s0 + return {'d0': val & 0xffffffff, 'scc': scc} + +COMPILED_FUNCTIONS = { + SOP1Op: SOP1Op_FUNCTIONS, + SOP2Op: SOP2Op_FUNCTIONS, + SOPCOp: SOPCOp_FUNCTIONS, + SOPKOp: SOPKOp_FUNCTIONS, + SOPPOp: SOPPOp_FUNCTIONS, + VOP1Op: VOP1Op_FUNCTIONS, + VOP2Op: VOP2Op_FUNCTIONS, + VOP3Op: VOP3Op_FUNCTIONS, + VOP3SDOp: VOP3SDOp_FUNCTIONS, + VOP3POp: VOP3POp_FUNCTIONS, + VOPCOp: VOPCOp_FUNCTIONS, +} + +# Add lane instructions to their respective dicts +VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32 +VOP3Op_FUNCTIONS[VOP3Op.V_READLANE_B32] = _VOP3Op_V_READLANE_B32 +VOP1Op_FUNCTIONS[VOP1Op.V_READFIRSTLANE_B32] = _VOP1Op_V_READFIRSTLANE_B32 + +def get_compiled_functions(): return COMPILED_FUNCTIONS \ No newline at end of file diff --git a/extra/assembly/rdna3/emu.py b/extra/assembly/rdna3/emu.py index 39f9c7e564..465b0e4858 100644 --- a/extra/assembly/rdna3/emu.py +++ b/extra/assembly/rdna3/emu.py @@ -1,31 +1,56 @@ -# RDNA3 emulator - pure Python implementation for testing +# RDNA3 emulator - executes compiled pseudocode from AMD ISA PDF from __future__ import annotations -import ctypes, struct, math -from typing import Callable -from extra.assembly.rdna3.lib import Inst, Inst32, Inst64, RawImm - -Program = dict[int, Inst] # pc (word offset) -> instruction +import ctypes, os +from extra.assembly.rdna3.lib import Inst, RawImm +from extra.assembly.rdna3.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64 +from extra.assembly.rdna3.autogen.gen_pcode import get_compiled_functions from extra.assembly.rdna3.autogen import ( SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD, SrcEnum, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, VOPDOp ) -from extra.assembly.rdna3.alu import ( - f32, i32, f16, i16, sext, vopc, SALU, VALU, - SOP1_BASE, SOP2_BASE, SOPC_BASE, SOPK_BASE, VOP1_BASE, VOP2_BASE -) +Program = dict[int, Inst] WAVE_SIZE, SGPR_COUNT, VGPR_COUNT = 32, 128, 256 -VCC_LO, VCC_HI, NULL, M0, EXEC_LO, EXEC_HI, SCC = SrcEnum.VCC_LO, SrcEnum.VCC_HI, SrcEnum.NULL, SrcEnum.M0, SrcEnum.EXEC_LO, SrcEnum.EXEC_HI, SrcEnum.SCC -# Pre-computed inline constant table for src operands 128-254 (index = src - 128) +VCC_LO, VCC_HI, NULL, EXEC_LO, EXEC_HI, SCC = SrcEnum.VCC_LO, SrcEnum.VCC_HI, SrcEnum.NULL, SrcEnum.EXEC_LO, SrcEnum.EXEC_HI, SrcEnum.SCC + +# VOP3 ops that use 64-bit operands (and thus 64-bit literals when src is 255) +# Exception: V_LDEXP_F64 has 32-bit integer src1, so literal should NOT be 64-bit when src1=255 +_VOP3_64BIT_OPS = {op.value for op in VOP3Op if op.name.endswith(('_F64', '_B64', '_I64', '_U64'))} +# Ops where src1 is 32-bit (exponent/shift amount) even though the op name suggests 64-bit +_VOP3_64BIT_OPS_32BIT_SRC1 = {VOP3Op.V_LDEXP_F64.value} + +# Inline constants for src operands 128-254 (f32 format for most instructions) _INLINE_CONSTS = [0] * 127 -for _i in range(65): _INLINE_CONSTS[_i] = _i # 128-192 -> 0-64 -for _i in range(1, 17): _INLINE_CONSTS[64 + _i] = ((-_i) & 0xffffffff) # 193-208 -> -1 to -16 +for _i in range(65): _INLINE_CONSTS[_i] = _i +for _i in range(1, 17): _INLINE_CONSTS[64 + _i] = ((-_i) & 0xffffffff) for _k, _v in {SrcEnum.POS_HALF: 0x3f000000, SrcEnum.NEG_HALF: 0xbf000000, SrcEnum.POS_ONE: 0x3f800000, SrcEnum.NEG_ONE: 0xbf800000, SrcEnum.POS_TWO: 0x40000000, SrcEnum.NEG_TWO: 0xc0000000, SrcEnum.POS_FOUR: 0x40800000, SrcEnum.NEG_FOUR: 0xc0800000, SrcEnum.INV_2PI: 0x3e22f983}.items(): _INLINE_CONSTS[_k - 128] = _v +# Inline constants for VOP3P packed f16 operations (f16 value in low 16 bits only, high 16 bits are 0) +# Hardware does NOT replicate the constant - opsel_hi controls which half is used for the hi result +_INLINE_CONSTS_F16 = [0] * 127 +for _i in range(65): _INLINE_CONSTS_F16[_i] = _i # Integer constants in low 16 bits only +for _i in range(1, 17): _INLINE_CONSTS_F16[64 + _i] = (-_i) & 0xffff # Negative integers in low 16 bits +for _k, _v in {SrcEnum.POS_HALF: 0x3800, SrcEnum.NEG_HALF: 0xb800, SrcEnum.POS_ONE: 0x3c00, SrcEnum.NEG_ONE: 0xbc00, + SrcEnum.POS_TWO: 0x4000, SrcEnum.NEG_TWO: 0xc000, SrcEnum.POS_FOUR: 0x4400, SrcEnum.NEG_FOUR: 0xc400, + SrcEnum.INV_2PI: 0x3118}.items(): _INLINE_CONSTS_F16[_k - 128] = _v # f16 values in low 16 bits + +# Inline constants for 64-bit operations (f64 format) +# Integer constants 0-64 are zero-extended to 64 bits; -1 to -16 are sign-extended +# Float constants are the f64 representation of the value +import struct as _struct +_INLINE_CONSTS_F64 = [0] * 127 +for _i in range(65): _INLINE_CONSTS_F64[_i] = _i # Integer constants 0-64 zero-extended +for _i in range(1, 17): _INLINE_CONSTS_F64[64 + _i] = ((-_i) & 0xffffffffffffffff) # -1 to -16 sign-extended +for _k, _v in {SrcEnum.POS_HALF: 0.5, SrcEnum.NEG_HALF: -0.5, SrcEnum.POS_ONE: 1.0, SrcEnum.NEG_ONE: -1.0, + SrcEnum.POS_TWO: 2.0, SrcEnum.NEG_TWO: -2.0, SrcEnum.POS_FOUR: 4.0, SrcEnum.NEG_FOUR: -4.0, + SrcEnum.INV_2PI: 0.15915494309189535}.items(): + _INLINE_CONSTS_F64[_k - 128] = _struct.unpack(' None: global _valid_mem_ranges; _valid_mem_ranges = list(ranges) +def set_valid_mem_ranges(ranges: set[tuple[int, int]]) -> None: _valid_mem_ranges.clear(); _valid_mem_ranges.extend(ranges) def _mem_valid(addr: int, size: int) -> bool: for s, z in _valid_mem_ranges: if s <= addr and addr + size <= s + z: return True @@ -35,46 +60,60 @@ def mem_read(addr: int, size: int) -> int: return _ctypes_at(addr, size).value i def mem_write(addr: int, size: int, val: int) -> None: if _mem_valid(addr, size): _ctypes_at(addr, size).value = val -# Memory op tables - (cnt, sz, sign) for loads, (cnt, sz) for stores +# Memory op tables (not pseudocode - these are format descriptions) def _mem_ops(ops, suffix_map): return {getattr(e, f"{p}_{s}"): v for e in ops for s, v in suffix_map.items() for p in [e.__name__.replace("Op", "")]} _LOAD_MAP = {'LOAD_B32': (1,4,0), 'LOAD_B64': (2,4,0), 'LOAD_B96': (3,4,0), 'LOAD_B128': (4,4,0), 'LOAD_U8': (1,1,0), 'LOAD_I8': (1,1,1), 'LOAD_U16': (1,2,0), 'LOAD_I16': (1,2,1)} _STORE_MAP = {'STORE_B32': (1,4), 'STORE_B64': (2,4), 'STORE_B96': (3,4), 'STORE_B128': (4,4), 'STORE_B8': (1,1), 'STORE_B16': (1,2)} -FLAT_LOAD = _mem_ops([GLOBALOp, FLATOp], _LOAD_MAP) -FLAT_STORE = _mem_ops([GLOBALOp, FLATOp], _STORE_MAP) -DS_LOAD: dict[int, tuple[int,int,int]] = {DSOp.DS_LOAD_B32: (1,4,0), DSOp.DS_LOAD_B64: (2,4,0), DSOp.DS_LOAD_B128: (4,4,0), DSOp.DS_LOAD_U8: (1,1,0), DSOp.DS_LOAD_I8: (1,1,1), DSOp.DS_LOAD_U16: (1,2,0), DSOp.DS_LOAD_I16: (1,2,1)} -DS_STORE: dict[int, tuple[int,int]] = {DSOp.DS_STORE_B32: (1,4), DSOp.DS_STORE_B64: (2,4), DSOp.DS_STORE_B128: (4,4), DSOp.DS_STORE_B8: (1,1), DSOp.DS_STORE_B16: (1,2)} -FLAT_D16_LO = {getattr(e, f"{e.__name__.replace('Op', '')}_{s}"): v for e in [FLATOp, GLOBALOp] for s, v in [('LOAD_D16_U8', (1, 0)), ('LOAD_D16_I8', (1, 1)), ('LOAD_D16_B16', (2, 0))]} -FLAT_D16_HI = {getattr(e, f"{e.__name__.replace('Op', '')}_{s}"): v for e in [FLATOp, GLOBALOp] for s, v in [('LOAD_D16_HI_U8', (1, 0)), ('LOAD_D16_HI_I8', (1, 1)), ('LOAD_D16_HI_B16', (2, 0))]} -FLAT_D16_STORE = {getattr(e, f"{e.__name__.replace('Op', '')}_{s}"): v for e in [FLATOp, GLOBALOp] for s, v in [('STORE_D16_HI_B8', 1), ('STORE_D16_HI_B16', 2)]} -SMEM_LOAD: dict[int, int] = {SMEMOp.S_LOAD_B32: 1, SMEMOp.S_LOAD_B64: 2, SMEMOp.S_LOAD_B128: 4, SMEMOp.S_LOAD_B256: 8, SMEMOp.S_LOAD_B512: 16} -SOPK_WAIT = {SOPKOp.S_WAITCNT_VSCNT, SOPKOp.S_WAITCNT_VMCNT, SOPKOp.S_WAITCNT_EXPCNT, SOPKOp.S_WAITCNT_LGKMCNT} +FLAT_LOAD, FLAT_STORE = _mem_ops([GLOBALOp, FLATOp], _LOAD_MAP), _mem_ops([GLOBALOp, FLATOp], _STORE_MAP) +# D16 ops: load/store 16-bit to lower or upper half of VGPR. Format: (size, sign, hi) where hi=1 means upper 16 bits +_D16_LOAD_MAP = {'LOAD_D16_U8': (1,0,0), 'LOAD_D16_I8': (1,1,0), 'LOAD_D16_B16': (2,0,0), + 'LOAD_D16_HI_U8': (1,0,1), 'LOAD_D16_HI_I8': (1,1,1), 'LOAD_D16_HI_B16': (2,0,1)} +_D16_STORE_MAP = {'STORE_D16_HI_B8': (1,1), 'STORE_D16_HI_B16': (2,1)} # (size, hi) +FLAT_D16_LOAD = _mem_ops([GLOBALOp, FLATOp], _D16_LOAD_MAP) +FLAT_D16_STORE = _mem_ops([GLOBALOp, FLATOp], _D16_STORE_MAP) +DS_LOAD = {DSOp.DS_LOAD_B32: (1,4,0), DSOp.DS_LOAD_B64: (2,4,0), DSOp.DS_LOAD_B128: (4,4,0), DSOp.DS_LOAD_U8: (1,1,0), DSOp.DS_LOAD_I8: (1,1,1), DSOp.DS_LOAD_U16: (1,2,0), DSOp.DS_LOAD_I16: (1,2,1)} +DS_STORE = {DSOp.DS_STORE_B32: (1,4), DSOp.DS_STORE_B64: (2,4), DSOp.DS_STORE_B128: (4,4), DSOp.DS_STORE_B8: (1,1), DSOp.DS_STORE_B16: (1,2)} +SMEM_LOAD = {SMEMOp.S_LOAD_B32: 1, SMEMOp.S_LOAD_B64: 2, SMEMOp.S_LOAD_B128: 4, SMEMOp.S_LOAD_B256: 8, SMEMOp.S_LOAD_B512: 16} + +# VOPD op -> VOP3 op mapping (VOPD is dual-issue of VOP1/VOP2 ops, use VOP3 enums for pseudocode lookup) +_VOPD_TO_VOP = { + VOPDOp.V_DUAL_FMAC_F32: VOP3Op.V_FMAC_F32, VOPDOp.V_DUAL_FMAAK_F32: VOP2Op.V_FMAAK_F32, VOPDOp.V_DUAL_FMAMK_F32: VOP2Op.V_FMAMK_F32, + VOPDOp.V_DUAL_MUL_F32: VOP3Op.V_MUL_F32, VOPDOp.V_DUAL_ADD_F32: VOP3Op.V_ADD_F32, VOPDOp.V_DUAL_SUB_F32: VOP3Op.V_SUB_F32, + VOPDOp.V_DUAL_SUBREV_F32: VOP3Op.V_SUBREV_F32, VOPDOp.V_DUAL_MUL_DX9_ZERO_F32: VOP3Op.V_MUL_DX9_ZERO_F32, + VOPDOp.V_DUAL_MOV_B32: VOP3Op.V_MOV_B32, VOPDOp.V_DUAL_CNDMASK_B32: VOP3Op.V_CNDMASK_B32, + VOPDOp.V_DUAL_MAX_F32: VOP3Op.V_MAX_F32, VOPDOp.V_DUAL_MIN_F32: VOP3Op.V_MIN_F32, + VOPDOp.V_DUAL_ADD_NC_U32: VOP3Op.V_ADD_NC_U32, VOPDOp.V_DUAL_LSHLREV_B32: VOP3Op.V_LSHLREV_B32, VOPDOp.V_DUAL_AND_B32: VOP3Op.V_AND_B32, +} + +# Compiled pseudocode functions (lazy loaded) +_COMPILED: dict | None = None + +def _get_compiled() -> dict: + global _COMPILED + if _COMPILED is None: _COMPILED = get_compiled_functions() + return _COMPILED class WaveState: __slots__ = ('sgpr', 'vgpr', 'scc', 'pc', 'literal', '_pend_sgpr') def __init__(self): self.sgpr, self.vgpr = [0] * SGPR_COUNT, [[0] * VGPR_COUNT for _ in range(WAVE_SIZE)] - self.sgpr[EXEC_LO] = 0xffffffff # wave32: all lanes active - self.scc = self.pc = self.literal = 0 - self._pend_sgpr = {} + self.sgpr[EXEC_LO], self.scc, self.pc, self.literal, self._pend_sgpr = 0xffffffff, 0, 0, 0, {} @property def vcc(self) -> int: return self.sgpr[VCC_LO] | (self.sgpr[VCC_HI] << 32) @vcc.setter - def vcc(self, v: int) -> None: self.sgpr[VCC_LO] = v & 0xffffffff; self.sgpr[VCC_HI] = (v >> 32) & 0xffffffff + def vcc(self, v: int): self.sgpr[VCC_LO], self.sgpr[VCC_HI] = v & 0xffffffff, (v >> 32) & 0xffffffff @property def exec_mask(self) -> int: return self.sgpr[EXEC_LO] | (self.sgpr[EXEC_HI] << 32) @exec_mask.setter - def exec_mask(self, v: int) -> None: self.sgpr[EXEC_LO] = v & 0xffffffff; self.sgpr[EXEC_HI] = (v >> 32) & 0xffffffff + def exec_mask(self, v: int): self.sgpr[EXEC_LO], self.sgpr[EXEC_HI] = v & 0xffffffff, (v >> 32) & 0xffffffff - def rsgpr(self, i: int) -> int: - if i == NULL: return 0 - if i == SCC: return self.scc - return self.sgpr[i] if i < SGPR_COUNT else 0 - def wsgpr(self, i: int, v: int) -> None: + def rsgpr(self, i: int) -> int: return 0 if i == NULL else self.scc if i == SCC else self.sgpr[i] if i < SGPR_COUNT else 0 + def wsgpr(self, i: int, v: int): if i < SGPR_COUNT and i != NULL: self.sgpr[i] = v & 0xffffffff def rsgpr64(self, i: int) -> int: return self.rsgpr(i) | (self.rsgpr(i+1) << 32) - def wsgpr64(self, i: int, v: int) -> None: self.wsgpr(i, v & 0xffffffff); self.wsgpr(i+1, (v >> 32) & 0xffffffff) + def wsgpr64(self, i: int, v: int): self.wsgpr(i, v & 0xffffffff); self.wsgpr(i+1, (v >> 32) & 0xffffffff) def rsrc(self, v: int, lane: int) -> int: if v < SGPR_COUNT: return self.sgpr[v] @@ -83,17 +122,29 @@ class WaveState: if v == 255: return self.literal return self.vgpr[lane][v - 256] if v <= 511 else 0 + def rsrc_f16(self, v: int, lane: int) -> int: + """Read source operand for VOP3P packed f16 operations. Uses f16 inline constants.""" + if v < SGPR_COUNT: return self.sgpr[v] + if v == SCC: return self.scc + if v < 255: return _INLINE_CONSTS_F16[v - 128] + if v == 255: return self.literal + return self.vgpr[lane][v - 256] if v <= 511 else 0 + def rsrc64(self, v: int, lane: int) -> int: + """Read 64-bit source operand. For inline constants, returns 64-bit representation.""" + # Inline constants 128-254 need special handling for 64-bit ops + if 128 <= v < 255: return _INLINE_CONSTS_F64[v - 128] + if v == 255: return self.literal # 32-bit literal, caller handles extension return self.rsrc(v, lane) | ((self.rsrc(v+1, lane) if v < VCC_LO or 256 <= v <= 511 else 0) << 32) - def pend_sgpr_lane(self, reg: int, lane: int, val: int) -> None: + def pend_sgpr_lane(self, reg: int, lane: int, val: int): if reg not in self._pend_sgpr: self._pend_sgpr[reg] = 0 if val: self._pend_sgpr[reg] |= (1 << lane) - - def commit_pends(self) -> None: + def commit_pends(self): for reg, val in self._pend_sgpr.items(): self.sgpr[reg] = val self._pend_sgpr.clear() +# Instruction decode def decode_format(word: int) -> tuple[type[Inst] | None, bool]: hi2 = (word >> 30) & 0x3 if hi2 == 0b11: @@ -119,362 +170,519 @@ def decode_program(data: bytes) -> Program: inst_class, is_64 = decode_format(word) if inst_class is None: i += 4; continue base_size = 8 if is_64 else 4 - inst = inst_class.from_bytes(data[i:i+base_size]) + # Pass enough data for potential 64-bit literal (base + 8 bytes max) + inst = inst_class.from_bytes(data[i:i+base_size+8]) for name, val in inst._values.items(): setattr(inst, name, _unwrap(val)) - has_literal = any(getattr(inst, fld, None) == 255 for fld in ('src0', 'src1', 'src2', 'ssrc0', 'ssrc1', 'srcx0', 'srcy0')) - if inst_class == VOP2 and inst.op in (44, 45, 55, 56): has_literal = True - if inst_class == VOPD and (inst.opx in (1, 2) or inst.opy in (1, 2)): has_literal = True - if inst_class == SOP2 and inst.op in (69, 70): has_literal = True - if has_literal: inst._literal = int.from_bytes(data[i+base_size:i+base_size+4], 'little') - inst._words = inst.size() // 4 # cache size for step_wave + # from_bytes already handles literal reading - only need fallback for cases it doesn't handle + if inst._literal is None: + has_literal = any(getattr(inst, fld, None) == 255 for fld in ('src0', 'src1', 'src2', 'ssrc0', 'ssrc1', 'srcx0', 'srcy0')) + if inst_class == VOP2 and inst.op in (44, 45, 55, 56): has_literal = True + if inst_class == VOPD and (inst.opx in (1, 2) or inst.opy in (1, 2)): has_literal = True + if inst_class == SOP2 and inst.op in (69, 70): has_literal = True + if has_literal: + # For 64-bit ops, the 32-bit literal is placed in HIGH 32 bits (low 32 bits = 0) + # Exception: some ops have mixed src sizes (e.g., V_LDEXP_F64 has 32-bit src1) + op_val = inst._values.get('op') + if hasattr(op_val, 'value'): op_val = op_val.value + is_64bit = inst_class is VOP3 and op_val in _VOP3_64BIT_OPS + # Don't treat literal as 64-bit if the op has 32-bit src1 and src1 is the literal + if is_64bit and op_val in _VOP3_64BIT_OPS_32BIT_SRC1 and getattr(inst, 'src1', None) == 255: + is_64bit = False + lit32 = int.from_bytes(data[i+base_size:i+base_size+4], 'little') + inst._literal = (lit32 << 32) if is_64bit else lit32 + inst._words = inst.size() // 4 result[i // 4] = inst i += inst._words * 4 return result # ═══════════════════════════════════════════════════════════════════════════════ -# SCALAR EXECUTION +# EXECUTION - All ALU ops use pseudocode from PDF # ═══════════════════════════════════════════════════════════════════════════════ -def exec_sop1(st: WaveState, inst: SOP1) -> int: - s0, op = st.rsrc(inst.ssrc0, 0), inst.op - # 64-bit and special ops handled inline - if op == SOP1Op.S_MOV_B64: st.wsgpr64(inst.sdst, st.rsrc64(inst.ssrc0, 0)); return 0 - if op == SOP1Op.S_NOT_B64: r = (~st.rsrc64(inst.ssrc0, 0)) & 0xffffffffffffffff; st.wsgpr64(inst.sdst, r); st.scc = int(r != 0); return 0 - if op == SOP1Op.S_BITSET0_B32: st.wsgpr(inst.sdst, st.rsgpr(inst.sdst) & ~(1 << (s0 & 0x1f))); return 0 - if op == SOP1Op.S_BITSET1_B32: st.wsgpr(inst.sdst, st.rsgpr(inst.sdst) | (1 << (s0 & 0x1f))); return 0 - if op == SOP1Op.S_AND_SAVEEXEC_B32: old = st.exec_mask & 0xffffffff; st.exec_mask = s0 & old; st.scc = int(st.exec_mask != 0); st.wsgpr(inst.sdst, old); return 0 - if op == SOP1Op.S_OR_SAVEEXEC_B32: old = st.exec_mask & 0xffffffff; st.exec_mask = s0 | old; st.scc = int(st.exec_mask != 0); st.wsgpr(inst.sdst, old); return 0 - if op == SOP1Op.S_AND_NOT1_SAVEEXEC_B32: old = st.exec_mask & 0xffffffff; st.exec_mask = s0 & (~old & 0xffffffff); st.scc = int(st.exec_mask != 0); st.wsgpr(inst.sdst, old); return 0 - if op == SOP1Op.S_GETPC_B64: return -3 - if op == SOP1Op.S_SETPC_B64: return -4 - if op == SOP1Op.S_SWAPPC_B64: return -5 - if (fn := SALU.get(SOP1_BASE + op)) is None: raise NotImplementedError(f"SOP1 op {op}") - r, scc = fn(s0, 0, st.scc); st.wsgpr(inst.sdst, r); st.scc = scc; return 0 -_SOP2_64: dict[int, Callable[[int, int], int]] = {SOP2Op.S_AND_B64: lambda a, b: a & b, SOP2Op.S_OR_B64: lambda a, b: a | b, SOP2Op.S_XOR_B64: lambda a, b: a ^ b} -def exec_sop2(st: WaveState, inst: SOP2) -> int: - s0, s1, op = st.rsrc(inst.ssrc0, 0), st.rsrc(inst.ssrc1, 0), inst.op - # 64-bit ops handled inline - if op == SOP2Op.S_LSHL_B64: r = (st.rsrc64(inst.ssrc0, 0) << (s1 & 0x3f)) & 0xffffffffffffffff; st.wsgpr64(inst.sdst, r); st.scc = int(r != 0); return 0 - if op == SOP2Op.S_LSHR_B64: r = st.rsrc64(inst.ssrc0, 0) >> (s1 & 0x3f); st.wsgpr64(inst.sdst, r); st.scc = int(r != 0); return 0 - if op == SOP2Op.S_ASHR_I64: r = sext(st.rsrc64(inst.ssrc0, 0), 64) >> (s1 & 0x3f); st.wsgpr64(inst.sdst, r & 0xffffffffffffffff); st.scc = int(r != 0); return 0 - if (fn := _SOP2_64.get(op)): r = fn(st.rsrc64(inst.ssrc0, 0), st.rsrc64(inst.ssrc1, 0)); st.wsgpr64(inst.sdst, r); st.scc = int(r != 0); return 0 - if op == SOP2Op.S_CSELECT_B64: st.wsgpr64(inst.sdst, st.rsrc64(inst.ssrc0, 0) if st.scc else st.rsrc64(inst.ssrc1, 0)); return 0 - if op == SOP2Op.S_FMAC_F32: st.wsgpr(inst.sdst, i32(f32(st.rsgpr(inst.sdst)) + f32(s0) * f32(s1))); return 0 - if op == SOP2Op.S_FMAAK_F32: st.wsgpr(inst.sdst, i32(f32(s0) * f32(s1) + f32(inst._literal or 0))); return 0 - if op == SOP2Op.S_FMAMK_F32: st.wsgpr(inst.sdst, i32(f32(s0) * f32(inst._literal or 0) + f32(s1))); return 0 - if (fn := SALU.get(SOP2_BASE + op)) is None: raise NotImplementedError(f"SOP2 op {op}") - r, scc = fn(s0, s1, st.scc); st.wsgpr(inst.sdst, r); st.scc = scc; return 0 +def exec_scalar(st: WaveState, inst: Inst) -> int: + """Execute scalar instruction. Returns PC delta or negative for special cases.""" + compiled = _get_compiled() + inst_type = type(inst) -def exec_sopc(st: WaveState, inst: SOPC) -> int: - s0, s1, op = st.rsrc(inst.ssrc0, 0), st.rsrc(inst.ssrc1, 0), inst.op - if op == SOPCOp.S_CMP_EQ_U64: st.scc = int(st.rsrc64(inst.ssrc0, 0) == st.rsrc64(inst.ssrc1, 0)); return 0 - if op == SOPCOp.S_CMP_LG_U64: st.scc = int(st.rsrc64(inst.ssrc0, 0) != st.rsrc64(inst.ssrc1, 0)); return 0 - if (fn := SALU.get(SOPC_BASE + op)) is None: raise NotImplementedError(f"SOPC op {op}") - st.scc = fn(s0, s1, st.scc)[1]; return 0 + # SOPP: control flow (not ALU) + if inst_type is SOPP: + op = inst.op + if op == SOPPOp.S_ENDPGM: return -1 + if op == SOPPOp.S_BARRIER: return -2 + if op == SOPPOp.S_BRANCH: return _sext(inst.simm16, 16) + if op == SOPPOp.S_CBRANCH_SCC0: return _sext(inst.simm16, 16) if st.scc == 0 else 0 + if op == SOPPOp.S_CBRANCH_SCC1: return _sext(inst.simm16, 16) if st.scc == 1 else 0 + if op == SOPPOp.S_CBRANCH_VCCZ: return _sext(inst.simm16, 16) if (st.vcc & 0xffffffff) == 0 else 0 + if op == SOPPOp.S_CBRANCH_VCCNZ: return _sext(inst.simm16, 16) if (st.vcc & 0xffffffff) != 0 else 0 + if op == SOPPOp.S_CBRANCH_EXECZ: return _sext(inst.simm16, 16) if st.exec_mask == 0 else 0 + if op == SOPPOp.S_CBRANCH_EXECNZ: return _sext(inst.simm16, 16) if st.exec_mask != 0 else 0 + # Valid SOPP range is 0-61 (max defined opcode); anything above is invalid + if op > 61: raise NotImplementedError(f"Invalid SOPP opcode {op}") + return 0 # waits, hints, nops -_SOPK_CMP = frozenset((SOPKOp.S_CMPK_EQ_I32, SOPKOp.S_CMPK_LG_I32, SOPKOp.S_CMPK_GT_I32, SOPKOp.S_CMPK_GE_I32, - SOPKOp.S_CMPK_LT_I32, SOPKOp.S_CMPK_LE_I32, SOPKOp.S_CMPK_EQ_U32, SOPKOp.S_CMPK_LG_U32, - SOPKOp.S_CMPK_GT_U32, SOPKOp.S_CMPK_GE_U32, SOPKOp.S_CMPK_LT_U32, SOPKOp.S_CMPK_LE_U32)) -def exec_sopk(st: WaveState, inst: SOPK) -> int: - simm, s0, op = inst.simm16, st.rsgpr(inst.sdst), inst.op - if op in SOPK_WAIT: return 0 - if (fn := SALU.get(SOPK_BASE + op)) is None: raise NotImplementedError(f"SOPK op {op}") - r, scc = fn(s0, simm, st.scc) - if op not in _SOPK_CMP: st.wsgpr(inst.sdst, r) - st.scc = scc; return 0 + # SMEM: memory loads (not ALU) + if inst_type is SMEM: + addr = st.rsgpr64(inst.sbase * 2) + _sext(inst.offset, 21) + if inst.soffset not in (NULL, 0x7f): addr += st.rsrc(inst.soffset, 0) + if (cnt := SMEM_LOAD.get(inst.op)) is None: raise NotImplementedError(f"SMEM op {inst.op}") + for i in range(cnt): st.wsgpr(inst.sdata + i, mem_read((addr + i * 4) & 0xffffffffffffffff, 4)) + return 0 -def exec_sopp(st: WaveState, inst: SOPP) -> int: - if inst.op == SOPPOp.S_ENDPGM: return -1 - if inst.op == SOPPOp.S_BARRIER: return -2 - if inst.op == SOPPOp.S_BRANCH: return sext(inst.simm16, 16) - if inst.op == SOPPOp.S_CBRANCH_SCC0: return sext(inst.simm16, 16) if st.scc == 0 else 0 - if inst.op == SOPPOp.S_CBRANCH_SCC1: return sext(inst.simm16, 16) if st.scc == 1 else 0 - # In wave32 mode, only VCC_LO is used for lane masks; VCC_HI is a free SGPR - if inst.op == SOPPOp.S_CBRANCH_VCCZ: return sext(inst.simm16, 16) if (st.vcc & 0xffffffff) == 0 else 0 - if inst.op == SOPPOp.S_CBRANCH_VCCNZ: return sext(inst.simm16, 16) if (st.vcc & 0xffffffff) != 0 else 0 - if inst.op == SOPPOp.S_CBRANCH_EXECZ: return sext(inst.simm16, 16) if st.exec_mask == 0 else 0 - if inst.op == SOPPOp.S_CBRANCH_EXECNZ: return sext(inst.simm16, 16) if st.exec_mask != 0 else 0 - # Scheduling hints and wait instructions are no-ops in emulation - if inst.op <= 31: return 0 # S_NOP, S_CLAUSE, S_DELAY_ALU, S_WAITCNT, etc. - # S_WAKEUP(52), S_SETPRIO(53), S_SENDMSG(54), S_SENDMSGHALT(55), perf counters, S_ICACHE_INV(60) are no-ops - if inst.op in (52, 53, 54, 55, 56, 57, 60): return 0 - raise NotImplementedError(f"SOPP op {inst.op}") + # SOP1: special handling for ops not in pseudocode + if inst_type is SOP1: + op = SOP1Op(inst.op) + # S_GETPC_B64: Get program counter (PC is stored as byte offset, convert from words) + if op == SOP1Op.S_GETPC_B64: + pc_bytes = st.pc * 4 # PC is in words, convert to bytes + st.wsgpr64(inst.sdst, pc_bytes) + return 0 + # S_SETPC_B64: Set program counter to source value (indirect jump) + # Returns delta such that st.pc + inst_words + delta = target_words + if op == SOP1Op.S_SETPC_B64: + target_bytes = st.rsrc64(inst.ssrc0, 0) + target_words = target_bytes // 4 + inst_words = 1 # SOP1 is always 1 word + return target_words - st.pc - inst_words -def exec_smem(st: WaveState, inst: SMEM) -> int: - addr = st.rsgpr64(inst.sbase * 2) + sext(inst.offset, 21) - if inst.soffset not in (NULL, 0x7f): addr += st.rsrc(inst.soffset, 0) - if (cnt := SMEM_LOAD.get(inst.op)) is None: raise NotImplementedError(f"SMEM op {inst.op}") - for i in range(cnt): st.wsgpr(inst.sdata + i, mem_read((addr + i * 4) & 0xffffffffffffffff, 4)) + # Get op enum and lookup compiled function + if inst_type is SOP1: op_cls, ssrc0, sdst = SOP1Op, inst.ssrc0, inst.sdst + elif inst_type is SOP2: op_cls, ssrc0, sdst = SOP2Op, inst.ssrc0, inst.sdst + elif inst_type is SOPC: op_cls, ssrc0, sdst = SOPCOp, inst.ssrc0, None + elif inst_type is SOPK: op_cls, ssrc0, sdst = SOPKOp, inst.sdst, inst.sdst # sdst is both src and dst + else: raise NotImplementedError(f"Unknown scalar type {inst_type}") + + op = op_cls(inst.op) + fn = compiled.get(op_cls, {}).get(op) + if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode") + + # Build context - handle 64-bit ops that need 64-bit source reads + # 64-bit source ops: name ends with _B64, _I64, _U64 or contains _U64, _I64 before last underscore + is_64bit_s0 = op.name.endswith(('_B64', '_I64', '_U64')) or '_U64_' in op.name or '_I64_' in op.name + is_64bit_s0s1 = op_cls is SOPCOp and op in (SOPCOp.S_CMP_EQ_U64, SOPCOp.S_CMP_LG_U64) + s0 = st.rsrc64(ssrc0, 0) if is_64bit_s0 or is_64bit_s0s1 else (st.rsrc(ssrc0, 0) if inst_type != SOPK else st.rsgpr(inst.sdst)) + is_64bit_sop2 = is_64bit_s0 and inst_type is SOP2 + s1 = st.rsrc64(inst.ssrc1, 0) if (is_64bit_sop2 or is_64bit_s0s1) else (st.rsrc(inst.ssrc1, 0) if inst_type in (SOP2, SOPC) else inst.simm16 if inst_type is SOPK else 0) + d0 = st.rsgpr64(sdst) if (is_64bit_s0 or is_64bit_s0s1) and sdst is not None else (st.rsgpr(sdst) if sdst is not None else 0) + exec_mask = st.exec_mask + literal = inst.simm16 if inst_type is SOPK else st.literal + + # Execute compiled function + result = fn(s0, s1, 0, d0, st.scc, st.vcc, 0, exec_mask, literal, None, {}) + + # Apply results + if sdst is not None: + if result.get('d0_64'): + st.wsgpr64(sdst, result['d0']) + else: + st.wsgpr(sdst, result['d0']) + if 'scc' in result: st.scc = result['scc'] + if 'exec' in result: st.exec_mask = result['exec'] + if 'pc_delta' in result: return result['pc_delta'] return 0 -# ═══════════════════════════════════════════════════════════════════════════════ -# VECTOR EXECUTION -# ═══════════════════════════════════════════════════════════════════════════════ -def f64(hi: int, lo: int) -> float: return struct.unpack(' tuple[int, int]: - if math.isnan(f): val = 0x7ff8000000000000 - elif math.isinf(f): val = 0x7ff0000000000000 if f > 0 else 0xfff0000000000000 - else: val = struct.unpack('> 32) & 0xffffffff +def exec_vector(st: WaveState, inst: Inst, lane: int, lds: bytearray | None = None) -> None: + """Execute vector instruction for one lane.""" + compiled = _get_compiled() + inst_type, V = type(inst), st.vgpr[lane] -def exec_vop1(st: WaveState, inst: VOP1, lane: int) -> None: - if inst.op == VOP1Op.V_NOP: return - V, s0 = st.vgpr[lane], st.rsrc(inst.src0, lane) - if inst.op == VOP1Op.V_READFIRSTLANE_B32: - first = (st.exec_mask & -st.exec_mask).bit_length() - 1 if st.exec_mask else 0 - st.wsgpr(inst.vdst, st.rsrc(inst.src0, first) if inst.src0 >= 256 else s0); return - # F64 ops handled inline - if inst.op == VOP1Op.V_CVT_F64_F32: V[inst.vdst], V[inst.vdst+1] = i64_parts(float(f32(s0))); return - if inst.op == VOP1Op.V_CVT_F64_I32: V[inst.vdst], V[inst.vdst+1] = i64_parts(float(sext(s0, 32))); return - if inst.op == VOP1Op.V_CVT_F64_U32: V[inst.vdst], V[inst.vdst+1] = i64_parts(float(s0)); return - if inst.op in (VOP1Op.V_CVT_F32_F64, VOP1Op.V_CVT_I32_F64, VOP1Op.V_CVT_U32_F64): - src = inst.src0 - 256 if inst.src0 >= 256 else inst.src0 - lo, hi = (V[src], V[src+1]) if inst.src0 >= 256 else (st.sgpr[src], st.sgpr[src+1]) - v = f64(hi, lo) - if inst.op == VOP1Op.V_CVT_F32_F64: V[inst.vdst] = i32(v) - elif inst.op == VOP1Op.V_CVT_I32_F64: V[inst.vdst] = (max(-0x80000000, min(0x7fffffff, int(v))) & 0xffffffff) if math.isfinite(v) else 0 - else: V[inst.vdst] = max(0, min(0xffffffff, int(v))) if math.isfinite(v) and v == v else 0 + # Memory ops (not ALU pseudocode) + if inst_type is FLAT: + op, addr_reg, data_reg, vdst, offset, saddr = inst.op, inst.addr, inst.data, inst.vdst, _sext(inst.offset, 13), inst.saddr + addr = V[addr_reg] | (V[addr_reg+1] << 32) + addr = (st.rsgpr64(saddr) + V[addr_reg] + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff + if op in FLAT_LOAD: + cnt, sz, sign = FLAT_LOAD[op] + for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i] = _sext(val, sz * 8) & 0xffffffff if sign else val + elif op in FLAT_STORE: + cnt, sz = FLAT_STORE[op] + for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i] & ((1 << (sz * 8)) - 1)) + elif op in FLAT_D16_LOAD: + sz, sign, hi = FLAT_D16_LOAD[op] + val = mem_read(addr, sz) + if sign: val = _sext(val, sz * 8) & 0xffff + if hi: V[vdst] = (V[vdst] & 0xffff) | (val << 16) # upper 16 bits + else: V[vdst] = (V[vdst] & 0xffff0000) | (val & 0xffff) # lower 16 bits + elif op in FLAT_D16_STORE: + sz, hi = FLAT_D16_STORE[op] + val = (V[data_reg] >> 16) & 0xffff if hi else V[data_reg] & 0xffff + mem_write(addr, sz, val & ((1 << (sz * 8)) - 1)) + else: raise NotImplementedError(f"FLAT op {op}") return - if (fn := VALU.get(VOP1_BASE + inst.op)): V[inst.vdst] = fn(s0, 0, 0); return - raise NotImplementedError(f"VOP1 op {inst.op}") -def exec_vop2(st: WaveState, inst: VOP2, lane: int) -> None: - V, s0, s1, op = st.vgpr[lane], st.rsrc(inst.src0, lane), st.vgpr[lane][inst.vsrc1], inst.op - if op == VOP2Op.V_CNDMASK_B32: V[inst.vdst] = s1 if (st.vcc >> lane) & 1 else s0; return - if op == VOP2Op.V_FMAC_F32: V[inst.vdst] = i32(f32(s0)*f32(s1)+f32(V[inst.vdst])); return - if op == VOP2Op.V_FMAMK_F32: V[inst.vdst] = i32(f32(s0)*f32(st.literal)+f32(s1)); return - if op == VOP2Op.V_FMAAK_F32: V[inst.vdst] = i32(f32(s0)*f32(s1)+f32(st.literal)); return - if op == VOP2Op.V_FMAC_F16: V[inst.vdst] = (V[inst.vdst] & 0xffff0000) | i16(f16(s0)*f16(s1)+f16(V[inst.vdst])); return - if op == VOP2Op.V_FMAMK_F16: V[inst.vdst] = (V[inst.vdst] & 0xffff0000) | i16(f16(s0)*f16(st.literal)+f16(s1)); return - if op == VOP2Op.V_FMAAK_F16: V[inst.vdst] = (V[inst.vdst] & 0xffff0000) | i16(f16(s0)*f16(s1)+f16(st.literal)); return - if op == VOP2Op.V_PK_FMAC_F16: - lo = i16(f16(s0 & 0xffff) * f16(s1 & 0xffff) + f16(V[inst.vdst] & 0xffff)) - hi = i16(f16((s0 >> 16) & 0xffff) * f16((s1 >> 16) & 0xffff) + f16((V[inst.vdst] >> 16) & 0xffff)) - V[inst.vdst] = lo | (hi << 16); return - if op == VOP2Op.V_ADD_CO_CI_U32: r = s0+s1+((st.vcc>>lane)&1); st.pend_sgpr_lane(VCC_LO, lane, r >= 0x100000000); V[inst.vdst] = r & 0xffffffff; return - if op == VOP2Op.V_SUB_CO_CI_U32: b = (st.vcc>>lane)&1; st.pend_sgpr_lane(VCC_LO, lane, s1+b > s0); V[inst.vdst] = (s0-s1-b) & 0xffffffff; return - if (fn := VALU.get(VOP2_BASE + op)): V[inst.vdst] = fn(s0, s1, 0); return - raise NotImplementedError(f"VOP2 op {op}") + if inst_type is DS: + op, addr, vdst = inst.op, (V[inst.addr] + inst.offset0) & 0xffff, inst.vdst + if op in DS_LOAD: + cnt, sz, sign = DS_LOAD[op] + for i in range(cnt): val = int.from_bytes(lds[addr+i*sz:addr+i*sz+sz], 'little'); V[vdst + i] = _sext(val, sz * 8) & 0xffffffff if sign else val + elif op in DS_STORE: + cnt, sz = DS_STORE[op] + for i in range(cnt): lds[addr+i*sz:addr+i*sz+sz] = (V[inst.data0 + i] & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little') + else: raise NotImplementedError(f"DS op {op}") + return -def vop3_mod(val: int, neg: int, abs_: int, idx: int) -> int: - if (abs_ >> idx) & 1: val = i32(abs(f32(val))) - if (neg >> idx) & 1: val = i32(-f32(val)) - return val + # VOPD: dual-issue, execute two ops using VOP2/VOP3 compiled functions + # Both ops execute simultaneously using pre-instruction values, so read all inputs first + if inst_type is VOPD: + vdsty = (inst.vdsty << 1) | ((inst.vdstx & 1) ^ 1) + # Read all source operands BEFORE any writes (dual-issue semantics) + sx0, sx1 = st.rsrc(inst.srcx0, lane), V[inst.vsrcx1] + sy0, sy1 = st.rsrc(inst.srcy0, lane), V[inst.vsrcy1] + dx0, dy0 = V[inst.vdstx], V[vdsty] + # Execute X op + res_x = None + if (op_x := _VOPD_TO_VOP.get(inst.opx)): + if (fn_x := compiled.get(type(op_x), {}).get(op_x)): + res_x = fn_x(sx0, sx1, 0, dx0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {}) + # Execute Y op + res_y = None + if (op_y := _VOPD_TO_VOP.get(inst.opy)): + if (fn_y := compiled.get(type(op_y), {}).get(op_y)): + res_y = fn_y(sy0, sy1, 0, dy0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {}) + # Write results after both ops complete + if res_x is not None: V[inst.vdstx] = res_x['d0'] + if res_y is not None: V[vdsty] = res_y['d0'] + return -def exec_vop3(st: WaveState, inst: VOP3, lane: int) -> None: - op, src0, src1, src2, vdst, neg, abs_ = inst.op, inst.src0, inst.src1, inst.src2, inst.vdst, inst.neg, getattr(inst, 'abs', 0) - V = st.vgpr[lane] - # VOPC encoded in VOP3 (0-255) - if 0 <= op <= 255: - base = op & 0x7f - # For 64-bit comparisons (I64: 80-87, U64: 88-95), read raw 64-bit values (no float modifiers) - if 80 <= base <= 95: - s0_64, s1_64 = st.rsrc64(src0, lane), st.rsrc64(src1, lane) - result = vopc(op, s0_64 & 0xffffffff, s1_64 & 0xffffffff, (s0_64 >> 32) & 0xffffffff, (s1_64 >> 32) & 0xffffffff) + # VOP3SD: has extra scalar dest for carry output + if inst_type is VOP3SD: + op = VOP3SDOp(inst.op) + fn = compiled.get(VOP3SDOp, {}).get(op) + if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode") + s0, s1, s2 = st.rsrc(inst.src0, lane), st.rsrc(inst.src1, lane), st.rsrc(inst.src2, lane) + # For 64-bit src2 ops (V_MAD_U64_U32, V_MAD_I64_I32), read from consecutive registers + mad64_ops = (VOP3SDOp.V_MAD_U64_U32, VOP3SDOp.V_MAD_I64_I32) + if op in mad64_ops: + if inst.src2 >= 256: # VGPR + s2 = V[inst.src2 - 256] | (V[inst.src2 - 256 + 1] << 32) + else: # SGPR - read 64-bit from consecutive SGPRs + s2 = st.rsgpr64(inst.src2) + d0 = V[inst.vdst] + # For carry-in operations (V_*_CO_CI_*), src2 register contains the carry bitmask (not VCC). + # The pseudocode uses VCC but in VOP3SD encoding, the actual carry source is inst.src2. + # We pass the src2 register value as 'vcc' to the interpreter so it reads the correct carry. + carry_ops = (VOP3SDOp.V_ADD_CO_CI_U32, VOP3SDOp.V_SUB_CO_CI_U32, VOP3SDOp.V_SUBREV_CO_CI_U32) + vcc_for_exec = st.rsgpr64(inst.src2) if op in carry_ops else st.vcc + result = fn(s0, s1, s2, d0, st.scc, vcc_for_exec, lane, st.exec_mask, st.literal, None, {}) + # Write result - handle 64-bit destinations + if result.get('d0_64'): + V[inst.vdst] = result['d0'] & 0xffffffff + V[inst.vdst + 1] = (result['d0'] >> 32) & 0xffffffff else: - s0, s1 = vop3_mod(st.rsrc(src0, lane), neg, abs_, 0), vop3_mod(st.rsrc(src1, lane), neg, abs_, 1) - result = vopc(op, s0, s1) - is_cmpx = op >= 128 - st.pend_sgpr_lane(vdst, lane, result) - if is_cmpx: st.pend_sgpr_lane(EXEC_LO, lane, result) + V[inst.vdst] = result['d0'] & 0xffffffff + if result.get('vcc_lane') is not None: + st.pend_sgpr_lane(inst.sdst, lane, result['vcc_lane']) return - s0, s1, s2 = vop3_mod(st.rsrc(src0, lane), neg, abs_, 0), vop3_mod(st.rsrc(src1, lane), neg, abs_, 1), vop3_mod(st.rsrc(src2, lane), neg, abs_, 2) - # Special ops - if op == VOP3Op.V_FMAC_F32: V[vdst] = i32(f32(s0)*f32(s1)+f32(V[vdst])); return - if op == VOP3Op.V_READLANE_B32: st.wsgpr(vdst, st.vgpr[s1 & 0x1f][src0 - 256] if src0 >= 256 else s0); return - if op == VOP3Op.V_WRITELANE_B32: st.vgpr[s1 & 0x1f][vdst] = s0; return - if op == VOP3Op.V_CNDMASK_B32: - mask = st.rsgpr(src2) if src2 < 256 else st.vcc - V[vdst] = s1 if (mask >> lane) & 1 else s0; return - if op in (VOP3Op.V_LSHLREV_B64, VOP3Op.V_LSHRREV_B64, VOP3Op.V_ASHRREV_I64): - v64 = st.rsrc64(src1, lane) - r = ((v64 << (s0 & 0x3f)) & 0xffffffffffffffff if op == VOP3Op.V_LSHLREV_B64 else - v64 >> (s0 & 0x3f) if op == VOP3Op.V_LSHRREV_B64 else sext(v64, 64) >> (s0 & 0x3f)) - V[vdst], V[vdst+1] = r & 0xffffffff, (r >> 32) & 0xffffffff; return - if op in (VOP3Op.V_ADD_F64, VOP3Op.V_MUL_F64, VOP3Op.V_FMA_F64, VOP3Op.V_MAX_F64, VOP3Op.V_MIN_F64): - a, b = f64(st.rsrc(src0+1, lane), s0), f64(st.rsrc(src1+1, lane), s1) - c = f64(st.rsrc(src2+1, lane), s2) if op == VOP3Op.V_FMA_F64 else 0.0 - rf = a + b if op == VOP3Op.V_ADD_F64 else a * b if op == VOP3Op.V_MUL_F64 else a * b + c if op == VOP3Op.V_FMA_F64 else max(a, b) if op == VOP3Op.V_MAX_F64 else min(a, b) - V[vdst], V[vdst+1] = i64_parts(rf); return - if (fn := VALU.get(op)): V[vdst] = fn(s0, s1, s2); return - raise NotImplementedError(f"VOP3 op {op}") -def exec_vopc(st: WaveState, inst: VOPC, lane: int) -> None: - result, is_cmpx = vopc(inst.op, st.rsrc(inst.src0, lane), st.vgpr[lane][inst.vsrc1]), inst.op >= 128 - st.pend_sgpr_lane(EXEC_LO if is_cmpx else VCC_LO, lane, result) -def exec_vop3sd(st: WaveState, inst: VOP3SD, lane: int) -> None: - op, src0, src1, src2, vdst, sdst, neg = inst.op, inst.src0, inst.src1, inst.src2, inst.vdst, inst.sdst, inst.neg - s0, s1, s2 = st.rsrc(src0, lane), st.rsrc(src1, lane), st.rsrc(src2, lane) - if (neg >> 0) & 1: s0 = i32(-f32(s0)) - if (neg >> 1) & 1: s1 = i32(-f32(s1)) - if (neg >> 2) & 1: s2 = i32(-f32(s2)) - V = st.vgpr[lane] - if op == VOP3SDOp.V_ADD_CO_U32: r = s0 + s1; V[vdst] = r & 0xffffffff; st.pend_sgpr_lane(sdst, lane, r >= 0x100000000) - elif op == VOP3SDOp.V_SUB_CO_U32: V[vdst] = (s0 - s1) & 0xffffffff; st.pend_sgpr_lane(sdst, lane, s1 > s0) - elif op == VOP3SDOp.V_SUBREV_CO_U32: V[vdst] = (s1 - s0) & 0xffffffff; st.pend_sgpr_lane(sdst, lane, s0 > s1) - elif op == VOP3SDOp.V_ADD_CO_CI_U32: - cin = (st.rsgpr(src2) >> lane) & 1 if src2 < 256 else (st.vcc >> lane) & 1 - r = s0 + s1 + cin; V[vdst] = r & 0xffffffff; st.pend_sgpr_lane(sdst, lane, r >= 0x100000000) - elif op == VOP3SDOp.V_SUB_CO_CI_U32: - cin = (st.rsgpr(src2) >> lane) & 1 if src2 < 256 else (st.vcc >> lane) & 1 - V[vdst] = (s0 - s1 - cin) & 0xffffffff; st.pend_sgpr_lane(sdst, lane, s1 + cin > s0) - elif op == VOP3SDOp.V_MAD_U64_U32: - s2_64 = s2 | (st.rsrc(src2+1, lane) << 32); r = s0 * s1 + s2_64 - V[vdst], V[vdst+1] = r & 0xffffffff, (r >> 32) & 0xffffffff - elif op == VOP3SDOp.V_MAD_I64_I32: - s2_64 = sext(s2 | (st.rsrc(src2+1, lane) << 32), 64) - r = (sext(s0, 32) * sext(s1, 32) + s2_64) & 0xffffffffffffffff - V[vdst], V[vdst+1] = r & 0xffffffff, (r >> 32) & 0xffffffff - elif op == VOP3SDOp.V_DIV_SCALE_F32: V[vdst] = 0; st.pend_sgpr_lane(sdst, lane, False) - elif op == VOP3SDOp.V_DIV_SCALE_F64: V[vdst], V[vdst+1] = s0, st.rsrc(src0+1, lane); st.pend_sgpr_lane(VCC_LO, lane, s0 == s2) - else: raise NotImplementedError(f"VOP3SD op {op}") -def exec_flat(st: WaveState, inst: FLAT, lane: int) -> None: - op, addr_reg, data_reg, vdst, offset, saddr, V = inst.op, inst.addr, inst.data, inst.vdst, sext(inst.offset, 13), inst.saddr, st.vgpr[lane] - addr = V[addr_reg] | (V[addr_reg+1] << 32) - addr = (st.rsgpr64(saddr) + V[addr_reg] + offset) & 0xffffffffffffffff if saddr not in (NULL, 0x7f) else (addr + offset) & 0xffffffffffffffff - if op in FLAT_LOAD: - cnt, sz, sign = FLAT_LOAD[op] - for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i] = sext(val, sz * 8) & 0xffffffff if sign else val - elif op in FLAT_STORE: - cnt, sz = FLAT_STORE[op] - for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i] & ((1 << (sz * 8)) - 1)) - elif op in FLAT_D16_LO: sz, sign = FLAT_D16_LO[op]; val = mem_read(addr, sz); V[vdst] = (V[vdst] & 0xffff0000) | ((sext(val, sz * 8) & 0xffff) if sign else (val & 0xffff)) - elif op in FLAT_D16_HI: sz, sign = FLAT_D16_HI[op]; val = mem_read(addr, sz); V[vdst] = (V[vdst] & 0x0000ffff) | (((sext(val, sz * 8) & 0xffff) if sign else (val & 0xffff)) << 16) - elif op in FLAT_D16_STORE: mem_write(addr, FLAT_D16_STORE[op], (V[data_reg] >> 16) & ((1 << (FLAT_D16_STORE[op] * 8)) - 1)) - else: raise NotImplementedError(f"FLAT op {op}") + # Get op enum and sources (None means "no source" for that operand) + if inst_type is VOP1: + if inst.op == VOP1Op.V_NOP: return + op_cls, op, src0, src1, src2, vdst = VOP1Op, VOP1Op(inst.op), inst.src0, None, None, inst.vdst + elif inst_type is VOP2: + op_cls, op, src0, src1, src2, vdst = VOP2Op, VOP2Op(inst.op), inst.src0, inst.vsrc1 + 256, None, inst.vdst + elif inst_type is VOP3: + # VOP3 ops 0-255 are VOPC comparisons encoded as VOP3 (use VOPCOp pseudocode) + if inst.op < 256: + op_cls, op, src0, src1, src2, vdst = VOPCOp, VOPCOp(inst.op), inst.src0, inst.src1, None, inst.vdst + else: + op_cls, op, src0, src1, src2, vdst = VOP3Op, VOP3Op(inst.op), inst.src0, inst.src1, inst.src2, inst.vdst + # V_PERM_B32: byte permutation - not in pseudocode PDF, implement directly + # D0[byte_i] = selector[byte_i] < 8 ? {src1, src0}[selector[byte_i]] : (selector[byte_i] >= 0xD ? 0xFF : 0x00) + if op == VOP3Op.V_PERM_B32: + s0, s1, s2 = st.rsrc(inst.src0, lane), st.rsrc(inst.src1, lane), st.rsrc(inst.src2, lane) + # Combine src0 and src1 into 8-byte value: src0 is bytes 0-3, src1 is bytes 4-7 + combined = (s0 & 0xffffffff) | ((s1 & 0xffffffff) << 32) + result = 0 + for i in range(4): # 4 result bytes + sel = (s2 >> (i * 8)) & 0xff # byte selector for this position + if sel <= 7: result |= (((combined >> (sel * 8)) & 0xff) << (i * 8)) # select byte from combined + elif sel >= 0xd: result |= (0xff << (i * 8)) # 0xD-0xF: constant 0xFF + # else 0x8-0xC: constant 0x00 (already 0) + V[vdst] = result & 0xffffffff + return + elif inst_type is VOPC: + op_cls, op, src0, src1, src2, vdst = VOPCOp, VOPCOp(inst.op), inst.src0, inst.vsrc1 + 256, None, VCC_LO + elif inst_type is VOP3P: + # VOP3P: Packed 16-bit operations using compiled functions + op = VOP3POp(inst.op) + # WMMA: wave-level matrix multiply-accumulate (special handling - needs cross-lane access) + if op in (VOP3POp.V_WMMA_F32_16X16X16_F16, VOP3POp.V_WMMA_F32_16X16X16_BF16, VOP3POp.V_WMMA_F16_16X16X16_F16): + if lane == 0: # Only execute once per wave, write results for all lanes + exec_wmma(st, inst, op) + return + # V_FMA_MIX: Mixed precision FMA - inputs can be f16 or f32 controlled by opsel + if op in (VOP3POp.V_FMA_MIX_F32, VOP3POp.V_FMA_MIXLO_F16, VOP3POp.V_FMA_MIXHI_F16): + opsel = getattr(inst, 'opsel', 0) + opsel_hi = getattr(inst, 'opsel_hi', 0) + neg = getattr(inst, 'neg', 0) + neg_hi = getattr(inst, 'neg_hi', 0) + vdst = inst.vdst + # Read raw 32-bit values - for V_FMA_MIX, sources can be either f32 or f16 + s0_raw = st.rsrc(inst.src0, lane) + s1_raw = st.rsrc(inst.src1, lane) + s2_raw = st.rsrc(inst.src2, lane) if inst.src2 is not None else 0 + # opsel[i]=0: use as f32, opsel[i]=1: use hi f16 as f32 + # For src0: opsel[0], for src1: opsel[1], for src2: opsel[2] + if opsel & 1: s0 = _f16((s0_raw >> 16) & 0xffff) # hi f16 -> f32 + else: s0 = _f32(s0_raw) # use as f32 + if opsel & 2: s1 = _f16((s1_raw >> 16) & 0xffff) + else: s1 = _f32(s1_raw) + if opsel & 4: s2 = _f16((s2_raw >> 16) & 0xffff) + else: s2 = _f32(s2_raw) + # Apply neg modifiers (for f32 values) + if neg & 1: s0 = -s0 + if neg & 2: s1 = -s1 + if neg & 4: s2 = -s2 + # Compute FMA: d = s0 * s1 + s2 + result = s0 * s1 + s2 + V = st.vgpr[lane] + if op == VOP3POp.V_FMA_MIX_F32: + V[vdst] = _i32(result) + elif op == VOP3POp.V_FMA_MIXLO_F16: + lo = _i16(result) & 0xffff + V[vdst] = (V[vdst] & 0xffff0000) | lo + else: # V_FMA_MIXHI_F16 + hi = _i16(result) & 0xffff + V[vdst] = (V[vdst] & 0x0000ffff) | (hi << 16) + return + # Use rsrc_f16 for VOP3P to get correct f16 inline constants + s0_raw = st.rsrc_f16(inst.src0, lane) + s1_raw = st.rsrc_f16(inst.src1, lane) + s2_raw = st.rsrc_f16(inst.src2, lane) if inst.src2 is not None else 0 + # Handle opsel (which 16-bit halves to use for each source) + opsel = getattr(inst, 'opsel', 0) + opsel_hi = getattr(inst, 'opsel_hi', 3) # Default: use hi for hi result + opsel_hi2 = getattr(inst, 'opsel_hi2', 1) # Default for src2 + # Handle neg modifiers for VOP3P + # neg applies to lo result inputs, neg_hi applies to hi result inputs + neg = getattr(inst, 'neg', 0) + neg_hi = getattr(inst, 'neg_hi', 0) + # Build "virtual" sources with halves arranged for pseudocode: lo half goes to [15:0], hi half goes to [31:16] + # opsel bit 0/1/2 selects which half of src0/1/2 goes to the LO result + # opsel_hi bit 0/1 selects which half of src0/1 goes to the HI result + s0_lo = (s0_raw >> 16) & 0xffff if (opsel & 1) else s0_raw & 0xffff + s1_lo = (s1_raw >> 16) & 0xffff if (opsel & 2) else s1_raw & 0xffff + s2_lo = (s2_raw >> 16) & 0xffff if (opsel & 4) else s2_raw & 0xffff + s0_hi = (s0_raw >> 16) & 0xffff if (opsel_hi & 1) else s0_raw & 0xffff + s1_hi = (s1_raw >> 16) & 0xffff if (opsel_hi & 2) else s1_raw & 0xffff + s2_hi = (s2_raw >> 16) & 0xffff if opsel_hi2 else s2_raw & 0xffff + # Apply neg to lo result inputs (toggle f16 sign bit) + if neg & 1: s0_lo ^= 0x8000 + if neg & 2: s1_lo ^= 0x8000 + if neg & 4: s2_lo ^= 0x8000 + # Apply neg_hi to hi result inputs + if neg_hi & 1: s0_hi ^= 0x8000 + if neg_hi & 2: s1_hi ^= 0x8000 + if neg_hi & 4: s2_hi ^= 0x8000 + # Pack into format expected by pseudocode: [31:16] = hi input, [15:0] = lo input + s0 = (s0_hi << 16) | s0_lo + s1 = (s1_hi << 16) | s1_lo + s2 = (s2_hi << 16) | s2_lo + op_cls, vdst = VOP3POp, inst.vdst + fn = compiled.get(op_cls, {}).get(op) + if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode") + result = fn(s0, s1, s2, 0, st.scc, st.vcc, lane, st.exec_mask, st.literal, None, {}) + st.vgpr[lane][vdst] = result['d0'] & 0xffffffff + return + else: raise NotImplementedError(f"Unknown vector type {inst_type}") -def exec_ds(st: WaveState, inst: DS, lane: int, lds: bytearray) -> None: - op, addr, vdst, V = inst.op, (st.vgpr[lane][inst.addr] + inst.offset0) & 0xffff, inst.vdst, st.vgpr[lane] - if op in DS_LOAD: - cnt, sz, sign = DS_LOAD[op] - for i in range(cnt): val = int.from_bytes(lds[addr+i*sz:addr+i*sz+sz], 'little'); V[vdst + i] = sext(val, sz * 8) & 0xffffffff if sign else val - elif op in DS_STORE: - cnt, sz = DS_STORE[op] - for i in range(cnt): lds[addr+i*sz:addr+i*sz+sz] = (V[inst.data0 + i] & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little') - else: raise NotImplementedError(f"DS op {op}") + fn = compiled.get(op_cls, {}).get(op) + if fn is None: raise NotImplementedError(f"{op.name} not in pseudocode") -VOPD_OPS: dict[int, Callable[[int, int, int, int, int], int]] = { - VOPDOp.V_DUAL_MUL_F32: lambda a, b, d, l, lit: i32(f32(a)*f32(b)), VOPDOp.V_DUAL_ADD_F32: lambda a, b, d, l, lit: i32(f32(a)+f32(b)), - VOPDOp.V_DUAL_SUB_F32: lambda a, b, d, l, lit: i32(f32(a)-f32(b)), VOPDOp.V_DUAL_SUBREV_F32: lambda a, b, d, l, lit: i32(f32(b)-f32(a)), - VOPDOp.V_DUAL_MAX_F32: lambda a, b, d, l, lit: i32(max(f32(a), f32(b))), VOPDOp.V_DUAL_MIN_F32: lambda a, b, d, l, lit: i32(min(f32(a), f32(b))), - VOPDOp.V_DUAL_MUL_DX9_ZERO_F32: lambda a, b, d, l, lit: i32(0.0 if f32(a) == 0.0 or f32(b) == 0.0 else f32(a)*f32(b)), - VOPDOp.V_DUAL_MOV_B32: lambda a, b, d, l, lit: a, VOPDOp.V_DUAL_ADD_NC_U32: lambda a, b, d, l, lit: (a + b) & 0xffffffff, - VOPDOp.V_DUAL_LSHLREV_B32: lambda a, b, d, l, lit: (b << (a & 0x1f)) & 0xffffffff, VOPDOp.V_DUAL_AND_B32: lambda a, b, d, l, lit: a & b, - VOPDOp.V_DUAL_FMAC_F32: lambda a, b, d, l, lit: i32(f32(a)*f32(b)+f32(d)), VOPDOp.V_DUAL_FMAAK_F32: lambda a, b, d, l, lit: i32(f32(a)*f32(b)+f32(lit)), - VOPDOp.V_DUAL_FMAMK_F32: lambda a, b, d, l, lit: i32(f32(a)*f32(lit)+f32(b)), VOPDOp.V_DUAL_CNDMASK_B32: lambda a, b, d, l, lit: b if l else a, -} -def exec_vopd(st: WaveState, inst: VOPD, lane: int) -> None: - V, vdsty, vcc_lane = st.vgpr[lane], (inst.vdsty << 1) | ((inst.vdstx & 1) ^ 1), (st.vcc >> lane) & 1 - sx0, sx1, sy0, sy1, dstx = st.rsrc(inst.srcx0, lane), V[inst.vsrcx1], st.rsrc(inst.srcy0, lane), V[inst.vsrcy1], inst.vdstx - if (fn := VOPD_OPS.get(inst.opx)): V[dstx] = fn(sx0, sx1, V[dstx], vcc_lane, st.literal) - else: raise NotImplementedError(f"VOPD opx {inst.opx}") - if (fn := VOPD_OPS.get(inst.opy)): V[vdsty] = fn(sy0, sy1, V[vdsty], vcc_lane, st.literal) - else: raise NotImplementedError(f"VOPD opy {inst.opy}") + # Read sources (with VOP3 modifiers if applicable) + neg, abs_ = (getattr(inst, 'neg', 0), getattr(inst, 'abs', 0)) if inst_type is VOP3 else (0, 0) + opsel = getattr(inst, 'opsel', 0) if inst_type is VOP3 else 0 + def mod_src(val: int, idx: int) -> int: + if (abs_ >> idx) & 1: val = _i32(abs(_f32(val))) + if (neg >> idx) & 1: val = _i32(-_f32(val)) + return val + def mod_src64(val: int, idx: int) -> int: + if (abs_ >> idx) & 1: val = _i64(abs(_f64(val))) + if (neg >> idx) & 1: val = _i64(-_f64(val)) + return val -def exec_vop3p(st: WaveState, inst: VOP3P, lane: int) -> None: - op, vdst, V = inst.op, inst.vdst, st.vgpr[lane] - s0, s1, s2 = st.rsrc(inst.src0, lane), st.rsrc(inst.src1, lane), st.rsrc(inst.src2, lane) - opsel, opsel_hi = [(inst.opsel >> i) & 1 for i in range(3)], [(inst.opsel_hi >> i) & 1 for i in range(2)] + [inst.opsel_hi2] - neg, neg_hi = inst.neg, inst.neg_hi - def get_src(src: int, idx: int, for_mix: bool = False) -> float: - if for_mix: - if not opsel_hi[idx]: return abs(f32(src)) if (neg_hi >> idx) & 1 else f32(src) - return float(f16((src >> 16) & 0xffff) if opsel[idx] else f16(src & 0xffff)) - use_hi = opsel[idx] - val = ((src >> 16) & 0xffff) if use_hi else (src & 0xffff) - f = f16(val) - if use_hi and (neg >> idx) & 1: f = -f - elif not use_hi and (neg_hi >> idx) & 1: f = -f - return f - if op == VOP3POp.V_FMA_MIX_F32: V[vdst] = i32(get_src(s0, 0, True) * get_src(s1, 1, True) + get_src(s2, 2, True)) - elif op == VOP3POp.V_FMA_MIXLO_F16: V[vdst] = (V[vdst] & 0xffff0000) | i16(get_src(s0, 0, True) * get_src(s1, 1, True) + get_src(s2, 2, True)) - elif op == VOP3POp.V_FMA_MIXHI_F16: V[vdst] = (V[vdst] & 0x0000ffff) | (i16(get_src(s0, 0, True) * get_src(s1, 1, True) + get_src(s2, 2, True)) << 16) - else: raise NotImplementedError(f"VOP3P op {op}") + # Determine if sources are 64-bit based on instruction type + # For 64-bit shift ops: src0 is 32-bit (shift amount), src1 is 64-bit (value to shift) + # For most other _B64/_I64/_U64/_F64 ops: all sources are 64-bit + is_64bit_op = op.name.endswith(('_B64', '_I64', '_U64', '_F64')) + # V_LDEXP_F64: src0 is 64-bit float, src1 is 32-bit integer exponent + is_ldexp_64 = op in (VOP3Op.V_LDEXP_F64,) + is_shift_64 = op in (VOP3Op.V_LSHLREV_B64, VOP3Op.V_LSHRREV_B64, VOP3Op.V_ASHRREV_I64) + # 16-bit source ops: name contains 16-bit type, but for CVT ops check the SOURCE type (CVT naming is V_CVT_DST_SRC) + # For CVT: source type is at the end of the name, so V_CVT_F16_F32 has 32-bit src, V_CVT_F32_F16 has 16-bit src + has_16bit_type = any(s in op.name for s in ('_F16', '_B16', '_I16', '_U16')) + is_cvt_with_32_64_src = op.name.startswith('V_CVT_') and op.name.endswith(('_F32', '_I32', '_U32', '_F64', '_I64', '_U64')) + is_16bit_src = op_cls is VOP3Op and has_16bit_type and not is_cvt_with_32_64_src -def exec_wmma_f32_16x16x16_f16(st: WaveState, inst: VOP3P, n_lanes: int) -> None: - src0_base, src1_base, src2_base = (inst.src0 - 256) if inst.src0 >= 256 else inst.src0, (inst.src1 - 256) if inst.src1 >= 256 else inst.src1, (inst.src2 - 256) if inst.src2 >= 256 else inst.src2 - src0_is_vgpr, src1_is_vgpr, src2_is_vgpr, vdst = inst.src0 >= 256, inst.src1 >= 256, inst.src2 >= 256, inst.vdst - A, B, C = [[0.0] * 16 for _ in range(16)], [[0.0] * 16 for _ in range(16)], [[0.0] * 16 for _ in range(16)] - for lane in range(min(n_lanes, 16)): - V = st.vgpr[lane] + if is_shift_64: + s0 = mod_src(st.rsrc(src0, lane), 0) # shift amount is 32-bit + s1 = st.rsrc64(src1, lane) if src1 is not None else 0 # value to shift is 64-bit + s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0 + elif is_ldexp_64: + s0 = mod_src64(st.rsrc64(src0, lane), 0) # mantissa is 64-bit float + s1 = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0 # exponent is 32-bit int + s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0 + elif is_64bit_op: + # 64-bit ops: apply neg/abs modifiers using f64 interpretation for float ops + s0 = mod_src64(st.rsrc64(src0, lane), 0) + s1 = mod_src64(st.rsrc64(src1, lane), 1) if src1 is not None else 0 + s2 = mod_src64(st.rsrc64(src2, lane), 2) if src2 is not None else 0 + elif is_16bit_src: + # For 16-bit source ops, opsel bits select which half to use + s0_raw = mod_src(st.rsrc(src0, lane), 0) + s1_raw = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0 + s2_raw = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0 + # opsel[0] selects hi(1) or lo(0) for src0, opsel[1] for src1, opsel[2] for src2 + s0 = ((s0_raw >> 16) & 0xffff) if (opsel & 1) else (s0_raw & 0xffff) + s1 = ((s1_raw >> 16) & 0xffff) if (opsel & 2) else (s1_raw & 0xffff) + s2 = ((s2_raw >> 16) & 0xffff) if (opsel & 4) else (s2_raw & 0xffff) + else: + s0 = mod_src(st.rsrc(src0, lane), 0) + s1 = mod_src(st.rsrc(src1, lane), 1) if src1 is not None else 0 + s2 = mod_src(st.rsrc(src2, lane), 2) if src2 is not None else 0 + d0 = V[vdst] if not is_64bit_op else (V[vdst] | (V[vdst + 1] << 32)) + + # V_CNDMASK_B32: VOP3 encoding uses src2 as mask (not VCC); VOP2 uses VCC implicitly + # Pass the correct mask as vcc to the function so pseudocode VCC.u64[laneId] works correctly + vcc_for_fn = st.rsgpr64(src2) if op in (VOP3Op.V_CNDMASK_B32,) and inst_type is VOP3 and src2 is not None and src2 < 256 else st.vcc + + # Execute compiled function - pass src0_idx and vdst_idx for lane instructions + # For VGPR access: src0 index is the VGPR number (src0 - 256 if VGPR, else src0 for SGPR) + src0_idx = (src0 - 256) if src0 is not None and src0 >= 256 else (src0 if src0 is not None else 0) + result = fn(s0, s1, s2, d0, st.scc, vcc_for_fn, lane, st.exec_mask, st.literal, st.vgpr, {}, src0_idx, vdst) + + # Apply results + if 'vgpr_write' in result: + # Lane instruction wrote to VGPR: (lane, vgpr_idx, value) + wr_lane, wr_idx, wr_val = result['vgpr_write'] + st.vgpr[wr_lane][wr_idx] = wr_val + if 'vcc_lane' in result: + # VOP2 carry instructions (V_ADD_CO_CI_U32, V_SUB_CO_CI_U32, V_SUBREV_CO_CI_U32) write carry to VCC implicitly + # VOPC and VOP3-encoded VOPC write to vdst (which is VCC_LO for VOPC, inst.sdst for VOP3) + vcc_dst = VCC_LO if op_cls is VOP2Op and op in (VOP2Op.V_ADD_CO_CI_U32, VOP2Op.V_SUB_CO_CI_U32, VOP2Op.V_SUBREV_CO_CI_U32) else vdst + st.pend_sgpr_lane(vcc_dst, lane, result['vcc_lane']) + if 'exec_lane' in result: + # V_CMPX instructions write to EXEC per-lane + st.pend_sgpr_lane(EXEC_LO, lane, result['exec_lane']) + if 'd0' in result and op_cls not in (VOPCOp,) and 'vgpr_write' not in result: + # V_READFIRSTLANE_B32 and V_READLANE_B32 write to SGPR, not VGPR + # V_WRITELANE_B32 uses vgpr_write for cross-lane writes, don't overwrite with d0 + writes_to_sgpr = op in (VOP1Op.V_READFIRSTLANE_B32,) or \ + (op_cls is VOP3Op and op in (VOP3Op.V_READFIRSTLANE_B32, VOP3Op.V_READLANE_B32)) + # Check for 16-bit destination ops (opsel[3] controls hi/lo write) + # 16-bit dst ops (exclude PACK which has 32-bit dst despite F16 in name) + is_16bit_dst = any(s in op.name for s in ('_F16', '_B16', '_I16', '_U16')) and 'PACK' not in op.name + if writes_to_sgpr: + st.wsgpr(vdst, result['d0'] & 0xffffffff) + elif result.get('d0_64') or is_64bit_op: + V[vdst] = result['d0'] & 0xffffffff + V[vdst + 1] = (result['d0'] >> 32) & 0xffffffff + elif is_16bit_dst and inst_type is VOP3: + # VOP3 16-bit ops: opsel[3] (bit 3 of opsel field) controls hi/lo destination + if opsel & 8: # opsel[3] = 1: write to high 16 bits + V[vdst] = (V[vdst] & 0x0000ffff) | ((result['d0'] & 0xffff) << 16) + else: # opsel[3] = 0: write to low 16 bits + V[vdst] = (V[vdst] & 0xffff0000) | (result['d0'] & 0xffff) + else: + V[vdst] = result['d0'] & 0xffffffff + +# ═══════════════════════════════════════════════════════════════════════════════ +# WMMA (Wave Matrix Multiply-Accumulate) +# ═══════════════════════════════════════════════════════════════════════════════ + +def exec_wmma(st: WaveState, inst, op: VOP3POp) -> None: + """Execute WMMA instruction - 16x16x16 matrix multiply across the wave.""" + src0, src1, src2, vdst = inst.src0, inst.src1, inst.src2, inst.vdst + # Read matrix A (16x16 f16/bf16) from lanes 0-15, VGPRs src0 to src0+7 (2 f16 per VGPR = 16 values per lane) + # Layout: A[row][k] where row = lane (0-15), k comes from 8 VGPRs × 2 halves + mat_a = [] + for lane in range(16): for reg in range(8): - val = V[src0_base + reg] if src0_is_vgpr else st.sgpr[src0_base + reg] - A[lane][reg * 2], A[lane][reg * 2 + 1] = f16(val & 0xffff), f16((val >> 16) & 0xffff) - val = V[src1_base + reg] if src1_is_vgpr else st.sgpr[src1_base + reg] - B[reg * 2][lane], B[reg * 2 + 1][lane] = f16(val & 0xffff), f16((val >> 16) & 0xffff) + val = st.vgpr[lane][src0 - 256 + reg] if src0 >= 256 else st.rsgpr(src0 + reg) + mat_a.append(_f16(val & 0xffff)) + mat_a.append(_f16((val >> 16) & 0xffff)) + # Read matrix B (16x16 f16/bf16) - same layout, B[col][k] where col comes from lane + mat_b = [] + for lane in range(16): + for reg in range(8): + val = st.vgpr[lane][src1 - 256 + reg] if src1 >= 256 else st.rsgpr(src1 + reg) + mat_b.append(_f16(val & 0xffff)) + mat_b.append(_f16((val >> 16) & 0xffff)) + + # Read matrix C (16x16 f32) from lanes 0-31, VGPRs src2 to src2+7 + # Layout: element i is at lane (i % 32), VGPR (i // 32) + src2 + mat_c = [] + for i in range(256): + lane, reg = i % 32, i // 32 + val = st.vgpr[lane][src2 - 256 + reg] if src2 >= 256 else st.rsgpr(src2 + reg) + mat_c.append(_f32(val)) + + # Compute D = A × B + C (16x16 matrix multiply) + mat_d = [0.0] * 256 for row in range(16): for col in range(16): - idx, lane_idx, reg = row * 16 + col, (row * 16 + col) % 32, (row * 16 + col) // 32 - if lane_idx < n_lanes: - val = st.vgpr[lane_idx][src2_base + reg] if src2_is_vgpr else st.sgpr[src2_base + reg] - C[row][col] = f32(val) - for row in range(16): - for col in range(16): - for k in range(16): C[row][col] += A[row][k] * B[k][col] - for row in range(16): - for col in range(16): - idx, lane_idx, reg = row * 16 + col, (row * 16 + col) % 32, (row * 16 + col) // 32 - if lane_idx < n_lanes and (st.exec_mask & (1 << lane_idx)): st.vgpr[lane_idx][vdst + reg] = i32(C[row][col]) + acc = 0.0 + for k in range(16): + a_val = mat_a[row * 16 + k] + b_val = mat_b[col * 16 + k] + acc += a_val * b_val + mat_d[row * 16 + col] = acc + mat_c[row * 16 + col] + + # Write result matrix D back - same layout as C + if op == VOP3POp.V_WMMA_F16_16X16X16_F16: + # Output is f16, pack 2 values per VGPR + for i in range(0, 256, 2): + lane, reg = (i // 2) % 32, (i // 2) // 32 + lo = _i16(mat_d[i]) & 0xffff + hi = _i16(mat_d[i + 1]) & 0xffff + st.vgpr[lane][vdst + reg] = (hi << 16) | lo + else: + # Output is f32 + for i in range(256): + lane, reg = i % 32, i // 32 + st.vgpr[lane][vdst + reg] = _i32(mat_d[i]) # ═══════════════════════════════════════════════════════════════════════════════ # MAIN EXECUTION LOOP # ═══════════════════════════════════════════════════════════════════════════════ -SCALAR: dict[type, Callable[..., int]] = {SOP1: exec_sop1, SOP2: exec_sop2, SOPC: exec_sopc, SOPK: exec_sopk, SOPP: exec_sopp, SMEM: exec_smem} -VECTOR: dict[type, Callable[..., None]] = {VOP1: exec_vop1, VOP2: exec_vop2, VOP3: exec_vop3, VOP3SD: exec_vop3sd, VOPC: exec_vopc, FLAT: exec_flat, DS: exec_ds, VOPD: exec_vopd, VOP3P: exec_vop3p} -_WMMA_OPS = frozenset((VOP3POp.V_WMMA_F32_16X16X16_F16, VOP3POp.V_WMMA_F32_16X16X16_BF16, VOP3POp.V_WMMA_F16_16X16X16_F16, - VOP3POp.V_WMMA_BF16_16X16X16_BF16, VOP3POp.V_WMMA_I32_16X16X16_IU8, VOP3POp.V_WMMA_I32_16X16X16_IU4)) +SCALAR_TYPES = {SOP1, SOP2, SOPC, SOPK, SOPP, SMEM} +VECTOR_TYPES = {VOP1, VOP2, VOP3, VOP3SD, VOPC, FLAT, DS, VOPD, VOP3P} def step_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> int: inst = program.get(st.pc) if inst is None: return 1 - inst_words, st.literal, inst_type = inst._words, inst._literal or 0, type(inst) - if (handler := SCALAR.get(inst_type)) is not None: - delta = handler(st, inst) - if delta == -1: return -1 - if delta == -2: st.pc += inst_words; return -2 - if delta == -3: # S_GETPC_B64 - sop1 = inst if isinstance(inst, SOP1) else None - assert sop1 is not None - next_pc = (st.pc + inst_words) * 4; st.wsgpr(sop1.sdst, next_pc & 0xffffffff); st.wsgpr(sop1.sdst + 1, (next_pc >> 32) & 0xffffffff); st.pc += inst_words; return 0 - if delta == -4: # S_SETPC_B64 - sop1 = inst if isinstance(inst, SOP1) else None - assert sop1 is not None - st.pc = st.rsrc64(sop1.ssrc0, 0) // 4; return 0 - if delta == -5: # S_SWAPPC_B64 - sop1 = inst if isinstance(inst, SOP1) else None - assert sop1 is not None - next_pc = (st.pc + inst_words) * 4; st.wsgpr(sop1.sdst, next_pc & 0xffffffff); st.wsgpr(sop1.sdst + 1, (next_pc >> 32) & 0xffffffff); st.pc = st.rsrc64(sop1.ssrc0, 0) // 4; return 0 + inst_words, st.literal, inst_type = inst._words, getattr(inst, '_literal', None) or 0, type(inst) + + if inst_type in SCALAR_TYPES: + delta = exec_scalar(st, inst) + if delta == -1: return -1 # endpgm + if delta == -2: st.pc += inst_words; return -2 # barrier st.pc += inst_words + delta else: - vec_handler, exec_mask = VECTOR[inst_type], st.exec_mask - if inst_type is DS: - for lane in range(n_lanes): - if exec_mask & (1 << lane): vec_handler(st, inst, lane, lds) - elif inst_type is VOP3P: - vop3p = inst if isinstance(inst, VOP3P) else None - assert vop3p is not None - if vop3p.op in _WMMA_OPS: - exec_wmma_f32_16x16x16_f16(st, vop3p, n_lanes) - else: - for lane in range(n_lanes): - if exec_mask & (1 << lane): vec_handler(st, vop3p, lane) + # V_READFIRSTLANE_B32 and V_READLANE_B32 write to SGPR, so they should only execute once per wave (lane 0) + is_readlane = (inst_type is VOP1 and inst.op == VOP1Op.V_READFIRSTLANE_B32) or \ + (inst_type is VOP3 and inst.op in (VOP3Op.V_READFIRSTLANE_B32, VOP3Op.V_READLANE_B32)) + if is_readlane: + exec_vector(st, inst, 0, lds) # Execute once with lane 0 else: + exec_mask = st.exec_mask for lane in range(n_lanes): - if exec_mask & (1 << lane): vec_handler(st, inst, lane) - st.commit_pends(); st.pc += inst_words + if exec_mask & (1 << lane): exec_vector(st, inst, lane, lds) + st.commit_pends() + st.pc += inst_words return 0 -def exec_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int, wg_id: tuple[int,int,int]=(0,0,0), local_size: tuple[int,int,int]=(1,1,1), wave_start: int=0) -> int: +def exec_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> int: while st.pc in program: result = step_wave(program, st, lds, n_lanes) if result == -1: return 0 if result == -2: return -2 return 0 -def exec_workgroup(program: Program, workgroup_id: tuple[int, int, int], local_size: tuple[int, int, int], args_ptr: int, dispatch_dim: int) -> None: +def exec_workgroup(program: Program, workgroup_id: tuple[int, int, int], local_size: tuple[int, int, int], args_ptr: int, + wg_id_sgpr_base: int, wg_id_enables: tuple[bool, bool, bool]) -> None: lx, ly, lz = local_size total_threads, lds = lx * ly * lz, bytearray(65536) waves: list[tuple[WaveState, int, int]] = [] @@ -483,23 +691,30 @@ def exec_workgroup(program: Program, workgroup_id: tuple[int, int, int], local_s st.exec_mask = (1 << n_lanes) - 1 st.wsgpr64(0, args_ptr) gx, gy, gz = workgroup_id - if dispatch_dim >= 3: st.sgpr[13], st.sgpr[14], st.sgpr[15] = gx, gy, gz - elif dispatch_dim == 2: st.sgpr[14], st.sgpr[15] = gx, gy - else: st.sgpr[15] = gx + # Set workgroup IDs in SGPRs based on USER_SGPR_COUNT and enable flags from COMPUTE_PGM_RSRC2 + sgpr_idx = wg_id_sgpr_base + if wg_id_enables[0]: st.sgpr[sgpr_idx] = gx; sgpr_idx += 1 + if wg_id_enables[1]: st.sgpr[sgpr_idx] = gy; sgpr_idx += 1 + if wg_id_enables[2]: st.sgpr[sgpr_idx] = gz for i in range(n_lanes): tid = wave_start + i st.vgpr[i][0] = tid if local_size == (lx, 1, 1) else ((tid // (lx * ly)) << 20) | (((tid // lx) % ly) << 10) | (tid % lx) waves.append((st, n_lanes, wave_start)) has_barrier = any(isinstance(inst, SOPP) and inst.op == SOPPOp.S_BARRIER for inst in program.values()) for _ in range(2 if has_barrier else 1): - for st, n_lanes, wave_start in waves: exec_wave(program, st, lds, n_lanes, workgroup_id, local_size, wave_start) + for st, n_lanes, _ in waves: exec_wave(program, st, lds, n_lanes) -def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int: +def run_asm(lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int, rsrc2: int = 0x19c) -> int: data = (ctypes.c_char * lib_sz).from_address(lib).raw program = decode_program(data) if not program: return -1 - dispatch_dim = 3 if gz > 1 else (2 if gy > 1 else 1) + # Parse COMPUTE_PGM_RSRC2 for SGPR layout + user_sgpr_count = (rsrc2 >> 1) & 0x1f + enable_wg_id_x = bool((rsrc2 >> 7) & 1) + enable_wg_id_y = bool((rsrc2 >> 8) & 1) + enable_wg_id_z = bool((rsrc2 >> 9) & 1) + wg_id_enables = (enable_wg_id_x, enable_wg_id_y, enable_wg_id_z) for gidz in range(gz): for gidy in range(gy): - for gidx in range(gx): exec_workgroup(program, (gidx, gidy, gidz), (lx, ly, lz), args_ptr, dispatch_dim) + for gidx in range(gx): exec_workgroup(program, (gidx, gidy, gidz), (lx, ly, lz), args_ptr, user_sgpr_count, wg_id_enables) return 0 diff --git a/extra/assembly/rdna3/gen.py b/extra/assembly/rdna3/gen.py index 2e2122c289..2c13db7722 100644 --- a/extra/assembly/rdna3/gen.py +++ b/extra/assembly/rdna3/gen.py @@ -178,7 +178,15 @@ def generate(output_path: pathlib.Path|str|None = None) -> dict: suffix = "_e64" else: suffix = "" - lines.append(f"{name.lower()}{suffix} = functools.partial({tgt}.{name}{seg})") + # FMAMK/FMAAK have a literal constant K that must be passed via literal= kwarg + # FMAMK: D = S0.f * K + S1.f (K is 3rd operand in assembly syntax) + # FMAAK: D = S0.f * S1.f + K (K is 4th operand in assembly syntax) + if name in ('V_FMAMK_F32', 'V_FMAMK_F16'): + lines.append(f"def {name.lower()}{suffix}(vdst, src0, K, vsrc1): return {fmt}({cls_name}.{name}, vdst, src0, vsrc1, literal=K)") + elif name in ('V_FMAAK_F32', 'V_FMAAK_F16'): + lines.append(f"def {name.lower()}{suffix}(vdst, src0, vsrc1, K): return {fmt}({cls_name}.{name}, vdst, src0, vsrc1, literal=K)") + else: + lines.append(f"{name.lower()}{suffix} = functools.partial({tgt}.{name}{seg})") # export SrcEnum values, but skip DPP8/DPP16 which conflict with class names skip_exports = {'DPP8', 'DPP16'} lines += [""] + [f"{name} = SrcEnum.{name}" for _, name in sorted(src_enum.items()) if name not in skip_exports] + ["OFF = NULL\n"] diff --git a/extra/assembly/rdna3/lib.py b/extra/assembly/rdna3/lib.py index 7f2db1e014..ba7dff204e 100644 --- a/extra/assembly/rdna3/lib.py +++ b/extra/assembly/rdna3/lib.py @@ -169,11 +169,13 @@ class Inst: cur_neg = self._values.get('neg', 0) self._values['neg'] = (cur_neg.val if isinstance(cur_neg, RawImm) else cur_neg) | neg_bit # Track literal value if needed (encoded as 255) + # For 64-bit ops, store literal in high 32 bits (to match from_bytes decoding and to_bytes encoding) if encoded == 255 and self._literal is None and isinstance(val, int) and not isinstance(val, IntEnum): - self._literal = val + self._literal = (val << 32) if self._is_64bit_op() else val elif encoded == 255 and self._literal is None and isinstance(val, float): import struct - self._literal = struct.unpack(' bool: + """Check if this instruction uses 64-bit operands (and thus 64-bit literals). + Exception: V_LDEXP_F64 has 32-bit integer src1, so its literal is 32-bit.""" + op = self._values.get('op') + if op is None: return False + # op may be an enum (from __init__) or an int (from from_int) + op_name = op.name if hasattr(op, 'name') else None + if op_name is None and self.__class__.__name__ == 'VOP3': + from extra.assembly.rdna3.autogen import VOP3Op + try: op_name = VOP3Op(op).name + except ValueError: pass + if op_name is None: return False + # V_LDEXP_F64 has 32-bit integer exponent in src1, so literal is 32-bit + if op_name == 'V_LDEXP_F64': return False + return op_name.endswith(('_F64', '_B64', '_I64', '_U64')) + def to_bytes(self) -> bytes: result = self.to_int().to_bytes(self._size(), 'little') - return result + (lit & 0xffffffff).to_bytes(4, 'little') if (lit := self._get_literal() or getattr(self, '_literal', None)) else result + lit = self._get_literal() or getattr(self, '_literal', None) + if lit is None: return result + # For 64-bit ops, literal is stored in high 32 bits internally, but encoded as 4 bytes + lit32 = (lit >> 32) if self._is_64bit_op() else lit + return result + (lit32 & 0xffffffff).to_bytes(4, 'little') @classmethod def _size(cls) -> int: return 4 if issubclass(cls, Inst32) else 8 - def size(self) -> int: return self._size() + (4 if self._literal is not None else 0) + def size(self) -> int: + # Literal is always 4 bytes in the binary (for 64-bit ops, it's in high 32 bits) + return self._size() + (4 if self._literal is not None else 0) @classmethod def from_int(cls, word: int): @@ -229,7 +253,12 @@ class Inst: has_literal = has_literal or (cls.__name__ == 'SOP2' and op_val in (69, 70)) for n in SRC_FIELDS: if n in inst._values and isinstance(inst._values[n], RawImm) and inst._values[n].val == 255: has_literal = True - if has_literal and len(data) >= cls._size() + 4: inst._literal = int.from_bytes(data[cls._size():cls._size()+4], 'little') + if has_literal: + # For 64-bit ops, the literal is 32 bits placed in the HIGH 32 bits of the 64-bit value + # (low 32 bits are zero). This is how AMD hardware interprets 32-bit literals for 64-bit ops. + if len(data) >= cls._size() + 4: + lit32 = int.from_bytes(data[cls._size():cls._size()+4], 'little') + inst._literal = (lit32 << 32) if inst._is_64bit_op() else lit32 return inst def __repr__(self): diff --git a/extra/assembly/rdna3/pcode.py b/extra/assembly/rdna3/pcode.py new file mode 100644 index 0000000000..88bca1301c --- /dev/null +++ b/extra/assembly/rdna3/pcode.py @@ -0,0 +1,910 @@ +# DSL for RDNA3 pseudocode - makes pseudocode expressions work directly as Python +import struct, math, re + +# ═══════════════════════════════════════════════════════════════════════════════ +# HELPER FUNCTIONS (previously in helpers.py) +# ═══════════════════════════════════════════════════════════════════════════════ + +def _f32(i): return struct.unpack(" 0 else 0xff800000 + try: return struct.unpack(" 0 else 0xff800000 +def _div(a, b): + try: return a / b + except ZeroDivisionError: + if a == 0.0 or math.isnan(a): return float("nan") + return math.copysign(float("inf"), a * b) if b == 0.0 else float("inf") if a > 0 else float("-inf") +def _sext(v, b): return v - (1 << b) if v & (1 << (b - 1)) else v +def _f16(i): return struct.unpack(" 0 else 0xfc00 + try: return struct.unpack(" 0 else 0xfc00 +def _to_f16_bits(v): return v if isinstance(v, int) else _i16(v) +def _f64(i): return struct.unpack(" 0 else 0xfff0000000000000 + try: return struct.unpack(" 0 else 0xfff0000000000000 +def _isnan(x): + try: return math.isnan(float(x)) + except (TypeError, ValueError): return False +def _isquietnan(x): + """Check if x is a quiet NaN. For f32: exponent=255, bit22=1, mantissa!=0""" + try: + if not math.isnan(float(x)): return False + # Get raw bits from TypedView or similar object with _reg attribute + if hasattr(x, '_reg') and hasattr(x, '_bits'): + bits = x._reg._val & ((1 << x._bits) - 1) + if x._bits == 32: + return ((bits >> 23) & 0xff) == 255 and ((bits >> 22) & 1) == 1 and (bits & 0x7fffff) != 0 + if x._bits == 64: + return ((bits >> 52) & 0x7ff) == 0x7ff and ((bits >> 51) & 1) == 1 and (bits & 0xfffffffffffff) != 0 + return True # Default to quiet NaN if we can't determine bit pattern + except (TypeError, ValueError): return False +def _issignalnan(x): + """Check if x is a signaling NaN. For f32: exponent=255, bit22=0, mantissa!=0""" + try: + if not math.isnan(float(x)): return False + # Get raw bits from TypedView or similar object with _reg attribute + if hasattr(x, '_reg') and hasattr(x, '_bits'): + bits = x._reg._val & ((1 << x._bits) - 1) + if x._bits == 32: + return ((bits >> 23) & 0xff) == 255 and ((bits >> 22) & 1) == 0 and (bits & 0x7fffff) != 0 + if x._bits == 64: + return ((bits >> 52) & 0x7ff) == 0x7ff and ((bits >> 51) & 1) == 0 and (bits & 0xfffffffffffff) != 0 + return False # Default to not signaling if we can't determine bit pattern + except (TypeError, ValueError): return False +def _gt_neg_zero(a, b): return (a > b) or (a == 0 and b == 0 and not math.copysign(1, a) < 0 and math.copysign(1, b) < 0) +def _lt_neg_zero(a, b): return (a < b) or (a == 0 and b == 0 and math.copysign(1, a) < 0 and not math.copysign(1, b) < 0) +def _fma(a, b, c): return a * b + c +def _signext(v): return v +def trunc(x): + x = float(x) + return x if math.isnan(x) or math.isinf(x) else float(math.trunc(x)) +def floor(x): + x = float(x) + return x if math.isnan(x) or math.isinf(x) else float(math.floor(x)) +def ceil(x): + x = float(x) + return x if math.isnan(x) or math.isinf(x) else float(math.ceil(x)) +def sqrt(x): return math.sqrt(x) if x >= 0 else float("nan") +def log2(x): return math.log2(x) if x > 0 else (float("-inf") if x == 0 else float("nan")) +i32_to_f32 = u32_to_f32 = i32_to_f64 = u32_to_f64 = f32_to_f64 = f64_to_f32 = float +def f32_to_i32(f): + f = float(f) + if math.isnan(f): return 0 + if f >= 2147483647: return 2147483647 + if f <= -2147483648: return -2147483648 + return int(f) +def f32_to_u32(f): + f = float(f) + if math.isnan(f): return 0 + if f >= 4294967295: return 4294967295 + if f <= 0: return 0 + return int(f) +f64_to_i32 = f32_to_i32 +f64_to_u32 = f32_to_u32 +def f32_to_f16(f): + f = float(f) + if math.isnan(f): return 0x7e00 # f16 NaN + if math.isinf(f): return 0x7c00 if f > 0 else 0xfc00 # f16 ±infinity + try: return struct.unpack(" 0 else 0xfc00 # overflow -> ±infinity +def _f16_to_f32_bits(bits): return struct.unpack(" 0 else 0.0 +def _brev32(v): return int(bin(v & 0xffffffff)[2:].zfill(32)[::-1], 2) +def _brev64(v): return int(bin(v & 0xffffffffffffffff)[2:].zfill(64)[::-1], 2) +def _ctz32(v): + v = int(v) & 0xffffffff + if v == 0: return 32 + n = 0 + while (v & 1) == 0: v >>= 1; n += 1 + return n +def _ctz64(v): + v = int(v) & 0xffffffffffffffff + if v == 0: return 64 + n = 0 + while (v & 1) == 0: v >>= 1; n += 1 + return n +def _exponent(f): + if math.isinf(f) or math.isnan(f): return 255 + if f == 0.0: return 0 + try: bits = struct.unpack("> 23) & 0xff + except: return 0 +def _is_denorm_f32(f): + if not isinstance(f, float): f = _f32(int(f) & 0xffffffff) + if math.isinf(f) or math.isnan(f) or f == 0.0: return False + bits = struct.unpack("> 23) & 0xff == 0 +def _is_denorm_f64(f): + if not isinstance(f, float): f = _f64(int(f) & 0xffffffffffffffff) + if math.isinf(f) or math.isnan(f) or f == 0.0: return False + bits = struct.unpack("> 52) & 0x7ff == 0 +def v_min_f32(a, b): + if math.isnan(b): return a + if math.isnan(a): return b + return a if _lt_neg_zero(a, b) else b +def v_max_f32(a, b): + if math.isnan(b): return a + if math.isnan(a): return b + return a if _gt_neg_zero(a, b) else b +def v_min_i32(a, b): return min(a, b) +def v_max_i32(a, b): return max(a, b) +def v_min_u32(a, b): return min(a & 0xffffffff, b & 0xffffffff) +def v_max_u32(a, b): return max(a & 0xffffffff, b & 0xffffffff) +v_min_f16 = v_min_f32 +v_max_f16 = v_max_f32 +v_min_i16 = v_min_i32 +v_max_i16 = v_max_i32 +def v_min_u16(a, b): return min(a & 0xffff, b & 0xffff) +def v_max_u16(a, b): return max(a & 0xffff, b & 0xffff) +def v_min3_f32(a, b, c): return v_min_f32(v_min_f32(a, b), c) +def v_max3_f32(a, b, c): return v_max_f32(v_max_f32(a, b), c) +def v_min3_i32(a, b, c): return min(a, b, c) +def v_max3_i32(a, b, c): return max(a, b, c) +def v_min3_u32(a, b, c): return min(a & 0xffffffff, b & 0xffffffff, c & 0xffffffff) +def v_max3_u32(a, b, c): return max(a & 0xffffffff, b & 0xffffffff, c & 0xffffffff) +v_min3_f16 = v_min3_f32 +v_max3_f16 = v_max3_f32 +v_min3_i16 = v_min3_i32 +v_max3_i16 = v_max3_i32 +def v_min3_u16(a, b, c): return min(a & 0xffff, b & 0xffff, c & 0xffff) +def v_max3_u16(a, b, c): return max(a & 0xffff, b & 0xffff, c & 0xffff) +def ABSDIFF(a, b): return abs(a - b) +def f16_to_snorm(f): return max(-32768, min(32767, int(round(max(-1.0, min(1.0, f)) * 32767)))) +def f16_to_unorm(f): return max(0, min(65535, int(round(max(0.0, min(1.0, f)) * 65535)))) +def f32_to_snorm(f): return max(-32768, min(32767, int(round(max(-1.0, min(1.0, f)) * 32767)))) +def f32_to_unorm(f): return max(0, min(65535, int(round(max(0.0, min(1.0, f)) * 65535)))) +def v_cvt_i16_f32(f): return max(-32768, min(32767, int(f))) if not math.isnan(f) else 0 +def v_cvt_u16_f32(f): return max(0, min(65535, int(f))) if not math.isnan(f) else 0 +def u32_to_u16(u): return int(u) & 0xffff +def i32_to_i16(i): return ((int(i) + 32768) & 0xffff) - 32768 +def SAT8(v): return max(0, min(255, int(v))) +def f32_to_u8(f): return max(0, min(255, int(f))) if not math.isnan(f) else 0 +def mantissa(f): + if f == 0.0 or math.isinf(f) or math.isnan(f): return f + m, _ = math.frexp(f) + return math.copysign(m * 2.0, f) +def signext_from_bit(val, bit): + bit = int(bit) + if bit == 0: return 0 + mask = (1 << bit) - 1 + val = int(val) & mask + if val & (1 << (bit - 1)): return val - (1 << bit) + return val + +# ═══════════════════════════════════════════════════════════════════════════════ +# DSL EXPORTS +# ═══════════════════════════════════════════════════════════════════════════════ + +__all__ = [ + # Classes + 'Reg', 'SliceProxy', 'TypedView', 'ExecContext', 'compile_pseudocode', + # Pack functions + '_pack', '_pack32', 'pack', 'pack32', + # Constants + 'WAVE32', 'WAVE64', 'MASK32', 'MASK64', 'WAVE_MODE', 'DENORM', 'OVERFLOW_F32', 'UNDERFLOW_F32', + 'OVERFLOW_F64', 'UNDERFLOW_F64', 'MAX_FLOAT_F32', 'ROUND_MODE', 'cvtToQuietNAN', 'DST', 'INF', 'PI', + # Aliases for pseudocode + 's_ff1_i32_b32', 's_ff1_i32_b64', 'GT_NEG_ZERO', 'LT_NEG_ZERO', + 'isNAN', 'isQuietNAN', 'isSignalNAN', 'fma', 'ldexp', 'sign', 'exponent', 'F', 'signext', + # Conversion functions + '_f32', '_i32', '_f16', '_i16', '_f64', '_i64', '_sext', '_to_f16_bits', '_f16_to_f32_bits', + 'i32_to_f32', 'u32_to_f32', 'i32_to_f64', 'u32_to_f64', 'f32_to_f64', 'f64_to_f32', + 'f32_to_i32', 'f32_to_u32', 'f64_to_i32', 'f64_to_u32', 'f32_to_f16', 'f16_to_f32', + 'i16_to_f16', 'u16_to_f16', 'f16_to_i16', 'f16_to_u16', 'u32_to_u16', 'i32_to_i16', + 'f16_to_snorm', 'f16_to_unorm', 'f32_to_snorm', 'f32_to_unorm', 'v_cvt_i16_f32', 'v_cvt_u16_f32', + 'SAT8', 'f32_to_u8', + # Math functions + 'trunc', 'floor', 'ceil', 'sqrt', 'log2', 'sin', 'cos', 'pow', 'fract', 'isEven', 'mantissa', + # Min/max functions + 'v_min_f32', 'v_max_f32', 'v_min_i32', 'v_max_i32', 'v_min_u32', 'v_max_u32', + 'v_min_f16', 'v_max_f16', 'v_min_i16', 'v_max_i16', 'v_min_u16', 'v_max_u16', + 'v_min3_f32', 'v_max3_f32', 'v_min3_i32', 'v_max3_i32', 'v_min3_u32', 'v_max3_u32', + 'v_min3_f16', 'v_max3_f16', 'v_min3_i16', 'v_max3_i16', 'v_min3_u16', 'v_max3_u16', + 'ABSDIFF', + # Bit manipulation + '_brev32', '_brev64', '_ctz32', '_ctz64', '_exponent', '_is_denorm_f32', '_is_denorm_f64', + '_sign', '_mantissa_f32', '_div', '_isnan', '_isquietnan', '_issignalnan', '_gt_neg_zero', '_lt_neg_zero', '_fma', '_ldexp', '_signext', + 'signext_from_bit', +] + +# Aliases used in pseudocode +s_ff1_i32_b32, s_ff1_i32_b64 = _ctz32, _ctz64 +GT_NEG_ZERO, LT_NEG_ZERO = _gt_neg_zero, _lt_neg_zero +isNAN = _isnan +isQuietNAN = _isquietnan +isSignalNAN = _issignalnan +fma, ldexp, sign, exponent = _fma, _ldexp, _sign, _exponent +def F(x): + """32'F(x) or 64'F(x) - interpret x as float. If x is int, treat as bit pattern.""" + if isinstance(x, int): return _f32(x) # int -> interpret as f32 bits + if isinstance(x, TypedView): return x # preserve TypedView for bit-pattern checks + return float(x) # already a float or float-like +signext = lambda x: x +pack = lambda hi, lo: ((int(hi) & 0xffff) << 16) | (int(lo) & 0xffff) +pack32 = lambda hi, lo: ((int(hi) & 0xffffffff) << 32) | (int(lo) & 0xffffffff) +_pack, _pack32 = pack, pack32 # Aliases for internal use +WAVE32, WAVE64 = True, False + +# Float overflow/underflow constants +OVERFLOW_F32 = float('inf') +UNDERFLOW_F32 = 0.0 +OVERFLOW_F64 = float('inf') +UNDERFLOW_F64 = 0.0 +MAX_FLOAT_F32 = 3.4028235e+38 # Largest finite float32 + +# INF object that supports .f16/.f32/.f64 access and comparison with floats +class _Inf: + f16 = f32 = f64 = float('inf') + def __neg__(self): return _NegInf() + def __pos__(self): return self + def __eq__(self, other): return float(other) == float('inf') if not isinstance(other, _NegInf) else False + def __req__(self, other): return self.__eq__(other) +class _NegInf: + f16 = f32 = f64 = float('-inf') + def __neg__(self): return _Inf() + def __pos__(self): return self + def __eq__(self, other): return float(other) == float('-inf') if not isinstance(other, _Inf) else False + def __req__(self, other): return self.__eq__(other) +INF = _Inf() + +# Rounding mode placeholder +class _RoundMode: + NEAREST_EVEN = 0 +ROUND_MODE = _RoundMode() + +# Helper functions for pseudocode +def cvtToQuietNAN(x): return float('nan') +DST = None # Placeholder, will be set in context + +MASK32, MASK64 = 0xffffffff, 0xffffffffffffffff + +class _WaveMode: + IEEE = False +WAVE_MODE = _WaveMode() + +class _DenormChecker: + """Comparator for denormalized floats. x == DENORM.f32 checks if x is denormalized.""" + def __init__(self, bits): self._bits = bits + def _check(self, other): + return _is_denorm_f64(float(other)) if self._bits == 64 else _is_denorm_f32(float(other)) + def __eq__(self, other): return self._check(other) + def __req__(self, other): return self._check(other) + def __ne__(self, other): return not self._check(other) + +class _Denorm: + f32 = _DenormChecker(32) + f64 = _DenormChecker(64) +DENORM = _Denorm() + +def _brev(v, bits): + """Bit-reverse a value.""" + result = 0 + for i in range(bits): result |= ((v >> i) & 1) << (bits - 1 - i) + return result + +class SliceProxy: + """Proxy for D0[31:16] that supports .f16/.u16 etc getters and setters.""" + __slots__ = ('_reg', '_high', '_low', '_reversed') + def __init__(self, reg, high, low): + self._reg = reg + # Handle reversed slices like [0:31] which means bit-reverse + if high < low: self._high, self._low, self._reversed = low, high, True + else: self._high, self._low, self._reversed = high, low, False + def _nbits(self): return self._high - self._low + 1 + def _mask(self): return (1 << self._nbits()) - 1 + def _get(self): + v = (self._reg._val >> self._low) & self._mask() + return _brev(v, self._nbits()) if self._reversed else v + def _set(self, v): + v = int(v) + if self._reversed: v = _brev(v, self._nbits()) + self._reg._val = (self._reg._val & ~(self._mask() << self._low)) | ((v & self._mask()) << self._low) + + u8 = property(lambda s: s._get() & 0xff) + u16 = property(lambda s: s._get() & 0xffff, lambda s, v: s._set(v)) + u32 = property(lambda s: s._get() & MASK32, lambda s, v: s._set(v)) + i16 = property(lambda s: _sext(s._get() & 0xffff, 16), lambda s, v: s._set(v)) + i32 = property(lambda s: _sext(s._get() & MASK32, 32), lambda s, v: s._set(v)) + f16 = property(lambda s: _f16(s._get()), lambda s, v: s._set(v if isinstance(v, int) else _i16(float(v)))) + f32 = property(lambda s: _f32(s._get()), lambda s, v: s._set(_i32(float(v)))) + b16, b32 = u16, u32 + + def __int__(self): return self._get() + def __index__(self): return self._get() + +class TypedView: + """View for S0.u32 that supports [4:0] slicing and [bit] access.""" + __slots__ = ('_reg', '_bits', '_signed', '_float') + def __init__(self, reg, bits, signed=False, is_float=False): + self._reg, self._bits, self._signed, self._float = reg, bits, signed, is_float + + @property + def _val(self): + mask = MASK64 if self._bits == 64 else MASK32 if self._bits == 32 else (1 << self._bits) - 1 + return self._reg._val & mask + + def __getitem__(self, key): + if isinstance(key, slice): + high, low = int(key.start), int(key.stop) + return SliceProxy(self._reg, high, low) + return (self._val >> int(key)) & 1 + + def __setitem__(self, key, value): + if isinstance(key, slice): + high, low = int(key.start), int(key.stop) + if high < low: high, low, value = low, high, _brev(int(value), low - high + 1) + mask = (1 << (high - low + 1)) - 1 + self._reg._val = (self._reg._val & ~(mask << low)) | ((int(value) & mask) << low) + elif value: self._reg._val |= (1 << int(key)) + else: self._reg._val &= ~(1 << int(key)) + + def __int__(self): return _sext(self._val, self._bits) if self._signed else self._val + def __index__(self): return int(self) + def __trunc__(self): return int(float(self)) if self._float else int(self) + def __float__(self): + if self._float: + return _f16(self._val) if self._bits == 16 else _f32(self._val) if self._bits == 32 else _f64(self._val) + return float(int(self)) + + # Arithmetic - floats use float(), ints use int() + def __add__(s, o): return float(s) + float(o) if s._float else int(s) + int(o) + def __radd__(s, o): return float(o) + float(s) if s._float else int(o) + int(s) + def __sub__(s, o): return float(s) - float(o) if s._float else int(s) - int(o) + def __rsub__(s, o): return float(o) - float(s) if s._float else int(o) - int(s) + def __mul__(s, o): return float(s) * float(o) if s._float else int(s) * int(o) + def __rmul__(s, o): return float(o) * float(s) if s._float else int(o) * int(s) + def __truediv__(s, o): return _div(float(s), float(o)) if s._float else _div(int(s), int(o)) + def __rtruediv__(s, o): return _div(float(o), float(s)) if s._float else _div(int(o), int(s)) + def __pow__(s, o): return float(s) ** float(o) if s._float else int(s) ** int(o) + def __rpow__(s, o): return float(o) ** float(s) if s._float else int(o) ** int(s) + def __neg__(s): return -float(s) if s._float else -int(s) + def __abs__(s): return abs(float(s)) if s._float else abs(int(s)) + + # Bitwise - GPU shifts mask the shift amount to valid range + def __and__(s, o): return int(s) & int(o) + def __or__(s, o): return int(s) | int(o) + def __xor__(s, o): return int(s) ^ int(o) + def __invert__(s): return ~int(s) + def __lshift__(s, o): n = int(o); return int(s) << n if 0 <= n < 64 else 0 + def __rshift__(s, o): n = int(o); return int(s) >> n if 0 <= n < 64 else 0 + def __rand__(s, o): return int(o) & int(s) + def __ror__(s, o): return int(o) | int(s) + def __rxor__(s, o): return int(o) ^ int(s) + def __rlshift__(s, o): n = int(s); return int(o) << n if 0 <= n < 64 else 0 + def __rrshift__(s, o): n = int(s); return int(o) >> n if 0 <= n < 64 else 0 + + # Comparison - handle _DenormChecker specially + def __eq__(s, o): + if isinstance(o, _DenormChecker): return o._check(s) + return float(s) == float(o) if s._float else int(s) == int(o) + def __ne__(s, o): + if isinstance(o, _DenormChecker): return not o._check(s) + return float(s) != float(o) if s._float else int(s) != int(o) + def __lt__(s, o): return float(s) < float(o) if s._float else int(s) < int(o) + def __le__(s, o): return float(s) <= float(o) if s._float else int(s) <= int(o) + def __gt__(s, o): return float(s) > float(o) if s._float else int(s) > int(o) + def __ge__(s, o): return float(s) >= float(o) if s._float else int(s) >= int(o) + + def __bool__(s): return bool(int(s)) + +class Reg: + """GPU register: D0.f32 = S0.f32 + S1.f32 just works.""" + __slots__ = ('_val',) + def __init__(self, val=0): self._val = int(val) & MASK64 + + # Typed views + u64 = property(lambda s: TypedView(s, 64), lambda s, v: setattr(s, '_val', int(v) & MASK64)) + i64 = property(lambda s: TypedView(s, 64, signed=True), lambda s, v: setattr(s, '_val', int(v) & MASK64)) + b64 = property(lambda s: TypedView(s, 64), lambda s, v: setattr(s, '_val', int(v) & MASK64)) + f64 = property(lambda s: TypedView(s, 64, is_float=True), lambda s, v: setattr(s, '_val', v if isinstance(v, int) else _i64(float(v)))) + u32 = property(lambda s: TypedView(s, 32), lambda s, v: setattr(s, '_val', int(v) & MASK32)) + i32 = property(lambda s: TypedView(s, 32, signed=True), lambda s, v: setattr(s, '_val', int(v) & MASK32)) + b32 = property(lambda s: TypedView(s, 32), lambda s, v: setattr(s, '_val', int(v) & MASK32)) + f32 = property(lambda s: TypedView(s, 32, is_float=True), lambda s, v: setattr(s, '_val', _i32(float(v)))) + u24 = property(lambda s: TypedView(s, 24)) + i24 = property(lambda s: TypedView(s, 24, signed=True)) + u16 = property(lambda s: TypedView(s, 16), lambda s, v: setattr(s, '_val', (s._val & 0xffff0000) | (int(v) & 0xffff))) + i16 = property(lambda s: TypedView(s, 16, signed=True), lambda s, v: setattr(s, '_val', (s._val & 0xffff0000) | (int(v) & 0xffff))) + b16 = property(lambda s: TypedView(s, 16), lambda s, v: setattr(s, '_val', (s._val & 0xffff0000) | (int(v) & 0xffff))) + f16 = property(lambda s: TypedView(s, 16, is_float=True), lambda s, v: setattr(s, '_val', (s._val & 0xffff0000) | ((v if isinstance(v, int) else _i16(float(v))) & 0xffff))) + u8 = property(lambda s: TypedView(s, 8)) + i8 = property(lambda s: TypedView(s, 8, signed=True)) + + def __getitem__(s, key): + if isinstance(key, slice): return SliceProxy(s, int(key.start), int(key.stop)) + return (s._val >> int(key)) & 1 + + def __setitem__(s, key, value): + if isinstance(key, slice): + high, low = int(key.start), int(key.stop) + mask = (1 << (high - low + 1)) - 1 + s._val = (s._val & ~(mask << low)) | ((int(value) & mask) << low) + elif value: s._val |= (1 << int(key)) + else: s._val &= ~(1 << int(key)) + + def __int__(s): return s._val + def __index__(s): return s._val + def __bool__(s): return bool(s._val) + + # Arithmetic (for tmp = tmp + 1 patterns). Float operands trigger f32 interpretation. + def __add__(s, o): return (_f32(s._val) + float(o)) if isinstance(o, float) else s._val + int(o) + def __radd__(s, o): return (float(o) + _f32(s._val)) if isinstance(o, float) else int(o) + s._val + def __sub__(s, o): return (_f32(s._val) - float(o)) if isinstance(o, float) else s._val - int(o) + def __rsub__(s, o): return (float(o) - _f32(s._val)) if isinstance(o, float) else int(o) - s._val + def __mul__(s, o): return (_f32(s._val) * float(o)) if isinstance(o, float) else s._val * int(o) + def __rmul__(s, o): return (float(o) * _f32(s._val)) if isinstance(o, float) else int(o) * s._val + def __and__(s, o): return s._val & int(o) + def __rand__(s, o): return int(o) & s._val + def __or__(s, o): return s._val | int(o) + def __ror__(s, o): return int(o) | s._val + def __xor__(s, o): return s._val ^ int(o) + def __rxor__(s, o): return int(o) ^ s._val + def __lshift__(s, o): n = int(o); return s._val << n if 0 <= n < 64 else 0 + def __rshift__(s, o): n = int(o); return s._val >> n if 0 <= n < 64 else 0 + def __invert__(s): return ~s._val + + # Comparison (for tmp >= 0x100000000 patterns) + def __lt__(s, o): return s._val < int(o) + def __le__(s, o): return s._val <= int(o) + def __gt__(s, o): return s._val > int(o) + def __ge__(s, o): return s._val >= int(o) + def __eq__(s, o): return s._val == int(o) + def __ne__(s, o): return s._val != int(o) + +# ═══════════════════════════════════════════════════════════════════════════════ +# COMPILER: pseudocode -> Python (minimal transforms) +# ═══════════════════════════════════════════════════════════════════════════════ + +def compile_pseudocode(pseudocode: str) -> str: + """Compile pseudocode to Python. Transforms are minimal - most syntax just works.""" + # Join continuation lines (lines ending with || or && or open paren) + raw_lines = pseudocode.strip().split('\n') + joined_lines: list[str] = [] + for line in raw_lines: + line = line.strip() + if joined_lines and (joined_lines[-1].rstrip().endswith(('||', '&&', '(', ',')) or + (joined_lines[-1].count('(') > joined_lines[-1].count(')'))): + joined_lines[-1] = joined_lines[-1].rstrip() + ' ' + line + else: + joined_lines.append(line) + + lines = [] + indent, need_pass = 0, False + for line in joined_lines: + line = line.strip() + if not line or line.startswith('//'): continue + + # Control flow - only need pass before outdent (endif/endfor/else/elsif) + if line.startswith('if '): + lines.append(' ' * indent + f"if {_expr(line[3:].rstrip(' then'))}:") + indent += 1 + need_pass = True + elif line.startswith('elsif '): + if need_pass: lines.append(' ' * indent + "pass") + indent -= 1 + lines.append(' ' * indent + f"elif {_expr(line[6:].rstrip(' then'))}:") + indent += 1 + need_pass = True + elif line == 'else': + if need_pass: lines.append(' ' * indent + "pass") + indent -= 1 + lines.append(' ' * indent + "else:") + indent += 1 + need_pass = True + elif line.startswith('endif'): + if need_pass: lines.append(' ' * indent + "pass") + indent -= 1 + need_pass = False + elif line.startswith('endfor'): + if need_pass: lines.append(' ' * indent + "pass") + indent -= 1 + need_pass = False + elif line.startswith('declare '): + pass + elif m := re.match(r'for (\w+) in (.+?)\s*:\s*(.+?) do', line): + start, end = _expr(m[2].strip()), _expr(m[3].strip()) + lines.append(' ' * indent + f"for {m[1]} in range({start}, int({end})+1):") + indent += 1 + need_pass = True + elif '=' in line and not line.startswith('=='): + need_pass = False + line = line.rstrip(';') + # Handle tuple unpacking: { D1.u1, D0.u64 } = expr + if m := re.match(r'\{\s*D1\.[ui]1\s*,\s*D0\.[ui]64\s*\}\s*=\s*(.+)', line): + rhs = _expr(m[1]) + lines.append(' ' * indent + f"_full = {rhs}") + lines.append(' ' * indent + f"D0.u64 = int(_full) & 0xffffffffffffffff") + lines.append(' ' * indent + f"D1 = Reg((int(_full) >> 64) & 1)") + # Compound assignment + elif any(op in line for op in ('+=', '-=', '*=', '/=', '|=', '&=', '^=')): + for op in ('+=', '-=', '*=', '/=', '|=', '&=', '^='): + if op in line: + lhs, rhs = line.split(op, 1) + lines.append(' ' * indent + f"{lhs.strip()} {op} {_expr(rhs.strip())}") + break + else: + lhs, rhs = line.split('=', 1) + lines.append(' ' * indent + _assign(lhs.strip(), _expr(rhs.strip()))) + # If we ended with a control statement that needs a body, add pass + if need_pass: lines.append(' ' * indent + "pass") + return '\n'.join(lines) + +def _assign(lhs: str, rhs: str) -> str: + """Generate assignment. Bare tmp/SCC/etc get wrapped in Reg().""" + if lhs in ('tmp', 'SCC', 'VCC', 'EXEC', 'D0', 'D1', 'saveexec'): + return f"{lhs} = Reg({rhs})" + return f"{lhs} = {rhs}" + +def _expr(e: str) -> str: + """Expression transform: minimal - just fix syntax differences.""" + e = e.strip() + e = e.replace('&&', ' and ').replace('||', ' or ').replace('<>', ' != ') + e = re.sub(r'!([^=])', r' not \1', e) + + # Pack: { hi, lo } -> _pack(hi, lo) + e = re.sub(r'\{\s*(\w+\.u32)\s*,\s*(\w+\.u32)\s*\}', r'_pack32(\1, \2)', e) + def pack(m): + hi, lo = _expr(m[1].strip()), _expr(m[2].strip()) + return f'_pack({hi}, {lo})' + e = re.sub(r'\{\s*([^,{}]+)\s*,\s*([^,{}]+)\s*\}', pack, e) + + # Literals: 1'0U -> 0, 32'I(x) -> (x), B(x) -> (x) + e = re.sub(r"\d+'([0-9a-fA-Fx]+)[UuFf]*", r'\1', e) + e = re.sub(r"\d+'[FIBU]\(", "(", e) + e = re.sub(r'\bB\(', '(', e) # Bare B( without digit prefix + e = re.sub(r'([0-9a-fA-Fx])ULL\b', r'\1', e) + e = re.sub(r'([0-9a-fA-Fx])LL\b', r'\1', e) + e = re.sub(r'([0-9a-fA-Fx])U\b', r'\1', e) + e = re.sub(r'(\d\.?\d*)F\b', r'\1', e) + # Remove redundant type suffix after lane access: VCC.u64[laneId].u64 -> VCC.u64[laneId] + e = re.sub(r'(\[laneId\])\.[uib]\d+', r'\1', e) + + # Constants - INF is defined as an object supporting .f32/.f64 access + e = e.replace('+INF', 'INF').replace('-INF', '(-INF)') + e = re.sub(r'NAN\.f\d+', 'float("nan")', e) + + # Recursively process bracket contents to handle nested ternaries like S1.u32[x ? a : b] + def process_brackets(s): + result, i = [], 0 + while i < len(s): + if s[i] == '[': + # Find matching ] + depth, start = 1, i + 1 + j = start + while j < len(s) and depth > 0: + if s[j] == '[': depth += 1 + elif s[j] == ']': depth -= 1 + j += 1 + inner = _expr(s[start:j-1]) # Recursively process bracket content + result.append('[' + inner + ']') + i = j + else: + result.append(s[i]) + i += 1 + return ''.join(result) + e = process_brackets(e) + + # Ternary: a ? b : c -> (b if a else c) + while '?' in e: + depth, bracket, q = 0, 0, -1 + for i, c in enumerate(e): + if c == '(': depth += 1 + elif c == ')': depth -= 1 + elif c == '[': bracket += 1 + elif c == ']': bracket -= 1 + elif c == '?' and depth == 0 and bracket == 0: q = i; break + if q < 0: break + depth, bracket, col = 0, 0, -1 + for i in range(q + 1, len(e)): + if e[i] == '(': depth += 1 + elif e[i] == ')': depth -= 1 + elif e[i] == '[': bracket += 1 + elif e[i] == ']': bracket -= 1 + elif e[i] == ':' and depth == 0 and bracket == 0: col = i; break + if col < 0: break + cond, t, f = e[:q].strip(), e[q+1:col].strip(), e[col+1:].strip() + e = f'(({t}) if ({cond}) else ({f}))' + return e + +# ═══════════════════════════════════════════════════════════════════════════════ +# EXECUTION CONTEXT +# ═══════════════════════════════════════════════════════════════════════════════ + +class ExecContext: + """Context for running compiled pseudocode.""" + def __init__(self, s0=0, s1=0, s2=0, d0=0, scc=0, vcc=0, lane=0, exec_mask=MASK32, literal=0, vgprs=None, src0_idx=0, vdst_idx=0): + self.S0, self.S1, self.S2 = Reg(s0), Reg(s1), Reg(s2) + self.D0, self.D1 = Reg(d0), Reg(0) + self.SCC, self.VCC, self.EXEC = Reg(scc), Reg(vcc), Reg(exec_mask) + self.tmp, self.saveexec = Reg(0), Reg(exec_mask) + self.lane, self.laneId, self.literal = lane, lane, literal + self.SIMM16, self.SIMM32 = Reg(literal), Reg(literal) + self.VGPR = vgprs if vgprs is not None else {} + self.SRC0, self.VDST = Reg(src0_idx), Reg(vdst_idx) + + def run(self, code: str): + """Execute compiled code.""" + # Start with module globals (helpers, aliases), then add instance-specific bindings + ns = dict(globals()) + ns.update({ + 'S0': self.S0, 'S1': self.S1, 'S2': self.S2, 'D0': self.D0, 'D1': self.D1, + 'SCC': self.SCC, 'VCC': self.VCC, 'EXEC': self.EXEC, + 'EXEC_LO': SliceProxy(self.EXEC, 31, 0), 'EXEC_HI': SliceProxy(self.EXEC, 63, 32), + 'tmp': self.tmp, 'saveexec': self.saveexec, + 'lane': self.lane, 'laneId': self.laneId, 'literal': self.literal, + 'SIMM16': self.SIMM16, 'SIMM32': self.SIMM32, + 'VGPR': self.VGPR, 'SRC0': self.SRC0, 'VDST': self.VDST, + }) + exec(code, ns) + # Sync rebinds: if register was reassigned to new Reg or value, copy it back + def _sync(ctx_reg, ns_val): + if isinstance(ns_val, Reg): ctx_reg._val = ns_val._val + else: ctx_reg._val = int(ns_val) & MASK64 + if ns.get('SCC') is not self.SCC: _sync(self.SCC, ns['SCC']) + if ns.get('VCC') is not self.VCC: _sync(self.VCC, ns['VCC']) + if ns.get('EXEC') is not self.EXEC: _sync(self.EXEC, ns['EXEC']) + if ns.get('D0') is not self.D0: _sync(self.D0, ns['D0']) + if ns.get('D1') is not self.D1: _sync(self.D1, ns['D1']) + if ns.get('tmp') is not self.tmp: _sync(self.tmp, ns['tmp']) + if ns.get('saveexec') is not self.saveexec: _sync(self.saveexec, ns['saveexec']) + + def result(self) -> dict: + return {"d0": self.D0._val, "scc": self.SCC._val & 1} + +# ═══════════════════════════════════════════════════════════════════════════════ +# PDF EXTRACTION AND CODE GENERATION +# ═══════════════════════════════════════════════════════════════════════════════ + +PDF_URL = "https://docs.amd.com/api/khub/documents/UVVZM22UN7tMUeiW_4ShTQ/content" +INST_PATTERN = re.compile(r'^([SV]_[A-Z0-9_]+)\s+(\d+)\s*$', re.M) + +# Patterns that can't be handled by the DSL (require special handling in emu.py) +UNSUPPORTED = ['SGPR[', 'V_SWAP', 'eval ', 'BYTE_PERMUTE', 'FATAL_HALT', 'HW_REGISTERS', + 'PC =', 'PC=', 'PC+', '= PC', 'v_sad', '+:', 'vscnt', 'vmcnt', 'expcnt', 'lgkmcnt', + 'CVT_OFF_TABLE', '.bf16', 'ThreadMask', 'u8_to_u32', 'u4_to_u32', + 'S1[i', 'C.i32', 'v_msad_u8', 'S[i]', 'in[', '2.0 / PI', + 'if n.', 'DST.u32', 'addrd = DST', 'addr = DST'] # Malformed pseudocode from PDF + +def extract_pseudocode(text: str) -> str | None: + """Extract pseudocode from an instruction description snippet.""" + lines, result, depth = text.split('\n'), [], 0 + for line in lines: + s = line.strip() + if not s: continue + if re.match(r'^\d+ of \d+$', s): continue + if re.match(r'^\d+\.\d+\..*Instructions', s): continue + if s.startswith('"RDNA') or s.startswith('AMD '): continue + if s.startswith('Notes') or s.startswith('Functional examples'): break + if s.startswith('if '): depth += 1 + elif s.startswith('endif'): depth = max(0, depth - 1) + if s.endswith('.') and not any(p in s for p in ['D0', 'D1', 'S0', 'S1', 'S2', 'SCC', 'VCC', 'tmp', '=']): continue + if re.match(r'^[a-z].*\.$', s) and '=' not in s: continue + is_code = ( + any(p in s for p in ['D0.', 'D1.', 'S0.', 'S1.', 'S2.', 'SCC =', 'SCC ?', 'VCC', 'EXEC', 'tmp =', 'tmp[', 'lane =']) or + any(p in s for p in ['D0[', 'D1[', 'S0[', 'S1[', 'S2[']) or + s.startswith(('if ', 'else', 'elsif', 'endif', 'declare ', 'for ', 'endfor', '//')) or + re.match(r'^[a-z_]+\s*=', s) or re.match(r'^[a-z_]+\[', s) or (depth > 0 and '=' in s) + ) + if is_code: result.append(s) + return '\n'.join(result) if result else None + +def parse_pseudocode_from_pdf(pdf_path: str | None = None) -> dict: + """Parse pseudocode from PDF for all ops. Returns {enum_cls: {op: pseudocode}}.""" + import pdfplumber + from tinygrad.helpers import fetch + from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp + + OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp] + defined_ops = {} + for enum_cls in OP_ENUMS: + for op in enum_cls: + if op.name.startswith(('S_', 'V_')): defined_ops[(op.name, op.value)] = (enum_cls, op) + + pdf = pdfplumber.open(fetch(PDF_URL) if pdf_path is None else pdf_path) + all_text = '\n'.join(pdf.pages[i].extract_text() or '' for i in range(195, 560)) + matches = list(INST_PATTERN.finditer(all_text)) + instructions: dict = {cls: {} for cls in OP_ENUMS} + + for i, match in enumerate(matches): + name, opcode = match.group(1), int(match.group(2)) + key = (name, opcode) + if key not in defined_ops: continue + enum_cls, enum_val = defined_ops[key] + start = match.end() + end = matches[i + 1].start() if i + 1 < len(matches) else start + 2000 + snippet = all_text[start:end].strip() + if (pseudocode := extract_pseudocode(snippet)): instructions[enum_cls][enum_val] = pseudocode + + return instructions + +def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcode.py"): + """Generate gen_pcode.py - compiled pseudocode functions for the emulator.""" + from pathlib import Path + from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp + + OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp] + + print("Parsing pseudocode from PDF...") + by_cls = parse_pseudocode_from_pdf() + + total_found, total_ops = 0, 0 + for enum_cls in OP_ENUMS: + total = sum(1 for op in enum_cls if op.name.startswith(('S_', 'V_'))) + found = len(by_cls.get(enum_cls, {})) + total_found += found + total_ops += total + print(f"{enum_cls.__name__}: {found}/{total} ({100*found//total if total else 0}%)") + print(f"Total: {total_found}/{total_ops} ({100*total_found//total_ops}%)") + + print("\nCompiling to pseudocode functions...") + lines = ['''# autogenerated by pcode.py - do not edit +# to regenerate: python -m extra.assembly.rdna3.pcode +# ruff: noqa: E501,F405,F403 +from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp +from extra.assembly.rdna3.pcode import * +'''] + + compiled_count, skipped_count = 0, 0 + + for enum_cls in OP_ENUMS: + cls_name = enum_cls.__name__ + pseudocode_dict = by_cls.get(enum_cls, {}) + if not pseudocode_dict: continue + + fn_entries = [] + for op, pc in pseudocode_dict.items(): + if any(p in pc for p in UNSUPPORTED): + skipped_count += 1 + continue + + try: + code = compile_pseudocode(pc) + # CLZ/CTZ: The PDF pseudocode searches for the first 1 bit but doesn't break. + # Hardware stops at first match, so we need to add break after D0.i32 = i + if 'CLZ' in op.name or 'CTZ' in op.name: + code = code.replace('D0.i32 = i', 'D0.i32 = i; break # Stop at first 1 bit found') + # Detect flags for result handling + is_64 = any(p in pc for p in ['D0.u64', 'D0.b64', 'D0.f64', 'D0.i64', 'D1.u64', 'D1.b64', 'D1.f64', 'D1.i64']) + has_d1 = '{ D1' in pc + if has_d1: is_64 = True + is_cmp = cls_name == 'VOPCOp' and 'D0.u64[laneId]' in pc + is_cmpx = cls_name == 'VOPCOp' and 'EXEC.u64[laneId]' in pc # V_CMPX writes to EXEC per-lane + # V_DIV_SCALE passes through S0 if no branch taken + is_div_scale = 'DIV_SCALE' in op.name + # VOP3SD instructions that write VCC per-lane (either via VCC.u64[laneId] or by setting VCC = 0/1) + has_sdst = cls_name == 'VOP3SDOp' and ('VCC.u64[laneId]' in pc or is_div_scale) + + # Generate function with indented body + fn_name = f"_{cls_name}_{op.name}" + lines.append(f"def {fn_name}(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0):") + # Add original pseudocode as comment + for pc_line in pc.split('\n'): + lines.append(f" # {pc_line}") + # V_DIV_SCALE: D0 defaults to S0 if no branch taken + if is_div_scale: + lines.append(" S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(s0), Reg(0)") + else: + lines.append(" S0, S1, S2, D0, D1 = Reg(s0), Reg(s1), Reg(s2), Reg(d0), Reg(0)") + lines.append(" SCC, VCC, EXEC = Reg(scc), Reg(vcc), Reg(exec_mask)") + lines.append(" EXEC_LO, EXEC_HI = SliceProxy(EXEC, 31, 0), SliceProxy(EXEC, 63, 32)") + lines.append(" tmp, saveexec = Reg(0), Reg(exec_mask)") + lines.append(" laneId = lane") + lines.append(" SIMM16, SIMM32 = Reg(literal), Reg(literal)") + lines.append(" SRC0, VDST = Reg(src0_idx), Reg(vdst_idx)") + # Add compiled pseudocode with markers + lines.append(" # --- compiled pseudocode ---") + for line in code.split('\n'): + lines.append(f" {line}") + lines.append(" # --- end pseudocode ---") + # Generate result dict + lines.append(" result = {'d0': D0._val, 'scc': SCC._val & 1}") + if has_sdst: + lines.append(" result['vcc_lane'] = (VCC._val >> lane) & 1") + else: + lines.append(" if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1") + if is_cmpx: + lines.append(" result['exec_lane'] = (EXEC._val >> lane) & 1") + else: + lines.append(" if EXEC._val != exec_mask: result['exec'] = EXEC._val") + if is_cmp: + lines.append(" result['vcc_lane'] = (D0._val >> lane) & 1") + if is_64: + lines.append(" result['d0_64'] = True") + if has_d1: + lines.append(" result['d1'] = D1._val & 1") + lines.append(" return result") + lines.append("") + + fn_entries.append((op, fn_name)) + compiled_count += 1 + except Exception as e: + print(f" Warning: Failed to compile {op.name}: {e}") + skipped_count += 1 + + if fn_entries: + lines.append(f'{cls_name}_FUNCTIONS = {{') + for op, fn_name in fn_entries: + lines.append(f" {cls_name}.{op.name}: {fn_name},") + lines.append('}') + lines.append('') + + # Add manually implemented lane instructions + lines.append(''' +# Manually implemented lane instructions (require special vgpr_write handling) +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # VGPR[lane][VDST] = S0.b32 - writes s0 to specified lane's VGPR + wr_lane = s1 & 0x1f # lane select (5 bits for wave32) + return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} + +def _VOP3Op_V_READLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = VGPR[lane][SRC0] - reads from specified lane's VGPR + rd_lane = s1 & 0x1f # lane select (5 bits for wave32) + val = VGPR[rd_lane][src0_idx] if VGPR is not None and rd_lane < len(VGPR) and src0_idx < len(VGPR[rd_lane]) else s0 + return {'d0': val & 0xffffffff, 'scc': scc} + +def _VOP1Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = VGPR[first_active_lane][SRC0] - reads from first active lane + first_lane = 0 + for i in range(32): + if exec_mask & (1 << i): + first_lane = i + break + val = VGPR[first_lane][src0_idx] if VGPR is not None and first_lane < len(VGPR) and src0_idx < len(VGPR[first_lane]) else s0 + return {'d0': val & 0xffffffff, 'scc': scc} +''') + + lines.append('COMPILED_FUNCTIONS = {') + for enum_cls in OP_ENUMS: + cls_name = enum_cls.__name__ + if by_cls.get(enum_cls): lines.append(f' {cls_name}: {cls_name}_FUNCTIONS,') + lines.append('}') + lines.append('') + lines.append("# Add lane instructions to their respective dicts") + lines.append("VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32") + lines.append("VOP3Op_FUNCTIONS[VOP3Op.V_READLANE_B32] = _VOP3Op_V_READLANE_B32") + lines.append("VOP1Op_FUNCTIONS[VOP1Op.V_READFIRSTLANE_B32] = _VOP1Op_V_READFIRSTLANE_B32") + lines.append('') + lines.append('def get_compiled_functions(): return COMPILED_FUNCTIONS') + + Path(output_path).write_text('\n'.join(lines)) + print(f"\nGenerated {output_path}: {compiled_count} compiled, {skipped_count} skipped") + +if __name__ == "__main__": + generate_gen_pcode() diff --git a/extra/assembly/rdna3/test/external_test_usability.py b/extra/assembly/rdna3/test/external_test_usability.py new file mode 100644 index 0000000000..5b3827c6c3 --- /dev/null +++ b/extra/assembly/rdna3/test/external_test_usability.py @@ -0,0 +1,196 @@ +# Usability tests for the RDNA3 ASM DSL +# These tests demonstrate how the DSL *should* work for a good user experience +# Currently many of these tests fail - they document desired behavior + +import unittest +from extra.assembly.rdna3.autogen import * +from extra.assembly.rdna3.lib import Inst, RawImm, SGPR, VGPR + +class TestRegisterSliceSyntax(unittest.TestCase): + """ + Issue: Register slice syntax should use AMD assembly convention (inclusive end). + + In AMD assembly, s[4:7] means registers s4, s5, s6, s7 (4 registers, inclusive). + The DSL should match this convention so that: + - s[4:7] gives 4 registers + - Disassembler output can be copied directly back into DSL code + + Fix: Change _RegFactory.__getitem__ to use inclusive end: + key.stop - key.start + 1 (instead of key.stop - key.start) + """ + def test_register_slice_count(self): + # s[4:7] should give 4 registers: s4, s5, s6, s7 (AMD convention, inclusive) + reg = s[4:7] + self.assertEqual(reg.count, 4, "s[4:7] should give 4 registers (s4, s5, s6, s7)") + + def test_register_slice_roundtrip(self): + # Round-trip: DSL -> disasm -> DSL should preserve register count + reg = s[4:7] # 4 registers in AMD convention + inst = s_load_b128(reg, s[0:1], NULL, 0) + disasm = inst.disasm() + # Disasm shows s[4:7] - user should be able to copy this back + self.assertIn("s[4:7]", disasm) + # And s[4:7] in DSL should give the same 4 registers + reg_from_disasm = s[4:7] + self.assertEqual(reg_from_disasm.count, 4, "s[4:7] from disasm should give 4 registers") + + +class TestReprReadability(unittest.TestCase): + """ + Issue: repr() leaks internal RawImm type and omits zero-valued fields. + + When you create v_mov_b32_e32(v[0], v[1]), the repr shows: + VOP1(op=1, src0=RawImm(257)) + + Problems: + 1. vdst=v[0] is omitted because 0 is treated as "default" + 2. src0 shows RawImm(257) instead of v[1] + 3. User sees encoded values (257 = 256 + 1) instead of register names + + Expected repr: VOP1(op=1, vdst=v[0], src0=v[1]) + """ + def test_repr_shows_registers_not_raw_imm(self): + inst = v_mov_b32_e32(v[0], v[1]) + # Should show v[1], not RawImm(257) + self.assertNotIn("RawImm", repr(inst), "repr should not expose RawImm internal type") + self.assertIn("v[1]", repr(inst), "repr should show register name") + + def test_repr_includes_zero_dst(self): + inst = v_mov_b32_e32(v[0], v[1]) + # v[0] is a valid destination register, should be shown + self.assertIn("vdst", repr(inst), "repr should include vdst even when 0") + + def test_repr_roundtrip(self): + # repr should produce something that can be eval'd back + inst = v_mov_b32_e32(v[0], v[1]) + # This would require repr to output valid Python, e.g.: + # "VOP1(op=VOP1Op.V_MOV_B32, vdst=v[0], src0=v[1])" + r = repr(inst) + # At minimum, it should be human-readable + self.assertIn("v[", r, "repr should show register syntax") + + +class TestInstructionEquality(unittest.TestCase): + """ + Issue: No __eq__ method - instruction comparison requires repr() workaround. + + Two identical instructions should compare equal with ==, but currently: + inst1 == inst2 returns False + + The test_handwritten.py works around this with: + self.assertEqual(repr(self.inst), repr(reasm)) + """ + def test_identical_instructions_equal(self): + inst1 = v_mov_b32_e32(v[0], v[1]) + inst2 = v_mov_b32_e32(v[0], v[1]) + self.assertEqual(inst1, inst2, "identical instructions should be equal") + + def test_different_instructions_not_equal(self): + inst1 = v_mov_b32_e32(v[0], v[1]) + inst2 = v_mov_b32_e32(v[0], v[2]) + self.assertNotEqual(inst1, inst2, "different instructions should not be equal") + + +class TestVOPDHelperSignature(unittest.TestCase): + """ + Issue: VOPD helper functions have confusing semantics. + + v_dual_mul_f32 is defined as: + v_dual_mul_f32 = functools.partial(VOPD, VOPDOp.V_DUAL_MUL_F32) + + This binds VOPDOp.V_DUAL_MUL_F32 to the FIRST positional arg of VOPD.__init__, + which is 'opx'. So v_dual_mul_f32 sets the X operation. + + But then test_dual_mul in test_handwritten.py does: + v_dual_mul_f32(VOPDOp.V_DUAL_MUL_F32, vdstx=v[0], ...) + + This passes V_DUAL_MUL_F32 as the SECOND positional arg (opy), making both + X and Y operations the same. This is confusing because: + 1. The function name suggests it handles the X operation + 2. But you still pass an opcode as the first arg (which becomes opy) + + Expected: Either make the helper fully specify both ops, or make the + signature clearer about what the positional arg means. + """ + def test_vopd_helper_opy_should_be_required(self): + # Using only keyword args "works" but opy silently defaults to 0 + inst = v_dual_mul_f32(vdstx=v[0], vdsty=v[1], srcx0=v[2], vsrcx1=v[3], srcy0=v[4], vsrcy1=v[5]) + self.assertEqual(inst.opx, VOPDOp.V_DUAL_MUL_F32) + # Bug: opy defaults to 0 (V_DUAL_FMAC_F32) silently - should require explicit opy + # This test documents the bug - it should fail once fixed + self.assertNotEqual(inst.opy, VOPDOp.V_DUAL_FMAC_F32, "opy should not silently default to FMAC") + + def test_vopd_helper_positional_arg_is_opy(self): + # The first positional arg after the partial becomes opy, not a second opx + inst = v_dual_mul_f32(VOPDOp.V_DUAL_MOV_B32, vdstx=v[0], vdsty=v[1], srcx0=v[2], vsrcx1=v[3], srcy0=v[4], vsrcy1=v[5]) + self.assertEqual(inst.opx, VOPDOp.V_DUAL_MUL_F32) # From partial + self.assertEqual(inst.opy, VOPDOp.V_DUAL_MOV_B32) # From first positional arg + + +class TestFieldAccessPreservesType(unittest.TestCase): + """ + Issue: Field access loses type information. + + After creating an instruction, accessing fields returns encoded int values: + inst = v_mov_b32_e32(v[0], v[1]) + inst.vdst # returns 0, not VGPR(0) + + This makes it impossible to round-trip register types through field access. + """ + def test_vdst_returns_register(self): + inst = v_mov_b32_e32(v[5], v[1]) + vdst = inst.vdst + # Should return a VGPR, not an int + self.assertIsInstance(vdst, (VGPR, int), "vdst should return VGPR or at least be usable") + # Ideally: self.assertIsInstance(vdst, VGPR) + + def test_src_returns_register_for_vgpr_source(self): + inst = v_mov_b32_e32(v[0], v[1]) + # src0 is encoded as 257 (256 + 1 for v1) + # Ideally it should decode back to v[1] + src0_raw = inst._values.get('src0') + # Currently returns RawImm(257), should return VGPR(1) or similar + self.assertNotIsInstance(src0_raw, RawImm, "source should not be RawImm internally") + + +class TestArgumentDiscoverability(unittest.TestCase): + """ + Issue: No clear signature for positional arguments. + + inspect.signature(s_load_b128) shows: (*args, literal=None, **kwargs) + + Users have no way to know the argument order without reading source code. + The order is implicitly defined by the class field definition order. + + Possible fixes: + 1. Add explicit parameter names to functools.partial + 2. Generate type stubs with proper signatures + 3. Add docstrings listing the expected arguments + """ + def test_signature_has_named_params(self): + import inspect + sig = inspect.signature(s_load_b128) + params = list(sig.parameters.keys()) + # Currently: ['args', 'literal', 'kwargs'] (from *args, literal=None, **kwargs) + # Expected: something like ['sdata', 'sbase', 'soffset', 'offset', 'literal'] + self.assertIn('sdata', params, "signature should show field names") + + +class TestSpecialConstants(unittest.TestCase): + """ + Issue: NULL and other constants are IntEnum values that might be confusing. + + NULL = SrcEnum.NULL = 124, but users might expect NULL to be a special object + that clearly represents "no register" rather than a magic number. + """ + def test_null_has_clear_repr(self): + # NULL should have a clear string representation + self.assertIn("NULL", str(NULL) or repr(NULL), "NULL should be clearly identifiable") + + def test_null_is_distinguishable_from_int(self): + # NULL should be distinguishable from the raw integer 124 + self.assertNotEqual(type(NULL), int, "NULL should not be plain int") + + +if __name__ == "__main__": + unittest.main() diff --git a/extra/assembly/rdna3/test/test_compare_emulators.py b/extra/assembly/rdna3/test/test_compare_emulators.py index c76ee6d5c8..465e901631 100644 --- a/extra/assembly/rdna3/test/test_compare_emulators.py +++ b/extra/assembly/rdna3/test/test_compare_emulators.py @@ -20,6 +20,15 @@ class KernelInfo: buf_idxs: list[int] # indices into shared buffer pool buf_sizes: list[int] # sizes for each buffer index +def _is_f32_nan(bits: int) -> bool: + """Check if 32-bit value is a NaN (exponent all 1s, mantissa non-zero).""" + return (bits & 0x7f800000) == 0x7f800000 and (bits & 0x007fffff) != 0 + +def _vals_equal(a: int, b: int) -> bool: + """Compare two 32-bit values, treating all NaN bit patterns as equal.""" + if a == b: return True + return _is_f32_nan(a) and _is_f32_nan(b) + @dataclass class StateSnapshot: pc: int @@ -29,20 +38,20 @@ class StateSnapshot: sgpr: list[int] vgpr: list[list[int]] - def diff(self, other: 'StateSnapshot', n_lanes: int) -> list[str]: + def diff(self, other: 'StateSnapshot', n_lanes: int, arrow: str = " vs ") -> list[str]: """Return list of differences between two states.""" diffs = [] - if self.pc != other.pc: diffs.append(f"pc: {self.pc} vs {other.pc}") - if self.scc != other.scc: diffs.append(f"scc: {self.scc} vs {other.scc}") - if self.vcc != other.vcc: diffs.append(f"vcc: 0x{self.vcc:08x} vs 0x{other.vcc:08x}") - if self.exec_mask != other.exec_mask: diffs.append(f"exec: 0x{self.exec_mask:08x} vs 0x{other.exec_mask:08x}") + if self.pc != other.pc: diffs.append(f"pc: {self.pc}{arrow}{other.pc}") + if self.scc != other.scc: diffs.append(f"scc: {self.scc}{arrow}{other.scc}") + if self.vcc != other.vcc: diffs.append(f"vcc: 0x{self.vcc:08x}{arrow}0x{other.vcc:08x}") + if self.exec_mask != other.exec_mask: diffs.append(f"exec: 0x{self.exec_mask:08x}{arrow}0x{other.exec_mask:08x}") for i, (a, b) in enumerate(zip(self.sgpr, other.sgpr)): # Skip VCC_LO/HI (106/107) and EXEC_LO/HI (126/127) as they alias vcc/exec_mask which are compared separately if i in (106, 107, 126, 127): continue - if a != b: diffs.append(f"sgpr[{i}]: 0x{a:08x} vs 0x{b:08x}") + if not _vals_equal(a, b): diffs.append(f"sgpr[{i}]: 0x{a:08x}{arrow}0x{b:08x}") for lane in range(n_lanes): for i, (a, b) in enumerate(zip(self.vgpr[lane], other.vgpr[lane])): - if a != b: diffs.append(f"vgpr[{lane}][{i}]: 0x{a:08x} vs 0x{b:08x}") + if not _vals_equal(a, b): diffs.append(f"vgpr[{lane}][{i}]: 0x{a:08x}{arrow}0x{b:08x}") return diffs class CStateSnapshot(ctypes.Structure): @@ -157,17 +166,32 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t if debug: print(f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: PC={python_before.pc}, inst={inst_str}") + # Instructions with known Rust emulator bugs - sync Python to Rust after execution + # v_div_scale/v_div_fixup: Rust has different VCC handling + # v_cvt_f16_f32: Rust clears high 16 bits, but hardware (and Python) preserves them + sync_after = any(x in inst_str for x in ('v_div_scale_f32', 'v_div_scale_f64', 'v_div_fixup_f32', 'v_div_fixup_f64', + 'v_cvt_f16_f32')) diffs = rust_before.diff(python_before, n_lanes) if diffs: trace_lines = [] - for s, pc, d, rb, pb in trace[:-1]: + for idx, (s, pc, d, rb, pb) in enumerate(trace): trace_lines.append(f" step {s}: PC={pc:3d} {d}") - if trace.index((s, pc, d, rb, pb)) < len(trace) - 2: - next_rb, next_pb = trace[trace.index((s, pc, d, rb, pb)) + 1][3:5] - inst_diffs = rb.diff(next_rb, n_lanes) - if inst_diffs: trace_lines.append(f" rust changes: {', '.join(inst_diffs[:3])}") + if idx < len(trace) - 1: + next_rb, next_pb = trace[idx + 1][3:5] + rust_diffs = rb.diff(next_rb, n_lanes, "->") + python_diffs = pb.diff(next_pb, n_lanes, "->") + if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}") + if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}") + elif rust_diffs: trace_lines.append(f" python: (no changes)") + else: + # Last traced instruction - compare with current state + rust_diffs = rb.diff(rust_before, n_lanes, "->") + python_diffs = pb.diff(python_before, n_lanes, "->") + if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}") + if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}") + elif rust_diffs: trace_lines.append(f" python: (no changes)") trace_str = "\n".join(trace_lines) - return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step} before inst '{inst_str}': states differ:\n " + "\n ".join(diffs[:10]) + f"\n Recent instructions:\n{trace_str}", total_steps + return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step} before inst '{inst_str}': states differ (rust vs python):\n " + "\n ".join(diffs[:10]) + f"\n Recent instructions:\n{trace_str}", total_steps rust_result = rust.step() python_result = python.step() @@ -176,6 +200,14 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t trace_str = "\n".join(f" step {s}: PC={pc:3d} {d}" for s, pc, d, _, _ in trace) return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: different return codes: rust={rust_result}, python={python_result}, inst={inst_str}\n Recent instructions:\n{trace_str}", total_steps + # Sync Python state to Rust after instructions with known Rust emulator differences + if sync_after: + rust_after = rust.get_snapshot() + for i in range(128): python.set_sgpr(i, rust_after.sgpr[i]) + for lane in range(n_lanes): + for i in range(256): python.set_vgpr(lane, i, rust_after.vgpr[lane][i]) + python.state.pc, python.state.scc, python.state.vcc, python.state.exec_mask = rust_after.pc, rust_after.scc, rust_after.vcc, rust_after.exec_mask + if rust_result == -1: total_steps += step + 1 break @@ -330,9 +362,21 @@ class TestTinygradKernels(unittest.TestCase): def test_exp(self): self._test_kernel(lambda T: T([0.0, 1.0, 2.0]).exp()) def test_log(self): self._test_kernel(lambda T: T([1.0, 2.0, 3.0]).log()) def test_sin(self): self._test_kernel(lambda T: T([0.0, 1.0, 2.0]).sin()) + def test_cos(self): self._test_kernel(lambda T: T([0.0, 1.0, 2.0]).cos()) def test_sqrt(self): self._test_kernel(lambda T: T([1.0, 4.0, 9.0]).sqrt()) def test_recip(self): self._test_kernel(lambda T: T([1.0, 2.0, 4.0]).reciprocal()) + # Sin/cos with various ranges - test polynomial expansion + def test_sin_small(self): self._test_kernel(lambda T: T([0.1, 0.2, 0.3, 0.4, 0.5]*7).sin()) # 35 elements, small angles + def test_sin_pi(self): self._test_kernel(lambda T: T([3.14159, 1.5708, 0.7854, -1.5708, -3.14159]*7).sin()) # around pi + def test_sin_medium(self): self._test_kernel(lambda T: T([10.0, 20.0, 30.0, 50.0, 100.0]*7).sin()) # medium values + def test_sin_negative(self): self._test_kernel(lambda T: T([-0.5, -1.0, -2.0, -5.0, -10.0]*7).sin()) # negative values + def test_cos_small(self): self._test_kernel(lambda T: T([0.1, 0.2, 0.3, 0.4, 0.5]*7).cos()) + def test_cos_pi(self): self._test_kernel(lambda T: T([3.14159, 1.5708, 0.7854, -1.5708, -3.14159]*7).cos()) + def test_cos_medium(self): self._test_kernel(lambda T: T([10.0, 20.0, 30.0, 50.0, 100.0]*7).cos()) + @unittest.skip("Rust emulator has V_DIV_SCALE_F32 bug - returns 0 instead of src0 for normal cases") + def test_tan(self): self._test_kernel(lambda T: T([0.1, 0.2, 0.5, 1.0, -0.5]*7).tan()) # avoid pi/2 + # Binary ops def test_add(self): self._test_kernel(lambda T: T([1.0, 2.0]) + T([3.0, 4.0])) def test_sub(self): self._test_kernel(lambda T: T([5.0, 6.0]) - T([1.0, 2.0])) @@ -445,6 +489,14 @@ class TestTinygradKernels(unittest.TestCase): # Pooling operations - regression test for VCC wave32 mode (S_CBRANCH_VCCZ should only check VCC_LO) def test_avg_pool2d(self): self._test_kernel(lambda T: T.empty(1, 1, 8, 8).avg_pool2d(kernel_size=(4,4), stride=2)) + + # Trig functions with special values (inf, nan, 0) + def test_sin_special(self): self._test_kernel(lambda T: T([0., 0.25, 0.5, 1.0]*8).sin()) + def test_cos_special(self): self._test_kernel(lambda T: T([0., 0.25, 0.5, 1.0]*8).cos()) + + # Sqrt and rsqrt + def test_sqrt(self): self._test_kernel(lambda T: T([0., 1., 4., 9.]*8).sqrt()) + def test_rsqrt(self): self._test_kernel(lambda T: T([1., 4., 9., 16.]*8).rsqrt()) @unittest.skip("Rust emulator has S_ADD_I32 SCC bug - uses carry instead of signed overflow") def test_avg_pool3d(self): import numpy as np @@ -462,5 +514,33 @@ class TestTinygradKernels(unittest.TestCase): self._test_kernel(lambda T: T(np.random.randn(2, 4, 9, 9, 9).astype(np.float32).tolist()).conv_transpose2d( T(np.random.randn(4, 4, 3, 3, 3).astype(np.float32).tolist())), max_steps=500000) + # Tests from test_ops.py failures + def test_gelu_extreme(self): self._test_kernel(lambda T: T.empty(45, 65).gelu()) + def test_gemm_64x64(self): self._test_kernel(lambda T: T.empty(64, 64) @ T.empty(64, 64), max_steps=500000) + def test_gemm_fp16(self): self._test_kernel(lambda T: T.empty(64, 64).half() @ T.empty(64, 64).half(), max_steps=500000) + def test_global_avg_pool2d(self): self._test_kernel(lambda T: T.empty(32, 2, 111, 28).avg_pool2d(kernel_size=(111, 28)), max_steps=100000) + @unittest.skip("Rust emulator has S_ADD_I32 SCC bug - uses carry instead of signed overflow") + def test_grouped_conv2d(self): self._test_kernel(lambda T: T.empty(4, 15, 5, 5).conv2d(T.empty(35, 3, 3, 3), groups=5), max_steps=200000) + @unittest.skip("Rust emulator has S_ADD_I32 SCC bug - uses carry instead of signed overflow") + def test_grouped_conv_transpose2d(self): self._test_kernel(lambda T: T.empty(2, 4, 9, 9).conv_transpose2d(T.empty(4, 4, 3, 3), groups=2), max_steps=200000) + def test_hardsigmoid(self): self._test_kernel(lambda T: T.empty(45, 65).hardsigmoid()) + def test_hardsigmoid_extreme(self): self._test_kernel(lambda T: T.empty(45, 65).sigmoid()) + def test_matvec(self): self._test_kernel(lambda T: (T.empty(1, 128) @ T.empty(128, 128)).relu(), max_steps=200000) + def test_matvecmat(self): self._test_kernel(lambda T: ((T.empty(1, 128) @ T.empty(128, 128)).relu() @ T.empty(128, 128)), max_steps=300000) + def test_max_reduce_45x3(self): self._test_kernel(lambda T: T.empty(45, 3).max()) + def test_max_dont_collapse(self): self._test_kernel(lambda T: T.empty(4, 8).max(axis=1)) + def test_max_pool2d_simple(self): self._test_kernel(lambda T: T.empty(1, 1, 2, 3).max_pool2d(kernel_size=(2, 2))) + def test_max_pool2d_32x2(self): self._test_kernel(lambda T: T.empty(32, 2, 11, 28).max_pool2d(kernel_size=(2, 2))) + def test_max_pool2d_asymmetric_padding(self): self._test_kernel(lambda T: T.empty(4, 2, 111, 28).max_pool2d(kernel_size=(5, 5), padding=(0, 1, 0, 1))) + def test_max_pool2d_bigger_stride(self): self._test_kernel(lambda T: T.empty(4, 2, 11, 28).max_pool2d(kernel_size=(2, 2), stride=(2, 3))) + def test_max_pool2d_unit_stride(self): self._test_kernel(lambda T: T.empty(3, 2, 17, 14).max_pool2d(kernel_size=(5, 5), stride=1)) + def test_max_pool2d_smaller_stride(self): self._test_kernel(lambda T: T.empty(3, 2, 17, 14).max_pool2d(kernel_size=(5, 5), stride=(2, 3))) + def test_max_unpool2d(self): self._test_kernel(lambda T: T.max_unpool2d(*T.empty(8, 3, 50, 50).max_pool2d(kernel_size=(5, 5), stride=(6, 5), return_indices=True), kernel_size=(5, 5), stride=(6, 5))) + def test_isinf(self): self._test_kernel(lambda T: T([float('-inf'), 0., float('inf'), 1.1]*8).isinf()) + def test_isfinite(self): self._test_kernel(lambda T: T([float('-inf'), 0., float('inf'), 1.1]*8).isfinite()) + + # WMMA tests - uses wave matrix multiply for larger fp16 matmuls + def test_wmma_gemm_fp16(self): self._test_kernel(lambda T: T.empty(64, 64).half() @ T.empty(64, 64).half(), max_steps=1000000) + if __name__ == "__main__": unittest.main() diff --git a/extra/assembly/rdna3/test/test_emu.py b/extra/assembly/rdna3/test/test_emu.py index df181a8938..d7b3c45e24 100644 --- a/extra/assembly/rdna3/test/test_emu.py +++ b/extra/assembly/rdna3/test/test_emu.py @@ -1,845 +1,1977 @@ -# Unit tests for RDNA3 Python emulator -import unittest -import ctypes -import struct -import math -from extra.assembly.rdna3.emu import ( - WaveState, decode_program, exec_wave, exec_workgroup, run_asm, - i32, f32, sext, WAVE_SIZE, set_valid_mem_ranges -) +#!/usr/bin/env python3 +"""Regression tests for the RDNA3 emulator instruction execution. +Uses run_asm() with memory output, so tests can run on both emulator and real hardware. + +Set USE_HW=1 to run on both emulator and real hardware, comparing results. +""" + +import ctypes, unittest, os, struct from extra.assembly.rdna3.autogen import * from extra.assembly.rdna3.lib import RawImm +from extra.assembly.rdna3.emu import WaveState, run_asm, set_valid_mem_ranges +from extra.assembly.rdna3.pcode import _i32, _f32 -def run_kernel(kernel: bytes, n_threads: int = 1, n_outputs: int = 1) -> list[int]: - """Helper to run a kernel and return output values.""" - output = (ctypes.c_uint32 * (n_threads * n_outputs))(*[0xdead] * (n_threads * n_outputs)) - output_ptr = ctypes.addressof(output) - args = (ctypes.c_uint64 * 1)(output_ptr) +VCC = SrcEnum.VCC_LO # For VOP3SD sdst field +USE_HW = os.environ.get("USE_HW", "0") == "1" +# Tolerance for float comparisons (in ULPs or absolute) +FLOAT_TOLERANCE = 1e-5 + +# Output buffer layout: vgpr[16][32], sgpr[16], vcc, scc +# Each VGPR store writes 32 lanes (128 bytes), so vgpr[i] is at offset i*128 +N_VGPRS, N_SGPRS, WAVE_SIZE = 16, 16, 32 +VGPR_BYTES = N_VGPRS * WAVE_SIZE * 4 # 16 regs * 32 lanes * 4 bytes = 2048 +SGPR_BYTES = N_SGPRS * 4 # 16 regs * 4 bytes = 64 +OUT_BYTES = VGPR_BYTES + SGPR_BYTES + 8 # + vcc + scc + +def f2i(f: float) -> int: return _i32(f) +def i2f(i: int) -> float: return _f32(i) +def f2i64(f: float) -> int: return struct.unpack(' float: return struct.unpack(' bytes: + return b''.join(inst.to_bytes() for inst in instructions) + +def get_prologue_epilogue(n_lanes: int) -> tuple[list, list]: + """Generate prologue and epilogue instructions for state capture.""" + # Prologue: save s[0:1] and v[0] before test clobbers them + # Use s[80:81] for args pointer (safe range, avoiding VCC=106-107 and staying under 100) + prologue = [ + s_mov_b32(s[80], s[0]), + s_mov_b32(s[81], s[1]), + v_mov_b32_e32(v[255], v[0]), + ] + # Zero out test registers (v0-v15, s0-s15, vcc) so emu and hw start from same state + for i in range(N_VGPRS): + prologue.append(v_mov_b32_e32(v[i], 0)) + for i in range(N_SGPRS): + prologue.append(s_mov_b32(s[i], 0)) + prologue.append(s_mov_b32(s[SrcEnum.VCC_LO - 128], 0)) # zero VCC + + # Epilogue: store wave state to memory + # Use s[90-99] for epilogue temps to stay in safe SGPR range (<100, avoiding VCC=106-107) + # s[90] = saved VCC, s[91] = saved SCC, s[92:93] = output addr, s[94] = saved EXEC + # Save VCC/SCC first before we clobber them + epilogue = [ + s_mov_b32(s[90], SrcEnum.VCC_LO), # save VCC + s_cselect_b32(s[91], 1, 0), # save SCC + s_load_b64(s[92:93], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_lshlrev_b32_e32(v[240], 2, v[255]), # v[240] = lane_id * 4 + ] + # Store VGPRs: vgpr[i] at offset i*128 + lane_id*4 + for i in range(N_VGPRS): + epilogue.append(global_store_b32(addr=v[240], data=v[i], saddr=s[92], offset=i * WAVE_SIZE * 4)) + # Store SGPRs at VGPR_BYTES + i*4 (lane 0 only via exec mask) + epilogue.append(v_mov_b32_e32(v[241], 0)) + epilogue.append(v_cmp_eq_u32_e32(v[255], v[241])) + epilogue.append(s_and_saveexec_b32(s[94], SrcEnum.VCC_LO)) + epilogue.append(v_mov_b32_e32(v[240], 0)) + for i in range(N_SGPRS): + epilogue.append(v_mov_b32_e32(v[243], s[i])) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + i * 4)) + # Store saved VCC + epilogue.append(v_mov_b32_e32(v[243], s[90])) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES)) + # Store saved SCC + epilogue.append(v_mov_b32_e32(v[243], s[91])) + epilogue.append(global_store_b32(addr=v[240], data=v[243], saddr=s[92], offset=VGPR_BYTES + SGPR_BYTES + 4)) + epilogue.append(s_mov_b32(s[SrcEnum.EXEC_LO - 128], s[94])) # restore exec + epilogue.append(s_endpgm()) + + return prologue, epilogue + +def parse_output(out_buf: bytes, n_lanes: int) -> WaveState: + """Parse output buffer into WaveState.""" + st = WaveState() + for i in range(N_VGPRS): + for lane in range(n_lanes): + off = i * WAVE_SIZE * 4 + lane * 4 + st.vgpr[lane][i] = struct.unpack_from(' WaveState: + """Run instructions via emulator run_asm, dump state to memory, return WaveState.""" + out_buf = (ctypes.c_uint8 * OUT_BYTES)(*([0] * OUT_BYTES)) + out_addr = ctypes.addressof(out_buf) + + prologue, epilogue = get_prologue_epilogue(n_lanes) + code = assemble(prologue + instructions + epilogue) + + args = (ctypes.c_uint64 * 1)(out_addr) args_ptr = ctypes.addressof(args) - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - # Register valid memory ranges for bounds checking - set_valid_mem_ranges({ - (output_ptr, ctypes.sizeof(output)), - (args_ptr, ctypes.sizeof(args)), - (kernel_ptr, len(kernel)), - }) - result = run_asm(kernel_ptr, len(kernel), 1, 1, 1, n_threads, 1, 1, args_ptr) + kernel_buf = (ctypes.c_char * len(code)).from_buffer_copy(code) + lib_ptr = ctypes.addressof(kernel_buf) + + set_valid_mem_ranges({(out_addr, OUT_BYTES), (args_ptr, 8)}) + result = run_asm(lib_ptr, len(code), 1, 1, 1, n_lanes, 1, 1, args_ptr) assert result == 0, f"run_asm failed with {result}" - return [output[i] for i in range(n_threads * n_outputs)] -def make_store_kernel(setup_instrs: list, store_vreg: int = 1) -> bytes: - """Create a kernel that runs setup instructions then stores v[store_vreg] to output[tid].""" - kernel = b'' - # Load output pointer - kernel += s_load_b64(s[2:3], s[0:1], soffset=NULL, offset=0).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - # Run setup instructions - for instr in setup_instrs: - kernel += instr.to_bytes() - # Compute offset: v3 = tid * 4 - kernel += v_lshlrev_b32_e32(v[3], 2, v[0]).to_bytes() - # Store result - kernel += global_store_b32(addr=v[3], data=v[store_vreg], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() - return kernel + return parse_output(bytes(out_buf), n_lanes) -class TestScalarOps(unittest.TestCase): - def test_s_mov_b32(self): - state = WaveState() - kernel = s_mov_b32(s[5], 42).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[5], 42) +def run_program_hw(instructions: list, n_lanes: int = 1) -> WaveState: + """Run instructions on real AMD hardware via HIPCompiler and AMDProgram.""" + from tinygrad.device import Device + from tinygrad.runtime.ops_amd import AMDProgram + from tinygrad.runtime.support.compiler_amd import HIPCompiler + from tinygrad.helpers import flat_mv - def test_s_add_u32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 100, 50 - kernel = s_add_u32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 150) - self.assertEqual(state.scc, 0) # no carry + dev = Device["AMD"] + compiler = HIPCompiler(dev.arch) - def test_s_add_u32_carry(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 0xffffffff, 1 - kernel = s_add_u32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 0) - self.assertEqual(state.scc, 1) # carry + prologue, epilogue = get_prologue_epilogue(n_lanes) + code = assemble(prologue + instructions + epilogue) - def test_s_sub_u32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 100, 30 - kernel = s_sub_u32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 70) - self.assertEqual(state.scc, 0) # no borrow + # Create inline assembly source with .byte directives + byte_str = ', '.join(f'0x{b:02x}' for b in code) + asm_src = f""".text +.globl test +.p2align 8 +.type test,@function +test: +.byte {byte_str} - def test_s_and_b32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 0xff00, 0x0ff0 - kernel = s_and_b32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 0x0f00) +.rodata +.p2align 6 +.amdhsa_kernel test + .amdhsa_next_free_vgpr 256 + .amdhsa_next_free_sgpr 96 + .amdhsa_wavefront_size32 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_kernarg_size 8 +.end_amdhsa_kernel - def test_s_or_b32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 0xff00, 0x00ff - kernel = s_or_b32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 0xffff) +.amdgpu_metadata +--- +amdhsa.version: + - 1 + - 0 +amdhsa.kernels: + - .name: test + .symbol: test.kd + .kernarg_segment_size: 8 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .wavefront_size: 32 + .sgpr_count: 96 + .vgpr_count: 256 + .max_flat_workgroup_size: 1024 +... +.end_amdgpu_metadata +""" - def test_s_lshl_b32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 1, 4 - kernel = s_lshl_b32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 16) + lib = compiler.compile(asm_src) + prg = AMDProgram(dev, "test", lib) - def test_s_lshr_b32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 256, 4 - kernel = s_lshr_b32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 16) + # Allocate output buffer on GPU + out_gpu = dev.allocator.alloc(OUT_BYTES) - def test_s_mul_i32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 7, 6 - kernel = s_mul_i32(s[2], s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[2], 42) + # Run the kernel + prg(out_gpu, global_size=(1, 1, 1), local_size=(n_lanes, 1, 1), wait=True) - def test_s_cmp_eq_u32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 42, 42 - kernel = s_cmp_eq_u32(s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.scc, 1) + # Copy result back + out_buf = bytearray(OUT_BYTES) + dev.allocator._copyout(flat_mv(memoryview(out_buf)), out_gpu) - def test_s_cmp_lg_u32(self): - state = WaveState() - state.sgpr[0], state.sgpr[1] = 42, 43 - kernel = s_cmp_lg_u32(s[0], s[1]).to_bytes() + s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.scc, 1) + return parse_output(bytes(out_buf), n_lanes) -class TestVectorOps(unittest.TestCase): - def test_v_mov_b32(self): - kernel = make_store_kernel([v_mov_b32_e32(v[1], 42)]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [42]) +def compare_wave_states(emu_st: WaveState, hw_st: WaveState, n_lanes: int, n_vgprs: int = N_VGPRS) -> list[str]: + """Compare two WaveStates and return list of differences.""" + import math + diffs = [] + # Compare VGPRs - vgpr is list[lane][reg] + for i in range(n_vgprs): + for lane in range(n_lanes): + emu_val = emu_st.vgpr[lane][i] + hw_val = hw_st.vgpr[lane][i] + if emu_val != hw_val: + emu_f, hw_f = _f32(emu_val), _f32(hw_val) + # Handle NaN comparison + if math.isnan(emu_f) and math.isnan(hw_f): + continue + diffs.append(f"v[{i}] lane {lane}: emu=0x{emu_val:08x} ({emu_f:.6g}) hw=0x{hw_val:08x} ({hw_f:.6g})") + # Compare SGPRs - sgpr is list + for i in range(N_SGPRS): + emu_val = emu_st.sgpr[i] + hw_val = hw_st.sgpr[i] + if emu_val != hw_val: + diffs.append(f"s[{i}]: emu=0x{emu_val:08x} hw=0x{hw_val:08x}") + # Compare VCC + if emu_st.vcc != hw_st.vcc: + diffs.append(f"vcc: emu=0x{emu_st.vcc:08x} hw=0x{hw_st.vcc:08x}") + # Compare SCC + if emu_st.scc != hw_st.scc: + diffs.append(f"scc: emu={emu_st.scc} hw={hw_st.scc}") + return diffs - def test_v_add_nc_u32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 10), - v_mov_b32_e32(v[2], 32), - v_add_nc_u32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [42]) +def run_program(instructions: list, n_lanes: int = 1) -> WaveState: + """Run instructions and return WaveState. - def test_v_sub_nc_u32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 50), - v_mov_b32_e32(v[2], 8), - v_sub_nc_u32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [42]) + If USE_HW=1, runs on both emulator and hardware, compares results, and raises if they differ. + Otherwise, runs only on emulator. + """ + emu_st = run_program_emu(instructions, n_lanes) + if USE_HW: + hw_st = run_program_hw(instructions, n_lanes) + diffs = compare_wave_states(emu_st, hw_st, n_lanes) + if diffs: + raise AssertionError(f"Emulator vs Hardware mismatch:\n" + "\n".join(diffs)) + return hw_st # Return hardware result when both match + return emu_st - def test_v_mul_lo_u32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 6), - v_mov_b32_e32(v[2], 7), - v_mul_lo_u32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [42]) - def test_v_and_b32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 0xff0f), - v_mov_b32_e32(v[2], 0x0fff), - v_and_b32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [0x0f0f]) +class TestVDivScale(unittest.TestCase): + """Tests for V_DIV_SCALE_F32 VCC handling.""" - def test_v_or_b32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 0xf000), - v_mov_b32_e32(v[2], 0x000f), - v_or_b32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [0xf00f]) + def test_div_scale_f32_vcc_zero_single_lane(self): + """V_DIV_SCALE_F32 sets VCC=0 when no scaling needed.""" + instructions = [ + v_mov_b32_e32(v[0], 1.0), # uses inline constant + v_mov_b32_e32(v[1], 4.0), # uses inline constant + v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc, 0, "VCC should be 0 when no scaling needed") - def test_v_lshlrev_b32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 1), - v_lshlrev_b32_e32(v[1], 5, v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [32]) + def test_div_scale_f32_vcc_zero_multiple_lanes(self): + """V_DIV_SCALE_F32 sets VCC=0 for all lanes when no scaling needed.""" + instructions = [ + v_mov_b32_e32(v[0], 1.0), + v_mov_b32_e32(v[1], 4.0), + v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]), + ] + st = run_program(instructions, n_lanes=4) + self.assertEqual(st.vcc & 0xf, 0, "VCC should be 0 for all lanes") - def test_v_lshrrev_b32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 128), - v_lshrrev_b32_e32(v[1], 3, v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out, [16]) + def test_div_scale_f32_preserves_input(self): + """V_DIV_SCALE_F32 outputs S0 when no scaling needed.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # numerator - use inline constant + v_mov_b32_e32(v[1], 4.0), # denominator + v_div_scale_f32(v[2], VCC, v[0], v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 2.0, places=5) + + +class TestVCmpClass(unittest.TestCase): + """Tests for V_CMP_CLASS_F32 float classification.""" + + def test_cmp_class_quiet_nan(self): + """V_CMP_CLASS_F32 detects quiet NaN.""" + quiet_nan = 0x7fc00000 + instructions = [ + s_mov_b32(s[0], quiet_nan), # large int encodes as literal + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], 0b0000000010), # bit 1 = quiet NaN (mask in VGPR for VOPC) + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect quiet NaN") + + def test_cmp_class_signaling_nan(self): + """V_CMP_CLASS_F32 detects signaling NaN.""" + signal_nan = 0x7f800001 + instructions = [ + s_mov_b32(s[0], signal_nan), # large int encodes as literal + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], 0b0000000001), # bit 0 = signaling NaN + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect signaling NaN") + + def test_cmp_class_quiet_nan_not_signaling(self): + """Quiet NaN does not match signaling NaN mask.""" + quiet_nan = 0x7fc00000 + instructions = [ + s_mov_b32(s[0], quiet_nan), # large int encodes as literal + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], 0b0000000001), # bit 0 = signaling NaN only + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "Quiet NaN should not match signaling mask") + + def test_cmp_class_signaling_nan_not_quiet(self): + """Signaling NaN does not match quiet NaN mask.""" + signal_nan = 0x7f800001 + instructions = [ + s_mov_b32(s[0], signal_nan), # large int encodes as literal + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], 0b0000000010), # bit 1 = quiet NaN only + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 0, "Signaling NaN should not match quiet mask") + + def test_cmp_class_positive_inf(self): + """V_CMP_CLASS_F32 detects +inf.""" + pos_inf = 0x7f800000 + instructions = [ + s_mov_b32(s[0], pos_inf), # large int encodes as literal + s_mov_b32(s[1], 0b1000000000), # bit 9 = +inf (512 is outside inline range) + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], s[1]), # mask in VGPR + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect +inf") + + def test_cmp_class_negative_inf(self): + """V_CMP_CLASS_F32 detects -inf.""" + neg_inf = 0xff800000 + instructions = [ + s_mov_b32(s[0], neg_inf), # large int encodes as literal + v_mov_b32_e32(v[0], s[0]), # value to classify + v_mov_b32_e32(v[1], 0b0000000100), # bit 2 = -inf + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect -inf") + + def test_cmp_class_normal_positive(self): + """V_CMP_CLASS_F32 detects positive normal.""" + instructions = [ + v_mov_b32_e32(v[0], 1.0), # inline constant - value to classify + s_mov_b32(s[1], 0b0100000000), # bit 8 = positive normal (256 is outside inline range) + v_mov_b32_e32(v[1], s[1]), # mask in VGPR + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect positive normal") + + def test_cmp_class_normal_negative(self): + """V_CMP_CLASS_F32 detects negative normal.""" + instructions = [ + v_mov_b32_e32(v[0], -1.0), # inline constant - value to classify + v_mov_b32_e32(v[1], 0b0000001000), # bit 3 = negative normal + v_cmp_class_f32_e32(v[0], v[1]), # VOPC: src0=value, vsrc1=mask, writes VCC + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vcc & 1, 1, "Should detect negative normal") + + +class TestBasicOps(unittest.TestCase): + """Basic instruction tests.""" def test_v_add_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(1.5)), - v_mov_b32_e32(v[2], i32(2.5)), - v_add_f32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 4.0) + """V_ADD_F32 adds two floats.""" + instructions = [ + v_mov_b32_e32(v[0], 1.0), # inline constant + v_mov_b32_e32(v[1], 2.0), # inline constant + v_add_f32_e32(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5) def test_v_mul_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(3.0)), - v_mov_b32_e32(v[2], i32(4.0)), - v_mul_f32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 12.0) + """V_MUL_F32 multiplies two floats.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # inline constant + v_mov_b32_e32(v[1], 4.0), # inline constant + v_mul_f32_e32(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 8.0, places=5) - def test_v_max_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(3.0)), - v_mov_b32_e32(v[2], i32(5.0)), - v_max_f32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 5.0) + def test_v_mov_b32(self): + """V_MOV_B32 moves a value.""" + instructions = [ + s_mov_b32(s[0], 42), + v_mov_b32_e32(v[0], s[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 42) - def test_v_min_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(3.0)), - v_mov_b32_e32(v[2], i32(5.0)), - v_min_f32_e32(v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 3.0) + def test_s_add_u32(self): + """S_ADD_U32 adds two scalar values.""" + instructions = [ + s_mov_b32(s[0], 100), + s_mov_b32(s[1], 200), + s_add_u32(s[2], s[0], s[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[2], 300) -class TestThreading(unittest.TestCase): - def test_thread_id(self): - """Each thread should get its own thread ID in v0.""" - kernel = make_store_kernel([v_mov_b32_e32(v[1], v[0])], store_vreg=1) - out = run_kernel(kernel, n_threads=4) - self.assertEqual(out, [0, 1, 2, 3]) + def test_s_add_u32_carry(self): + """S_ADD_U32 sets SCC on overflow.""" + instructions = [ + s_mov_b32(s[0], 64), # use inline constant for max + s_not_b32(s[0], s[0]), # s0 = ~64 = 0xffffffbf, close to max + s_mov_b32(s[1], 64), + s_add_u32(s[2], s[0], s[1]), # 0xffffffbf + 64 = 0xffffffff + s_mov_b32(s[3], 1), + s_add_u32(s[4], s[2], s[3]), # 0xffffffff + 1 = overflow + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.sgpr[4], 0) + self.assertEqual(st.scc, 1) - def test_thread_local_ops(self): - """Each thread computes tid * 10.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[2], 10), - v_mul_lo_u32(v[1], v[0], v[2]), - ]) - out = run_kernel(kernel, n_threads=4) - self.assertEqual(out, [0, 10, 20, 30]) + def test_v_alignbit_b32(self): + """V_ALIGNBIT_B32 extracts bits from concatenated sources.""" + instructions = [ + s_mov_b32(s[0], 0x12), # small values as inline constants + s_mov_b32(s[1], 0x34), + s_mov_b32(s[2], 4), # shift amount + v_mov_b32_e32(v[0], s[2]), + v_alignbit_b32(v[1], s[0], s[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # {0x12, 0x34} >> 4 = 0x0000001200000034 >> 4 = 0x20000003 + expected = ((0x12 << 32) | 0x34) >> 4 + self.assertEqual(st.vgpr[0][1], expected & 0xffffffff) - def test_exec_mask(self): - """Test that exec mask controls which lanes execute.""" - kernel = b'' - kernel += s_load_b64(s[2:3], s[0:1], 0, soffset=NULL).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - kernel += v_mov_b32_e32(v[1], 100).to_bytes() # default value - kernel += s_mov_b32(EXEC_LO, 0b0101).to_bytes() # only lanes 0 and 2 - kernel += v_mov_b32_e32(v[1], 42).to_bytes() # only for active lanes - kernel += s_mov_b32(EXEC_LO, 0xf).to_bytes() # restore all lanes - kernel += v_lshlrev_b32_e32(v[3], 2, v[0]).to_bytes() - kernel += global_store_b32(addr=v[3], data=v[1], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() - out = run_kernel(kernel, n_threads=4) - self.assertEqual(out, [42, 100, 42, 100]) -class TestBranching(unittest.TestCase): - def test_s_branch(self): - """Test unconditional branch.""" - state = WaveState() - kernel = b'' - kernel += s_mov_b32(s[0], 1).to_bytes() - kernel += s_branch(1).to_bytes() # skip next instruction - kernel += s_mov_b32(s[0], 2).to_bytes() # should be skipped - kernel += s_mov_b32(s[1], 3).to_bytes() - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[0], 1) # not overwritten - self.assertEqual(state.sgpr[1], 3) +class TestMultiLane(unittest.TestCase): + """Tests for multi-lane execution.""" - def test_s_cbranch_scc0(self): - """Test conditional branch on SCC=0.""" - state = WaveState() - state.scc = 0 - kernel = b'' - kernel += s_mov_b32(s[0], 1).to_bytes() - kernel += s_cbranch_scc0(1).to_bytes() # branch if scc=0 - kernel += s_mov_b32(s[0], 2).to_bytes() # should be skipped - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[0], 1) + def test_v_mov_all_lanes(self): + """V_MOV_B32 sets all lanes to the same value.""" + instructions = [ + s_mov_b32(s[0], 42), + v_mov_b32_e32(v[0], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][0], 42) - def test_s_cbranch_scc1(self): - """Test conditional branch on SCC=1.""" - state = WaveState() - state.scc = 1 - kernel = b'' - kernel += s_mov_b32(s[0], 1).to_bytes() - kernel += s_cbranch_scc1(1).to_bytes() # branch if scc=1 - kernel += s_mov_b32(s[0], 2).to_bytes() # should be skipped - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.sgpr[0], 1) + def test_v_cmp_sets_vcc_bits(self): + """V_CMP_EQ sets VCC bits based on per-lane comparison.""" + instructions = [ + s_mov_b32(s[0], 5), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[0]), + v_cmp_eq_u32_e32(v[0], v[1]), # VOPC: src0, vsrc1 - writes VCC implicitly + ] + st = run_program(instructions, n_lanes=4) + self.assertEqual(st.vcc & 0xf, 0xf, "All lanes should match") - def test_unknown_sopp_opcode(self): - """Regression test: unknown SOPP opcodes should be ignored, not crash.""" - state = WaveState() - # Create a raw SOPP instruction with opcode 8 (undefined in our enum) - # SOPP format: bits[31:23] = 0b101111111, bits[22:16] = op, bits[15:0] = simm16 - unknown_sopp = (0b101111111 << 23) | (8 << 16) | 0 # op=8, simm16=0 - kernel = unknown_sopp.to_bytes(4, 'little') + s_endpgm().to_bytes() - prog = decode_program(kernel) - # Should not raise an exception - exec_wave(prog, state, bytearray(65536), 1) -class TestMemory(unittest.TestCase): - def test_global_load_store(self): - """Test global load followed by store.""" - # Create input buffer - input_buf = (ctypes.c_uint32 * 4)(10, 20, 30, 40) - input_ptr = ctypes.addressof(input_buf) - output_buf = (ctypes.c_uint32 * 4)(*[0]*4) - output_ptr = ctypes.addressof(output_buf) - args = (ctypes.c_uint64 * 2)(output_ptr, input_ptr) - args_ptr = ctypes.addressof(args) +class TestLaneInstructions(unittest.TestCase): + """Tests for cross-lane instructions (readlane, writelane, readfirstlane). - # Kernel: load from input[tid], add 1, store to output[tid] - kernel = b'' - kernel += s_load_b64(s[2:3], s[0:1], soffset=NULL, offset=0).to_bytes() # output ptr - kernel += s_load_b64(s[4:5], s[0:1], soffset=NULL, offset=8).to_bytes() # input ptr - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - kernel += v_lshlrev_b32_e32(v[2], 2, v[0]).to_bytes() # offset = tid * 4 - kernel += global_load_b32(vdst=v[1], addr=v[2], saddr=s[4]).to_bytes() - kernel += s_waitcnt(vmcnt=0).to_bytes() - kernel += v_add_nc_u32_e32(v[1], 1, v[1]).to_bytes() # add 1 - kernel += global_store_b32(addr=v[2], data=v[1], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() + These are critical for wave-level reductions and WMMA matrix operations. - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({ - (input_ptr, ctypes.sizeof(input_buf)), - (output_ptr, ctypes.sizeof(output_buf)), - (args_ptr, ctypes.sizeof(args)), - (kernel_ptr, len(kernel)), - }) - result = run_asm(kernel_ptr, len(kernel), 1, 1, 1, 4, 1, 1, args_ptr) - self.assertEqual(result, 0) - self.assertEqual([output_buf[i] for i in range(4)], [11, 21, 31, 41]) + Note: V_READLANE_B32 and V_READFIRSTLANE_B32 write to SGPR, but the VOP1/VOP3 + encoding has a 'vdst' field. We use RawImm to encode SGPR indices directly. + """ -class TestFloatOps(unittest.TestCase): - def test_v_rcp_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(4.0)), - v_rcp_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertAlmostEqual(f32(out[0]), 0.25, places=5) + def _readlane(self, sdst_idx, vsrc, lane_idx): + """Helper to create V_READLANE_B32 with SGPR destination.""" + return VOP3(VOP3Op.V_READLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc, src1=lane_idx) - def test_v_sqrt_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(16.0)), - v_sqrt_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertAlmostEqual(f32(out[0]), 4.0, places=5) + def _readfirstlane(self, sdst_idx, vsrc): + """Helper to create V_READFIRSTLANE_B32 with SGPR destination.""" + return VOP1(VOP1Op.V_READFIRSTLANE_B32, vdst=RawImm(sdst_idx), src0=vsrc) + + def test_v_readlane_b32_basic(self): + """V_READLANE_B32 reads a value from a specific lane's VGPR.""" + # v[255] = lane_id from prologue; compute v[0] = lane_id * 10 + instructions = [ + v_lshlrev_b32_e32(v[0], 1, v[255]), # v0 = lane_id * 2 + v_lshlrev_b32_e32(v[1], 3, v[255]), # v1 = lane_id * 8 + v_add_nc_u32_e32(v[0], v[0], v[1]), # v0 = lane_id * 10 + # Now read lane 2's value (should be 20) into s0 + self._readlane(0, v[0], 2), # s0 = v0 from lane 2 = 20 + v_mov_b32_e32(v[2], s[0]), # broadcast to all lanes + ] + st = run_program(instructions, n_lanes=4) + # All lanes should have the value 20 (lane 2's value) + for lane in range(4): + self.assertEqual(st.vgpr[lane][2], 20, f"Lane {lane}: expected 20, got {st.vgpr[lane][2]}") + + def test_v_readlane_b32_lane_0(self): + """V_READLANE_B32 reading from lane 0.""" + instructions = [ + v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4 + v_add_nc_u32_e32(v[0], 100, v[0]), # v0 = 100 + lane_id * 4 + self._readlane(0, v[0], 0), # s0 = lane 0's v0 = 100 + v_mov_b32_e32(v[1], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 100) + + def test_v_readlane_b32_last_lane(self): + """V_READLANE_B32 reading from the last active lane (lane 3 in 4-lane test).""" + instructions = [ + v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4 + v_add_nc_u32_e32(v[0], 100, v[0]), # v0 = 100 + lane_id * 4 + self._readlane(0, v[0], 3), # s0 = lane 3's v0 = 112 + v_mov_b32_e32(v[1], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 112) + + def test_v_readlane_b32_different_vgpr(self): + """V_READLANE_B32 reading from different VGPR indices. + + Regression test for bug where rd_lane was checked against VGPR values + instead of being used as an index (using 'in' operator on list instead + of checking if index is within bounds). + """ + instructions = [ + # Set up v[5] with per-lane values + v_lshlrev_b32_e32(v[5], 3, v[255]), # v5 = lane_id * 8 + v_add_nc_u32_e32(v[5], 50, v[5]), # v5 = 50 + lane_id * 8 + # Read lane 1's v[5] (should be 58) + self._readlane(0, v[5], 1), + v_mov_b32_e32(v[6], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][6], 58, f"Lane {lane}: expected 58 from v[5] lane 1") + + def test_v_readfirstlane_b32_basic(self): + """V_READFIRSTLANE_B32 reads from the first active lane.""" + instructions = [ + v_lshlrev_b32_e32(v[0], 2, v[255]), # v0 = lane_id * 4 + v_add_nc_u32_e32(v[0], 1000, v[0]), # v0 = 1000 + lane_id * 4 + self._readfirstlane(0, v[0]), # s0 = first lane's v0 = 1000 + v_mov_b32_e32(v[1], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 1000) + + def test_v_readfirstlane_b32_different_vgpr(self): + """V_READFIRSTLANE_B32 reading from different VGPR index. + + Regression test for bug where src0_idx bounds check was incorrect. + """ + instructions = [ + v_lshlrev_b32_e32(v[7], 5, v[255]), # v7 = lane_id * 32 + v_add_nc_u32_e32(v[7], 200, v[7]), # v7 = 200 + lane_id * 32 + self._readfirstlane(0, v[7]), # s0 = first lane's v7 = 200 + v_mov_b32_e32(v[8], s[0]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][8], 200) + + def test_v_writelane_b32_basic(self): + """V_WRITELANE_B32 writes a scalar to a specific lane's VGPR.""" + instructions = [ + v_mov_b32_e32(v[0], 0), # Initialize v0 = 0 for all lanes + s_mov_b32(s[0], 999), # Value to write + v_writelane_b32(v[0], s[0], 2), # Write 999 to lane 2's v0 + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + if lane == 2: + self.assertEqual(st.vgpr[lane][0], 999, f"Lane 2 should have 999") + else: + self.assertEqual(st.vgpr[lane][0], 0, f"Lane {lane} should have 0") + + def test_v_writelane_then_readlane(self): + """V_WRITELANE followed by V_READLANE to verify round-trip.""" + instructions = [ + v_mov_b32_e32(v[0], 0), + s_mov_b32(s[0], 0xdeadbeef), + v_writelane_b32(v[0], s[0], 1), # Write to lane 1 + self._readlane(1, v[0], 1), # Read back from lane 1 into s1 + v_mov_b32_e32(v[1], s[1]), + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 0xdeadbeef) + + def test_v_readlane_for_reduction(self): + """Simulate a wave reduction using readlane - common pattern in WMMA/reductions. + + This pattern is used when reducing across lanes, e.g., for computing + the sum of all elements in a wave. + """ + # Each lane computes lane_id + 1, then we sum lanes 0-3 using readlane + instructions = [ + v_add_nc_u32_e32(v[0], 1, v[255]), # v0 = lane_id + 1 (1, 2, 3, 4) + # Read all 4 lanes and sum in scalar registers + self._readlane(0, v[0], 0), # s0 = 1 + self._readlane(1, v[0], 1), # s1 = 2 + s_add_u32(s[0], s[0], s[1]), # s0 = 3 + self._readlane(1, v[0], 2), # s1 = 3 + s_add_u32(s[0], s[0], s[1]), # s0 = 6 + self._readlane(1, v[0], 3), # s1 = 4 + s_add_u32(s[0], s[0], s[1]), # s0 = 10 + v_mov_b32_e32(v[1], s[0]), # Broadcast sum to all lanes + ] + st = run_program(instructions, n_lanes=4) + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 10, f"Sum 1+2+3+4 should be 10") + + +class TestTrigonometry(unittest.TestCase): + """Tests for trigonometric instructions.""" + + def test_v_sin_f32_small(self): + """V_SIN_F32 computes sin for small values.""" + import math + # sin(1.0) ≈ 0.8414709848 + instructions = [ + v_mov_b32_e32(v[0], 1.0), + v_sin_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + expected = math.sin(1.0 * 2 * math.pi) # V_SIN_F32 expects input in cycles (0-1 = 0-2π) + self.assertAlmostEqual(result, expected, places=4) + + def test_v_sin_f32_quarter(self): + """V_SIN_F32 at 0.25 cycles = sin(π/2) = 1.0.""" + instructions = [ + s_mov_b32(s[0], f2i(0.25)), # 0.25 is not an inline constant, use f2i + v_mov_b32_e32(v[0], s[0]), + v_sin_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + self.assertAlmostEqual(result, 1.0, places=4) + + def test_v_sin_f32_large(self): + """V_SIN_F32 for large input value (132000.0).""" + import math + # This is the failing case: sin(132000.0) should be ≈ 0.294 + # V_SIN_F32 input is in cycles, so we need frac(132000.0) * 2π + instructions = [ + s_mov_b32(s[0], f2i(132000.0)), + v_mov_b32_e32(v[0], s[0]), + v_sin_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + # frac(132000.0) = 0, so sin(0) = 0... but actually V_SIN_F32 does its own frac internally + # The expected value is sin(frac(132000.0) * 2π) where frac is done in the instruction + # For 132000.0, the hardware computes frac(132000.0) ≈ 0.046875 (due to precision) + # sin(0.046875 * 2π) ≈ 0.294 + expected = math.sin(132000.0 * 2 * math.pi) + # Allow some tolerance due to precision differences + self.assertAlmostEqual(result, expected, places=2, msg=f"sin(132000) got {result}, expected ~{expected}") + + +class TestFMA(unittest.TestCase): + """Tests for FMA instructions - key for OCML sin argument reduction.""" + + def test_v_fma_f32_basic(self): + """V_FMA_F32: a*b+c basic case using inline constants only.""" + # Inline float constants: 0.5, -0.5, 1.0, -1.0, 2.0, -2.0, 4.0, -4.0 + instructions = [ + v_mov_b32_e32(v[0], 2.0), # inline constant + v_mov_b32_e32(v[1], 4.0), # inline constant + v_mov_b32_e32(v[2], 1.0), # inline constant + v_fma_f32(v[3], v[0], v[1], v[2]), # 2*4+1 = 9 + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][3]), 9.0, places=5) + + def test_v_fma_f32_negative(self): + """V_FMA_F32 with negative multiplier (used in sin reduction).""" + instructions = [ + v_mov_b32_e32(v[0], -2.0), # inline constant + v_mov_b32_e32(v[1], 4.0), # inline constant + v_mov_b32_e32(v[2], 1.0), # inline constant + v_fma_f32(v[3], v[0], v[1], v[2]), # -2*4+1 = -7 + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][3]), -7.0, places=5) + + def test_v_fmac_f32(self): + """V_FMAC_F32: d = d + a*b using inline constants.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # inline constant + v_mov_b32_e32(v[1], 4.0), # inline constant + v_mov_b32_e32(v[2], 1.0), # inline constant + v_fmac_f32_e32(v[2], v[0], v[1]), # v2 = v2 + v0*v1 = 1 + 2*4 = 9 + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5) + + def test_v_fmaak_f32(self): + """V_FMAAK_F32: d = a * b + K using inline constants.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), # inline constant + v_mov_b32_e32(v[1], 4.0), # inline constant + v_fmaak_f32_e32(v[2], v[0], v[1], 0x3f800000), # v2 = v0 * v1 + 1.0 = 2*4+1 = 9 + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5) + + def test_v_fma_f32_with_sgpr(self): + """V_FMA_F32: using SGPR for non-inline constant.""" + # Use SGPR to load 3.0 which is not an inline constant + instructions = [ + s_mov_b32(s[0], f2i(3.0)), # 3.0 via literal in SGPR + v_mov_b32_e32(v[0], 2.0), # inline constant + v_mov_b32_e32(v[1], s[0]), # 3.0 from SGPR + v_mov_b32_e32(v[2], 4.0), # inline constant + v_fma_f32(v[3], v[0], v[1], v[2]), # 2*3+4 = 10 + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][3]), 10.0, places=5) + + +class TestRounding(unittest.TestCase): + """Tests for rounding instructions - used in sin argument reduction.""" + + def test_v_rndne_f32_half_even(self): + """V_RNDNE_F32 rounds to nearest even.""" + instructions = [ + s_mov_b32(s[0], f2i(2.5)), + v_mov_b32_e32(v[0], s[0]), + v_rndne_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5) # rounds to even + + def test_v_rndne_f32_half_odd(self): + """V_RNDNE_F32 rounds 3.5 to 4 (nearest even).""" + instructions = [ + s_mov_b32(s[0], f2i(3.5)), + v_mov_b32_e32(v[0], s[0]), + v_rndne_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4.0, places=5) + + def test_v_rndne_f32_large(self): + """V_RNDNE_F32 with large value (like sin reduction uses).""" + # sin(1e5) reduction: 1e5 * (1/2pi) ≈ 15915.49... + val = 100000.0 * 0.15915494309189535 # 1/(2*pi) + instructions = [ + s_mov_b32(s[0], f2i(val)), + v_mov_b32_e32(v[0], s[0]), + v_rndne_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + expected = round(val) # Python's round does banker's rounding + self.assertAlmostEqual(i2f(st.vgpr[0][1]), expected, places=0) def test_v_floor_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(3.7)), - v_floor_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 3.0) + """V_FLOOR_F32 floors to integer.""" + instructions = [ + s_mov_b32(s[0], f2i(3.7)), + v_mov_b32_e32(v[0], s[0]), + v_floor_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 3.0, places=5) - def test_v_ceil_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(3.2)), - v_ceil_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 4.0) + def test_v_trunc_f32(self): + """V_TRUNC_F32 truncates toward zero.""" + instructions = [ + s_mov_b32(s[0], f2i(-3.7)), + v_mov_b32_e32(v[0], s[0]), + v_trunc_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), -3.0, places=5) + + def test_v_fract_f32(self): + """V_FRACT_F32 returns fractional part.""" + instructions = [ + s_mov_b32(s[0], f2i(3.75)), + v_mov_b32_e32(v[0], s[0]), + v_fract_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.75, places=5) + + def test_v_fract_f32_large(self): + """V_FRACT_F32 with large value - precision matters here.""" + instructions = [ + s_mov_b32(s[0], f2i(132000.25)), + v_mov_b32_e32(v[0], s[0]), + v_fract_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + # For large floats, fract precision degrades + self.assertGreaterEqual(result, 0.0) + self.assertLess(result, 1.0) + + +class TestConversion(unittest.TestCase): + """Tests for conversion instructions.""" + + def test_v_cvt_i32_f32_positive(self): + """V_CVT_I32_F32 converts float to signed int.""" + instructions = [ + s_mov_b32(s[0], f2i(42.7)), + v_mov_b32_e32(v[0], s[0]), + v_cvt_i32_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 42) + + def test_v_cvt_i32_f32_negative(self): + """V_CVT_I32_F32 converts negative float to signed int.""" + instructions = [ + s_mov_b32(s[0], f2i(-42.7)), + v_mov_b32_e32(v[0], s[0]), + v_cvt_i32_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # Result is signed, stored as unsigned + self.assertEqual(st.vgpr[0][1] & 0xffffffff, (-42) & 0xffffffff) + + def test_v_cvt_i32_f32_large(self): + """V_CVT_I32_F32 with large float (used in sin for quadrant).""" + # sin reduction converts round(x * 1/2pi) to int for quadrant selection + instructions = [ + s_mov_b32(s[0], f2i(15915.0)), # ~1e5 / (2*pi) + v_mov_b32_e32(v[0], s[0]), + v_cvt_i32_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 15915) def test_v_cvt_f32_i32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 42), - v_cvt_f32_i32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 42.0) + """V_CVT_F32_I32 converts signed int to float.""" + instructions = [ + s_mov_b32(s[0], 42), + v_mov_b32_e32(v[0], s[0]), + v_cvt_f32_i32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 42.0, places=5) - def test_v_cvt_i32_f32(self): - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(42.9)), - v_cvt_i32_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0], 42) + def test_v_cvt_f32_u32(self): + """V_CVT_F32_U32 converts unsigned int to float.""" + instructions = [ + s_mov_b32(s[0], 0xffffffff), # max u32 + v_mov_b32_e32(v[0], s[0]), + v_cvt_f32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 4294967296.0, places=-5) -class TestVOP3(unittest.TestCase): - def test_v_fma_f32(self): - """Test fused multiply-add: a*b + c""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(2.0)), - v_mov_b32_e32(v[2], i32(3.0)), - v_mov_b32_e32(v[4], i32(4.0)), - v_fma_f32(v[1], v[1], v[2], v[4]), # 2*3+4 = 10 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 10.0) - def test_v_add3_u32(self): - """Test 3-operand add.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 10), - v_mov_b32_e32(v[2], 20), - v_mov_b32_e32(v[4], 12), - v_add3_u32(v[1], v[1], v[2], v[4]), # 10+20+12 = 42 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0], 42) +class TestBitManipulation(unittest.TestCase): + """Tests for bit manipulation - used in sin for quadrant selection.""" - def test_v_neg_modifier(self): - """Test VOP3 negation modifier.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(5.0)), - v_mov_b32_e32(v[2], i32(3.0)), - # v_add_f32 with neg on src1: 5 + (-3) = 2 - v_add_f32_e64(v[1], v[1], v[2], neg=0b010), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 2.0) + def test_v_and_b32(self): + """V_AND_B32 bitwise and.""" + instructions = [ + s_mov_b32(s[0], 0xff), + s_mov_b32(s[1], 0x0f), + v_mov_b32_e32(v[0], s[0]), + v_and_b32_e32(v[1], s[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0x0f) - def test_v_ldexp_f32(self): - """Regression test: V_LDEXP_F32 used by exp().""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(1.5)), - v_mov_b32_e32(v[2], 3), # exponent - v_ldexp_f32(v[1], v[1], v[2]), # 1.5 * 2^3 = 12.0 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(f32(out[0]), 12.0) + def test_v_and_b32_quadrant(self): + """V_AND_B32 for quadrant extraction (n & 3).""" + instructions = [ + s_mov_b32(s[0], 15915), # some large number + v_mov_b32_e32(v[0], s[0]), + v_and_b32_e32(v[1], 3, v[0]), # n & 3 for quadrant + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 15915 & 3) - def test_v_xad_u32(self): - """Regression test: V_XAD_U32 (xor-add) used by random number generation.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 3), - v_mov_b32_e32(v[2], 4), - v_mov_b32_e32(v[4], 5), - v_xad_u32(v[1], v[1], v[2], v[4]), # (3^4)+5 = 7+5 = 12 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0], 12) + def test_v_lshrrev_b32(self): + """V_LSHRREV_B32 logical shift right.""" + instructions = [ + s_mov_b32(s[0], 0xff00), + v_mov_b32_e32(v[0], s[0]), + v_lshrrev_b32_e32(v[1], 8, v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0xff) - def test_v_lshl_or_b32(self): - """Regression test: V_LSHL_OR_B32 operand order is (s0 << s1) | s2, not (s0 << s2) | s1.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 5), # s0 = value to shift - v_mov_b32_e32(v[2], 2), # s1 = shift amount - v_mov_b32_e32(v[4], 3), # s2 = value to OR - v_lshl_or_b32(v[1], v[1], v[2], v[4]), # (5 << 2) | 3 = 20 | 3 = 23 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0], 23) + def test_v_lshlrev_b32(self): + """V_LSHLREV_B32 logical shift left.""" + instructions = [ + s_mov_b32(s[0], 0xff), + v_mov_b32_e32(v[0], s[0]), + v_lshlrev_b32_e32(v[1], 8, v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0xff00) - def test_v_sqrt_f32_negative(self): - """Regression test: V_SQRT_F32 should return NaN for negative inputs, not 0.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(-1.0)), - v_sqrt_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertTrue(math.isnan(f32(out[0]))) + def test_v_xor_b32(self): + """V_XOR_B32 bitwise xor (used in sin for sign).""" + instructions = [ + s_mov_b32(s[0], 0x80000000), # sign bit + s_mov_b32(s[1], f2i(1.0)), + v_mov_b32_e32(v[0], s[1]), + v_xor_b32_e32(v[1], s[0], v[0]), # flip sign + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5) - def test_v_rsq_f32_negative(self): - """Regression test: V_RSQ_F32 should return NaN for negative inputs, not inf.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i32(-1.0)), - v_rsq_f32_e32(v[1], v[1]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertTrue(math.isnan(f32(out[0]))) -class TestVOPD(unittest.TestCase): - def test_vopd_add_nc_u32(self): - """Test VOPD V_DUAL_ADD_NC_U32.""" - state = WaveState() - state.vgpr[0][1] = 100 - state.vgpr[0][2] = 50 - # vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1), so for vdstx=3 (odd), vdsty=4 requires VGPR(4) - kernel = VOPD(opx=VOPDOp.V_DUAL_MOV_B32, srcx0=v[1], vsrcx1=VGPR(0), vdstx=VGPR(3), - opy=VOPDOp.V_DUAL_ADD_NC_U32, srcy0=v[1], vsrcy1=VGPR(2), vdsty=VGPR(4)).to_bytes() - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.vgpr[0][3], 100) # MOV result - self.assertEqual(state.vgpr[0][4], 150) # 100 + 50 +class TestOCMLSinSequence(unittest.TestCase): + """Test the specific instruction sequence used in OCML sin.""" - def test_vopd_lshlrev(self): - """Test VOPD V_DUAL_LSHLREV_B32.""" - state = WaveState() - state.vgpr[0][1] = 0x10 - state.vgpr[0][2] = 0 - # vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1), so for vdstx=3 (odd), vdsty=4 requires VGPR(4) - kernel = VOPD(opx=VOPDOp.V_DUAL_MOV_B32, srcx0=v[1], vsrcx1=VGPR(0), vdstx=VGPR(3), - opy=VOPDOp.V_DUAL_LSHLREV_B32, srcy0=4, vsrcy1=VGPR(1), vdsty=VGPR(4)).to_bytes() # V4 = V1 << 4 - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.vgpr[0][3], 0x10) # MOV result - self.assertEqual(state.vgpr[0][4], 0x100) # 0x10 << 4 = 0x100 + def test_sin_reduction_step1_mul(self): + """First step: v12 = |x| * (1/2pi).""" + import math + one_over_2pi = 1.0 / (2.0 * math.pi) # 0x3e22f983 in hex + x = 100000.0 + instructions = [ + s_mov_b32(s[0], f2i(x)), + s_mov_b32(s[1], f2i(one_over_2pi)), + v_mov_b32_e32(v[0], s[0]), + v_mul_f32_e32(v[1], s[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + expected = x * one_over_2pi + self.assertAlmostEqual(result, expected, places=0) - def test_vopd_and(self): - """Test VOPD V_DUAL_AND_B32.""" - state = WaveState() - state.vgpr[0][1] = 0xff - state.vgpr[0][2] = 0x0f - # vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1), so for vdstx=3 (odd), vdsty=4 requires VGPR(4) - kernel = VOPD(opx=VOPDOp.V_DUAL_MOV_B32, srcx0=v[1], vsrcx1=VGPR(0), vdstx=VGPR(3), - opy=VOPDOp.V_DUAL_AND_B32, srcy0=v[1], vsrcy1=VGPR(2), vdsty=VGPR(4)).to_bytes() - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.vgpr[0][3], 0xff) - self.assertEqual(state.vgpr[0][4], 0x0f) # 0xff & 0x0f = 0x0f + def test_sin_reduction_step2_round(self): + """Second step: round to nearest integer.""" + import math + one_over_2pi = 1.0 / (2.0 * math.pi) + x = 100000.0 + val = x * one_over_2pi # ~15915.49 + instructions = [ + s_mov_b32(s[0], f2i(val)), + v_mov_b32_e32(v[0], s[0]), + v_rndne_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + expected = round(val) + self.assertAlmostEqual(result, expected, places=0) - def test_vopd_parallel_read(self): - """Regression: VOPD must read all inputs before writing - Y op reads register that X op writes.""" - state = WaveState() - state.vgpr[0][4] = 0 - state.vgpr[0][7] = 5 # Y op reads v7 as vsrcy1, X op writes to v7 - # X: MOV v7, v0 (v0=0, so v7 becomes 0) - # Y: ADD v6, v4, v7 (should use original v7=5, not the overwritten 0) - # vdsty_enc=3 with vdstx=7 (odd) -> vdsty = (3 << 1) | (7&1)^1 = 6 | 0 = 6 - kernel = VOPD(opx=VOPDOp.V_DUAL_MOV_B32, srcx0=v[0], vsrcx1=VGPR(0), vdstx=VGPR(7), - opy=VOPDOp.V_DUAL_ADD_NC_U32, srcy0=v[4], vsrcy1=VGPR(7), vdsty=VGPR(6)).to_bytes() - kernel += s_endpgm().to_bytes() - prog = decode_program(kernel) - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.vgpr[0][7], 0) # X op: v7 = v0 = 0 - self.assertEqual(state.vgpr[0][6], 5) # Y op: v6 = v4 + v7 = 0 + 5 (original v7) + def test_sin_reduction_step3_fma(self): + """Third step: x - n * (pi/2) via FMA.""" + import math + # This is where precision matters - the FMA does: |x| + (-pi/2) * n + neg_half_pi = -math.pi / 2.0 # 0xbfc90fda + x = 100000.0 + n = 15915.0 + instructions = [ + s_mov_b32(s[0], f2i(neg_half_pi)), + s_mov_b32(s[1], f2i(n)), + s_mov_b32(s[2], f2i(x)), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_fma_f32(v[3], v[0], v[1], v[2]), # x + (-pi/2) * n + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][3]) + expected = x + neg_half_pi * n + # Allow some tolerance due to float precision + self.assertAlmostEqual(result, expected, places=2) -class TestDecoder(unittest.TestCase): - def test_vopd_literal_handling(self): - """Regression test: VOPD srcx0/srcy0 with literal (255) wasn't consuming the literal dword.""" - state = WaveState() - # Create VOPD with srcx0=255 (literal), followed by literal value 0x12345678 - vopd_bytes = VOPD(opx=8, srcx0=RawImm(255), vsrcx1=VGPR(0), vdstx=VGPR(1), # MOV: V1 = literal - opy=8, srcy0=RawImm(128), vsrcy1=VGPR(0), vdsty=VGPR(2)).to_bytes() # MOV: V2 = 0 - literal_bytes = (0x12345678).to_bytes(4, 'little') - kernel = vopd_bytes + literal_bytes + s_endpgm().to_bytes() - prog = decode_program(kernel) - # Should decode as 3 instructions: VOPD (with literal), then S_ENDPGM - # The literal should NOT be decoded as a separate instruction - self.assertEqual(len(prog), 2) # VOPD + S_ENDPGM - exec_wave(prog, state, bytearray(65536), 1) - self.assertEqual(state.vgpr[0][1], 0x12345678) + def test_sin_1e5_full_reduction(self): + """Full reduction sequence for sin(1e5).""" + import math + x = 100000.0 + one_over_2pi = 1.0 / (2.0 * math.pi) + neg_half_pi = -math.pi / 2.0 - def test_s_endpgm_stops_decode(self): - """Regression test: decoder should stop at S_ENDPGM, not read past into metadata.""" - # Create a kernel followed by garbage that looks like an invalid instruction - kernel = s_mov_b32(s[0], 42).to_bytes() + s_endpgm().to_bytes() - garbage = bytes([0xff] * 16) # garbage after kernel - prog = decode_program(kernel + garbage) - # Should only have 2 instructions (s_mov_b32 and s_endpgm) - self.assertEqual(len(prog), 2) + instructions = [ + # Load constants + s_mov_b32(s[0], f2i(x)), + s_mov_b32(s[1], f2i(one_over_2pi)), + s_mov_b32(s[2], f2i(neg_half_pi)), + # Step 1: v1 = x * (1/2pi) + v_mov_b32_e32(v[0], s[0]), + v_mul_f32_e32(v[1], s[1], v[0]), + # Step 2: v2 = round(v1) + v_rndne_f32_e32(v[2], v[1]), + # Step 3: v3 = x + (-pi/2) * round_val (FMA) + v_fma_f32(v[3], s[2], v[2], v[0]), + # Step 4: convert to int for quadrant + v_cvt_i32_f32_e32(v[4], v[2]), + # Step 5: quadrant = n & 3 + v_and_b32_e32(v[5], 3, v[4]), + ] + st = run_program(instructions, n_lanes=1) -class TestFloatConversion(unittest.TestCase): - """Unit tests for i32/i16/f32/f16 float conversion functions.""" + # Check intermediate values + mul_result = i2f(st.vgpr[0][1]) + round_result = i2f(st.vgpr[0][2]) + reduced = i2f(st.vgpr[0][3]) + quadrant = st.vgpr[0][5] - def test_i32_preserves_nan_sign(self): - """NaN sign bit should be preserved when converting float to int bits.""" - from extra.assembly.rdna3.emu import i32, f32 - # 0 * -inf produces a negative NaN - neg_nan = 0.0 * float('-inf') - bits = i32(neg_nan) - # Should have sign bit set (0xffc00000), not canonical positive NaN (0x7fc00000) - self.assertEqual(bits & 0x80000000, 0x80000000, f"Expected negative NaN, got 0x{bits:08x}") - self.assertTrue(math.isnan(f32(bits))) + # Verify results match expected + expected_mul = x * one_over_2pi + expected_round = round(expected_mul) + expected_reduced = x + neg_half_pi * expected_round + expected_quadrant = int(expected_round) & 3 - def test_i32_preserves_positive_nan(self): - """Positive NaN should remain positive.""" - from extra.assembly.rdna3.emu import i32, f32 - pos_nan = float('nan') - bits = i32(pos_nan) - # Standard Python NaN is positive (0x7fc00000) - self.assertEqual(bits & 0x80000000, 0, f"Expected positive NaN, got 0x{bits:08x}") - self.assertTrue(math.isnan(f32(bits))) + self.assertAlmostEqual(mul_result, expected_mul, places=0, msg=f"mul: got {mul_result}, expected {expected_mul}") + self.assertAlmostEqual(round_result, expected_round, places=0, msg=f"round: got {round_result}, expected {expected_round}") + self.assertEqual(quadrant, expected_quadrant, f"quadrant: got {quadrant}, expected {expected_quadrant}") - def test_i32_overflow_to_inf(self): - """Values too large for f32 should become inf.""" - from extra.assembly.rdna3.emu import i32, f32 - big = 2.0 ** 200 - self.assertEqual(i32(big), 0x7f800000) # +inf - self.assertEqual(i32(-big), 0xff800000) # -inf - def test_i32_inf(self): - """Infinity should be preserved.""" - from extra.assembly.rdna3.emu import i32 - self.assertEqual(i32(float('inf')), 0x7f800000) - self.assertEqual(i32(float('-inf')), 0xff800000) +class TestMad64(unittest.TestCase): + """Tests for V_MAD_U64_U32 - critical for OCML Payne-Hanek sin reduction.""" - def test_i32_normal_values(self): - """Normal float values should round-trip correctly (within f32 precision).""" - from extra.assembly.rdna3.emu import i32, f32 - # Use values exactly representable in float32 - for val in [0.0, 1.0, -1.0, 0.5, -0.5, 100.0, -100.0, 1e10]: - bits = i32(val) - self.assertAlmostEqual(f32(bits), val, places=5) + def test_v_mad_u64_u32_simple(self): + """V_MAD_U64_U32: D = S0 * S1 + S2 (64-bit result).""" + # 3 * 4 + 5 = 17 + instructions = [ + s_mov_b32(s[0], 3), + s_mov_b32(s[1], 4), + v_mov_b32_e32(v[2], 5), # S2 lo + v_mov_b32_e32(v[3], 0), # S2 hi + v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]), # result in v[4:5] + ] + st = run_program(instructions, n_lanes=1) + result_lo = st.vgpr[0][4] + result_hi = st.vgpr[0][5] + result = result_lo | (result_hi << 32) + self.assertEqual(result, 17) - def test_i16_overflow_to_inf(self): - """Values too large for f16 should become inf.""" - from extra.assembly.rdna3.emu import i16 - big = 100000.0 # way larger than f16 max (65504) - self.assertEqual(i16(big), 0x7c00) # +inf - self.assertEqual(i16(-big), 0xfc00) # -inf + def test_v_mad_u64_u32_large_mult(self): + """V_MAD_U64_U32 with large values that overflow 32 bits.""" + # 0x80000000 * 2 + 0 = 0x100000000 + instructions = [ + s_mov_b32(s[0], 0x80000000), + s_mov_b32(s[1], 2), + v_mov_b32_e32(v[2], 0), + v_mov_b32_e32(v[3], 0), + v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + result_lo = st.vgpr[0][4] + result_hi = st.vgpr[0][5] + result = result_lo | (result_hi << 32) + self.assertEqual(result, 0x100000000) - def test_i16_inf(self): - """Infinity should be preserved.""" - from extra.assembly.rdna3.emu import i16 - self.assertEqual(i16(float('inf')), 0x7c00) - self.assertEqual(i16(float('-inf')), 0xfc00) + def test_v_mad_u64_u32_with_add(self): + """V_MAD_U64_U32 with 64-bit addend.""" + # 1000 * 1000 + 0x100000000 = 1000000 + 0x100000000 = 0x1000F4240 + instructions = [ + s_mov_b32(s[0], 1000), + s_mov_b32(s[1], 1000), + v_mov_b32_e32(v[2], 0), # S2 lo + v_mov_b32_e32(v[3], 1), # S2 hi = 0x100000000 + v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + result_lo = st.vgpr[0][4] + result_hi = st.vgpr[0][5] + result = result_lo | (result_hi << 32) + expected = 1000 * 1000 + 0x100000000 + self.assertEqual(result, expected) - def test_fma_nan_sign_preserved(self): - """FMA producing NaN should preserve the correct sign bit.""" - from extra.assembly.rdna3.emu import i32, f32 - # 0 * (-inf) + 1.0 = NaN (from 0 * -inf) - a, b, c = 0.0, float('-inf'), 1.0 - result = i32(a * b + c) - # The NaN should be negative since 0 * -inf produces negative NaN - self.assertEqual(result & 0x80000000, 0x80000000, f"Expected negative NaN, got 0x{result:08x}") + def test_v_mad_u64_u32_max_values(self): + """V_MAD_U64_U32 with max u32 values.""" + # 0xFFFFFFFF * 0xFFFFFFFF + 0 = 0xFFFFFFFE00000001 + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), + s_mov_b32(s[1], 0xFFFFFFFF), + v_mov_b32_e32(v[2], 0), + v_mov_b32_e32(v[3], 0), + v_mad_u64_u32(v[4], SrcEnum.NULL, s[0], s[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + result_lo = st.vgpr[0][4] + result_hi = st.vgpr[0][5] + result = result_lo | (result_hi << 32) + expected = 0xFFFFFFFF * 0xFFFFFFFF + self.assertEqual(result, expected) -class TestMultiWave(unittest.TestCase): - def test_all_waves_execute(self): - """Regression test: all waves in a workgroup must execute, not just the first.""" - n_threads = 64 # 2 waves of 32 threads each - output = (ctypes.c_uint32 * n_threads)(*[0xdead] * n_threads) - output_ptr = ctypes.addressof(output) - args = (ctypes.c_uint64 * 1)(output_ptr) - args_ptr = ctypes.addressof(args) - # Simple kernel: store tid to output[tid] - kernel = b'' - kernel += s_load_b64(s[2:3], s[0:1], soffset=NULL, offset=0).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - kernel += v_lshlrev_b32_e32(v[1], 2, v[0]).to_bytes() # offset = tid * 4 - kernel += global_store_b32(addr=v[1], data=v[0], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() +class TestClz(unittest.TestCase): + """Tests for V_CLZ_I32_U32 - count leading zeros, used in Payne-Hanek.""" - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({ - (output_ptr, ctypes.sizeof(output)), - (args_ptr, ctypes.sizeof(args)), - (kernel_ptr, len(kernel)), - }) - result = run_asm(kernel_ptr, len(kernel), 1, 1, 1, n_threads, 1, 1, args_ptr) - self.assertEqual(result, 0) - # All threads should have written their tid - for i in range(n_threads): - self.assertEqual(output[i], i, f"Thread {i} didn't execute") + def test_v_clz_i32_u32_zero(self): + """V_CLZ_I32_U32 of 0 returns -1 (all bits are 0).""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_clz_i32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # -1 as unsigned 32-bit + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF) -class TestRegressions(unittest.TestCase): - """Regression tests for bugs fixed in the emulator.""" + def test_v_clz_i32_u32_one(self): + """V_CLZ_I32_U32 of 1 returns 31 (31 leading zeros).""" + instructions = [ + v_mov_b32_e32(v[0], 1), + v_clz_i32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 31) - def test_v_fmac_f16(self): - """V_FMAC_F16: fused multiply-add for FP16. Regression for VOP2 op 54.""" - from extra.assembly.rdna3.emu import i16, f16 - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], i16(2.0)), # v1.lo = 2.0 (fp16) - v_mov_b32_e32(v[2], i16(3.0)), # v2.lo = 3.0 (fp16) - # v1 = v1 * v2 + v1 = 2.0 * 3.0 + 2.0 = 8.0 - VOP2(VOP2Op.V_FMAC_F16, v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertAlmostEqual(f16(out[0] & 0xffff), 8.0, places=2) + def test_v_clz_i32_u32_msb_set(self): + """V_CLZ_I32_U32 of 0x80000000 returns 0 (no leading zeros).""" + instructions = [ + s_mov_b32(s[0], 0x80000000), + v_mov_b32_e32(v[0], s[0]), + v_clz_i32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0) - def test_v_cvt_f64_f32(self): - """V_CVT_F64_F32: convert float32 to float64. Regression for VOP1 op 16.""" - kernel = b'' - kernel += s_load_b64(s[2:3], s[0:1], soffset=NULL, offset=0).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - kernel += v_mov_b32_e32(v[1], i32(3.14159)).to_bytes() - kernel += VOP1(VOP1Op.V_CVT_F64_F32, v[4], v[1]).to_bytes() # v4:v5 = f64(v1) - kernel += v_lshlrev_b32_e32(v[3], 3, v[0]).to_bytes() # offset = tid * 8 - kernel += global_store_b64(addr=v[3], data=v[4], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() - output = (ctypes.c_double * 1)(0.0) - output_ptr = ctypes.addressof(output) - args = (ctypes.c_uint64 * 1)(output_ptr) - args_ptr = ctypes.addressof(args) - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({(output_ptr, 8), (args_ptr, 8), (kernel_ptr, len(kernel))}) - run_asm(kernel_ptr, len(kernel), 1, 1, 1, 1, 1, 1, args_ptr) - self.assertAlmostEqual(output[0], 3.14159, places=4) + def test_v_clz_i32_u32_half(self): + """V_CLZ_I32_U32 of 0x8000 (bit 15) returns 16.""" + instructions = [ + s_mov_b32(s[0], 0x8000), + v_mov_b32_e32(v[0], s[0]), + v_clz_i32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 16) - def test_v_add_f64(self): - """V_ADD_F64: add two float64 values. Regression for VOP3 op 807.""" - from extra.assembly.rdna3.emu import i64_parts - kernel = b'' - kernel += s_load_b64(s[2:3], s[0:1], soffset=NULL, offset=0).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - # Load 1.5 into v1:v2 - lo, hi = i64_parts(1.5) - kernel += v_mov_b32_e32(v[1], lo).to_bytes() - kernel += v_mov_b32_e32(v[2], hi).to_bytes() - # Load 2.5 into v3:v4 - lo, hi = i64_parts(2.5) - kernel += v_mov_b32_e32(v[3], lo).to_bytes() - kernel += v_mov_b32_e32(v[4], hi).to_bytes() - # v5:v6 = v1:v2 + v3:v4 = 1.5 + 2.5 = 4.0 - kernel += VOP3(VOP3Op.V_ADD_F64, v[5], v[1], v[3]).to_bytes() - kernel += v_lshlrev_b32_e32(v[7], 3, v[0]).to_bytes() - kernel += global_store_b64(addr=v[7], data=v[5], saddr=s[2]).to_bytes() - kernel += s_endpgm().to_bytes() - output = (ctypes.c_double * 1)(0.0) - output_ptr = ctypes.addressof(output) - args = (ctypes.c_uint64 * 1)(output_ptr) - args_ptr = ctypes.addressof(args) - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({(output_ptr, 8), (args_ptr, 8), (kernel_ptr, len(kernel))}) - run_asm(kernel_ptr, len(kernel), 1, 1, 1, 1, 1, 1, args_ptr) - self.assertAlmostEqual(output[0], 4.0, places=10) + def test_v_clz_i32_u32_all_ones(self): + """V_CLZ_I32_U32 of 0xFFFFFFFF returns 0.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[0], s[0]), + v_clz_i32_u32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0) - def test_flat_load_d16_hi_b16(self): - """FLAT_LOAD_D16_HI_B16: load 16-bit to high half. Regression for FLAT op 35.""" - from extra.assembly.rdna3.emu import i16 - # Create a buffer with test data - src_data = (ctypes.c_uint16 * 1)(0x1234) - src_ptr = ctypes.addressof(src_data) - output = (ctypes.c_uint32 * 1)(0xABCD0000) # preset low bits - output_ptr = ctypes.addressof(output) - args = (ctypes.c_uint64 * 2)(output_ptr, src_ptr) - args_ptr = ctypes.addressof(args) - kernel = b'' - kernel += s_load_b128(s[0:3], s[0:1], soffset=NULL, offset=0).to_bytes() - kernel += s_waitcnt(lgkmcnt=0).to_bytes() - kernel += v_mov_b32_e32(v[1], 0xDEAD).to_bytes() # initial value with low bits set - kernel += v_mov_b32_e32(v[2], 0).to_bytes() # offset = 0 - kernel += FLAT(FLATOp.FLAT_LOAD_D16_HI_B16, v[1], v[2], saddr=s[2], offset=0).to_bytes() - kernel += s_waitcnt(vmcnt=0).to_bytes() - kernel += v_lshlrev_b32_e32(v[3], 2, v[0]).to_bytes() - kernel += global_store_b32(addr=v[3], data=v[1], saddr=s[0]).to_bytes() - kernel += s_endpgm().to_bytes() +class TestCtz(unittest.TestCase): + """Tests for V_CTZ_I32_B32 - count trailing zeros.""" - kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel) - kernel_ptr = ctypes.addressof(kernel_buf) - set_valid_mem_ranges({(output_ptr, 4), (src_ptr, 2), (args_ptr, 16), (kernel_ptr, len(kernel))}) - run_asm(kernel_ptr, len(kernel), 1, 1, 1, 1, 1, 1, args_ptr) - # High 16 bits should be 0x1234, low 16 bits preserved as 0xDEAD - self.assertEqual(output[0], 0x1234DEAD) + def test_v_ctz_i32_b32_zero(self): + """V_CTZ_I32_B32 of 0 returns -1 (all bits are 0).""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_ctz_i32_b32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0xFFFFFFFF) - def test_v_mad_u16(self): - """V_MAD_U16: multiply-add unsigned 16-bit. Regression for VOP3 op 577.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 10), # a = 10 - v_mov_b32_e32(v[2], 20), # b = 20 - v_mov_b32_e32(v[4], 5), # c = 5 - VOP3(VOP3Op.V_MAD_U16, v[1], v[1], v[2], v[4]), # v1 = 10*20+5 = 205 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0] & 0xffff, 205) + def test_v_ctz_i32_b32_one(self): + """V_CTZ_I32_B32 of 1 returns 0 (no trailing zeros).""" + instructions = [ + v_mov_b32_e32(v[0], 1), + v_ctz_i32_b32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0) - def test_v_lshrrev_b16(self): - """V_LSHRREV_B16: logical shift right 16-bit. Regression for VOP3 op 825.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 0x8000), # value to shift - v_mov_b32_e32(v[2], 4), # shift amount - VOP3(VOP3Op.V_LSHRREV_B16, v[1], v[2], v[1]), # v1 = 0x8000 >> 4 = 0x0800 - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0] & 0xffff, 0x0800) + def test_v_ctz_i32_b32_msb_set(self): + """V_CTZ_I32_B32 of 0x80000000 returns 31.""" + instructions = [ + s_mov_b32(s[0], 0x80000000), + v_mov_b32_e32(v[0], s[0]), + v_ctz_i32_b32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 31) + + def test_v_ctz_i32_b32_half(self): + """V_CTZ_I32_B32 of 0x8000 (bit 15) returns 15.""" + instructions = [ + s_mov_b32(s[0], 0x8000), + v_mov_b32_e32(v[0], s[0]), + v_ctz_i32_b32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 15) + + def test_v_ctz_i32_b32_all_ones(self): + """V_CTZ_I32_B32 of 0xFFFFFFFF returns 0.""" + instructions = [ + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[0], s[0]), + v_ctz_i32_b32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0) + + +class TestDivision(unittest.TestCase): + """Tests for division instructions - V_RCP, V_DIV_SCALE, V_DIV_FMAS, V_DIV_FIXUP.""" + + def test_v_rcp_f32_normal(self): + """V_RCP_F32 of 2.0 returns 0.5.""" + instructions = [ + v_mov_b32_e32(v[0], 2.0), + v_rcp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5) + + def test_v_rcp_f32_inf(self): + """V_RCP_F32 of +inf returns 0.""" + instructions = [ + s_mov_b32(s[0], 0x7f800000), # +inf + v_mov_b32_e32(v[0], s[0]), + v_rcp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(i2f(st.vgpr[0][1]), 0.0) + + def test_v_rcp_f32_neg_inf(self): + """V_RCP_F32 of -inf returns -0.""" + instructions = [ + s_mov_b32(s[0], 0xff800000), # -inf + v_mov_b32_e32(v[0], s[0]), + v_rcp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + self.assertEqual(result, 0.0) + # Check it's negative zero + self.assertEqual(st.vgpr[0][1], 0x80000000) + + def test_v_rcp_f32_zero(self): + """V_RCP_F32 of 0 returns +inf.""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_rcp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + import math + self.assertTrue(math.isinf(i2f(st.vgpr[0][1]))) + + def test_v_div_fixup_f32_normal(self): + """V_DIV_FIXUP_F32 normal division 1.0/2.0.""" + # S0 = approximation (from rcp * scale), S1 = denominator, S2 = numerator + instructions = [ + s_mov_b32(s[0], f2i(0.5)), # approximation + s_mov_b32(s[1], f2i(2.0)), # denominator + s_mov_b32(s[2], f2i(1.0)), # numerator + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertAlmostEqual(i2f(st.vgpr[0][1]), 0.5, places=5) + + def test_v_div_fixup_f32_one_div_inf(self): + """V_DIV_FIXUP_F32: 1.0 / +inf = 0.""" + # For x/inf: S0=approx(~0), S1=inf, S2=x + instructions = [ + s_mov_b32(s[0], 0), # approximation (rcp of inf = 0) + s_mov_b32(s[1], 0x7f800000), # denominator = +inf + s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0 + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(i2f(st.vgpr[0][1]), 0.0) + + def test_v_div_fixup_f32_one_div_neg_inf(self): + """V_DIV_FIXUP_F32: 1.0 / -inf = -0.""" + instructions = [ + s_mov_b32(s[0], 0x80000000), # approximation (rcp of -inf = -0) + s_mov_b32(s[1], 0xff800000), # denominator = -inf + s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0 + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0x80000000) # -0.0 + + def test_v_div_fixup_f32_inf_div_inf(self): + """V_DIV_FIXUP_F32: inf / inf = NaN.""" + import math + instructions = [ + s_mov_b32(s[0], 0), # approximation + s_mov_b32(s[1], 0x7f800000), # denominator = +inf + s_mov_b32(s[2], 0x7f800000), # numerator = +inf + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertTrue(math.isnan(i2f(st.vgpr[0][1]))) + + def test_v_div_fixup_f32_zero_div_zero(self): + """V_DIV_FIXUP_F32: 0 / 0 = NaN.""" + import math + instructions = [ + s_mov_b32(s[0], 0), # approximation + s_mov_b32(s[1], 0), # denominator = 0 + s_mov_b32(s[2], 0), # numerator = 0 + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + self.assertTrue(math.isnan(i2f(st.vgpr[0][1]))) + + def test_v_div_fixup_f32_x_div_zero(self): + """V_DIV_FIXUP_F32: 1.0 / 0 = +inf.""" + import math + instructions = [ + s_mov_b32(s[0], 0x7f800000), # approximation (rcp of 0 = inf) + s_mov_b32(s[1], 0), # denominator = 0 + s_mov_b32(s[2], f2i(1.0)), # numerator = 1.0 + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + self.assertTrue(math.isinf(result) and result > 0) + + def test_v_div_fixup_f32_neg_x_div_zero(self): + """V_DIV_FIXUP_F32: -1.0 / 0 = -inf.""" + import math + instructions = [ + s_mov_b32(s[0], 0xff800000), # approximation (rcp of 0 = inf, with sign) + s_mov_b32(s[1], 0), # denominator = 0 + s_mov_b32(s[2], f2i(-1.0)), # numerator = -1.0 + v_mov_b32_e32(v[0], s[0]), + v_div_fixup_f32(v[1], v[0], s[1], s[2]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][1]) + self.assertTrue(math.isinf(result) and result < 0) + + +class TestSpecialValues(unittest.TestCase): + """Tests for special float values - inf, nan, zero handling.""" + + def test_v_mul_f32_zero_times_inf(self): + """V_MUL_F32: 0 * inf = NaN.""" + import math + instructions = [ + v_mov_b32_e32(v[0], 0), + s_mov_b32(s[0], 0x7f800000), # +inf + v_mov_b32_e32(v[1], s[0]), + v_mul_f32_e32(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertTrue(math.isnan(i2f(st.vgpr[0][2]))) + + def test_v_add_f32_inf_minus_inf(self): + """V_ADD_F32: inf + (-inf) = NaN.""" + import math + instructions = [ + s_mov_b32(s[0], 0x7f800000), # +inf + s_mov_b32(s[1], 0xff800000), # -inf + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_add_f32_e32(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + self.assertTrue(math.isnan(i2f(st.vgpr[0][2]))) + + def test_v_fma_f32_with_inf(self): + """V_FMA_F32: 1.0 * inf + 0 = inf.""" + import math + instructions = [ + v_mov_b32_e32(v[0], 1.0), + s_mov_b32(s[0], 0x7f800000), # +inf + v_mov_b32_e32(v[1], s[0]), + v_mov_b32_e32(v[2], 0), + v_fma_f32(v[3], v[0], v[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + result = i2f(st.vgpr[0][3]) + self.assertTrue(math.isinf(result) and result > 0) + + def test_v_exp_f32_large_negative(self): + """V_EXP_F32 of large negative value (2^-100) returns very small number.""" + instructions = [ + s_mov_b32(s[0], f2i(-100.0)), + v_mov_b32_e32(v[0], s[0]), + v_exp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # V_EXP_F32 computes 2^x, so 2^-100 is ~7.9e-31 (very small but not 0) + result = i2f(st.vgpr[0][1]) + self.assertLess(result, 1e-20) # Just verify it's very small + + def test_v_exp_f32_large_positive(self): + """V_EXP_F32 of large positive value (2^100) returns very large number.""" + instructions = [ + s_mov_b32(s[0], f2i(100.0)), + v_mov_b32_e32(v[0], s[0]), + v_exp_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + # V_EXP_F32 computes 2^x, so 2^100 is ~1.27e30 (very large) + result = i2f(st.vgpr[0][1]) + self.assertGreater(result, 1e20) # Just verify it's very large + + +class TestF16Conversions(unittest.TestCase): + """Tests for f16 conversion and packing instructions.""" + + def test_v_cvt_f16_f32_basic(self): + """V_CVT_F16_F32 converts f32 to f16 in low 16 bits.""" + from extra.assembly.rdna3.pcode import _f16 + instructions = [ + v_mov_b32_e32(v[0], 1.0), # f32 1.0 = 0x3f800000 + v_cvt_f16_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + # f16 1.0 = 0x3c00, should be in low 16 bits + lo_bits = result & 0xffff + self.assertEqual(lo_bits, 0x3c00, f"Expected 0x3c00, got 0x{lo_bits:04x}") + + def test_v_cvt_f16_f32_negative(self): + """V_CVT_F16_F32 converts negative f32 to f16.""" + from extra.assembly.rdna3.pcode import _f16 + instructions = [ + v_mov_b32_e32(v[0], -2.0), # f32 -2.0 = 0xc0000000 + v_cvt_f16_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + lo_bits = result & 0xffff + # f16 -2.0 = 0xc000 + self.assertEqual(lo_bits, 0xc000, f"Expected 0xc000, got 0x{lo_bits:04x}") + + def test_v_cvt_f16_f32_small(self): + """V_CVT_F16_F32 converts small f32 value.""" + from extra.assembly.rdna3.pcode import _f16, f32_to_f16 + instructions = [ + v_mov_b32_e32(v[0], 0.5), + v_cvt_f16_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + lo_bits = result & 0xffff + expected = f32_to_f16(0.5) # Should be 0x3800 + self.assertEqual(lo_bits, expected, f"Expected 0x{expected:04x}, got 0x{lo_bits:04x}") + + def test_v_cvt_f16_f32_preserves_high_bits(self): + """V_CVT_F16_F32 preserves high 16 bits of destination. + + Hardware verified: V_CVT_F16_F32 only writes to the low 16 bits of the + destination register, preserving the high 16 bits. This is important for + the common pattern of converting two f32 values and packing them. + """ + instructions = [ + s_mov_b32(s[0], 0xdead0000), # Pre-fill with garbage in high bits + v_mov_b32_e32(v[1], s[0]), + v_mov_b32_e32(v[0], 1.0), + v_cvt_f16_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + hi_bits = (result >> 16) & 0xffff + lo_bits = result & 0xffff + self.assertEqual(lo_bits, 0x3c00, f"Low bits should be 0x3c00, got 0x{lo_bits:04x}") + self.assertEqual(hi_bits, 0xdead, f"High bits should be preserved as 0xdead, got 0x{hi_bits:04x}") + + def test_v_cvt_f16_f32_same_src_dst_preserves_high_bits(self): + """V_CVT_F16_F32 with same src/dst preserves high bits of source. + + Regression test: When converting v0 in-place (v_cvt_f16_f32 v0, v0), + the high 16 bits of the original f32 value are preserved in the result. + For f32 1.0 (0x3f800000), the result should be 0x3f803c00: + - Low 16 bits: 0x3c00 (f16 1.0) + - High 16 bits: 0x3f80 (preserved from original f32) + """ + instructions = [ + v_mov_b32_e32(v[0], 1.0), # v0 = 0x3f800000 + v_cvt_f16_f32_e32(v[0], v[0]), # convert v0 in-place + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + # Hardware preserves high bits: 0x3f800000 -> 0x3f803c00 + self.assertEqual(result, 0x3f803c00, f"Expected 0x3f803c00, got 0x{result:08x}") + + def test_v_cvt_f16_f32_reads_full_32bit_source(self): + """V_CVT_F16_F32 must read full 32-bit f32 source, not just low 16 bits. + + Regression test for a bug where V_CVT_F16_F32 was incorrectly treated as having + a 16-bit source because '_F16' is in the instruction name. The CVT naming convention + is V_CVT_DST_SRC, so V_CVT_F16_F32 has a 32-bit f32 source and 16-bit f16 destination. + + The bug caused the emulator to only read the low 16 bits of the source register, + which would produce wrong results when the significant bits of the f32 value are + in the upper bits (as they are for most f32 values > 1.0 or < -1.0). + """ + from extra.assembly.rdna3.pcode import _f16 + # Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong. + # Correct f16 result: 0x3e00 (1.5 in half precision) + instructions = [ + s_mov_b32(s[0], 0x3fc00000), # f32 1.5 + v_mov_b32_e32(v[0], s[0]), + v_cvt_f16_f32_e32(v[1], v[0]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + lo_bits = result & 0xffff + # f16(1.5) = 0x3e00 + self.assertEqual(lo_bits, 0x3e00, f"Expected f16(1.5)=0x3e00, got 0x{lo_bits:04x} ({_f16(lo_bits)})") + + def test_v_cvt_f16_f32_then_pack_for_wmma(self): + """Regression test: f32->f16 conversion followed by pack for WMMA input. + + This sequence is used in fused fp16 GEMM kernels where f32 data is loaded, + converted to f16, packed into pairs, and fed to WMMA instructions. + + The bug was: V_CVT_F16_F32 was treated as having 16-bit source (because '_F16' + is in the name), causing it to read only low 16 bits of the f32 input. + This resulted in WMMA receiving zero inputs and producing zero outputs. + """ + from extra.assembly.rdna3.pcode import _f16 + # Simulate loading two f32 values and converting/packing for WMMA + # f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000 + # After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100 + # After PACK: 0x41003e00 (hi=2.5, lo=1.5) + instructions = [ + s_mov_b32(s[0], 0x3fc00000), # f32 1.5 + s_mov_b32(s[1], 0x40200000), # f32 2.5 + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_cvt_f16_f32_e32(v[2], v[0]), # v2 = f16(1.5) = 0x3e00 + v_cvt_f16_f32_e32(v[3], v[1]), # v3 = f16(2.5) = 0x4100 + v_pack_b32_f16(v[4], v[2], v[3]), # v4 = pack(v2, v3) = 0x41003e00 + ] + st = run_program(instructions, n_lanes=1) + + # Check intermediate CVT results + v2_lo = st.vgpr[0][2] & 0xffff + v3_lo = st.vgpr[0][3] & 0xffff + self.assertEqual(v2_lo, 0x3e00, f"v2 should be f16(1.5)=0x3e00, got 0x{v2_lo:04x} ({_f16(v2_lo)})") + self.assertEqual(v3_lo, 0x4100, f"v3 should be f16(2.5)=0x4100, got 0x{v3_lo:04x} ({_f16(v3_lo)})") + + # Check packed result + result = st.vgpr[0][4] + self.assertEqual(result, 0x41003e00, f"Expected packed 0x41003e00, got 0x{result:08x}") + + def test_v_pack_b32_f16_basic(self): + """V_PACK_B32_F16 packs two f16 values into one 32-bit register.""" + from extra.assembly.rdna3.pcode import _f16 + instructions = [ + # First convert two f32 values to f16 + v_mov_b32_e32(v[0], 1.0), # Will become f16 0x3c00 + v_mov_b32_e32(v[2], -2.0), # Will become f16 0xc000 + v_cvt_f16_f32_e32(v[1], v[0]), # v1 low = 0x3c00 + v_cvt_f16_f32_e32(v[3], v[2]), # v3 low = 0xc000 + # Now pack them: v4 = (v3.f16 << 16) | v1.f16 + v_pack_b32_f16(v[4], v[1], v[3]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][4] + lo_bits = result & 0xffff + hi_bits = (result >> 16) & 0xffff + # Expected: lo=0x3c00 (1.0), hi=0xc000 (-2.0) + self.assertEqual(lo_bits, 0x3c00, f"Lo should be 0x3c00 (1.0), got 0x{lo_bits:04x} ({_f16(lo_bits)})") + self.assertEqual(hi_bits, 0xc000, f"Hi should be 0xc000 (-2.0), got 0x{hi_bits:04x} ({_f16(hi_bits)})") + + def test_v_pack_b32_f16_both_positive(self): + """V_PACK_B32_F16 packs two positive f16 values.""" + from extra.assembly.rdna3.pcode import _f16 + instructions = [ + v_mov_b32_e32(v[0], 0.5), # f16 0x3800 + v_mov_b32_e32(v[2], 2.0), # f16 0x4000 + v_cvt_f16_f32_e32(v[1], v[0]), + v_cvt_f16_f32_e32(v[3], v[2]), + v_pack_b32_f16(v[4], v[1], v[3]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][4] + lo_bits = result & 0xffff + hi_bits = (result >> 16) & 0xffff + self.assertEqual(lo_bits, 0x3800, f"Lo should be 0x3800 (0.5), got 0x{lo_bits:04x}") + self.assertEqual(hi_bits, 0x4000, f"Hi should be 0x4000 (2.0), got 0x{hi_bits:04x}") + + def test_v_pack_b32_f16_zeros(self): + """V_PACK_B32_F16 packs two zero values.""" + instructions = [ + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[2], 0), + v_cvt_f16_f32_e32(v[1], v[0]), + v_cvt_f16_f32_e32(v[3], v[2]), + v_pack_b32_f16(v[4], v[1], v[3]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][4] + self.assertEqual(result, 0, f"Expected 0x00000000, got 0x{result:08x}") + + +class TestPackInstructions(unittest.TestCase): + """Tests for pack instructions.""" + + def test_v_pack_b32_f16(self): + """V_PACK_B32_F16 packs two f16 values into one 32-bit register.""" + instructions = [] + # f16 1.0 = 0x3c00, f16 2.0 = 0x4000 + instructions.append(s_mov_b32(s[0], 0x3c00)) # f16 1.0 + instructions.append(s_mov_b32(s[1], 0x4000)) # f16 2.0 + instructions.append(v_mov_b32_e32(v[0], s[0])) + instructions.append(v_mov_b32_e32(v[1], s[1])) + # Pack: v[2] = (v[1].f16 << 16) | v[0].f16 + instructions.append(v_pack_b32_f16(v[2], v[0], v[1])) + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: hi=0x4000 (2.0), lo=0x3c00 (1.0) -> 0x40003c00 + self.assertEqual(result, 0x40003c00, f"Expected 0x40003c00, got 0x{result:08x}") + + def test_v_pack_b32_f16_with_cvt(self): + """V_PACK_B32_F16 after V_CVT_F16_F32 conversions.""" + instructions = [] + # f32 1.0 = 0x3f800000 + instructions.append(s_mov_b32(s[0], 0x3f800000)) + instructions.append(v_mov_b32_e32(v[0], s[0])) # f32 1.0 + instructions.append(v_mov_b32_e32(v[1], s[0])) # f32 1.0 + # Convert to f16 + instructions.append(v_cvt_f16_f32_e32(v[2], v[0])) # v[2].f16 = 1.0 + instructions.append(v_cvt_f16_f32_e32(v[3], v[1])) # v[3].f16 = 1.0 + # Pack + instructions.append(v_pack_b32_f16(v[4], v[2], v[3])) + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][4] + # Expected: 0x3c003c00 (two f16 1.0 values) + self.assertEqual(result, 0x3c003c00, f"Expected 0x3c003c00, got 0x{result:08x}") + + def test_v_pack_b32_f16_packed_sources(self): + """V_PACK_B32_F16 with sources that have packed f16 pairs (both hi and lo used). + This mimics what happens in matmul kernels where VGPRs contain packed f16 data. + """ + instructions = [] + # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0) + # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0) + # V_PACK_B32_F16 with default opsel=0 reads low halves from each source + # Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.lo=0x3c00 (1.0) -> 0x42003c00 + instructions.append(s_mov_b32(s[0], 0x40003c00)) # packed: hi=2.0, lo=1.0 + instructions.append(s_mov_b32(s[1], 0x44004200)) # packed: hi=4.0, lo=3.0 + instructions.append(v_mov_b32_e32(v[0], s[0])) + instructions.append(v_mov_b32_e32(v[1], s[1])) + instructions.append(v_pack_b32_f16(v[2], v[0], v[1])) + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: hi=0x4200 (3.0), lo=0x3c00 (1.0) -> 0x42003c00 + self.assertEqual(result, 0x42003c00, f"Expected 0x42003c00, got 0x{result:08x}") + + def test_v_pack_b32_f16_opsel_hi_hi(self): + """V_PACK_B32_F16 with opsel=0b0011 to read high halves from both sources. + This is used when extracting the high f16 values from packed registers. + """ + # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0) + # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0) + # With opsel=0b0011: read hi from v0 (0x4000=2.0) and hi from v1 (0x4400=4.0) + # Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.hi=0x4000 (2.0) -> 0x44004000 + inst = v_pack_b32_f16(v[2], v[0], v[1]) + inst._values['opsel'] = 0b0011 # opsel[0]=1 for src0 hi, opsel[1]=1 for src1 hi + + instructions = [ + s_mov_b32(s[0], 0x40003c00), # packed: hi=2.0, lo=1.0 + s_mov_b32(s[1], 0x44004200), # packed: hi=4.0, lo=3.0 + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + inst, + ] + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: hi=0x4400 (4.0), lo=0x4000 (2.0) -> 0x44004000 + self.assertEqual(result, 0x44004000, f"Expected 0x44004000, got 0x{result:08x}") + + def test_v_pack_b32_f16_opsel_lo_hi(self): + """V_PACK_B32_F16 with opsel=0b0010 to read lo from src0, hi from src1.""" + # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0) + # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0) + # With opsel=0b0010: read lo from v0 (0x3c00=1.0), hi from v1 (0x4400=4.0) + # Result should be: hi=v1.hi=0x4400 (4.0), lo=v0.lo=0x3c00 (1.0) -> 0x44003c00 + inst = v_pack_b32_f16(v[2], v[0], v[1]) + inst._values['opsel'] = 0b0010 # opsel[0]=0 for src0 lo, opsel[1]=1 for src1 hi + + instructions = [ + s_mov_b32(s[0], 0x40003c00), + s_mov_b32(s[1], 0x44004200), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + inst, + ] + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: hi=0x4400 (4.0), lo=0x3c00 (1.0) -> 0x44003c00 + self.assertEqual(result, 0x44003c00, f"Expected 0x44003c00, got 0x{result:08x}") + + def test_v_pack_b32_f16_opsel_hi_lo(self): + """V_PACK_B32_F16 with opsel=0b0001 to read hi from src0, lo from src1.""" + # v0 = 0x40003c00 (hi=f16 2.0, lo=f16 1.0) + # v1 = 0x44004200 (hi=f16 4.0, lo=f16 3.0) + # With opsel=0b0001: read hi from v0 (0x4000=2.0), lo from v1 (0x4200=3.0) + # Result should be: hi=v1.lo=0x4200 (3.0), lo=v0.hi=0x4000 (2.0) -> 0x42004000 + inst = v_pack_b32_f16(v[2], v[0], v[1]) + inst._values['opsel'] = 0b0001 # opsel[0]=1 for src0 hi, opsel[1]=0 for src1 lo + + instructions = [ + s_mov_b32(s[0], 0x40003c00), + s_mov_b32(s[1], 0x44004200), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + inst, + ] + + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: hi=0x4200 (3.0), lo=0x4000 (2.0) -> 0x42004000 + self.assertEqual(result, 0x42004000, f"Expected 0x42004000, got 0x{result:08x}") - def test_v_min_u16(self): - """V_MIN_U16: minimum of two unsigned 16-bit values. Regression for VOP3 op 779.""" - kernel = make_store_kernel([ - v_mov_b32_e32(v[1], 100), - v_mov_b32_e32(v[2], 50), - VOP3(VOP3Op.V_MIN_U16, v[1], v[1], v[2]), - ]) - out = run_kernel(kernel, n_threads=1) - self.assertEqual(out[0] & 0xffff, 50) class TestWMMA(unittest.TestCase): - """Tests for WMMA (Wave Matrix Multiply Accumulate) instructions.""" + """Tests for WMMA (Wave Matrix Multiply-Accumulate) instructions.""" - def test_wmma_f32_16x16x16_f16_identity(self): - """V_WMMA_F32_16X16X16_F16 with identity matrix. Regression for VOP3P op 64.""" - from extra.assembly.rdna3.emu import i16, f16, exec_wmma_f32_16x16x16_f16, WaveState - # Test using direct emulator call rather than full kernel to simplify - st = WaveState() - st.exec_mask = 0xffffffff # all 32 lanes active + def test_v_wmma_f32_16x16x16_f16_basic(self): + """V_WMMA_F32_16X16X16_F16 basic test - verify emulator matches hardware.""" + # WMMA does D = A @ B + C where A,B are 16x16 f16, C,D are 16x16 f32 + # Use: A=v[16:23], B=v[24:31], C=D=v[0:7] (output in captured range v[0:15]) + instructions = [] - # Set up A as identity matrix: A[i][i] = 1.0, rest = 0.0 - # Lane i holds row i of A in 8 regs (2 fp16 per reg) - for lane in range(16): - for reg in range(8): - col0, col1 = reg * 2, reg * 2 + 1 - val0 = i16(1.0) if col0 == lane else 0 - val1 = i16(1.0) if col1 == lane else 0 - st.vgpr[lane][0 + reg] = val0 | (val1 << 16) # src0 = v0:v7 + # f16 1.0 = 0x3c00, packed pair = 0x3c003c00 + instructions.append(s_mov_b32(s[0], 0x3c003c00)) - # Set up B as identity matrix: lane i holds column i of B - for lane in range(16): - for reg in range(8): - row0, row1 = reg * 2, reg * 2 + 1 - val0 = i16(1.0) if row0 == lane else 0 - val1 = i16(1.0) if row1 == lane else 0 - st.vgpr[lane][8 + reg] = val0 | (val1 << 16) # src1 = v8:v15 + # Set A (v16-v23) and B (v24-v31) to all 1.0s + for i in range(16, 32): + instructions.append(v_mov_b32_e32(v[i], s[0])) - # Set up C as zeros + # Set C (v0-v7) to all 0s (will also be output D) + for i in range(8): + instructions.append(v_mov_b32_e32(v[i], 0)) + + # Execute WMMA: v[0:7] = A @ B + C + instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0])) + + # Just run and compare - USE_HW=1 will verify emulator matches hardware + st = run_program(instructions, n_lanes=32) + + # Verify at least some output is non-zero (actual values depend on WMMA layout) + # Output should be 16.0 (16 x 1.0 x 1.0) for each element + any_nonzero = any(st.vgpr[lane][0] != 0 for lane in range(32)) + self.assertTrue(any_nonzero, "WMMA should produce non-zero output") + + def test_v_wmma_f32_16x16x16_f16_all_ones(self): + """V_WMMA_F32_16X16X16_F16 with all ones should produce 16.0 for each output element. + This verifies the matrix multiply is computing the correct sum. + """ + instructions = [] + + # f16 1.0 = 0x3c00, packed pair = 0x3c003c00 + instructions.append(s_mov_b32(s[0], 0x3c003c00)) + + # Set A (v16-v23) and B (v24-v31) to all 1.0s + for i in range(16, 32): + instructions.append(v_mov_b32_e32(v[i], s[0])) + + # Set C (v0-v7) to all 0s (will also be output D) + for i in range(8): + instructions.append(v_mov_b32_e32(v[i], 0)) + + # Execute WMMA: v[0:7] = A @ B + C + instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0])) + + st = run_program(instructions, n_lanes=32) + + # All output elements should be 16.0 (sum of 16 * 1.0 * 1.0) + expected = f2i(16.0) for lane in range(32): for reg in range(8): - st.vgpr[lane][16 + reg] = 0 # src2 = v16:v23 + result = st.vgpr[lane][reg] + self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (16.0), got 0x{result:08x} ({i2f(result)})") - # Create a fake VOP3P instruction - inst = VOP3P(VOP3POp.V_WMMA_F32_16X16X16_F16, v[24], src0=VGPR(0), src1=VGPR(8), src2=VGPR(16)) + def test_v_wmma_f32_16x16x16_f16_with_accumulator(self): + """V_WMMA_F32_16X16X16_F16 with non-zero accumulator. + Verifies that C matrix is properly added to the product. + """ + instructions = [] - # Execute WMMA - exec_wmma_f32_16x16x16_f16(st, inst, 32) + # f16 1.0 = 0x3c00, packed pair = 0x3c003c00 + instructions.append(s_mov_b32(s[0], 0x3c003c00)) + # f32 5.0 = 0x40a00000 + instructions.append(s_mov_b32(s[1], f2i(5.0))) - # Check result: C should be identity (since A @ B where both are identity) - # Output i = row*16+col goes to lane (i%32), reg (i//32) - for row in range(16): - for col in range(16): - idx = row * 16 + col - lane, reg = idx % 32, idx // 32 - result = st.vgpr[lane][24 + reg] - expected = 1.0 if row == col else 0.0 - self.assertAlmostEqual(f32(result), expected, places=3, - msg=f"C[{row},{col}] = {f32(result)}, expected {expected}") + # Set A (v16-v23) and B (v24-v31) to all 1.0s + for i in range(16, 32): + instructions.append(v_mov_b32_e32(v[i], s[0])) -if __name__ == "__main__": + # Set C (v0-v7) to all 5.0s + for i in range(8): + instructions.append(v_mov_b32_e32(v[i], s[1])) + + # Execute WMMA: v[0:7] = A @ B + C = 16.0 + 5.0 = 21.0 + instructions.append(v_wmma_f32_16x16x16_f16(v[0], v[16], v[24], v[0])) + + st = run_program(instructions, n_lanes=32) + + # All output elements should be 21.0 (16.0 + 5.0) + expected = f2i(21.0) + for lane in range(32): + for reg in range(8): + result = st.vgpr[lane][reg] + self.assertEqual(result, expected, f"v[{reg}] lane {lane}: expected 0x{expected:08x} (21.0), got 0x{result:08x} ({i2f(result)})") + + +class TestVOP3P(unittest.TestCase): + """Tests for VOP3P packed 16-bit operations.""" + + def test_v_pk_add_f16_basic(self): + """V_PK_ADD_F16 adds two packed f16 values.""" + from extra.assembly.rdna3.pcode import _f16 + # v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0) + # Result should be packed (4.0, 6.0) + instructions = [ + s_mov_b32(s[0], 0x40003c00), # packed f16: hi=2.0, lo=1.0 + s_mov_b32(s[1], 0x44004200), # packed f16: hi=4.0, lo=3.0 + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_pk_add_f16(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + # Expected: lo=1.0+3.0=4.0 (0x4400), hi=2.0+4.0=6.0 (0x4600) -> 0x46004400 + lo = _f16(result & 0xffff) + hi = _f16((result >> 16) & 0xffff) + self.assertAlmostEqual(lo, 4.0, places=2, msg=f"lo: expected 4.0, got {lo}") + self.assertAlmostEqual(hi, 6.0, places=2, msg=f"hi: expected 6.0, got {hi}") + + def test_v_pk_add_f16_with_inline_constant(self): + """V_PK_ADD_F16 with inline constant POS_ONE (1.0). + Inline constants for VOP3P are f16 values in the low 16 bits only. + The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant. + """ + from extra.assembly.rdna3.pcode import _f16 + # v0 = packed (1.0, 1.0), add POS_ONE + # With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant) + # But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding + # Default opsel_hi=3 means: bit0=1 (src0 hi from hi), bit1=1 (src1 hi from hi) + # Since inline constant has 0 in hi half, hi result = v0.hi + 0 = 1.0 + instructions = [ + s_mov_b32(s[0], 0x3c003c00), # packed f16: hi=1.0, lo=1.0 + v_mov_b32_e32(v[0], s[0]), + v_pk_add_f16(v[1], v[0], SrcEnum.POS_ONE), # Add inline constant 1.0 + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + lo = _f16(result & 0xffff) + hi = _f16((result >> 16) & 0xffff) + # lo = 1.0 + 1.0 = 2.0, hi = 1.0 + 0.0 = 1.0 (inline const hi half is 0) + self.assertAlmostEqual(lo, 2.0, places=2, msg=f"lo: expected 2.0, got {lo} (result=0x{result:08x})") + self.assertAlmostEqual(hi, 1.0, places=2, msg=f"hi: expected 1.0, got {hi} (result=0x{result:08x})") + + def test_v_pk_mul_f16_basic(self): + """V_PK_MUL_F16 multiplies two packed f16 values.""" + from extra.assembly.rdna3.pcode import _f16 + # v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0) + # Result should be packed (8.0, 15.0) + instructions = [ + s_mov_b32(s[0], 0x42004000), # packed f16: hi=3.0, lo=2.0 + s_mov_b32(s[1], 0x45004400), # packed f16: hi=5.0, lo=4.0 + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_pk_mul_f16(v[2], v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][2] + lo = _f16(result & 0xffff) + hi = _f16((result >> 16) & 0xffff) + self.assertAlmostEqual(lo, 8.0, places=1, msg=f"lo: expected 8.0, got {lo}") + self.assertAlmostEqual(hi, 15.0, places=1, msg=f"hi: expected 15.0, got {hi}") + + def test_v_pk_mul_f16_with_inline_constant(self): + """V_PK_MUL_F16 with inline constant POS_TWO (2.0). + Inline constant has value only in low 16 bits, hi is 0. + """ + from extra.assembly.rdna3.pcode import _f16 + # v0 = packed (3.0, 4.0), multiply by POS_TWO + # lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0) + instructions = [ + s_mov_b32(s[0], 0x44004200), # packed f16: hi=4.0, lo=3.0 + v_mov_b32_e32(v[0], s[0]), + v_pk_mul_f16(v[1], v[0], SrcEnum.POS_TWO), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][1] + lo = _f16(result & 0xffff) + hi = _f16((result >> 16) & 0xffff) + self.assertAlmostEqual(lo, 6.0, places=1, msg=f"lo: expected 6.0, got {lo}") + self.assertAlmostEqual(hi, 0.0, places=1, msg=f"hi: expected 0.0, got {hi}") + + def test_v_pk_fma_f16_basic(self): + """V_PK_FMA_F16: D = A * B + C for packed f16.""" + from extra.assembly.rdna3.pcode import _f16 + # A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0) + # Result should be packed (2*4+1=9.0, 3*5+1=16.0) + instructions = [ + s_mov_b32(s[0], 0x42004000), # A: hi=3.0, lo=2.0 + s_mov_b32(s[1], 0x45004400), # B: hi=5.0, lo=4.0 + s_mov_b32(s[2], 0x3c003c00), # C: hi=1.0, lo=1.0 + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_mov_b32_e32(v[2], s[2]), + v_pk_fma_f16(v[3], v[0], v[1], v[2]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][3] + lo = _f16(result & 0xffff) + hi = _f16((result >> 16) & 0xffff) + self.assertAlmostEqual(lo, 9.0, places=1, msg=f"lo: expected 9.0, got {lo}") + self.assertAlmostEqual(hi, 16.0, places=0, msg=f"hi: expected 16.0, got {hi}") + + +class TestF64Conversions(unittest.TestCase): + """Tests for 64-bit float operations and conversions.""" + + def test_v_add_f64_inline_constant(self): + """V_ADD_F64 with inline constant POS_ONE (1.0) as f64.""" + one_f64 = f2i64(1.0) + instructions = [ + s_mov_b32(s[0], one_f64 & 0xffffffff), + s_mov_b32(s[1], one_f64 >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_add_f64(v[2:4], v[0:2], SrcEnum.POS_ONE), # 1.0 + 1.0 = 2.0 + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32)) + self.assertAlmostEqual(result, 2.0, places=5) + + def test_v_ldexp_f64_negative_exponent(self): + """V_LDEXP_F64 with negative exponent (-32).""" + val = -8.0 + val_bits = f2i64(val) + expected = -8.0 * (2.0 ** -32) # -1.862645149230957e-09 + instructions = [ + s_mov_b32(s[0], val_bits & 0xffffffff), + s_mov_b32(s[1], val_bits >> 32), + v_mov_b32_e32(v[0], s[0]), + v_mov_b32_e32(v[1], s[1]), + v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0), # -32 + ] + st = run_program(instructions, n_lanes=1) + result = i642f(st.vgpr[0][2] | (st.vgpr[0][3] << 32)) + self.assertAlmostEqual(result, expected, places=15) + + def test_f64_to_i64_conversion_sequence(self): + """Test the f64->i64 conversion sequence used by the compiler. + + The compiler generates: + v_trunc_f64 -> v_ldexp_f64 (by -32) -> v_floor_f64 -> v_fma_f64 (by -2^32) + -> v_cvt_u32_f64 (low bits) -> v_cvt_i32_f64 (high bits) + + The FMA computes: trunc + (-2^32) * floor = trunc - floor * 2^32 + which gives the low 32 bits as a positive float (for proper u32 conversion). + """ + val = -8.0 + val_bits = f2i64(val) + lit = -4294967296.0 # -2^32 (note: NEGATIVE, so FMA does trunc - floor * 2^32) + lit_bits = f2i64(lit) + + instructions = [ + s_mov_b32(s[0], val_bits & 0xffffffff), + s_mov_b32(s[1], val_bits >> 32), + v_trunc_f64_e32(v[0:2], s[0:2]), + v_ldexp_f64(v[2:4], v[0:2], 0xffffffe0), # -32 + v_floor_f64_e32(v[2:4], v[2:4]), + s_mov_b32(s[2], lit_bits & 0xffffffff), + s_mov_b32(s[3], lit_bits >> 32), + v_fma_f64(v[0:2], s[2:4], v[2:4], v[0:2]), + v_cvt_u32_f64_e32(v[4], v[0:2]), + v_cvt_i32_f64_e32(v[5], v[2:4]), + ] + st = run_program(instructions, n_lanes=1) + # v4 = low 32 bits, v5 = high 32 bits (sign extended) + lo = st.vgpr[0][4] + hi = st.vgpr[0][5] + # For -8: lo should be 0xfffffff8, hi should be 0xffffffff + result = struct.unpack(' r2.u32) + self.assertFalse(r1.u32 < r2.u32) + self.assertTrue(r1.u32 != r2.u32) + +class TestSliceProxy(unittest.TestCase): + def test_slice_read(self): + r = Reg(0x56781234) + self.assertEqual(r[15:0].u16, 0x1234) + self.assertEqual(r[31:16].u16, 0x5678) + + def test_slice_write(self): + r = Reg(0) + r[15:0].u16 = 0x1234 + r[31:16].u16 = 0x5678 + self.assertEqual(r._val, 0x56781234) + + def test_slice_f16(self): + r = Reg(0) + r[15:0].f16 = 3.0 + self.assertAlmostEqual(_f16(r._val & 0xffff), 3.0, places=2) + +class TestCompiler(unittest.TestCase): + def test_ternary(self): + result = _expr("a > b ? 1 : 0") + self.assertIn("if", result) + self.assertIn("else", result) + + def test_type_prefix_strip(self): + self.assertEqual(_expr("1'0U"), "0") + self.assertEqual(_expr("32'1"), "1") + self.assertEqual(_expr("16'0xFFFF"), "0xFFFF") + + def test_suffix_strip(self): + self.assertEqual(_expr("0ULL"), "0") + self.assertEqual(_expr("1LL"), "1") + self.assertEqual(_expr("5U"), "5") + self.assertEqual(_expr("3.14F"), "3.14") + + def test_boolean_ops(self): + self.assertIn("and", _expr("a && b")) + self.assertIn("or", _expr("a || b")) + self.assertIn("!=", _expr("a <> b")) + + def test_pack16(self): + result = _expr("{ a, b }") + self.assertIn("_pack", result) + + def test_type_cast_strip(self): + self.assertEqual(_expr("64'U(x)"), "(x)") + self.assertEqual(_expr("32'I(y)"), "(y)") + +class TestExecContext(unittest.TestCase): + def test_float_add(self): + ctx = ExecContext(s0=0x40400000, s1=0x40800000) # 3.0f, 4.0f + ctx.D0.f32 = ctx.S0.f32 + ctx.S1.f32 + self.assertAlmostEqual(_f32(ctx.D0._val), 7.0) + + def test_float_mul(self): + ctx = ExecContext(s0=0x40400000, s1=0x40800000) # 3.0f, 4.0f + ctx.run("D0.f32 = S0.f32 * S1.f32") + self.assertAlmostEqual(_f32(ctx.D0._val), 12.0) + + def test_scc_comparison(self): + ctx = ExecContext(s0=42, s1=42) + ctx.run("SCC = S0.u32 == S1.u32") + self.assertEqual(ctx.SCC._val, 1) + + def test_scc_comparison_false(self): + ctx = ExecContext(s0=42, s1=43) + ctx.run("SCC = S0.u32 == S1.u32") + self.assertEqual(ctx.SCC._val, 0) + + def test_ternary(self): + code = compile_pseudocode("D0.u32 = S0.u32 > S1.u32 ? 1'1U : 1'0U") + ctx = ExecContext(s0=5, s1=3) + ctx.run(code) + self.assertEqual(ctx.D0._val, 1) + + def test_pack(self): + code = compile_pseudocode("D0 = { S1[15:0].u16, S0[15:0].u16 }") + ctx = ExecContext(s0=0x1234, s1=0x5678) + ctx.run(code) + self.assertEqual(ctx.D0._val, 0x56781234) + + def test_tmp_with_typed_access(self): + code = compile_pseudocode("""tmp = S0.u32 + S1.u32 +D0.u32 = tmp.u32""") + ctx = ExecContext(s0=100, s1=200) + ctx.run(code) + self.assertEqual(ctx.D0._val, 300) + + def test_s_add_u32_pattern(self): + # Real pseudocode pattern from S_ADD_U32 + code = compile_pseudocode("""tmp = 64'U(S0.u32) + 64'U(S1.u32) +SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U +D0.u32 = tmp.u32""") + # Test overflow case + ctx = ExecContext(s0=0xFFFFFFFF, s1=0x00000001) + ctx.run(code) + self.assertEqual(ctx.D0._val, 0) # Wraps to 0 + self.assertEqual(ctx.SCC._val, 1) # Carry set + + def test_s_add_u32_no_overflow(self): + code = compile_pseudocode("""tmp = 64'U(S0.u32) + 64'U(S1.u32) +SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U +D0.u32 = tmp.u32""") + ctx = ExecContext(s0=100, s1=200) + ctx.run(code) + self.assertEqual(ctx.D0._val, 300) + self.assertEqual(ctx.SCC._val, 0) # No carry + + def test_vcc_lane_read(self): + ctx = ExecContext(vcc=0b1010, lane=1) + # Lane 1 is set + self.assertEqual(ctx.VCC.u64[1], 1) + self.assertEqual(ctx.VCC.u64[2], 0) + + def test_vcc_lane_write(self): + ctx = ExecContext(vcc=0, lane=0) + ctx.VCC.u64[3] = 1 + ctx.VCC.u64[1] = 1 + self.assertEqual(ctx.VCC._val, 0b1010) + + def test_for_loop(self): + # CTZ pattern - find first set bit + code = compile_pseudocode("""tmp = -1 +for i in 0 : 31 do + if S0.u32[i] == 1 then + tmp = i +D0.i32 = tmp""") + ctx = ExecContext(s0=0b1000) # Bit 3 is set + ctx.run(code) + self.assertEqual(ctx.D0._val & MASK32, 3) + + def test_result_dict(self): + ctx = ExecContext(s0=5, s1=3) + ctx.D0.u32 = 42 + ctx.SCC._val = 1 + result = ctx.result() + self.assertEqual(result['d0'], 42) + self.assertEqual(result['scc'], 1) + +class TestPseudocodeRegressions(unittest.TestCase): + """Regression tests for pseudocode instruction emulation bugs.""" + + def test_v_div_scale_f32_vcc_always_returned(self): + """V_DIV_SCALE_F32 must always return vcc_lane, even when VCC=0 (no scaling needed). + Bug: when VCC._val == vcc (both 0), vcc_lane wasn't returned, so VCC bits weren't written. + This caused division to produce wrong results for multiple lanes.""" + # Normal case: 1.0 / 3.0, no scaling needed, VCC should be 0 + s0 = 0x3f800000 # 1.0 + s1 = 0x40400000 # 3.0 + s2 = 0x3f800000 # 1.0 (numerator) + result = _VOP3SDOp_V_DIV_SCALE_F32(s0, s1, s2, 0, 0, 0, 0, 0xffffffff, 0, None, {}) + # Must always have vcc_lane in result + self.assertIn('vcc_lane', result, "V_DIV_SCALE_F32 must always return vcc_lane") + self.assertEqual(result['vcc_lane'], 0, "vcc_lane should be 0 when no scaling needed") + + def test_v_cmp_class_f32_detects_quiet_nan(self): + """V_CMP_CLASS_F32 must correctly identify quiet NaN vs signaling NaN. + Bug: isQuietNAN and isSignalNAN both used math.isnan which can't distinguish them.""" + quiet_nan = 0x7fc00000 # quiet NaN: exponent=255, bit22=1 + signal_nan = 0x7f800001 # signaling NaN: exponent=255, bit22=0 + # Test quiet NaN detection (bit 1 in mask) + s1_quiet = 0b0000000010 # bit 1 = quiet NaN + result = _VOPCOp_V_CMP_CLASS_F32(quiet_nan, s1_quiet, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {}) + self.assertEqual(result['vcc_lane'], 1, "Should detect quiet NaN with quiet NaN mask") + # Test signaling NaN detection (bit 0 in mask) + s1_signal = 0b0000000001 # bit 0 = signaling NaN + result = _VOPCOp_V_CMP_CLASS_F32(signal_nan, s1_signal, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {}) + self.assertEqual(result['vcc_lane'], 1, "Should detect signaling NaN with signaling NaN mask") + # Test that quiet NaN doesn't match signaling NaN mask + result = _VOPCOp_V_CMP_CLASS_F32(quiet_nan, s1_signal, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {}) + self.assertEqual(result['vcc_lane'], 0, "Quiet NaN should not match signaling NaN mask") + # Test that signaling NaN doesn't match quiet NaN mask + result = _VOPCOp_V_CMP_CLASS_F32(signal_nan, s1_quiet, 0, 0, 0, 0, 0, 0xffffffff, 0, None, {}) + self.assertEqual(result['vcc_lane'], 0, "Signaling NaN should not match quiet NaN mask") + + def test_isnan_with_typed_view(self): + """_isnan must work with TypedView objects, not just Python floats. + Bug: _isnan checked isinstance(x, float) which returned False for TypedView.""" + nan_reg = Reg(0x7fc00000) # quiet NaN + normal_reg = Reg(0x3f800000) # 1.0 + inf_reg = Reg(0x7f800000) # +inf + self.assertTrue(_isnan(nan_reg.f32), "_isnan should return True for NaN TypedView") + self.assertFalse(_isnan(normal_reg.f32), "_isnan should return False for normal TypedView") + self.assertFalse(_isnan(inf_reg.f32), "_isnan should return False for inf TypedView") + +if __name__ == '__main__': + unittest.main() diff --git a/test/mockgpu/amd/amdgpu.py b/test/mockgpu/amd/amdgpu.py index f39c5f4238..c3fc27d705 100644 --- a/test/mockgpu/amd/amdgpu.py +++ b/test/mockgpu/amd/amdgpu.py @@ -7,6 +7,7 @@ import tinygrad.runtime.autogen.amd_gpu as amd_gpu, tinygrad.runtime.autogen.am. SDMA_MAX_COPY_SIZE = 0x400000 regCOMPUTE_PGM_LO = 0x1bac + amd_gpu.GC_BASE__INST0_SEG0 +regCOMPUTE_PGM_RSRC2 = 0x1bb3 + amd_gpu.GC_BASE__INST0_SEG0 regCOMPUTE_USER_DATA_0 = 0x1be0 + amd_gpu.GC_BASE__INST0_SEG0 regCOMPUTE_NUM_THREAD_X = 0x1ba7 + amd_gpu.GC_BASE__INST0_SEG0 regGRBM_GFX_INDEX = 0x2200 + amd_gpu.GC_BASE__INST0_SEG1 @@ -179,14 +180,16 @@ class PM4Executor(AMDQueue): prg_addr = (self.gpu.regs[regCOMPUTE_PGM_LO] + (self.gpu.regs[regCOMPUTE_PGM_LO + 1] << 32)) << 8 args_addr = self.gpu.regs[regCOMPUTE_USER_DATA_0] + (self.gpu.regs[regCOMPUTE_USER_DATA_0 + 1] << 32) lc = [self.gpu.regs[i] for i in range(regCOMPUTE_NUM_THREAD_X, regCOMPUTE_NUM_THREAD_X+3)] + rsrc2 = self.gpu.regs[regCOMPUTE_PGM_RSRC2] prg_sz = 0 for st,sz in self.gpu.mapped_ranges: if st <= prg_addr < st+sz: prg_sz = sz - (prg_addr - st) assert prg_sz > 0, "Invalid prg ptr (not found in mapped ranges)" - # Pass valid memory ranges to Python emulator for bounds checking + # Pass valid memory ranges and rsrc2 to Python emulator for bounds checking and SGPR layout if hasattr(remu, 'valid_mem_ranges'): remu.valid_mem_ranges = self.gpu.mapped_ranges + if hasattr(remu, 'rsrc2'): remu.rsrc2 = rsrc2 err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr) if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel") diff --git a/test/mockgpu/helpers.py b/test/mockgpu/helpers.py index 01dec2c095..39b8daa336 100644 --- a/test/mockgpu/helpers.py +++ b/test/mockgpu/helpers.py @@ -18,12 +18,13 @@ def _try_dlopen_gpuocelot(): class PythonRemu: """Python RDNA3 emulator wrapper that matches the libremu.so interface.""" valid_mem_ranges: set[tuple[int, int]] = set() + rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int: from extra.assembly.rdna3.emu import run_asm, set_valid_mem_ranges # Pad ranges to handle GPU loads that may read past small buffers (e.g. s_load_b128 on 12-byte buffer) set_valid_mem_ranges({(start, size + 4096) for start, size in self.valid_mem_ranges}) - return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr) + return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2) def _try_dlopen_remu(): # Use Python emulator only if PYTHON_REMU=1 diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index e14f7961d3..f4a98b3194 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -641,14 +641,14 @@ class AMDQueueDesc: def read_ptr(self): return min(p[0] for p in self.read_ptrs) def signal_doorbell(self, dev, doorbell_value:int|None=None): - for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value - - # Ensure all prior writes are visible to the GPU. - System.memory_barrier() - - # Flush hdp if queue is in dev mem. - if dev.is_am() and not dev.is_usb(): dev.iface.dev_impl.gmc.flush_hdp() try: + for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value + + # Ensure all prior writes are visible to the GPU. + System.memory_barrier() + + # Flush hdp if queue is in dev mem. + if dev.is_am() and not dev.is_usb(): dev.iface.dev_impl.gmc.flush_hdp() for doorbell in self.doorbells: doorbell[0] = self.put_value if doorbell_value is None else doorbell_value except Exception as e: dev.error_state = e