tinygrad/extra/assembly/amd/autogen/rdna3/str_pcode.py

# autogenerated by pdf.py - do not edit
# to regenerate: python -m extra.assembly.amd.pdf --arch rdna3
# ruff: noqa: E501
from extra.assembly.amd.autogen.rdna3.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, SCRATCHOp

SOP1Op_PCODE = {
  SOP1Op.S_MOV_B32: 'D0.b32 = S0.b32',
  SOP1Op.S_MOV_B64: 'D0.b64 = S0.b64',
  SOP1Op.S_CMOV_B32: 'if SCC then\nD0.b32 = S0.b32\nendif',
  SOP1Op.S_CMOV_B64: 'if SCC then\nD0.b64 = S0.b64\nendif',
  SOP1Op.S_BREV_B32: 'D0.u32[31 : 0] = S0.u32[0 : 31]',
  SOP1Op.S_BREV_B64: 'D0.u64[63 : 0] = S0.u64[0 : 63]',
  SOP1Op.S_CTZ_I32_B32: "tmp = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from LSB\nif S0.u32[i] == 1'1U then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp",
  SOP1Op.S_CTZ_I32_B64: "tmp = -1;\n// Set if no ones are found\nfor i in 0 : 63 do\n// Search from LSB\nif S0.u64[i] == 1'1U then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp",
  SOP1Op.S_CLZ_I32_U32: "tmp = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from MSB\nif S0.u32[31 - i] == 1'1U then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp",
  SOP1Op.S_CLZ_I32_U64: "tmp = -1;\n// Set if no ones are found\nfor i in 0 : 63 do\n// Search from MSB\nif S0.u64[63 - i] == 1'1U then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp",
  SOP1Op.S_CLS_I32: 'tmp = -1;\n// Set if all bits are the same\nfor i in 1 : 31 do\n// Search from MSB\nif S0.u32[31 - i] != S0.u32[31] then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp',
  SOP1Op.S_CLS_I32_I64: 'tmp = -1;\n// Set if all bits are the same\nfor i in 1 : 63 do\n// Search from MSB\nif S0.u64[63 - i] != S0.u64[63] then\ntmp = i;\nendif\nendfor;\nD0.i32 = tmp',
  SOP1Op.S_SEXT_I32_I8: "D0.i32 = 32'I(signext(S0.i8))",
  SOP1Op.S_SEXT_I32_I16: "D0.i32 = 32'I(signext(S0.i16))",
  SOP1Op.S_BITSET0_B32: "D0.u32[S0.u32[4 : 0]] = 1'0U",
  SOP1Op.S_BITSET0_B64: "D0.u64[S0.u32[5 : 0]] = 1'0U",
  SOP1Op.S_BITSET1_B32: "D0.u32[S0.u32[4 : 0]] = 1'1U",
  SOP1Op.S_BITSET1_B64: "D0.u64[S0.u32[5 : 0]] = 1'1U",
  SOP1Op.S_BITREPLICATE_B64_B32: 'tmp = S0.u32;\nfor i in 0 : 31 do\nD0.u64[i * 2] = tmp[i];\nD0.u64[i * 2 + 1] = tmp[i]\nendfor',
  SOP1Op.S_ABS_I32: 'D0.i32 = S0.i32 < 0 ? -S0.i32 : S0.i32;\nSCC = D0.i32 != 0',
  SOP1Op.S_BCNT0_I32_B32: "tmp = 0;\nfor i in 0 : 31 do\ntmp += S0.u32[i] == 1'0U ? 1 : 0\nendfor;\nD0.i32 = tmp;\nSCC = D0.u32 != 0U",
  SOP1Op.S_BCNT0_I32_B64: "tmp = 0;\nfor i in 0 : 63 do\ntmp += S0.u64[i] == 1'0U ? 1 : 0\nendfor;\nD0.i32 = tmp;\nSCC = D0.u64 != 0ULL",
  SOP1Op.S_BCNT1_I32_B32: "tmp = 0;\nfor i in 0 : 31 do\ntmp += S0.u32[i] == 1'1U ? 1 : 0\nendfor;\nD0.i32 = tmp;\nSCC = D0.u32 != 0U",
  SOP1Op.S_BCNT1_I32_B64: "tmp = 0;\nfor i in 0 : 63 do\ntmp += S0.u64[i] == 1'1U ? 1 : 0\nendfor;\nD0.i32 = tmp;\nSCC = D0.u64 != 0ULL",
  SOP1Op.S_QUADMASK_B32: 'tmp = 0U;\nfor i in 0 : 7 do\ntmp[i] = S0.u32[i * 4 +: 4] != 0U\nendfor;\nD0.u32 = tmp;\nSCC = D0.u32 != 0U',
  SOP1Op.S_QUADMASK_B64: 'tmp = 0ULL;\nfor i in 0 : 15 do\ntmp[i] = S0.u64[i * 4 +: 4] != 0ULL\nendfor;\nD0.u64 = tmp;\nSCC = D0.u64 != 0ULL',
  SOP1Op.S_WQM_B32: "tmp = 0U;\ndeclare i : 6'U;\nfor i in 6'0U : 6'31U do\ntmp[i] = S0.u32[i & 6'60U +: 6'4U] != 0U\nendfor;\nD0.u32 = tmp;\nSCC = D0.u32 != 0U",
  SOP1Op.S_WQM_B64: "tmp = 0ULL;\ndeclare i : 6'U;\nfor i in 6'0U : 6'63U do\ntmp[i] = S0.u64[i & 6'60U +: 6'4U] != 0ULL\nendfor;\nD0.u64 = tmp;\nSCC = D0.u64 != 0ULL",
  SOP1Op.S_NOT_B32: 'D0.u32 = ~S0.u32;\nSCC = D0.u32 != 0U',
  SOP1Op.S_NOT_B64: 'D0.u64 = ~S0.u64;\nSCC = D0.u64 != 0ULL',
  SOP1Op.S_AND_SAVEEXEC_B32: 'Calculate bitwise AND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u32;\nEXEC.u32 = (S0.u32 & EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_AND_SAVEEXEC_B64: 'Calculate bitwise AND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u64;\nEXEC.u64 = (S0.u64 & EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_OR_SAVEEXEC_B32: 'Calculate bitwise OR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, set\nSCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar destination\nsaveexec = EXEC.u32;\nEXEC.u32 = (S0.u32 | EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_OR_SAVEEXEC_B64: 'Calculate bitwise OR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask, set\nSCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar destination\nsaveexec = EXEC.u64;\nEXEC.u64 = (S0.u64 | EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_XOR_SAVEEXEC_B32: 'Calculate bitwise XOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u32;\nEXEC.u32 = (S0.u32 ^ EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_XOR_SAVEEXEC_B64: 'Calculate bitwise XOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u64;\nEXEC.u64 = (S0.u64 ^ EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_NAND_SAVEEXEC_B32: 'Calculate bitwise NAND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u32;\nEXEC.u32 = ~(S0.u32 & EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_NAND_SAVEEXEC_B64: 'Calculate bitwise NAND on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u64;\nEXEC.u64 = ~(S0.u64 & EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_NOR_SAVEEXEC_B32: 'Calculate bitwise NOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u32;\nEXEC.u32 = ~(S0.u32 | EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_NOR_SAVEEXEC_B64: 'Calculate bitwise NOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u64;\nEXEC.u64 = ~(S0.u64 | EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_XNOR_SAVEEXEC_B32: 'Calculate bitwise XNOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u32;\nEXEC.u32 = ~(S0.u32 ^ EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_XNOR_SAVEEXEC_B64: 'Calculate bitwise XNOR on the scalar input and the EXEC mask, store the calculated result into the EXEC mask,\nset SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the scalar\nsaveexec = EXEC.u64;\nEXEC.u64 = ~(S0.u64 ^ EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_AND_NOT0_SAVEEXEC_B32: 'Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into\nthe EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into\nsaveexec = EXEC.u32;\nEXEC.u32 = (~S0.u32 & EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_AND_NOT0_SAVEEXEC_B64: 'Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into\nthe EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into\nsaveexec = EXEC.u64;\nEXEC.u64 = (~S0.u64 & EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_OR_NOT0_SAVEEXEC_B32: 'Calculate bitwise OR on the EXEC mask and the negation of the scalar input, store the calculated result into the\nEXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the\nsaveexec = EXEC.u32;\nEXEC.u32 = (~S0.u32 | EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_OR_NOT0_SAVEEXEC_B64: 'Calculate bitwise OR on the EXEC mask and the negation of the scalar input, store the calculated result into the\nEXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the\nsaveexec = EXEC.u64;\nEXEC.u64 = (~S0.u64 | EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_AND_NOT1_SAVEEXEC_B32: 'Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into\nthe EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into\nsaveexec = EXEC.u32;\nEXEC.u32 = (S0.u32 & ~EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_AND_NOT1_SAVEEXEC_B64: 'Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into\nthe EXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into\nsaveexec = EXEC.u64;\nEXEC.u64 = (S0.u64 & ~EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_OR_NOT1_SAVEEXEC_B32: 'Calculate bitwise OR on the scalar input and the negation of the EXEC mask, store the calculated result into the\nEXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the\nsaveexec = EXEC.u32;\nEXEC.u32 = (S0.u32 | ~EXEC.u32);\nD0.u32 = saveexec.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_OR_NOT1_SAVEEXEC_B64: 'Calculate bitwise OR on the scalar input and the negation of the EXEC mask, store the calculated result into the\nEXEC mask, set SCC iff the calculated result is nonzero and store the original value of the EXEC mask into the\nsaveexec = EXEC.u64;\nEXEC.u64 = (S0.u64 | ~EXEC.u64);\nD0.u64 = saveexec.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_AND_NOT0_WREXEC_B32: 'Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into\nUnlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op\nresult. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is\nEXEC.u32 = (~S0.u32 & EXEC.u32);\nD0.u32 = EXEC.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_AND_NOT0_WREXEC_B64: 'Calculate bitwise AND on the EXEC mask and the negation of the scalar input, store the calculated result into\nUnlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op\nresult. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is\nEXEC.u64 = (~S0.u64 & EXEC.u64);\nD0.u64 = EXEC.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_AND_NOT1_WREXEC_B32: 'Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into\nUnlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op\nresult. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is\nEXEC.u32 = (S0.u32 & ~EXEC.u32);\nD0.u32 = EXEC.u32;\nSCC = EXEC.u32 != 0U',
  SOP1Op.S_AND_NOT1_WREXEC_B64: 'Calculate bitwise AND on the scalar input and the negation of the EXEC mask, store the calculated result into\nUnlike the SAVEEXEC series of opcodes, the value written to destination SGPRs is the result of the bitwise-op\nresult. EXEC and the destination SGPRs have the same value at the end of this instruction. This instruction is\nEXEC.u64 = (S0.u64 & ~EXEC.u64);\nD0.u64 = EXEC.u64;\nSCC = EXEC.u64 != 0ULL',
  SOP1Op.S_MOVRELS_B32: 'addr = SRC0.u32;\n// Raw value from instruction\nD0.b32 = SGPR[addr].b32',
  SOP1Op.S_MOVRELS_B64: 'addr = SRC0.u32;\n// Raw value from instruction\nD0.b64 = SGPR[addr].b64',
  SOP1Op.S_MOVRELD_B32: 'addr = DST.u32;\n// Raw value from instruction\nSGPR[addr].b32 = S0.b32',
  SOP1Op.S_MOVRELD_B64: 'addr = DST.u32;\n// Raw value from instruction\nSGPR[addr].b64 = S0.b64',
  SOP1Op.S_MOVRELSD_2_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction',
  SOP1Op.S_GETPC_B64: 'D0.i64 = PC + 4LL',
  SOP1Op.S_SETPC_B64: 'PC = S0.i64',
  SOP1Op.S_SWAPPC_B64: 'jump_addr = S0.i64;\nD0.i64 = PC + 4LL;\nPC = jump_addr.i64',
  SOP1Op.S_RFE_B64: 'PC = S0.i64',
  SOP1Op.S_SENDMSG_RTN_B32: 'If SDST is VCC then VCCZ is undefined.',
  SOP1Op.S_SENDMSG_RTN_B64: 'If SDST is VCC then VCCZ is undefined.',
  SOP1Op.S_CEIL_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += 1.0F\nendif',
  SOP1Op.S_FLOOR_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += -1.0F\nendif',
  SOP1Op.S_TRUNC_F32: 'D0.f32 = trunc(S0.f32)',
  SOP1Op.S_RNDNE_F32: "D0.f32 = floor(S0.f32 + 0.5F);\nif (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then\nD0.f32 -= 1.0F\nendif",
  SOP1Op.S_CVT_F32_I32: 'D0.f32 = i32_to_f32(S0.i32)',
  SOP1Op.S_CVT_F32_U32: 'D0.f32 = u32_to_f32(S0.u32)',
  SOP1Op.S_CVT_I32_F32: 'D0.i32 = f32_to_i32(S0.f32)',
  SOP1Op.S_CVT_U32_F32: 'D0.u32 = f32_to_u32(S0.f32)',
  SOP1Op.S_CVT_F16_F32: 'D0.f16 = f32_to_f16(S0.f32)',
  SOP1Op.S_CVT_F32_F16: 'D0.f32 = f16_to_f32(S0.f16)',
  SOP1Op.S_CVT_HI_F32_F16: 'D0.f32 = f16_to_f32(S0[31 : 16].f16)',
  SOP1Op.S_CEIL_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += 16'1.0\nendif",
  SOP1Op.S_FLOOR_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += -16'1.0\nendif",
  SOP1Op.S_TRUNC_F16: 'D0.f16 = trunc(S0.f16)',
  SOP1Op.S_RNDNE_F16: "D0.f16 = floor(S0.f16 + 16'0.5);\nif (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then\nD0.f16 -= 16'1.0\nendif",
}

SOP2Op_PCODE = {
  SOP2Op.S_ADD_U32: "tmp = 64'U(S0.u32) + 64'U(S1.u32);\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_SUB_U32: "tmp = S0.u32 - S1.u32;\nSCC = S1.u32 > S0.u32 ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_ADD_I32: 'tmp = S0.i32 + S1.i32;\nSCC = ((S0.u32[31] == S1.u32[31]) && (S0.u32[31] != tmp.u32[31]));\nD0.i32 = tmp.i32',
  SOP2Op.S_SUB_I32: 'tmp = S0.i32 - S1.i32;\nSCC = ((S0.u32[31] != S1.u32[31]) && (S0.u32[31] != tmp.u32[31]));\nD0.i32 = tmp.i32',
  SOP2Op.S_ADDC_U32: "tmp = 64'U(S0.u32) + 64'U(S1.u32) + SCC.u64;\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_SUBB_U32: "tmp = S0.u32 - S1.u32 - SCC.u32;\nSCC = 64'U(S1.u32) + SCC.u64 > 64'U(S0.u32) ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_ABSDIFF_I32: 'D0.i32 = S0.i32 - S1.i32;\nif D0.i32 < 0 then\nD0.i32 = -D0.i32\nendif;\nSCC = D0.i32 != 0',
  SOP2Op.S_LSHL_B32: 'D0.u32 = (S0.u32 << S1[4 : 0].u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_LSHL_B64: 'D0.u64 = (S0.u64 << S1[5 : 0].u32);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_LSHR_B32: 'D0.u32 = (S0.u32 >> S1[4 : 0].u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_LSHR_B64: 'D0.u64 = (S0.u64 >> S1[5 : 0].u32);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_ASHR_I32: "D0.i32 = 32'I(signext(S0.i32) >> S1[4 : 0].u32);\nSCC = D0.i32 != 0",
  SOP2Op.S_ASHR_I64: 'D0.i64 = (signext(S0.i64) >> S1[5 : 0].u32);\nSCC = D0.i64 != 0LL',
  SOP2Op.S_LSHL1_ADD_U32: "tmp = (64'U(S0.u32) << 1U) + 64'U(S1.u32);\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_LSHL2_ADD_U32: "tmp = (64'U(S0.u32) << 2U) + 64'U(S1.u32);\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_LSHL3_ADD_U32: "tmp = (64'U(S0.u32) << 3U) + 64'U(S1.u32);\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_LSHL4_ADD_U32: "tmp = (64'U(S0.u32) << 4U) + 64'U(S1.u32);\nSCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\nD0.u32 = tmp.u32",
  SOP2Op.S_MIN_I32: 'SCC = S0.i32 < S1.i32;\nD0.i32 = SCC ? S0.i32 : S1.i32',
  SOP2Op.S_MIN_U32: 'SCC = S0.u32 < S1.u32;\nD0.u32 = SCC ? S0.u32 : S1.u32',
  SOP2Op.S_MAX_I32: 'SCC = S0.i32 >= S1.i32;\nD0.i32 = SCC ? S0.i32 : S1.i32',
  SOP2Op.S_MAX_U32: 'SCC = S0.u32 >= S1.u32;\nD0.u32 = SCC ? S0.u32 : S1.u32',
  SOP2Op.S_AND_B32: 'D0.u32 = (S0.u32 & S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_AND_B64: 'D0.u64 = (S0.u64 & S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_OR_B32: 'D0.u32 = (S0.u32 | S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_OR_B64: 'D0.u64 = (S0.u64 | S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_XOR_B32: 'D0.u32 = (S0.u32 ^ S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_XOR_B64: 'D0.u64 = (S0.u64 ^ S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_NAND_B32: 'D0.u32 = ~(S0.u32 & S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_NAND_B64: 'D0.u64 = ~(S0.u64 & S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_NOR_B32: 'D0.u32 = ~(S0.u32 | S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_NOR_B64: 'D0.u64 = ~(S0.u64 | S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_XNOR_B32: 'D0.u32 = ~(S0.u32 ^ S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_XNOR_B64: 'D0.u64 = ~(S0.u64 ^ S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_AND_NOT1_B32: 'D0.u32 = (S0.u32 & ~S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_AND_NOT1_B64: 'D0.u64 = (S0.u64 & ~S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_OR_NOT1_B32: 'D0.u32 = (S0.u32 | ~S1.u32);\nSCC = D0.u32 != 0U',
  SOP2Op.S_OR_NOT1_B64: 'D0.u64 = (S0.u64 | ~S1.u64);\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_BFE_U32: 'D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1U << S1[22 : 16].u32) - 1U));\nSCC = D0.u32 != 0U',
  SOP2Op.S_BFE_I32: 'tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S1[22 : 16].u32) - 1));\nD0.i32 = signext_from_bit(tmp.i32, S1[22 : 16].u32);\nSCC = D0.i32 != 0',
  SOP2Op.S_BFE_U64: 'D0.u64 = ((S0.u64 >> S1[5 : 0].u32) & ((1ULL << S1[22 : 16].u32) - 1ULL));\nSCC = D0.u64 != 0ULL',
  SOP2Op.S_BFE_I64: 'tmp.i64 = ((S0.i64 >> S1[5 : 0].u32) & ((1LL << S1[22 : 16].u32) - 1LL));\nD0.i64 = signext_from_bit(tmp.i64, S1[22 : 16].u32);\nSCC = D0.i64 != 0LL',
  SOP2Op.S_BFM_B32: 'D0.u32 = (((1U << S0[4 : 0].u32) - 1U) << S1[4 : 0].u32)',
  SOP2Op.S_BFM_B64: 'D0.u64 = (((1ULL << S0[5 : 0].u32) - 1ULL) << S1[5 : 0].u32)',
  SOP2Op.S_MUL_I32: 'D0.i32 = S0.i32 * S1.i32',
  SOP2Op.S_MUL_HI_U32: "D0.u32 = 32'U((64'U(S0.u32) * 64'U(S1.u32)) >> 32U)",
  SOP2Op.S_MUL_HI_I32: "D0.i32 = 32'I((64'I(S0.i32) * 64'I(S1.i32)) >> 32U)",
  SOP2Op.S_CSELECT_B32: 'D0.u32 = SCC ? S0.u32 : S1.u32',
  SOP2Op.S_CSELECT_B64: 'D0.u64 = SCC ? S0.u64 : S1.u64',
  SOP2Op.S_PACK_LL_B32_B16: 'D0 = { S1[15 : 0].u16, S0[15 : 0].u16 }',
  SOP2Op.S_PACK_LH_B32_B16: 'D0 = { S1[31 : 16].u16, S0[15 : 0].u16 }',
  SOP2Op.S_PACK_HH_B32_B16: 'D0 = { S1[31 : 16].u16, S0[31 : 16].u16 }',
  SOP2Op.S_PACK_HL_B32_B16: 'D0 = { S1[15 : 0].u16, S0[31 : 16].u16 }',
  SOP2Op.S_ADD_F32: 'D0.f32 = S0.f32 + S1.f32',
  SOP2Op.S_SUB_F32: 'D0.f32 = S0.f32 - S1.f32',
  SOP2Op.S_MIN_F32: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  SOP2Op.S_MAX_F32: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  SOP2Op.S_MUL_F32: 'D0.f32 = S0.f32 * S1.f32',
  SOP2Op.S_FMAAK_F32: 'D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32)',
  SOP2Op.S_FMAMK_F32: 'D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32)',
  SOP2Op.S_FMAC_F32: 'D0.f32 = fma(S0.f32, S1.f32, D0.f32)',
  SOP2Op.S_CVT_PK_RTZ_F16_F32: 'prev_mode = ROUND_MODE;\ntmp[15 : 0].f16 = f32_to_f16(S0.f32);\ntmp[31 : 16].f16 = f32_to_f16(S1.f32);',
  SOP2Op.S_ADD_F16: 'D0.f16 = S0.f16 + S1.f16',
  SOP2Op.S_SUB_F16: 'D0.f16 = S0.f16 - S1.f16',
  SOP2Op.S_MIN_F16: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  SOP2Op.S_MAX_F16: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  SOP2Op.S_MUL_F16: 'D0.f16 = S0.f16 * S1.f16',
  SOP2Op.S_FMAC_F16: 'D0.f16 = fma(S0.f16, S1.f16, D0.f16)',
}

SOPCOp_PCODE = {
  SOPCOp.S_CMP_EQ_I32: 'SCC = S0.i32 == S1.i32',
  SOPCOp.S_CMP_LG_I32: 'SCC = S0.i32 <> S1.i32',
  SOPCOp.S_CMP_GT_I32: 'SCC = S0.i32 > S1.i32',
  SOPCOp.S_CMP_GE_I32: 'SCC = S0.i32 >= S1.i32',
  SOPCOp.S_CMP_LT_I32: 'SCC = S0.i32 < S1.i32',
  SOPCOp.S_CMP_LE_I32: 'SCC = S0.i32 <= S1.i32',
  SOPCOp.S_CMP_EQ_U32: 'SCC = S0.u32 == S1.u32',
  SOPCOp.S_CMP_LG_U32: 'SCC = S0.u32 <> S1.u32',
  SOPCOp.S_CMP_GT_U32: 'SCC = S0.u32 > S1.u32',
  SOPCOp.S_CMP_GE_U32: 'SCC = S0.u32 >= S1.u32',
  SOPCOp.S_CMP_LT_U32: 'SCC = S0.u32 < S1.u32',
  SOPCOp.S_CMP_LE_U32: 'SCC = S0.u32 <= S1.u32',
  SOPCOp.S_BITCMP0_B32: "SCC = S0.u32[S1.u32[4 : 0]] == 1'0U",
  SOPCOp.S_BITCMP1_B32: "SCC = S0.u32[S1.u32[4 : 0]] == 1'1U",
  SOPCOp.S_BITCMP0_B64: "SCC = S0.u64[S1.u32[5 : 0]] == 1'0U",
  SOPCOp.S_BITCMP1_B64: "SCC = S0.u64[S1.u32[5 : 0]] == 1'1U",
  SOPCOp.S_CMP_EQ_U64: 'SCC = S0.u64 == S1.u64',
  SOPCOp.S_CMP_LG_U64: 'SCC = S0.u64 <> S1.u64',
  SOPCOp.S_CMP_LT_F32: 'SCC = S0.f32 < S1.f32',
  SOPCOp.S_CMP_LT_F16: 'SCC = S0.f16 < S1.f16',
  SOPCOp.S_CMP_EQ_F32: 'SCC = S0.f32 == S1.f32',
  SOPCOp.S_CMP_EQ_F16: 'SCC = S0.f16 == S1.f16',
  SOPCOp.S_CMP_LE_F32: 'SCC = S0.f32 <= S1.f32',
  SOPCOp.S_CMP_LE_F16: 'SCC = S0.f16 <= S1.f16',
  SOPCOp.S_CMP_GT_F32: 'SCC = S0.f32 > S1.f32',
  SOPCOp.S_CMP_GT_F16: 'SCC = S0.f16 > S1.f16',
  SOPCOp.S_CMP_LG_F32: 'SCC = S0.f32 <> S1.f32',
  SOPCOp.S_CMP_LG_F16: 'SCC = S0.f16 <> S1.f16',
  SOPCOp.S_CMP_GE_F32: 'SCC = S0.f32 >= S1.f32',
  SOPCOp.S_CMP_GE_F16: 'SCC = S0.f16 >= S1.f16',
  SOPCOp.S_CMP_O_F32: "SCC = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32)))",
  SOPCOp.S_CMP_O_F16: "SCC = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16)))",
  SOPCOp.S_CMP_U_F32: "SCC = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)))",
  SOPCOp.S_CMP_U_F16: "SCC = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)))",
  SOPCOp.S_CMP_NGE_F32: 'SCC = !(S0.f32 >= S1.f32);\n// With NAN inputs this is not the same operation as <',
  SOPCOp.S_CMP_NGE_F16: 'SCC = !(S0.f16 >= S1.f16);\n// With NAN inputs this is not the same operation as <',
  SOPCOp.S_CMP_NLG_F32: 'SCC = !(S0.f32 <> S1.f32);\n// With NAN inputs this is not the same operation as ==',
  SOPCOp.S_CMP_NLG_F16: 'SCC = !(S0.f16 <> S1.f16);\n// With NAN inputs this is not the same operation as ==',
  SOPCOp.S_CMP_NGT_F32: 'SCC = !(S0.f32 > S1.f32);\n// With NAN inputs this is not the same operation as <=',
  SOPCOp.S_CMP_NGT_F16: 'SCC = !(S0.f16 > S1.f16);\n// With NAN inputs this is not the same operation as <=',
  SOPCOp.S_CMP_NLE_F32: 'SCC = !(S0.f32 <= S1.f32);\n// With NAN inputs this is not the same operation as >',
  SOPCOp.S_CMP_NLE_F16: 'SCC = !(S0.f16 <= S1.f16);\n// With NAN inputs this is not the same operation as >',
  SOPCOp.S_CMP_NEQ_F32: 'SCC = !(S0.f32 == S1.f32);\n// With NAN inputs this is not the same operation as !=',
  SOPCOp.S_CMP_NEQ_F16: 'SCC = !(S0.f16 == S1.f16);\n// With NAN inputs this is not the same operation as !=',
  SOPCOp.S_CMP_NLT_F32: 'SCC = !(S0.f32 < S1.f32);\n// With NAN inputs this is not the same operation as >=',
  SOPCOp.S_CMP_NLT_F16: 'SCC = !(S0.f16 < S1.f16);\n// With NAN inputs this is not the same operation as >=',
}

SOPKOp_PCODE = {
  SOPKOp.S_MOVK_I32: "D0.i32 = 32'I(signext(SIMM16.i16))",
  SOPKOp.S_VERSION: '// Do nothing - for use by tools only',
  SOPKOp.S_CMOVK_I32: "if SCC then\nD0.i32 = 32'I(signext(SIMM16.i16))\nendif",
  SOPKOp.S_CMPK_EQ_I32: "SCC = 64'I(S0.i32) == signext(SIMM16.i16)",
  SOPKOp.S_CMPK_LG_I32: "SCC = 64'I(S0.i32) != signext(SIMM16.i16)",
  SOPKOp.S_CMPK_GT_I32: "SCC = 64'I(S0.i32) > signext(SIMM16.i16)",
  SOPKOp.S_CMPK_GE_I32: "SCC = 64'I(S0.i32) >= signext(SIMM16.i16)",
  SOPKOp.S_CMPK_LT_I32: "SCC = 64'I(S0.i32) < signext(SIMM16.i16)",
  SOPKOp.S_CMPK_LE_I32: "SCC = 64'I(S0.i32) <= signext(SIMM16.i16)",
  SOPKOp.S_CMPK_EQ_U32: "SCC = S0.u32 == 32'U(SIMM16.u16)",
  SOPKOp.S_CMPK_LG_U32: "SCC = S0.u32 != 32'U(SIMM16.u16)",
  SOPKOp.S_CMPK_GT_U32: "SCC = S0.u32 > 32'U(SIMM16.u16)",
  SOPKOp.S_CMPK_GE_U32: "SCC = S0.u32 >= 32'U(SIMM16.u16)",
  SOPKOp.S_CMPK_LT_U32: "SCC = S0.u32 < 32'U(SIMM16.u16)",
  SOPKOp.S_CMPK_LE_U32: "SCC = S0.u32 <= 32'U(SIMM16.u16)",
  SOPKOp.S_ADDK_I32: "tmp = D0.i32;\nD0.i32 = 32'I(64'I(D0.i32) + signext(SIMM16.i16));\nSCC = ((tmp[31] == SIMM16.i16[15]) && (tmp[31] != D0.i32[31]));",
  SOPKOp.S_MULK_I32: "D0.i32 = 32'I(64'I(D0.i32) * signext(SIMM16.i16))",
  SOPKOp.S_GETREG_B32: "OFFSET = SIMM16[10:6]\noffset = SIMM16.u16[10 : 6];\nsize = SIMM16.u16[15 : 11].u32 + 1U;\n// logical size is in range 1:32\nvalue = HW_REGISTERS[hwRegId];\nD0.u32 = 32'U(32'I(value >> offset.u32) & ((1 << size) - 1))",
  SOPKOp.S_SETREG_B32: "OFFSET = SIMM16[10:6]\noffset = SIMM16.u16[10 : 6];\nsize = SIMM16.u16[15 : 11].u32 + 1U;\n// logical size is in range 1:32\nmask = (1 << size) - 1;\nmask = (mask & 32'I(writeableBitMask(hwRegId.u32, WAVE_STATUS.PRIV)));\n// Mask of bits we are allowed to modify\nvalue = ((S0.u32 << offset.u32) & mask.u32);\nvalue = (value | 32'U(HW_REGISTERS[hwRegId].i32 & ~mask));\n// Side-effects may trigger here if certain bits are modified",
  SOPKOp.S_SETREG_IMM32_B32: "OFFSET = SIMM16[10:6]\noffset = SIMM16.u16[10 : 6];\nsize = SIMM16.u16[15 : 11].u32 + 1U;\n// logical size is in range 1:32\nmask = (1 << size) - 1;\nmask = (mask & 32'I(writeableBitMask(hwRegId.u32, WAVE_STATUS.PRIV)));\n// Mask of bits we are allowed to modify\nvalue = ((SIMM32.u32 << offset.u32) & mask.u32);\nvalue = (value | 32'U(HW_REGISTERS[hwRegId].i32 & ~mask));\n// Side-effects may trigger here if certain bits are modified",
  SOPKOp.S_CALL_B64: "D0.i64 = PC + 4LL;\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL",
  SOPKOp.S_WAITCNT_VSCNT: 'vscnt <= S0.u[5:0] + S1.u[5:0].\n// Comparison is 6 bits, no clamping is applied for add overflow',
  SOPKOp.S_WAITCNT_VMCNT: 'vmcnt <= S0.u[5:0] + S1.u[5:0].\n// Comparison is 6 bits, no clamping is applied for add overflow',
  SOPKOp.S_WAITCNT_EXPCNT: 'expcnt <= S0.u[2:0] + S1.u[2:0].\n// Comparison is 3 bits, no clamping is applied for add overflow',
  SOPKOp.S_WAITCNT_LGKMCNT: 'lgkmcnt <= S0.u[5:0] + S1.u[5:0].\n// Comparison is 6 bits, no clamping is applied for add overflow',
}

SOPPOp_PCODE = {
  SOPPOp.S_NOP: 'for i in 0U : SIMM16.u16[3 : 0].u32 do\nendfor',
  SOPPOp.S_SETHALT: 'When halt type control is set to 1 (FATAL HALT bit select): Set FATAL_HALT bit to value of SIMM16[0]; 1 =\nfatal_halt, 0 = clear FATAL_HALT bit. Setting the fatal_halt flag halts the shader in or outside of the trap',
  SOPPOp.S_DELAY_ALU: 'instruction may be omitted. For wave64 the compiler may not know the status of the EXEC mask and hence\n// 1 cycle delay here\n// 2 cycles delay here',
  SOPPOp.S_TRAP: '// PC passed into trap handler points to S_TRAP itself,\nPC = TBA.i64;\n// trap base address',
  SOPPOp.S_BRANCH: "PC = PC + signext(SIMM16.i16 * 16'4) + 4LL;",
  SOPPOp.S_CBRANCH_SCC0: "if SCC == 1'0U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_SCC1: "if SCC == 1'1U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_VCCZ: "If VCCZ is 1 then jump to a constant offset relative to the current PC.\nif VCCZ.u1 == 1'1U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_VCCNZ: "If VCCZ is 0 then jump to a constant offset relative to the current PC.\nif VCCZ.u1 == 1'0U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_EXECZ: "if EXECZ.u1 == 1'1U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_EXECNZ: "if EXECZ.u1 == 1'0U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_CDBGSYS: "if WAVE_STATUS.COND_DBG_SYS.u32 != 0U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_CDBGUSER: "if WAVE_STATUS.COND_DBG_USER.u32 != 0U then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_CDBGSYS_OR_USER: "if (WAVE_STATUS.COND_DBG_SYS || WAVE_STATUS.COND_DBG_USER) then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
  SOPPOp.S_CBRANCH_CDBGSYS_AND_USER: "if (WAVE_STATUS.COND_DBG_SYS && WAVE_STATUS.COND_DBG_USER) then\nPC = PC + signext(SIMM16.i16 * 16'4) + 4LL\nelse\nPC = PC + 4LL\nendif",
}

SMEMOp_PCODE = {
  SMEMOp.S_LOAD_B32: 'SDATA[31 : 0] = MEM[ADDR].b32',
  SMEMOp.S_LOAD_B64: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32',
  SMEMOp.S_LOAD_B128: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32',
  SMEMOp.S_LOAD_B256: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32;\nSDATA[159 : 128] = MEM[ADDR + 16U].b32;\nSDATA[191 : 160] = MEM[ADDR + 20U].b32;\nSDATA[223 : 192] = MEM[ADDR + 24U].b32;\nSDATA[255 : 224] = MEM[ADDR + 28U].b32',
  SMEMOp.S_LOAD_B512: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32;\nSDATA[159 : 128] = MEM[ADDR + 16U].b32;\nSDATA[191 : 160] = MEM[ADDR + 20U].b32;\nSDATA[223 : 192] = MEM[ADDR + 24U].b32;\nSDATA[255 : 224] = MEM[ADDR + 28U].b32;\nSDATA[287 : 256] = MEM[ADDR + 32U].b32;\nSDATA[319 : 288] = MEM[ADDR + 36U].b32;\nSDATA[351 : 320] = MEM[ADDR + 40U].b32;\nSDATA[383 : 352] = MEM[ADDR + 44U].b32;\nSDATA[415 : 384] = MEM[ADDR + 48U].b32;\nSDATA[447 : 416] = MEM[ADDR + 52U].b32;\nSDATA[479 : 448] = MEM[ADDR + 56U].b32;\nSDATA[511 : 480] = MEM[ADDR + 60U].b32',
  SMEMOp.S_BUFFER_LOAD_B32: 'SDATA[31 : 0] = MEM[ADDR].b32',
  SMEMOp.S_BUFFER_LOAD_B64: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32',
  SMEMOp.S_BUFFER_LOAD_B128: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32',
  SMEMOp.S_BUFFER_LOAD_B256: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32;\nSDATA[159 : 128] = MEM[ADDR + 16U].b32;\nSDATA[191 : 160] = MEM[ADDR + 20U].b32;\nSDATA[223 : 192] = MEM[ADDR + 24U].b32;\nSDATA[255 : 224] = MEM[ADDR + 28U].b32',
  SMEMOp.S_BUFFER_LOAD_B512: 'SDATA[31 : 0] = MEM[ADDR].b32;\nSDATA[63 : 32] = MEM[ADDR + 4U].b32;\nSDATA[95 : 64] = MEM[ADDR + 8U].b32;\nSDATA[127 : 96] = MEM[ADDR + 12U].b32;\nSDATA[159 : 128] = MEM[ADDR + 16U].b32;\nSDATA[191 : 160] = MEM[ADDR + 20U].b32;\nSDATA[223 : 192] = MEM[ADDR + 24U].b32;\nSDATA[255 : 224] = MEM[ADDR + 28U].b32;\nSDATA[287 : 256] = MEM[ADDR + 32U].b32;\nSDATA[319 : 288] = MEM[ADDR + 36U].b32;\nSDATA[351 : 320] = MEM[ADDR + 40U].b32;\nSDATA[383 : 352] = MEM[ADDR + 44U].b32;\nSDATA[415 : 384] = MEM[ADDR + 48U].b32;\nSDATA[447 : 416] = MEM[ADDR + 52U].b32;\nSDATA[479 : 448] = MEM[ADDR + 56U].b32;\nSDATA[511 : 480] = MEM[ADDR + 60U].b32',
}

VOP1Op_PCODE = {
  VOP1Op.V_MOV_B32: 'D0.b32 = S0.b32',
  VOP1Op.V_READFIRSTLANE_B32: "declare lane : 32'U;\nif WAVE64 then\n// 64 lanes\nif EXEC == 0x0LL then\nlane = 0U;\n// Force lane 0 if all lanes are disabled\nelse\nlane = 32'U(s_ff1_i32_b64(EXEC));\n// Lowest active lane\nendif\nelse\n// 32 lanes\nif EXEC_LO.i32 == 0 then\nlane = 0U;\n// Force lane 0 if all lanes are disabled\nelse\nlane = 32'U(s_ff1_i32_b32(EXEC_LO));\n// Lowest active lane\nendif\nendif;\nD0.b32 = VGPR[lane][SRC0.u32]",
  VOP1Op.V_CVT_I32_F64: 'D0.i32 = f64_to_i32(S0.f64)',
  VOP1Op.V_CVT_F64_I32: 'D0.f64 = i32_to_f64(S0.i32)',
  VOP1Op.V_CVT_F32_I32: 'D0.f32 = i32_to_f32(S0.i32)',
  VOP1Op.V_CVT_F32_U32: 'D0.f32 = u32_to_f32(S0.u32)',
  VOP1Op.V_CVT_U32_F32: 'D0.u32 = f32_to_u32(S0.f32)',
  VOP1Op.V_CVT_I32_F32: 'D0.i32 = f32_to_i32(S0.f32)',
  VOP1Op.V_CVT_F16_F32: 'D0.f16 = f32_to_f16(S0.f32)',
  VOP1Op.V_CVT_F32_F16: 'D0.f32 = f16_to_f32(S0.f16)',
  VOP1Op.V_CVT_NEAREST_I32_F32: 'D0.i32 = f32_to_i32(floor(S0.f32 + 0.5F))',
  VOP1Op.V_CVT_FLOOR_I32_F32: 'D0.i32 = f32_to_i32(floor(S0.f32))',
  VOP1Op.V_CVT_OFF_F32_I4: "Used for interpolation in shader. Lookup table on S0[3:0]:\ndeclare CVT_OFF_TABLE : 32'F[16];\nD0.f32 = CVT_OFF_TABLE[S0.u32[3 : 0]]",
  VOP1Op.V_CVT_F32_F64: 'D0.f32 = f64_to_f32(S0.f64)',
  VOP1Op.V_CVT_F64_F32: 'D0.f64 = f32_to_f64(S0.f32)',
  VOP1Op.V_CVT_F32_UBYTE0: 'D0.f32 = u32_to_f32(S0[7 : 0].u32)',
  VOP1Op.V_CVT_F32_UBYTE1: 'D0.f32 = u32_to_f32(S0[15 : 8].u32)',
  VOP1Op.V_CVT_F32_UBYTE2: 'D0.f32 = u32_to_f32(S0[23 : 16].u32)',
  VOP1Op.V_CVT_F32_UBYTE3: 'D0.f32 = u32_to_f32(S0[31 : 24].u32)',
  VOP1Op.V_CVT_U32_F64: 'D0.u32 = f64_to_u32(S0.f64)',
  VOP1Op.V_CVT_F64_U32: 'D0.f64 = u32_to_f64(S0.u32)',
  VOP1Op.V_TRUNC_F64: 'D0.f64 = trunc(S0.f64)',
  VOP1Op.V_CEIL_F64: 'D0.f64 = trunc(S0.f64);\nif ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then\nD0.f64 += 1.0\nendif',
  VOP1Op.V_RNDNE_F64: 'D0.f64 = floor(S0.f64 + 0.5);\nif (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then\nD0.f64 -= 1.0\nendif',
  VOP1Op.V_FLOOR_F64: 'D0.f64 = trunc(S0.f64);\nif ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then\nD0.f64 += -1.0\nendif',
  VOP1Op.V_MOV_B16: 'D0.b16 = S0.b16',
  VOP1Op.V_FRACT_F32: 'D0.f32 = S0.f32 + -floor(S0.f32)',
  VOP1Op.V_TRUNC_F32: 'D0.f32 = trunc(S0.f32)',
  VOP1Op.V_CEIL_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += 1.0F\nendif',
  VOP1Op.V_RNDNE_F32: "D0.f32 = floor(S0.f32 + 0.5F);\nif (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then\nD0.f32 -= 1.0F\nendif",
  VOP1Op.V_FLOOR_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += -1.0F\nendif',
  VOP1Op.V_EXP_F32: 'D0.f32 = pow(2.0F, S0.f32)',
  VOP1Op.V_LOG_F32: 'D0.f32 = log2(S0.f32)',
  VOP1Op.V_RCP_F32: 'D0.f32 = 1.0F / S0.f32',
  VOP1Op.V_RCP_IFLAG_F32: 'D0.f32 = 1.0F / S0.f32;\n// Can only raise integer DIV_BY_ZERO exception',
  VOP1Op.V_RSQ_F32: 'D0.f32 = 1.0F / sqrt(S0.f32)',
  VOP1Op.V_RCP_F64: 'D0.f64 = 1.0 / S0.f64',
  VOP1Op.V_RSQ_F64: 'D0.f64 = 1.0 / sqrt(S0.f64)',
  VOP1Op.V_SQRT_F32: 'D0.f32 = sqrt(S0.f32)',
  VOP1Op.V_SQRT_F64: 'D0.f64 = sqrt(S0.f64)',
  VOP1Op.V_SIN_F32: "D0.f32 = sin(S0.f32 * 32'F(PI * 2.0))",
  VOP1Op.V_COS_F32: "D0.f32 = cos(S0.f32 * 32'F(PI * 2.0))",
  VOP1Op.V_NOT_B32: 'D0.u32 = ~S0.u32',
  VOP1Op.V_BFREV_B32: 'D0.u32[31 : 0] = S0.u32[0 : 31]',
  VOP1Op.V_CLZ_I32_U32: "D0.i32 = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from MSB\nif S0.u32[31 - i] == 1'1U then\nD0.i32 = i;\nendif\nendfor",
  VOP1Op.V_CTZ_I32_B32: "D0.i32 = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from LSB\nif S0.u32[i] == 1'1U then\nD0.i32 = i;\nendif\nendfor",
  VOP1Op.V_CLS_I32: 'D0.i32 = -1;\n// Set if all bits are the same\nfor i in 1 : 31 do\n// Search from MSB\nif S0.i32[31 - i] != S0.i32[31] then\nD0.i32 = i;\nendif\nendfor',
  VOP1Op.V_FREXP_EXP_I32_F64: 'if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then\nD0.i32 = 0\nelse\nD0.i32 = exponent(S0.f64) - 1023 + 1\nendif',
  VOP1Op.V_FREXP_MANT_F64: 'if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then\nD0.f64 = S0.f64\nelse\nD0.f64 = mantissa(S0.f64)\nendif',
  VOP1Op.V_FRACT_F64: 'D0.f64 = S0.f64 + -floor(S0.f64)',
  VOP1Op.V_FREXP_EXP_I32_F32: "if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then\nD0.i32 = 0\nelse\nD0.i32 = exponent(S0.f32) - 127 + 1\nendif",
  VOP1Op.V_FREXP_MANT_F32: "if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then\nD0.f32 = S0.f32\nelse\nD0.f32 = mantissa(S0.f32)\nendif",
  VOP1Op.V_MOVRELD_B32: 'addr = DST.u32;\n// Raw value from instruction\nVGPR[laneId][addr].b32 = S0.b32',
  VOP1Op.V_MOVRELS_B32: 'addr = SRC0.u32;\n// Raw value from instruction\nD0.b32 = VGPR[laneId][addr].b32',
  VOP1Op.V_MOVRELSD_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction',
  VOP1Op.V_MOVRELSD_2_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction',
  VOP1Op.V_CVT_F16_U16: 'D0.f16 = u16_to_f16(S0.u16)',
  VOP1Op.V_CVT_F16_I16: 'D0.f16 = i16_to_f16(S0.i16)',
  VOP1Op.V_CVT_U16_F16: 'D0.u16 = f16_to_u16(S0.f16)',
  VOP1Op.V_CVT_I16_F16: 'D0.i16 = f16_to_i16(S0.f16)',
  VOP1Op.V_RCP_F16: "D0.f16 = 16'1.0 / S0.f16",
  VOP1Op.V_SQRT_F16: 'D0.f16 = sqrt(S0.f16)',
  VOP1Op.V_RSQ_F16: "D0.f16 = 16'1.0 / sqrt(S0.f16)",
  VOP1Op.V_LOG_F16: 'D0.f16 = log2(S0.f16)',
  VOP1Op.V_EXP_F16: "D0.f16 = pow(16'2.0, S0.f16)",
  VOP1Op.V_FREXP_MANT_F16: "if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then\nD0.f16 = S0.f16\nelse\nD0.f16 = mantissa(S0.f16)\nendif",
  VOP1Op.V_FREXP_EXP_I16_F16: "if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then\nD0.i16 = 16'0\nelse\nD0.i16 = 16'I(exponent(S0.f16) - 15 + 1)\nendif",
  VOP1Op.V_FLOOR_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += -16'1.0\nendif",
  VOP1Op.V_CEIL_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += 16'1.0\nendif",
  VOP1Op.V_TRUNC_F16: 'D0.f16 = trunc(S0.f16)',
  VOP1Op.V_RNDNE_F16: "D0.f16 = floor(S0.f16 + 16'0.5);\nif (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then\nD0.f16 -= 16'1.0\nendif",
  VOP1Op.V_FRACT_F16: 'D0.f16 = S0.f16 + -floor(S0.f16)',
  VOP1Op.V_SIN_F16: "D0.f16 = sin(S0.f16 * 16'F(PI * 2.0))",
  VOP1Op.V_COS_F16: "D0.f16 = cos(S0.f16 * 16'F(PI * 2.0))",
  VOP1Op.V_SAT_PK_U8_I16: 'D0.b16 = { SAT8(S0[31 : 16].i16), SAT8(S0[15 : 0].i16) }',
  VOP1Op.V_CVT_NORM_I16_F16: 'D0.i16 = f16_to_snorm(S0.f16)',
  VOP1Op.V_CVT_NORM_U16_F16: 'D0.u16 = f16_to_unorm(S0.f16)',
  VOP1Op.V_SWAP_B32: 'tmp = D0.b32;\nD0.b32 = S0.b32;\nS0.b32 = tmp',
  VOP1Op.V_SWAP_B16: 'tmp = D0.b16;\nD0.b16 = S0.b16;\nS0.b16 = tmp',
  VOP1Op.V_PERMLANE64_B32: "declare tmp : 32'B[64];\ndeclare lane : 32'U;\nif WAVE32 then\n// Supported in wave64 ONLY; treated as scalar NOP in wave32\nelse\nfor lane in 0U : 63U do\n// Copy original S0 in case D==S0\ntmp[lane] = VGPR[lane][SRC0.u32]\nendfor;\nfor lane in 0U : 63U do\naltlane = { ~lane[5], lane[4 : 0] };\n// 0<->32, ..., 31<->63\nif EXEC[lane].u1 then\nVGPR[lane][VDST.u32] = tmp[altlane]\nendif\nendfor\nendif",
  VOP1Op.V_SWAPREL_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction\ntmp = VGPR[laneId][addrd].b32;',
  VOP1Op.V_NOT_B16: 'D0.u16 = ~S0.u16',
  VOP1Op.V_CVT_I32_I16: "D0.i32 = 32'I(signext(S0.i16))",
  VOP1Op.V_CVT_U32_U16: "D0 = { 16'0, S0.u16 }",
}

VOP2Op_PCODE = {
  VOP2Op.V_CNDMASK_B32: 'D0.u32 = VCC.u64[laneId] ? S1.u32 : S0.u32',
  VOP2Op.V_DOT2ACC_F32_F16: 'tmp = D0.f32;\ntmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16);\ntmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16);\nD0.f32 = tmp',
  VOP2Op.V_ADD_F32: 'D0.f32 = S0.f32 + S1.f32',
  VOP2Op.V_SUB_F32: 'D0.f32 = S0.f32 - S1.f32',
  VOP2Op.V_SUBREV_F32: 'D0.f32 = S1.f32 - S0.f32',
  VOP2Op.V_FMAC_DX9_ZERO_F32: "if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\n// DX9 rules, 0.0 * x = 0.0\nD0.f32 = S2.f32\nelse\nD0.f32 = fma(S0.f32, S1.f32, D0.f32)\nendif",
  VOP2Op.V_MUL_DX9_ZERO_F32: "if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\n// DX9 rules, 0.0 * x = 0.0\nD0.f32 = 0.0F\nelse\nD0.f32 = S0.f32 * S1.f32\nendif",
  VOP2Op.V_MUL_F32: 'D0.f32 = S0.f32 * S1.f32',
  VOP2Op.V_MUL_I32_I24: "D0.i32 = 32'I(S0.i24) * 32'I(S1.i24)",
  VOP2Op.V_MUL_HI_I32_I24: "D0.i32 = 32'I((64'I(S0.i24) * 64'I(S1.i24)) >> 32U)",
  VOP2Op.V_MUL_U32_U24: "D0.u32 = 32'U(S0.u24) * 32'U(S1.u24)",
  VOP2Op.V_MUL_HI_U32_U24: "D0.u32 = 32'U((64'U(S0.u24) * 64'U(S1.u24)) >> 32U)",
  VOP2Op.V_MIN_F32: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP2Op.V_MAX_F32: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP2Op.V_MIN_I32: 'D0.i32 = S0.i32 < S1.i32 ? S0.i32 : S1.i32',
  VOP2Op.V_MAX_I32: 'D0.i32 = S0.i32 >= S1.i32 ? S0.i32 : S1.i32',
  VOP2Op.V_MIN_U32: 'D0.u32 = S0.u32 < S1.u32 ? S0.u32 : S1.u32',
  VOP2Op.V_MAX_U32: 'D0.u32 = S0.u32 >= S1.u32 ? S0.u32 : S1.u32',
  VOP2Op.V_LSHLREV_B32: 'D0.u32 = (S1.u32 << S0[4 : 0].u32)',
  VOP2Op.V_LSHRREV_B32: 'D0.u32 = (S1.u32 >> S0[4 : 0].u32)',
  VOP2Op.V_ASHRREV_I32: 'D0.i32 = (S1.i32 >> S0[4 : 0].u32)',
  VOP2Op.V_AND_B32: 'D0.u32 = (S0.u32 & S1.u32)',
  VOP2Op.V_OR_B32: 'D0.u32 = (S0.u32 | S1.u32)',
  VOP2Op.V_XOR_B32: 'D0.u32 = (S0.u32 ^ S1.u32)',
  VOP2Op.V_XNOR_B32: 'D0.u32 = ~(S0.u32 ^ S1.u32)',
  VOP2Op.V_ADD_CO_CI_U32: "tmp = 64'U(S0.u32) + 64'U(S1.u32) + VCC.u64[laneId].u64;\nVCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP2Op.V_SUB_CO_CI_U32: "tmp = S0.u32 - S1.u32 - VCC.u64[laneId].u32;\nVCC.u64[laneId] = 64'U(S1.u32) + VCC.u64[laneId].u64 > 64'U(S0.u32) ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP2Op.V_SUBREV_CO_CI_U32: "tmp = S1.u32 - S0.u32 - VCC.u64[laneId].u32;\nVCC.u64[laneId] = 64'U(S0.u32) + VCC.u64[laneId].u64 > 64'U(S1.u32) ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP2Op.V_ADD_NC_U32: 'D0.u32 = S0.u32 + S1.u32',
  VOP2Op.V_SUB_NC_U32: 'D0.u32 = S0.u32 - S1.u32',
  VOP2Op.V_SUBREV_NC_U32: 'D0.u32 = S1.u32 - S0.u32',
  VOP2Op.V_FMAC_F32: 'D0.f32 = fma(S0.f32, S1.f32, D0.f32)',
  VOP2Op.V_FMAMK_F32: 'D0.f32 = fma(S0.f32, SIMM32.f32, S1.f32)',
  VOP2Op.V_FMAAK_F32: 'D0.f32 = fma(S0.f32, S1.f32, SIMM32.f32)',
  VOP2Op.V_CVT_PK_RTZ_F16_F32: 'prev_mode = ROUND_MODE;\ntmp[15 : 0].f16 = f32_to_f16(S0.f32);\ntmp[31 : 16].f16 = f32_to_f16(S1.f32);',
  VOP2Op.V_ADD_F16: 'D0.f16 = S0.f16 + S1.f16',
  VOP2Op.V_SUB_F16: 'D0.f16 = S0.f16 - S1.f16',
  VOP2Op.V_SUBREV_F16: 'D0.f16 = S1.f16 - S0.f16',
  VOP2Op.V_MUL_F16: 'D0.f16 = S0.f16 * S1.f16',
  VOP2Op.V_FMAC_F16: 'D0.f16 = fma(S0.f16, S1.f16, D0.f16)',
  VOP2Op.V_FMAMK_F16: 'D0.f16 = fma(S0.f16, SIMM32.f16, S1.f16)',
  VOP2Op.V_FMAAK_F16: 'D0.f16 = fma(S0.f16, S1.f16, SIMM32.f16)',
  VOP2Op.V_MAX_F16: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP2Op.V_MIN_F16: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP2Op.V_LDEXP_F16: "D0.f16 = S0.f16 * 16'F(2.0F ** 32'I(S1.i16))",
  VOP2Op.V_PK_FMAC_F16: 'D0[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, D0[31 : 16].f16);\nD0[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, D0[15 : 0].f16)',
}

VOP3Op_PCODE = {
  VOP3Op.V_CMP_F_F16: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_F16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f16 < S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_F16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f16 == S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_F16: 'D0.u64[laneId] = S0.f16 <= S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_F16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f16 > S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LG_F16: 'D0.u64[laneId] = S0.f16 <> S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_F16: 'D0.u64[laneId] = S0.f16 >= S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_O_F16: "Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16)));\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_U_F16: "VCC or a scalar register.\nD0.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)));\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_NGE_F16: 'D0.u64[laneId] = !(S0.f16 >= S1.f16);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLG_F16: 'D0.u64[laneId] = !(S0.f16 <> S1.f16);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NGT_F16: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f16 > S1.f16);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLE_F16: 'D0.u64[laneId] = !(S0.f16 <= S1.f16);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NEQ_F16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f16 == S1.f16);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLT_F16: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f16 < S1.f16);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_F16: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_F_F32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_F32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f32 < S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_F32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f32 == S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_F32: 'D0.u64[laneId] = S0.f32 <= S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_F32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f32 > S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LG_F32: 'D0.u64[laneId] = S0.f32 <> S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_F32: 'D0.u64[laneId] = S0.f32 >= S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_O_F32: "Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32)));\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_U_F32: "VCC or a scalar register.\nD0.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)));\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_NGE_F32: 'D0.u64[laneId] = !(S0.f32 >= S1.f32);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLG_F32: 'D0.u64[laneId] = !(S0.f32 <> S1.f32);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NGT_F32: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f32 > S1.f32);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLE_F32: 'D0.u64[laneId] = !(S0.f32 <= S1.f32);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NEQ_F32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f32 == S1.f32);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLT_F32: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f32 < S1.f32);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_F32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_F_F64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_F64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f64 < S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_F64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f64 == S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_F64: 'D0.u64[laneId] = S0.f64 <= S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_F64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f64 > S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LG_F64: 'D0.u64[laneId] = S0.f64 <> S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_F64: 'D0.u64[laneId] = S0.f64 >= S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_O_F64: 'Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64));\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_U_F64: 'VCC or a scalar register.\nD0.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64));\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NGE_F64: 'D0.u64[laneId] = !(S0.f64 >= S1.f64);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLG_F64: 'D0.u64[laneId] = !(S0.f64 <> S1.f64);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NGT_F64: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f64 > S1.f64);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLE_F64: 'D0.u64[laneId] = !(S0.f64 <= S1.f64);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NEQ_F64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f64 == S1.f64);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NLT_F64: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f64 < S1.f64);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_F64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_I16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i16 < S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_I16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i16 == S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_I16: 'D0.u64[laneId] = S0.i16 <= S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_I16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i16 > S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_I16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i16 <> S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_I16: 'D0.u64[laneId] = S0.i16 >= S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LT_U16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u16 < S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_U16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u16 == S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_U16: 'D0.u64[laneId] = S0.u16 <= S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_U16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u16 > S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_U16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u16 <> S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_U16: 'D0.u64[laneId] = S0.u16 >= S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_F_I32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_I32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i32 < S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_I32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i32 == S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_I32: 'D0.u64[laneId] = S0.i32 <= S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_I32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i32 > S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_I32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i32 <> S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_I32: 'D0.u64[laneId] = S0.i32 >= S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_I32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_F_U32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_U32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u32 < S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_U32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u32 == S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_U32: 'D0.u64[laneId] = S0.u32 <= S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_U32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u32 > S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_U32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u32 <> S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_U32: 'D0.u64[laneId] = S0.u32 >= S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_U32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_F_I64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_I64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i64 < S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_I64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i64 == S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_I64: 'D0.u64[laneId] = S0.i64 <= S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_I64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i64 > S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_I64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i64 <> S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_I64: 'D0.u64[laneId] = S0.i64 >= S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_I64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_F_U64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_LT_U64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u64 < S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_EQ_U64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u64 == S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_LE_U64: 'D0.u64[laneId] = S0.u64 <= S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GT_U64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u64 > S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_NE_U64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u64 <> S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_GE_U64: 'D0.u64[laneId] = S0.u64 >= S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOP3Op.V_CMP_T_U64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_CLASS_F16: "half-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f16)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f16)) then\nresult = S1.u32[1]\nelsif exponent(S0.f16) == 31 then\n// +-INF\nresult = S1.u32[sign(S0.f16) ? 2 : 9]\nelsif exponent(S0.f16) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f16) ? 3 : 8]\nelsif 64'F(abs(S0.f16)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f16) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f16) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_CLASS_F32: "single-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f32)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f32)) then\nresult = S1.u32[1]\nelsif exponent(S0.f32) == 255 then\n// +-INF\nresult = S1.u32[sign(S0.f32) ? 2 : 9]\nelsif exponent(S0.f32) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f32) ? 3 : 8]\nelsif 64'F(abs(S0.f32)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f32) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f32) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMP_CLASS_F64: "double-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(S0.f64) then\nresult = S1.u32[0]\nelsif isQuietNAN(S0.f64) then\nresult = S1.u32[1]\nelsif exponent(S0.f64) == 2047 then\n// +-INF\nresult = S1.u32[sign(S0.f64) ? 2 : 9]\nelsif exponent(S0.f64) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f64) ? 3 : 8]\nelsif abs(S0.f64) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f64) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f64) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOP3Op.V_CMPX_F_F16: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_F16: 'EXEC.u64[laneId] = S0.f16 < S1.f16',
  VOP3Op.V_CMPX_EQ_F16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f16 == S1.f16',
  VOP3Op.V_CMPX_LE_F16: 'EXEC.u64[laneId] = S0.f16 <= S1.f16',
  VOP3Op.V_CMPX_GT_F16: 'EXEC.u64[laneId] = S0.f16 > S1.f16',
  VOP3Op.V_CMPX_LG_F16: 'EXEC.u64[laneId] = S0.f16 <> S1.f16',
  VOP3Op.V_CMPX_GE_F16: 'EXEC.u64[laneId] = S0.f16 >= S1.f16',
  VOP3Op.V_CMPX_O_F16: "EXEC.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16)))",
  VOP3Op.V_CMPX_U_F16: "EXEC.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)))",
  VOP3Op.V_CMPX_NGE_F16: 'EXEC.u64[laneId] = !(S0.f16 >= S1.f16);\n// With NAN inputs this is not the same operation as <',
  VOP3Op.V_CMPX_NLG_F16: 'EXEC.u64[laneId] = !(S0.f16 <> S1.f16);\n// With NAN inputs this is not the same operation as ==',
  VOP3Op.V_CMPX_NGT_F16: 'EXEC.u64[laneId] = !(S0.f16 > S1.f16);\n// With NAN inputs this is not the same operation as <=',
  VOP3Op.V_CMPX_NLE_F16: 'EXEC.u64[laneId] = !(S0.f16 <= S1.f16);\n// With NAN inputs this is not the same operation as >',
  VOP3Op.V_CMPX_NEQ_F16: 'EXEC.u64[laneId] = !(S0.f16 == S1.f16);\n// With NAN inputs this is not the same operation as !=',
  VOP3Op.V_CMPX_NLT_F16: 'EXEC.u64[laneId] = !(S0.f16 < S1.f16);\n// With NAN inputs this is not the same operation as >=',
  VOP3Op.V_CMPX_T_F16: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_F_F32: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_F32: 'EXEC.u64[laneId] = S0.f32 < S1.f32',
  VOP3Op.V_CMPX_EQ_F32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f32 == S1.f32',
  VOP3Op.V_CMPX_LE_F32: 'EXEC.u64[laneId] = S0.f32 <= S1.f32',
  VOP3Op.V_CMPX_GT_F32: 'EXEC.u64[laneId] = S0.f32 > S1.f32',
  VOP3Op.V_CMPX_LG_F32: 'EXEC.u64[laneId] = S0.f32 <> S1.f32',
  VOP3Op.V_CMPX_GE_F32: 'EXEC.u64[laneId] = S0.f32 >= S1.f32',
  VOP3Op.V_CMPX_O_F32: "EXEC.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32)))",
  VOP3Op.V_CMPX_U_F32: "EXEC.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)))",
  VOP3Op.V_CMPX_NGE_F32: 'EXEC.u64[laneId] = !(S0.f32 >= S1.f32);\n// With NAN inputs this is not the same operation as <',
  VOP3Op.V_CMPX_NLG_F32: 'EXEC.u64[laneId] = !(S0.f32 <> S1.f32);\n// With NAN inputs this is not the same operation as ==',
  VOP3Op.V_CMPX_NGT_F32: 'EXEC.u64[laneId] = !(S0.f32 > S1.f32);\n// With NAN inputs this is not the same operation as <=',
  VOP3Op.V_CMPX_NLE_F32: 'EXEC.u64[laneId] = !(S0.f32 <= S1.f32);\n// With NAN inputs this is not the same operation as >',
  VOP3Op.V_CMPX_NEQ_F32: 'EXEC.u64[laneId] = !(S0.f32 == S1.f32);\n// With NAN inputs this is not the same operation as !=',
  VOP3Op.V_CMPX_NLT_F32: 'EXEC.u64[laneId] = !(S0.f32 < S1.f32);\n// With NAN inputs this is not the same operation as >=',
  VOP3Op.V_CMPX_T_F32: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_F_F64: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_F64: 'EXEC.u64[laneId] = S0.f64 < S1.f64',
  VOP3Op.V_CMPX_EQ_F64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f64 == S1.f64',
  VOP3Op.V_CMPX_LE_F64: 'EXEC.u64[laneId] = S0.f64 <= S1.f64',
  VOP3Op.V_CMPX_GT_F64: 'EXEC.u64[laneId] = S0.f64 > S1.f64',
  VOP3Op.V_CMPX_LG_F64: 'EXEC.u64[laneId] = S0.f64 <> S1.f64',
  VOP3Op.V_CMPX_GE_F64: 'EXEC.u64[laneId] = S0.f64 >= S1.f64',
  VOP3Op.V_CMPX_O_F64: 'EXEC.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64))',
  VOP3Op.V_CMPX_U_F64: 'EXEC.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64))',
  VOP3Op.V_CMPX_NGE_F64: 'EXEC.u64[laneId] = !(S0.f64 >= S1.f64);\n// With NAN inputs this is not the same operation as <',
  VOP3Op.V_CMPX_NLG_F64: 'EXEC.u64[laneId] = !(S0.f64 <> S1.f64);\n// With NAN inputs this is not the same operation as ==',
  VOP3Op.V_CMPX_NGT_F64: 'EXEC.u64[laneId] = !(S0.f64 > S1.f64);\n// With NAN inputs this is not the same operation as <=',
  VOP3Op.V_CMPX_NLE_F64: 'EXEC.u64[laneId] = !(S0.f64 <= S1.f64);\n// With NAN inputs this is not the same operation as >',
  VOP3Op.V_CMPX_NEQ_F64: 'EXEC.u64[laneId] = !(S0.f64 == S1.f64);\n// With NAN inputs this is not the same operation as !=',
  VOP3Op.V_CMPX_NLT_F64: 'EXEC.u64[laneId] = !(S0.f64 < S1.f64);\n// With NAN inputs this is not the same operation as >=',
  VOP3Op.V_CMPX_T_F64: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_LT_I16: 'EXEC.u64[laneId] = S0.i16 < S1.i16',
  VOP3Op.V_CMPX_EQ_I16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i16 == S1.i16',
  VOP3Op.V_CMPX_LE_I16: 'EXEC.u64[laneId] = S0.i16 <= S1.i16',
  VOP3Op.V_CMPX_GT_I16: 'EXEC.u64[laneId] = S0.i16 > S1.i16',
  VOP3Op.V_CMPX_NE_I16: 'EXEC.u64[laneId] = S0.i16 <> S1.i16',
  VOP3Op.V_CMPX_GE_I16: 'EXEC.u64[laneId] = S0.i16 >= S1.i16',
  VOP3Op.V_CMPX_LT_U16: 'EXEC.u64[laneId] = S0.u16 < S1.u16',
  VOP3Op.V_CMPX_EQ_U16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u16 == S1.u16',
  VOP3Op.V_CMPX_LE_U16: 'EXEC.u64[laneId] = S0.u16 <= S1.u16',
  VOP3Op.V_CMPX_GT_U16: 'EXEC.u64[laneId] = S0.u16 > S1.u16',
  VOP3Op.V_CMPX_NE_U16: 'EXEC.u64[laneId] = S0.u16 <> S1.u16',
  VOP3Op.V_CMPX_GE_U16: 'EXEC.u64[laneId] = S0.u16 >= S1.u16',
  VOP3Op.V_CMPX_F_I32: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_I32: 'EXEC.u64[laneId] = S0.i32 < S1.i32',
  VOP3Op.V_CMPX_EQ_I32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i32 == S1.i32',
  VOP3Op.V_CMPX_LE_I32: 'EXEC.u64[laneId] = S0.i32 <= S1.i32',
  VOP3Op.V_CMPX_GT_I32: 'EXEC.u64[laneId] = S0.i32 > S1.i32',
  VOP3Op.V_CMPX_NE_I32: 'EXEC.u64[laneId] = S0.i32 <> S1.i32',
  VOP3Op.V_CMPX_GE_I32: 'EXEC.u64[laneId] = S0.i32 >= S1.i32',
  VOP3Op.V_CMPX_T_I32: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_F_U32: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_U32: 'EXEC.u64[laneId] = S0.u32 < S1.u32',
  VOP3Op.V_CMPX_EQ_U32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u32 == S1.u32',
  VOP3Op.V_CMPX_LE_U32: 'EXEC.u64[laneId] = S0.u32 <= S1.u32',
  VOP3Op.V_CMPX_GT_U32: 'EXEC.u64[laneId] = S0.u32 > S1.u32',
  VOP3Op.V_CMPX_NE_U32: 'EXEC.u64[laneId] = S0.u32 <> S1.u32',
  VOP3Op.V_CMPX_GE_U32: 'EXEC.u64[laneId] = S0.u32 >= S1.u32',
  VOP3Op.V_CMPX_T_U32: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_F_I64: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_I64: 'EXEC.u64[laneId] = S0.i64 < S1.i64',
  VOP3Op.V_CMPX_EQ_I64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i64 == S1.i64',
  VOP3Op.V_CMPX_LE_I64: 'EXEC.u64[laneId] = S0.i64 <= S1.i64',
  VOP3Op.V_CMPX_GT_I64: 'EXEC.u64[laneId] = S0.i64 > S1.i64',
  VOP3Op.V_CMPX_NE_I64: 'EXEC.u64[laneId] = S0.i64 <> S1.i64',
  VOP3Op.V_CMPX_GE_I64: 'EXEC.u64[laneId] = S0.i64 >= S1.i64',
  VOP3Op.V_CMPX_T_I64: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_F_U64: "EXEC.u64[laneId] = 1'0U",
  VOP3Op.V_CMPX_LT_U64: 'EXEC.u64[laneId] = S0.u64 < S1.u64',
  VOP3Op.V_CMPX_EQ_U64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u64 == S1.u64',
  VOP3Op.V_CMPX_LE_U64: 'EXEC.u64[laneId] = S0.u64 <= S1.u64',
  VOP3Op.V_CMPX_GT_U64: 'EXEC.u64[laneId] = S0.u64 > S1.u64',
  VOP3Op.V_CMPX_NE_U64: 'EXEC.u64[laneId] = S0.u64 <> S1.u64',
  VOP3Op.V_CMPX_GE_U64: 'EXEC.u64[laneId] = S0.u64 >= S1.u64',
  VOP3Op.V_CMPX_T_U64: "EXEC.u64[laneId] = 1'1U",
  VOP3Op.V_CMPX_CLASS_F16: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f16)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f16)) then\nresult = S1.u32[1]\nelsif exponent(S0.f16) == 31 then\n// +-INF\nresult = S1.u32[sign(S0.f16) ? 2 : 9]\nelsif exponent(S0.f16) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f16) ? 3 : 8]\nelsif 64'F(abs(S0.f16)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f16) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f16) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
  VOP3Op.V_CMPX_CLASS_F32: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f32)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f32)) then\nresult = S1.u32[1]\nelsif exponent(S0.f32) == 255 then\n// +-INF\nresult = S1.u32[sign(S0.f32) ? 2 : 9]\nelsif exponent(S0.f32) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f32) ? 3 : 8]\nelsif 64'F(abs(S0.f32)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f32) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f32) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
  VOP3Op.V_CMPX_CLASS_F64: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(S0.f64) then\nresult = S1.u32[0]\nelsif isQuietNAN(S0.f64) then\nresult = S1.u32[1]\nelsif exponent(S0.f64) == 2047 then\n// +-INF\nresult = S1.u32[sign(S0.f64) ? 2 : 9]\nelsif exponent(S0.f64) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f64) ? 3 : 8]\nelsif abs(S0.f64) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f64) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f64) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
  VOP3Op.V_MOV_B32: 'D0.b32 = S0.b32',
  VOP3Op.V_READFIRSTLANE_B32: "declare lane : 32'U;\nif WAVE64 then\n// 64 lanes\nif EXEC == 0x0LL then\nlane = 0U;\n// Force lane 0 if all lanes are disabled\nelse\nlane = 32'U(s_ff1_i32_b64(EXEC));\n// Lowest active lane\nendif\nelse\n// 32 lanes\nif EXEC_LO.i32 == 0 then\nlane = 0U;\n// Force lane 0 if all lanes are disabled\nelse\nlane = 32'U(s_ff1_i32_b32(EXEC_LO));\n// Lowest active lane\nendif\nendif;\nD0.b32 = VGPR[lane][SRC0.u32]",
  VOP3Op.V_CVT_I32_F64: 'D0.i32 = f64_to_i32(S0.f64)',
  VOP3Op.V_CVT_F64_I32: 'D0.f64 = i32_to_f64(S0.i32)',
  VOP3Op.V_CVT_F32_I32: 'D0.f32 = i32_to_f32(S0.i32)',
  VOP3Op.V_CVT_F32_U32: 'D0.f32 = u32_to_f32(S0.u32)',
  VOP3Op.V_CVT_U32_F32: 'D0.u32 = f32_to_u32(S0.f32)',
  VOP3Op.V_CVT_I32_F32: 'D0.i32 = f32_to_i32(S0.f32)',
  VOP3Op.V_CVT_F16_F32: 'D0.f16 = f32_to_f16(S0.f32)',
  VOP3Op.V_CVT_F32_F16: 'D0.f32 = f16_to_f32(S0.f16)',
  VOP3Op.V_CVT_NEAREST_I32_F32: 'D0.i32 = f32_to_i32(floor(S0.f32 + 0.5F))',
  VOP3Op.V_CVT_FLOOR_I32_F32: 'D0.i32 = f32_to_i32(floor(S0.f32))',
  VOP3Op.V_CVT_OFF_F32_I4: "Used for interpolation in shader. Lookup table on S0[3:0]:\ndeclare CVT_OFF_TABLE : 32'F[16];\nD0.f32 = CVT_OFF_TABLE[S0.u32[3 : 0]]",
  VOP3Op.V_CVT_F32_F64: 'D0.f32 = f64_to_f32(S0.f64)',
  VOP3Op.V_CVT_F64_F32: 'D0.f64 = f32_to_f64(S0.f32)',
  VOP3Op.V_CVT_F32_UBYTE0: 'D0.f32 = u32_to_f32(S0[7 : 0].u32)',
  VOP3Op.V_CVT_F32_UBYTE1: 'D0.f32 = u32_to_f32(S0[15 : 8].u32)',
  VOP3Op.V_CVT_F32_UBYTE2: 'D0.f32 = u32_to_f32(S0[23 : 16].u32)',
  VOP3Op.V_CVT_F32_UBYTE3: 'D0.f32 = u32_to_f32(S0[31 : 24].u32)',
  VOP3Op.V_CVT_U32_F64: 'D0.u32 = f64_to_u32(S0.f64)',
  VOP3Op.V_CVT_F64_U32: 'D0.f64 = u32_to_f64(S0.u32)',
  VOP3Op.V_TRUNC_F64: 'D0.f64 = trunc(S0.f64)',
  VOP3Op.V_CEIL_F64: 'D0.f64 = trunc(S0.f64);\nif ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then\nD0.f64 += 1.0\nendif',
  VOP3Op.V_RNDNE_F64: 'D0.f64 = floor(S0.f64 + 0.5);\nif (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then\nD0.f64 -= 1.0\nendif',
  VOP3Op.V_FLOOR_F64: 'D0.f64 = trunc(S0.f64);\nif ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then\nD0.f64 += -1.0\nendif',
  VOP3Op.V_MOV_B16: 'D0.b16 = S0.b16',
  VOP3Op.V_FRACT_F32: 'D0.f32 = S0.f32 + -floor(S0.f32)',
  VOP3Op.V_TRUNC_F32: 'D0.f32 = trunc(S0.f32)',
  VOP3Op.V_CEIL_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 > 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += 1.0F\nendif',
  VOP3Op.V_RNDNE_F32: "D0.f32 = floor(S0.f32 + 0.5F);\nif (isEven(64'F(floor(S0.f32))) && (fract(S0.f32) == 0.5F)) then\nD0.f32 -= 1.0F\nendif",
  VOP3Op.V_FLOOR_F32: 'D0.f32 = trunc(S0.f32);\nif ((S0.f32 < 0.0F) && (S0.f32 != D0.f32)) then\nD0.f32 += -1.0F\nendif',
  VOP3Op.V_EXP_F32: 'D0.f32 = pow(2.0F, S0.f32)',
  VOP3Op.V_LOG_F32: 'D0.f32 = log2(S0.f32)',
  VOP3Op.V_RCP_F32: 'D0.f32 = 1.0F / S0.f32',
  VOP3Op.V_RCP_IFLAG_F32: 'D0.f32 = 1.0F / S0.f32;\n// Can only raise integer DIV_BY_ZERO exception',
  VOP3Op.V_RSQ_F32: 'D0.f32 = 1.0F / sqrt(S0.f32)',
  VOP3Op.V_RCP_F64: 'D0.f64 = 1.0 / S0.f64',
  VOP3Op.V_RSQ_F64: 'D0.f64 = 1.0 / sqrt(S0.f64)',
  VOP3Op.V_SQRT_F32: 'D0.f32 = sqrt(S0.f32)',
  VOP3Op.V_SQRT_F64: 'D0.f64 = sqrt(S0.f64)',
  VOP3Op.V_SIN_F32: "D0.f32 = sin(S0.f32 * 32'F(PI * 2.0))",
  VOP3Op.V_COS_F32: "D0.f32 = cos(S0.f32 * 32'F(PI * 2.0))",
  VOP3Op.V_NOT_B32: 'D0.u32 = ~S0.u32',
  VOP3Op.V_BFREV_B32: 'D0.u32[31 : 0] = S0.u32[0 : 31]',
  VOP3Op.V_CLZ_I32_U32: "D0.i32 = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from MSB\nif S0.u32[31 - i] == 1'1U then\nD0.i32 = i;\nendif\nendfor",
  VOP3Op.V_CTZ_I32_B32: "D0.i32 = -1;\n// Set if no ones are found\nfor i in 0 : 31 do\n// Search from LSB\nif S0.u32[i] == 1'1U then\nD0.i32 = i;\nendif\nendfor",
  VOP3Op.V_CLS_I32: 'D0.i32 = -1;\n// Set if all bits are the same\nfor i in 1 : 31 do\n// Search from MSB\nif S0.i32[31 - i] != S0.i32[31] then\nD0.i32 = i;\nendif\nendfor',
  VOP3Op.V_FREXP_EXP_I32_F64: 'if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then\nD0.i32 = 0\nelse\nD0.i32 = exponent(S0.f64) - 1023 + 1\nendif',
  VOP3Op.V_FREXP_MANT_F64: 'if ((S0.f64 == +INF) || (S0.f64 == -INF) || isNAN(S0.f64)) then\nD0.f64 = S0.f64\nelse\nD0.f64 = mantissa(S0.f64)\nendif',
  VOP3Op.V_FRACT_F64: 'D0.f64 = S0.f64 + -floor(S0.f64)',
  VOP3Op.V_FREXP_EXP_I32_F32: "if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then\nD0.i32 = 0\nelse\nD0.i32 = exponent(S0.f32) - 127 + 1\nendif",
  VOP3Op.V_FREXP_MANT_F32: "if ((64'F(S0.f32) == +INF) || (64'F(S0.f32) == -INF) || isNAN(64'F(S0.f32))) then\nD0.f32 = S0.f32\nelse\nD0.f32 = mantissa(S0.f32)\nendif",
  VOP3Op.V_MOVRELD_B32: 'addr = DST.u32;\n// Raw value from instruction\nVGPR[laneId][addr].b32 = S0.b32',
  VOP3Op.V_MOVRELS_B32: 'addr = SRC0.u32;\n// Raw value from instruction\nD0.b32 = VGPR[laneId][addr].b32',
  VOP3Op.V_MOVRELSD_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction',
  VOP3Op.V_MOVRELSD_2_B32: 'addrs = SRC0.u32;\n// Raw value from instruction\naddrd = DST.u32;\n// Raw value from instruction',
  VOP3Op.V_CVT_F16_U16: 'D0.f16 = u16_to_f16(S0.u16)',
  VOP3Op.V_CVT_F16_I16: 'D0.f16 = i16_to_f16(S0.i16)',
  VOP3Op.V_CVT_U16_F16: 'D0.u16 = f16_to_u16(S0.f16)',
  VOP3Op.V_CVT_I16_F16: 'D0.i16 = f16_to_i16(S0.f16)',
  VOP3Op.V_RCP_F16: "D0.f16 = 16'1.0 / S0.f16",
  VOP3Op.V_SQRT_F16: 'D0.f16 = sqrt(S0.f16)',
  VOP3Op.V_RSQ_F16: "D0.f16 = 16'1.0 / sqrt(S0.f16)",
  VOP3Op.V_LOG_F16: 'D0.f16 = log2(S0.f16)',
  VOP3Op.V_EXP_F16: "D0.f16 = pow(16'2.0, S0.f16)",
  VOP3Op.V_FREXP_MANT_F16: "if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then\nD0.f16 = S0.f16\nelse\nD0.f16 = mantissa(S0.f16)\nendif",
  VOP3Op.V_FREXP_EXP_I16_F16: "if ((64'F(S0.f16) == +INF) || (64'F(S0.f16) == -INF) || isNAN(64'F(S0.f16))) then\nD0.i16 = 16'0\nelse\nD0.i16 = 16'I(exponent(S0.f16) - 15 + 1)\nendif",
  VOP3Op.V_FLOOR_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 < 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += -16'1.0\nendif",
  VOP3Op.V_CEIL_F16: "D0.f16 = trunc(S0.f16);\nif ((S0.f16 > 16'0.0) && (S0.f16 != D0.f16)) then\nD0.f16 += 16'1.0\nendif",
  VOP3Op.V_TRUNC_F16: 'D0.f16 = trunc(S0.f16)',
  VOP3Op.V_RNDNE_F16: "D0.f16 = floor(S0.f16 + 16'0.5);\nif (isEven(64'F(floor(S0.f16))) && (fract(S0.f16) == 16'0.5)) then\nD0.f16 -= 16'1.0\nendif",
  VOP3Op.V_FRACT_F16: 'D0.f16 = S0.f16 + -floor(S0.f16)',
  VOP3Op.V_SIN_F16: "D0.f16 = sin(S0.f16 * 16'F(PI * 2.0))",
  VOP3Op.V_COS_F16: "D0.f16 = cos(S0.f16 * 16'F(PI * 2.0))",
  VOP3Op.V_SAT_PK_U8_I16: 'D0.b16 = { SAT8(S0[31 : 16].i16), SAT8(S0[15 : 0].i16) }',
  VOP3Op.V_CVT_NORM_I16_F16: 'D0.i16 = f16_to_snorm(S0.f16)',
  VOP3Op.V_CVT_NORM_U16_F16: 'D0.u16 = f16_to_unorm(S0.f16)',
  VOP3Op.V_NOT_B16: 'D0.u16 = ~S0.u16',
  VOP3Op.V_CVT_I32_I16: "D0.i32 = 32'I(signext(S0.i16))",
  VOP3Op.V_CVT_U32_U16: "D0 = { 16'0, S0.u16 }",
  VOP3Op.V_CNDMASK_B32: 'D0.u32 = VCC.u64[laneId] ? S1.u32 : S0.u32',
  VOP3Op.V_ADD_F32: 'D0.f32 = S0.f32 + S1.f32',
  VOP3Op.V_SUB_F32: 'D0.f32 = S0.f32 - S1.f32',
  VOP3Op.V_SUBREV_F32: 'D0.f32 = S1.f32 - S0.f32',
  VOP3Op.V_FMAC_DX9_ZERO_F32: "if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\n// DX9 rules, 0.0 * x = 0.0\nD0.f32 = S2.f32\nelse\nD0.f32 = fma(S0.f32, S1.f32, D0.f32)\nendif",
  VOP3Op.V_MUL_DX9_ZERO_F32: "if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\n// DX9 rules, 0.0 * x = 0.0\nD0.f32 = 0.0F\nelse\nD0.f32 = S0.f32 * S1.f32\nendif",
  VOP3Op.V_MUL_F32: 'D0.f32 = S0.f32 * S1.f32',
  VOP3Op.V_MUL_I32_I24: "D0.i32 = 32'I(S0.i24) * 32'I(S1.i24)",
  VOP3Op.V_MUL_HI_I32_I24: "D0.i32 = 32'I((64'I(S0.i24) * 64'I(S1.i24)) >> 32U)",
  VOP3Op.V_MUL_U32_U24: "D0.u32 = 32'U(S0.u24) * 32'U(S1.u24)",
  VOP3Op.V_MUL_HI_U32_U24: "D0.u32 = 32'U((64'U(S0.u24) * 64'U(S1.u24)) >> 32U)",
  VOP3Op.V_MIN_F32: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif LT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP3Op.V_MAX_F32: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S0.f32)))\nelsif isSignalNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif isQuietNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isQuietNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nelse\nif isNAN(64'F(S1.f32)) then\nD0.f32 = S0.f32\nelsif isNAN(64'F(S0.f32)) then\nD0.f32 = S1.f32\nelsif GT_NEG_ZERO(S0.f32, S1.f32) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f32 = S0.f32\nelse\nD0.f32 = S1.f32\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP3Op.V_MIN_I32: 'D0.i32 = S0.i32 < S1.i32 ? S0.i32 : S1.i32',
  VOP3Op.V_MAX_I32: 'D0.i32 = S0.i32 >= S1.i32 ? S0.i32 : S1.i32',
  VOP3Op.V_MIN_U32: 'D0.u32 = S0.u32 < S1.u32 ? S0.u32 : S1.u32',
  VOP3Op.V_MAX_U32: 'D0.u32 = S0.u32 >= S1.u32 ? S0.u32 : S1.u32',
  VOP3Op.V_LSHLREV_B32: 'D0.u32 = (S1.u32 << S0[4 : 0].u32)',
  VOP3Op.V_LSHRREV_B32: 'D0.u32 = (S1.u32 >> S0[4 : 0].u32)',
  VOP3Op.V_ASHRREV_I32: 'D0.i32 = (S1.i32 >> S0[4 : 0].u32)',
  VOP3Op.V_AND_B32: 'D0.u32 = (S0.u32 & S1.u32)',
  VOP3Op.V_OR_B32: 'D0.u32 = (S0.u32 | S1.u32)',
  VOP3Op.V_XOR_B32: 'D0.u32 = (S0.u32 ^ S1.u32)',
  VOP3Op.V_XNOR_B32: 'D0.u32 = ~(S0.u32 ^ S1.u32)',
  VOP3Op.V_ADD_NC_U32: 'D0.u32 = S0.u32 + S1.u32',
  VOP3Op.V_SUB_NC_U32: 'D0.u32 = S0.u32 - S1.u32',
  VOP3Op.V_SUBREV_NC_U32: 'D0.u32 = S1.u32 - S0.u32',
  VOP3Op.V_FMAC_F32: 'D0.f32 = fma(S0.f32, S1.f32, D0.f32)',
  VOP3Op.V_CVT_PK_RTZ_F16_F32: 'prev_mode = ROUND_MODE;\ntmp[15 : 0].f16 = f32_to_f16(S0.f32);\ntmp[31 : 16].f16 = f32_to_f16(S1.f32);',
  VOP3Op.V_ADD_F16: 'D0.f16 = S0.f16 + S1.f16',
  VOP3Op.V_SUB_F16: 'D0.f16 = S0.f16 - S1.f16',
  VOP3Op.V_SUBREV_F16: 'D0.f16 = S1.f16 - S0.f16',
  VOP3Op.V_MUL_F16: 'D0.f16 = S0.f16 * S1.f16',
  VOP3Op.V_FMAC_F16: 'D0.f16 = fma(S0.f16, S1.f16, D0.f16)',
  VOP3Op.V_MAX_F16: "// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif GT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP3Op.V_MIN_F16: "// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(64'F(S0.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S0.f16)))\nelsif isSignalNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif isQuietNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isQuietNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nelse\nif isNAN(64'F(S1.f16)) then\nD0.f16 = S0.f16\nelsif isNAN(64'F(S0.f16)) then\nD0.f16 = S1.f16\nelsif LT_NEG_ZERO(S0.f16, S1.f16) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f16 = S0.f16\nelse\nD0.f16 = S1.f16\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE",
  VOP3Op.V_LDEXP_F16: "D0.f16 = S0.f16 * 16'F(2.0F ** 32'I(S1.i16))",
  VOP3Op.V_FMA_DX9_ZERO_F32: "if ((64'F(S0.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\n// DX9 rules, 0.0 * x = 0.0\nD0.f32 = S2.f32\nelse\nD0.f32 = fma(S0.f32, S1.f32, S2.f32)\nendif",
  VOP3Op.V_MAD_I32_I24: "D0.i32 = 32'I(S0.i24) * 32'I(S1.i24) + S2.i32",
  VOP3Op.V_MAD_U32_U24: "D0.u32 = 32'U(S0.u24) * 32'U(S1.u24) + S2.u32",
  VOP3Op.V_CUBEID_F32: '// Set D0.f = cubemap face ID ({0.0, 1.0, ..., 5.0}).\n// XYZ coordinate is given in (S0.f, S1.f, S2.f).\n// S0.f = x\n// S1.f = y\n// S2.f = z\nif ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then\nif S2.f32 < 0.0F then\nD0.f32 = 5.0F\nelse\nD0.f32 = 4.0F\nendif\nelsif abs(S1.f32) >= abs(S0.f32) then\nif S1.f32 < 0.0F then\nD0.f32 = 3.0F\nelse\nD0.f32 = 2.0F\nendif\nelse\nif S0.f32 < 0.0F then\nD0.f32 = 1.0F\nelse\nD0.f32 = 0.0F\nendif\nendif',
  VOP3Op.V_CUBESC_F32: '// D0.f = cubemap S coordinate.\n// XYZ coordinate is given in (S0.f, S1.f, S2.f).\n// S0.f = x\n// S1.f = y\n// S2.f = z\nif ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then\nif S2.f32 < 0.0F then\nD0.f32 = -S0.f32\nelse\nD0.f32 = S0.f32\nendif\nelsif abs(S1.f32) >= abs(S0.f32) then\nD0.f32 = S0.f32\nelse\nif S0.f32 < 0.0F then\nD0.f32 = S2.f32\nelse\nD0.f32 = -S2.f32\nendif\nendif',
  VOP3Op.V_CUBETC_F32: '// D0.f = cubemap T coordinate.\n// XYZ coordinate is given in (S0.f, S1.f, S2.f).\n// S0.f = x\n// S1.f = y\n// S2.f = z\nif ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then\nD0.f32 = -S1.f32\nelsif abs(S1.f32) >= abs(S0.f32) then\nif S1.f32 < 0.0F then\nD0.f32 = -S2.f32\nelse\nD0.f32 = S2.f32\nendif\nelse\nD0.f32 = -S1.f32\nendif',
  VOP3Op.V_CUBEMA_F32: '// D0.f = 2.0 * cubemap major axis.\n// XYZ coordinate is given in (S0.f, S1.f, S2.f).\n// S0.f = x\n// S1.f = y\n// S2.f = z\nif ((abs(S2.f32) >= abs(S0.f32)) && (abs(S2.f32) >= abs(S1.f32))) then\nD0.f32 = S2.f32 * 2.0F\nelsif abs(S1.f32) >= abs(S0.f32) then\nD0.f32 = S1.f32 * 2.0F\nelse\nD0.f32 = S0.f32 * 2.0F\nendif',
  VOP3Op.V_BFE_U32: 'D0.u32 = ((S0.u32 >> S1[4 : 0].u32) & ((1U << S2[4 : 0].u32) - 1U))',
  VOP3Op.V_BFE_I32: 'tmp.i32 = ((S0.i32 >> S1[4 : 0].u32) & ((1 << S2[4 : 0].u32) - 1));\nD0.i32 = signext_from_bit(tmp.i32, S2[4 : 0].u32)',
  VOP3Op.V_BFI_B32: 'D0.u32 = ((S0.u32 & S1.u32) | (~S0.u32 & S2.u32))',
  VOP3Op.V_FMA_F32: 'D0.f32 = fma(S0.f32, S1.f32, S2.f32)',
  VOP3Op.V_FMA_F64: 'D0.f64 = fma(S0.f64, S1.f64, S2.f64)',
  VOP3Op.V_LERP_U8: 'tmp = ((S0.u32[31 : 24] + S1.u32[31 : 24] + S2.u32[24].u8) >> 1U << 24U);\ntmp += ((S0.u32[23 : 16] + S1.u32[23 : 16] + S2.u32[16].u8) >> 1U << 16U);\ntmp += ((S0.u32[15 : 8] + S1.u32[15 : 8] + S2.u32[8].u8) >> 1U << 8U);\ntmp += ((S0.u32[7 : 0] + S1.u32[7 : 0] + S2.u32[0].u8) >> 1U);\nD0.u32 = tmp.u32',
  VOP3Op.V_ALIGNBIT_B32: "D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0].u32) & 0xffffffffLL)",
  VOP3Op.V_ALIGNBYTE_B32: "D0.u32 = 32'U(({ S0.u32, S1.u32 } >> (S2.u32[1 : 0].u32 * 8U)) & 0xffffffffLL)",
  VOP3Op.V_MULLIT_F32: "if ((S1.f32 == -MAX_FLOAT_F32) || (64'F(S1.f32) == -INF) || isNAN(64'F(S1.f32)) || (S2.f32 <= 0.0F) ||\nisNAN(64'F(S2.f32))) then\nD0.f32 = -MAX_FLOAT_F32\nelse\nD0.f32 = S0.f32 * S1.f32\nendif",
  VOP3Op.V_MIN3_F32: 'D0.f32 = v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)',
  VOP3Op.V_MIN3_I32: 'D0.i32 = v_min_i32(v_min_i32(S0.i32, S1.i32), S2.i32)',
  VOP3Op.V_MIN3_U32: 'D0.u32 = v_min_u32(v_min_u32(S0.u32, S1.u32), S2.u32)',
  VOP3Op.V_MAX3_F32: 'D0.f32 = v_max_f32(v_max_f32(S0.f32, S1.f32), S2.f32)',
  VOP3Op.V_MAX3_I32: 'D0.i32 = v_max_i32(v_max_i32(S0.i32, S1.i32), S2.i32)',
  VOP3Op.V_MAX3_U32: 'D0.u32 = v_max_u32(v_max_u32(S0.u32, S1.u32), S2.u32)',
  VOP3Op.V_MED3_F32: "if (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)) || isNAN(64'F(S2.f32))) then\nD0.f32 = v_min3_f32(S0.f32, S1.f32, S2.f32)\nelsif v_max3_f32(S0.f32, S1.f32, S2.f32) == S0.f32 then\nD0.f32 = v_max_f32(S1.f32, S2.f32)\nelsif v_max3_f32(S0.f32, S1.f32, S2.f32) == S1.f32 then\nD0.f32 = v_max_f32(S0.f32, S2.f32)\nelse\nD0.f32 = v_max_f32(S0.f32, S1.f32)\nendif",
  VOP3Op.V_MED3_I32: 'if v_max3_i32(S0.i32, S1.i32, S2.i32) == S0.i32 then\nD0.i32 = v_max_i32(S1.i32, S2.i32)\nelsif v_max3_i32(S0.i32, S1.i32, S2.i32) == S1.i32 then\nD0.i32 = v_max_i32(S0.i32, S2.i32)\nelse\nD0.i32 = v_max_i32(S0.i32, S1.i32)\nendif',
  VOP3Op.V_MED3_U32: 'if v_max3_u32(S0.u32, S1.u32, S2.u32) == S0.u32 then\nD0.u32 = v_max_u32(S1.u32, S2.u32)\nelsif v_max3_u32(S0.u32, S1.u32, S2.u32) == S1.u32 then\nD0.u32 = v_max_u32(S0.u32, S2.u32)\nelse\nD0.u32 = v_max_u32(S0.u32, S1.u32)\nendif',
  VOP3Op.V_SAD_U8: "// UNSIGNED comparison\ntmp = S2.u32;\ntmp += 32'U(ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0]));\ntmp += 32'U(ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8]));\ntmp += 32'U(ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16]));\ntmp += 32'U(ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24]));\nD0.u32 = tmp",
  VOP3Op.V_SAD_HI_U8: "D0.u32 = (32'U(v_sad_u8(S0, S1, 0U)) << 16U) + S2.u32",
  VOP3Op.V_SAD_U16: '// UNSIGNED comparison\ntmp = S2.u32;\ntmp += ABSDIFF(S0[15 : 0].u16, S1[15 : 0].u16);\ntmp += ABSDIFF(S0[31 : 16].u16, S1[31 : 16].u16);\nD0.u32 = tmp',
  VOP3Op.V_SAD_U32: '// UNSIGNED comparison\nD0.u32 = ABSDIFF(S0.u32, S1.u32) + S2.u32',
  VOP3Op.V_CVT_PK_U8_F32: "tmp = (S2.u32 & 32'U(~(0xff << (S1.u32[1 : 0].u32 * 8U))));\ntmp = (tmp | ((32'U(f32_to_u8(S0.f32)) & 255U) << (S1.u32[1 : 0].u32 * 8U)));\nD0.u32 = tmp",
  VOP3Op.V_DIV_FIXUP_F32: "sign_out = (sign(S1.f32) ^ sign(S2.f32));\nif isNAN(64'F(S2.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S2.f32)))\nelsif isNAN(64'F(S1.f32)) then\nD0.f32 = 32'F(cvtToQuietNAN(64'F(S1.f32)))\nelsif ((64'F(S1.f32) == 0.0) && (64'F(S2.f32) == 0.0)) then\n// 0/0\nD0.f32 = 32'F(0xffc00000)\nelsif ((64'F(abs(S1.f32)) == +INF) && (64'F(abs(S2.f32)) == +INF)) then\n// inf/inf\nD0.f32 = 32'F(0xffc00000)\nelsif ((64'F(S1.f32) == 0.0) || (64'F(abs(S2.f32)) == +INF)) then\n// x/0, or inf/y\nD0.f32 = sign_out ? -INF.f32 : +INF.f32\nelsif ((64'F(abs(S1.f32)) == +INF) || (64'F(S2.f32) == 0.0)) then\n// x/inf, 0/y\nD0.f32 = sign_out ? -0.0F : 0.0F\nelsif exponent(S2.f32) - exponent(S1.f32) < -150 then\nD0.f32 = sign_out ? -UNDERFLOW_F32 : UNDERFLOW_F32\nelsif exponent(S1.f32) == 255 then\nD0.f32 = sign_out ? -OVERFLOW_F32 : OVERFLOW_F32\nelse\nD0.f32 = sign_out ? -abs(S0.f32) : abs(S0.f32)\nendif",
  VOP3Op.V_DIV_FIXUP_F64: "sign_out = (sign(S1.f64) ^ sign(S2.f64));\nif isNAN(S2.f64) then\nD0.f64 = cvtToQuietNAN(S2.f64)\nelsif isNAN(S1.f64) then\nD0.f64 = cvtToQuietNAN(S1.f64)\nelsif ((S1.f64 == 0.0) && (S2.f64 == 0.0)) then\n// 0/0\nD0.f64 = 64'F(0xfff8000000000000LL)\nelsif ((abs(S1.f64) == +INF) && (abs(S2.f64) == +INF)) then\n// inf/inf\nD0.f64 = 64'F(0xfff8000000000000LL)\nelsif ((S1.f64 == 0.0) || (abs(S2.f64) == +INF)) then\n// x/0, or inf/y\nD0.f64 = sign_out ? -INF : +INF\nelsif ((abs(S1.f64) == +INF) || (S2.f64 == 0.0)) then\n// x/inf, 0/y\nD0.f64 = sign_out ? -0.0 : 0.0\nelsif exponent(S2.f64) - exponent(S1.f64) < -1075 then\nD0.f64 = sign_out ? -UNDERFLOW_F64 : UNDERFLOW_F64\nelsif exponent(S1.f64) == 2047 then\nD0.f64 = sign_out ? -OVERFLOW_F64 : OVERFLOW_F64\nelse\nD0.f64 = sign_out ? -abs(S0.f64) : abs(S0.f64)\nendif",
  VOP3Op.V_DIV_FMAS_F32: 'if VCC.u64[laneId] then\nD0.f32 = 2.0F ** 32 * fma(S0.f32, S1.f32, S2.f32)\nelse\nD0.f32 = fma(S0.f32, S1.f32, S2.f32)\nendif',
  VOP3Op.V_DIV_FMAS_F64: 'if VCC.u64[laneId] then\nD0.f64 = 2.0 ** 64 * fma(S0.f64, S1.f64, S2.f64)\nelse\nD0.f64 = fma(S0.f64, S1.f64, S2.f64)\nendif',
  VOP3Op.V_MSAD_U8: "// UNSIGNED comparison\ntmp = S2.u32;\ntmp += S1.u32[7 : 0] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[7 : 0], S1.u32[7 : 0]));\ntmp += S1.u32[15 : 8] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[15 : 8], S1.u32[15 : 8]));\ntmp += S1.u32[23 : 16] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[23 : 16], S1.u32[23 : 16]));\ntmp += S1.u32[31 : 24] == 8'0U ? 0U : 32'U(ABSDIFF(S0.u32[31 : 24], S1.u32[31 : 24]));\nD0.u32 = tmp",
  VOP3Op.V_QSAD_PK_U16_U8: "tmp[63 : 48] = 16'B(v_sad_u8(S0[55 : 24], S1[31 : 0], S2[63 : 48].u32));\ntmp[47 : 32] = 16'B(v_sad_u8(S0[47 : 16], S1[31 : 0], S2[47 : 32].u32));\ntmp[31 : 16] = 16'B(v_sad_u8(S0[39 : 8], S1[31 : 0], S2[31 : 16].u32));\ntmp[15 : 0] = 16'B(v_sad_u8(S0[31 : 0], S1[31 : 0], S2[15 : 0].u32));\nD0.b64 = tmp.b64",
  VOP3Op.V_MQSAD_PK_U16_U8: "tmp[63 : 48] = 16'B(v_msad_u8(S0[55 : 24], S1[31 : 0], S2[63 : 48].u32));\ntmp[47 : 32] = 16'B(v_msad_u8(S0[47 : 16], S1[31 : 0], S2[47 : 32].u32));\ntmp[31 : 16] = 16'B(v_msad_u8(S0[39 : 8], S1[31 : 0], S2[31 : 16].u32));\ntmp[15 : 0] = 16'B(v_msad_u8(S0[31 : 0], S1[31 : 0], S2[15 : 0].u32));\nD0.b64 = tmp.b64",
  VOP3Op.V_MQSAD_U32_U8: "tmp[127 : 96] = 32'B(v_msad_u8(S0[55 : 24], S1[31 : 0], S2[127 : 96].u32));\ntmp[95 : 64] = 32'B(v_msad_u8(S0[47 : 16], S1[31 : 0], S2[95 : 64].u32));\ntmp[63 : 32] = 32'B(v_msad_u8(S0[39 : 8], S1[31 : 0], S2[63 : 32].u32));\ntmp[31 : 0] = 32'B(v_msad_u8(S0[31 : 0], S1[31 : 0], S2[31 : 0].u32));\nD0.b128 = tmp.b128",
  VOP3Op.V_XOR3_B32: 'D0.u32 = (S0.u32 ^ S1.u32 ^ S2.u32)',
  VOP3Op.V_MAD_U16: 'D0.u16 = S0.u16 * S1.u16 + S2.u16',
  VOP3Op.V_PERM_B32: 'D0[31 : 24] = BYTE_PERMUTE({ S0.u32, S1.u32 }, S2.u32[31 : 24]);\nD0[23 : 16] = BYTE_PERMUTE({ S0.u32, S1.u32 }, S2.u32[23 : 16]);\nD0[15 : 8] = BYTE_PERMUTE({ S0.u32, S1.u32 }, S2.u32[15 : 8]);\nD0[7 : 0] = BYTE_PERMUTE({ S0.u32, S1.u32 }, S2.u32[7 : 0])',
  VOP3Op.V_XAD_U32: 'D0.u32 = (S0.u32 ^ S1.u32) + S2.u32',
  VOP3Op.V_LSHL_ADD_U32: 'D0.u32 = (S0.u32 << S1.u32[4 : 0].u32) + S2.u32',
  VOP3Op.V_ADD_LSHL_U32: 'D0.u32 = ((S0.u32 + S1.u32) << S2.u32[4 : 0].u32)',
  VOP3Op.V_FMA_F16: 'D0.f16 = fma(S0.f16, S1.f16, S2.f16)',
  VOP3Op.V_MIN3_F16: 'D0.f16 = v_min_f16(v_min_f16(S0.f16, S1.f16), S2.f16)',
  VOP3Op.V_MIN3_I16: 'D0.i16 = v_min_i16(v_min_i16(S0.i16, S1.i16), S2.i16)',
  VOP3Op.V_MIN3_U16: 'D0.u16 = v_min_u16(v_min_u16(S0.u16, S1.u16), S2.u16)',
  VOP3Op.V_MAX3_F16: 'D0.f16 = v_max_f16(v_max_f16(S0.f16, S1.f16), S2.f16)',
  VOP3Op.V_MAX3_I16: 'D0.i16 = v_max_i16(v_max_i16(S0.i16, S1.i16), S2.i16)',
  VOP3Op.V_MAX3_U16: 'D0.u16 = v_max_u16(v_max_u16(S0.u16, S1.u16), S2.u16)',
  VOP3Op.V_MED3_F16: "if (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)) || isNAN(64'F(S2.f16))) then\nD0.f16 = v_min3_f16(S0.f16, S1.f16, S2.f16)\nelsif v_max3_f16(S0.f16, S1.f16, S2.f16) == S0.f16 then\nD0.f16 = v_max_f16(S1.f16, S2.f16)\nelsif v_max3_f16(S0.f16, S1.f16, S2.f16) == S1.f16 then\nD0.f16 = v_max_f16(S0.f16, S2.f16)\nelse\nD0.f16 = v_max_f16(S0.f16, S1.f16)\nendif",
  VOP3Op.V_MED3_I16: 'if v_max3_i16(S0.i16, S1.i16, S2.i16) == S0.i16 then\nD0.i16 = v_max_i16(S1.i16, S2.i16)\nelsif v_max3_i16(S0.i16, S1.i16, S2.i16) == S1.i16 then\nD0.i16 = v_max_i16(S0.i16, S2.i16)\nelse\nD0.i16 = v_max_i16(S0.i16, S1.i16)\nendif',
  VOP3Op.V_MED3_U16: 'if v_max3_u16(S0.u16, S1.u16, S2.u16) == S0.u16 then\nD0.u16 = v_max_u16(S1.u16, S2.u16)\nelsif v_max3_u16(S0.u16, S1.u16, S2.u16) == S1.u16 then\nD0.u16 = v_max_u16(S0.u16, S2.u16)\nelse\nD0.u16 = v_max_u16(S0.u16, S1.u16)\nendif',
  VOP3Op.V_MAD_I16: 'D0.i16 = S0.i16 * S1.i16 + S2.i16',
  VOP3Op.V_DIV_FIXUP_F16: "sign_out = (sign(S1.f16) ^ sign(S2.f16));\nif isNAN(64'F(S2.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S2.f16)))\nelsif isNAN(64'F(S1.f16)) then\nD0.f16 = 16'F(cvtToQuietNAN(64'F(S1.f16)))\nelsif ((64'F(S1.f16) == 0.0) && (64'F(S2.f16) == 0.0)) then\n// 0/0\nD0.f16 = 16'F(0xfe00)\nelsif ((64'F(abs(S1.f16)) == +INF) && (64'F(abs(S2.f16)) == +INF)) then\n// inf/inf\nD0.f16 = 16'F(0xfe00)\nelsif ((64'F(S1.f16) == 0.0) || (64'F(abs(S2.f16)) == +INF)) then\n// x/0, or inf/y\nD0.f16 = sign_out ? -INF.f16 : +INF.f16\nelsif ((64'F(abs(S1.f16)) == +INF) || (64'F(S2.f16) == 0.0)) then\n// x/inf, 0/y\nD0.f16 = sign_out ? -16'0.0 : 16'0.0\nelse\nD0.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16)\nendif",
  VOP3Op.V_ADD3_U32: 'D0.u32 = S0.u32 + S1.u32 + S2.u32',
  VOP3Op.V_LSHL_OR_B32: 'D0.u32 = ((S0.u32 << S1.u32[4 : 0].u32) | S2.u32)',
  VOP3Op.V_AND_OR_B32: 'D0.u32 = ((S0.u32 & S1.u32) | S2.u32)',
  VOP3Op.V_OR3_B32: 'D0.u32 = (S0.u32 | S1.u32 | S2.u32)',
  VOP3Op.V_MAD_U32_U16: "D0.u32 = 32'U(S0.u16) * 32'U(S1.u16) + S2.u32",
  VOP3Op.V_MAD_I32_I16: "D0.i32 = 32'I(S0.i16) * 32'I(S1.i16) + S2.i32",
  VOP3Op.V_PERMLANE16_B32: "declare tmp : 32'B[64];\nlanesel = { S2.u32, S1.u32 };\n// Concatenate lane select bits\nfor i in 0 : WAVE32 ? 31 : 63 do\n// Copy original S0 in case D==S0\ntmp[i] = VGPR[i][SRC0.u32]\nendfor;\nfor row in 0 : WAVE32 ? 1 : 3 do\n// Implement arbitrary swizzle within each row\nfor i in 0 : 15 do\nif EXEC[row * 16 + i].u1 then\nVGPR[row * 16 + i][VDST.u32] = tmp[64'B(row * 16) + lanesel[i * 4 + 3 : i * 4]]\nendif\nendfor\nendfor",
  VOP3Op.V_PERMLANEX16_B32: "declare tmp : 32'B[64];\nlanesel = { S2.u32, S1.u32 };\n// Concatenate lane select bits\nfor i in 0 : WAVE32 ? 31 : 63 do\n// Copy original S0 in case D==S0\ntmp[i] = VGPR[i][SRC0.u32]\nendfor;\nfor row in 0 : WAVE32 ? 1 : 3 do\n// Implement arbitrary swizzle across two rows\naltrow = { row[1], ~row[0] };\n// 1<->0, 3<->2\nfor i in 0 : 15 do\nif EXEC[row * 16 + i].u1 then\nVGPR[row * 16 + i][VDST.u32] = tmp[64'B(altrow.i32 * 16) + lanesel[i * 4 + 3 : i * 4]]\nendif\nendfor\nendfor",
  VOP3Op.V_CNDMASK_B16: 'D0.u16 = VCC.u64[laneId] ? S1.u16 : S0.u16',
  VOP3Op.V_MAXMIN_F32: 'D0.f32 = v_min_f32(v_max_f32(S0.f32, S1.f32), S2.f32)',
  VOP3Op.V_MINMAX_F32: 'D0.f32 = v_max_f32(v_min_f32(S0.f32, S1.f32), S2.f32)',
  VOP3Op.V_MAXMIN_F16: 'D0.f16 = v_min_f16(v_max_f16(S0.f16, S1.f16), S2.f16)',
  VOP3Op.V_MINMAX_F16: 'D0.f16 = v_max_f16(v_min_f16(S0.f16, S1.f16), S2.f16)',
  VOP3Op.V_MAXMIN_U32: 'D0.u32 = v_min_u32(v_max_u32(S0.u32, S1.u32), S2.u32)',
  VOP3Op.V_MINMAX_U32: 'D0.u32 = v_max_u32(v_min_u32(S0.u32, S1.u32), S2.u32)',
  VOP3Op.V_MAXMIN_I32: 'D0.i32 = v_min_i32(v_max_i32(S0.i32, S1.i32), S2.i32)',
  VOP3Op.V_MINMAX_I32: 'D0.i32 = v_max_i32(v_min_i32(S0.i32, S1.i32), S2.i32)',
  VOP3Op.V_DOT2_F16_F16: 'tmp = S2.f16;\ntmp += S0[15 : 0].f16 * S1[15 : 0].f16;\ntmp += S0[31 : 16].f16 * S1[31 : 16].f16;\nD0.f16 = tmp',
  VOP3Op.V_DOT2_BF16_BF16: 'tmp = S2.bf16;\ntmp += S0[15 : 0].bf16 * S1[15 : 0].bf16;\ntmp += S0[31 : 16].bf16 * S1[31 : 16].bf16;\nD0.bf16 = tmp',
  VOP3Op.V_ADD_NC_U16: 'D0.u16 = S0.u16 + S1.u16',
  VOP3Op.V_SUB_NC_U16: 'D0.u16 = S0.u16 - S1.u16',
  VOP3Op.V_MUL_LO_U16: 'D0.u16 = S0.u16 * S1.u16',
  VOP3Op.V_CVT_PK_I16_F32: "declare tmp : 32'B;\ntmp[31 : 16] = 16'B(v_cvt_i16_f32(S1.f32));\ntmp[15 : 0] = 16'B(v_cvt_i16_f32(S0.f32));",
  VOP3Op.V_CVT_PK_U16_F32: "declare tmp : 32'B;\ntmp[31 : 16] = 16'B(v_cvt_u16_f32(S1.f32));\ntmp[15 : 0] = 16'B(v_cvt_u16_f32(S0.f32));",
  VOP3Op.V_MAX_U16: 'D0.u16 = S0.u16 >= S1.u16 ? S0.u16 : S1.u16',
  VOP3Op.V_MAX_I16: 'D0.i16 = S0.i16 >= S1.i16 ? S0.i16 : S1.i16',
  VOP3Op.V_MIN_U16: 'D0.u16 = S0.u16 < S1.u16 ? S0.u16 : S1.u16',
  VOP3Op.V_MIN_I16: 'D0.i16 = S0.i16 < S1.i16 ? S0.i16 : S1.i16',
  VOP3Op.V_ADD_NC_I16: 'D0.i16 = S0.i16 + S1.i16',
  VOP3Op.V_SUB_NC_I16: 'D0.i16 = S0.i16 - S1.i16',
  VOP3Op.V_PACK_B32_F16: 'D0[31 : 16].f16 = S1.f16;\nD0[15 : 0].f16 = S0.f16',
  VOP3Op.V_CVT_PK_NORM_I16_F16: "declare tmp : 32'B;\ntmp[15 : 0].i16 = f16_to_snorm(S0.f16);\ntmp[31 : 16].i16 = f16_to_snorm(S1.f16);",
  VOP3Op.V_CVT_PK_NORM_U16_F16: "declare tmp : 32'B;\ntmp[15 : 0].u16 = f16_to_unorm(S0.f16);\ntmp[31 : 16].u16 = f16_to_unorm(S1.f16);",
  VOP3Op.V_LDEXP_F32: 'D0.f32 = S0.f32 * 2.0F ** S1.i32',
  VOP3Op.V_BFM_B32: 'D0.u32 = (((1U << S0[4 : 0].u32) - 1U) << S1[4 : 0].u32)',
  VOP3Op.V_BCNT_U32_B32: "tmp = S1.u32;\nfor i in 0 : 31 do\ntmp += S0[i].u32;\n// count i'th bit\nendfor;\nD0.u32 = tmp",
  VOP3Op.V_MBCNT_LO_U32_B32: 'MaskedValue = (S0.u32 & ThreadMask[31 : 0].u32);\ntmp = S1.u32;\nfor i in 0 : 31 do\nendfor;\nD0.u32 = tmp',
  VOP3Op.V_MBCNT_HI_U32_B32: 'MaskedValue = (S0.u32 & ThreadMask[63 : 32].u32);\ntmp = S1.u32;\nfor i in 0 : 31 do\nendfor;\nD0.u32 = tmp',
  VOP3Op.V_CVT_PK_NORM_I16_F32: "declare tmp : 32'B;\ntmp[15 : 0].i16 = f32_to_snorm(S0.f32);\ntmp[31 : 16].i16 = f32_to_snorm(S1.f32);",
  VOP3Op.V_CVT_PK_NORM_U16_F32: "declare tmp : 32'B;\ntmp[15 : 0].u16 = f32_to_unorm(S0.f32);\ntmp[31 : 16].u16 = f32_to_unorm(S1.f32);",
  VOP3Op.V_CVT_PK_U16_U32: "declare tmp : 32'B;\ntmp[15 : 0].u16 = u32_to_u16(S0.u32);\ntmp[31 : 16].u16 = u32_to_u16(S1.u32);",
  VOP3Op.V_CVT_PK_I16_I32: "declare tmp : 32'B;\ntmp[15 : 0].i16 = i32_to_i16(S0.i32);\ntmp[31 : 16].i16 = i32_to_i16(S1.i32);",
  VOP3Op.V_SUB_NC_I32: 'D0.i32 = S0.i32 - S1.i32',
  VOP3Op.V_ADD_NC_I32: 'D0.i32 = S0.i32 + S1.i32',
  VOP3Op.V_ADD_F64: 'D0.f64 = S0.f64 + S1.f64',
  VOP3Op.V_MUL_F64: 'D0.f64 = S0.f64 * S1.f64',
  VOP3Op.V_MIN_F64: '// Version of comparison where -0.0 < +0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(S0.f64) then\nD0.f64 = cvtToQuietNAN(S0.f64)\nelsif isSignalNAN(S1.f64) then\nD0.f64 = cvtToQuietNAN(S1.f64)\nelsif isQuietNAN(S1.f64) then\nD0.f64 = S0.f64\nelsif isQuietNAN(S0.f64) then\nD0.f64 = S1.f64\nelsif LT_NEG_ZERO(S0.f64, S1.f64) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f64 = S0.f64\nelse\nD0.f64 = S1.f64\nendif\nelse\nif isNAN(S1.f64) then\nD0.f64 = S0.f64\nelsif isNAN(S0.f64) then\nD0.f64 = S1.f64\nelsif LT_NEG_ZERO(S0.f64, S1.f64) then\n// NOTE: -0<+0 is TRUE in this comparison\nD0.f64 = S0.f64\nelse\nD0.f64 = S1.f64\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE',
  VOP3Op.V_MAX_F64: '// Version of comparison where +0.0 > -0.0, differs from IEEE\nif WAVE_MODE.IEEE then\nif isSignalNAN(S0.f64) then\nD0.f64 = cvtToQuietNAN(S0.f64)\nelsif isSignalNAN(S1.f64) then\nD0.f64 = cvtToQuietNAN(S1.f64)\nelsif isQuietNAN(S1.f64) then\nD0.f64 = S0.f64\nelsif isQuietNAN(S0.f64) then\nD0.f64 = S1.f64\nelsif GT_NEG_ZERO(S0.f64, S1.f64) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f64 = S0.f64\nelse\nD0.f64 = S1.f64\nendif\nelse\nif isNAN(S1.f64) then\nD0.f64 = S0.f64\nelsif isNAN(S0.f64) then\nD0.f64 = S1.f64\nelsif GT_NEG_ZERO(S0.f64, S1.f64) then\n// NOTE: +0>-0 is TRUE in this comparison\nD0.f64 = S0.f64\nelse\nD0.f64 = S1.f64\nendif\nendif;\n// Inequalities in the above pseudocode behave differently from IEEE',
  VOP3Op.V_LDEXP_F64: 'D0.f64 = S0.f64 * 2.0 ** S1.i32',
  VOP3Op.V_MUL_LO_U32: 'D0.u32 = S0.u32 * S1.u32',
  VOP3Op.V_MUL_HI_U32: "D0.u32 = 32'U((64'U(S0.u32) * 64'U(S1.u32)) >> 32U)",
  VOP3Op.V_MUL_HI_I32: "D0.i32 = 32'I((64'I(S0.i32) * 64'I(S1.i32)) >> 32U)",
  VOP3Op.V_TRIG_PREOP_F64: "shift = 32'I(S1[4 : 0].u32) * 53;\nif exponent(S0.f64) > 1077 then\nshift += exponent(S0.f64) - 1077\nendif;\n// (2.0/PI) == 0.{b_1200, b_1199, b_1198, ..., b_1, b_0}\n// b_1200 is the MSB of the fractional part of 2.0/PI\n// Left shift operation indicates which bits are brought\nresult = 64'F((1201'B(2.0 / PI)[1200 : 0] << shift.u32) & 1201'0x1fffffffffffff);\nscale = -53 - shift;\nif exponent(S0.f64) >= 1968 then\nscale += 128\nendif;\nD0.f64 = ldexp(result, scale)",
  VOP3Op.V_LSHLREV_B16: 'D0.u16 = (S1.u16 << S0[3 : 0].u32)',
  VOP3Op.V_LSHRREV_B16: 'D0.u16 = (S1.u16 >> S0[3 : 0].u32)',
  VOP3Op.V_ASHRREV_I16: 'D0.i16 = (S1.i16 >> S0[3 : 0].u32)',
  VOP3Op.V_LSHLREV_B64: 'D0.u64 = (S1.u64 << S0[5 : 0].u32)',
  VOP3Op.V_LSHRREV_B64: 'D0.u64 = (S1.u64 >> S0[5 : 0].u32)',
  VOP3Op.V_ASHRREV_I64: 'D0.i64 = (S1.i64 >> S0[5 : 0].u32)',
  VOP3Op.V_READLANE_B32: "declare lane : 32'U;\nif WAVE32 then\nlane = S1.u32[4 : 0].u32;\n// Lane select for wave32\nelse\nlane = S1.u32[5 : 0].u32;\n// Lane select for wave64\nendif;\nD0.b32 = VGPR[lane][SRC0.u32]",
  VOP3Op.V_WRITELANE_B32: "declare lane : 32'U;\nif WAVE32 then\nlane = S1.u32[4 : 0].u32;\n// Lane select for wave32\nelse\nlane = S1.u32[5 : 0].u32;\n// Lane select for wave64\nendif;\nVGPR[lane][VDST.u32] = S0.b32",
  VOP3Op.V_AND_B16: 'D0.u16 = (S0.u16 & S1.u16)',
  VOP3Op.V_OR_B16: 'D0.u16 = (S0.u16 | S1.u16)',
  VOP3Op.V_XOR_B16: 'D0.u16 = (S0.u16 ^ S1.u16)',
}

VOP3SDOp_PCODE = {
  VOP3SDOp.V_ADD_CO_CI_U32: "tmp = 64'U(S0.u32) + 64'U(S1.u32) + VCC.u64[laneId].u64;\nVCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP3SDOp.V_SUB_CO_CI_U32: "tmp = S0.u32 - S1.u32 - VCC.u64[laneId].u32;\nVCC.u64[laneId] = 64'U(S1.u32) + VCC.u64[laneId].u64 > 64'U(S0.u32) ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP3SDOp.V_SUBREV_CO_CI_U32: "tmp = S1.u32 - S0.u32 - VCC.u64[laneId].u32;\nVCC.u64[laneId] = 64'U(S0.u32) + VCC.u64[laneId].u64 > 64'U(S1.u32) ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP3SDOp.V_DIV_SCALE_F32: "VCC = 0x0LL;\nif ((64'F(S2.f32) == 0.0) || (64'F(S1.f32) == 0.0)) then\nD0.f32 = NAN.f32\nelsif exponent(S2.f32) - exponent(S1.f32) >= 96 then\n// N/D near MAX_FLOAT_F32\nVCC = 0x1LL;\nif S0.f32 == S1.f32 then\n// Only scale the denominator\nD0.f32 = ldexp(S0.f32, 64)\nendif\nelsif S1.f32 == DENORM.f32 then\nD0.f32 = ldexp(S0.f32, 64)\nelsif ((1.0 / 64'F(S1.f32) == DENORM.f64) && (S2.f32 / S1.f32 == DENORM.f32)) then\nVCC = 0x1LL;\nif S0.f32 == S1.f32 then\n// Only scale the denominator\nD0.f32 = ldexp(S0.f32, 64)\nendif\nelsif 1.0 / 64'F(S1.f32) == DENORM.f64 then\nD0.f32 = ldexp(S0.f32, -64)\nelsif S2.f32 / S1.f32 == DENORM.f32 then\nVCC = 0x1LL;\nif S0.f32 == S2.f32 then\n// Only scale the numerator\nD0.f32 = ldexp(S0.f32, 64)\nendif\nelsif exponent(S2.f32) <= 23 then\n// Numerator is tiny\nD0.f32 = ldexp(S0.f32, 64)\nendif",
  VOP3SDOp.V_DIV_SCALE_F64: 'VCC = 0x0LL;\nif ((S2.f64 == 0.0) || (S1.f64 == 0.0)) then\nD0.f64 = NAN.f64\nelsif exponent(S2.f64) - exponent(S1.f64) >= 768 then\n// N/D near MAX_FLOAT_F64\nVCC = 0x1LL;\nif S0.f64 == S1.f64 then\n// Only scale the denominator\nD0.f64 = ldexp(S0.f64, 128)\nendif\nelsif S1.f64 == DENORM.f64 then\nD0.f64 = ldexp(S0.f64, 128)\nelsif ((1.0 / S1.f64 == DENORM.f64) && (S2.f64 / S1.f64 == DENORM.f64)) then\nVCC = 0x1LL;\nif S0.f64 == S1.f64 then\n// Only scale the denominator\nD0.f64 = ldexp(S0.f64, 128)\nendif\nelsif 1.0 / S1.f64 == DENORM.f64 then\nD0.f64 = ldexp(S0.f64, -128)\nelsif S2.f64 / S1.f64 == DENORM.f64 then\nVCC = 0x1LL;\nif S0.f64 == S2.f64 then\n// Only scale the numerator\nD0.f64 = ldexp(S0.f64, 128)\nendif\nelsif exponent(S2.f64) <= 53 then\n// Numerator is tiny\nD0.f64 = ldexp(S0.f64, 128)\nendif',
  VOP3SDOp.V_MAD_U64_U32: "{ D1.u1, D0.u64 } = 65'B(65'U(S0.u32) * 65'U(S1.u32) + 65'U(S2.u64))",
  VOP3SDOp.V_MAD_I64_I32: "{ D1.i1, D0.i64 } = 65'B(65'I(S0.i32) * 65'I(S1.i32) + 65'I(S2.i64))",
  VOP3SDOp.V_ADD_CO_U32: "tmp = 64'U(S0.u32) + 64'U(S1.u32);\nVCC.u64[laneId] = tmp >= 0x100000000ULL ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_ADD_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP3SDOp.V_SUB_CO_U32: "tmp = S0.u32 - S1.u32;\nVCC.u64[laneId] = S1.u32 > S0.u32 ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
  VOP3SDOp.V_SUBREV_CO_U32: "tmp = S1.u32 - S0.u32;\nVCC.u64[laneId] = S0.u32 > S1.u32 ? 1'1U : 1'0U;\n// VCC is an UNSIGNED overflow/carry-out for V_SUB_CO_CI_U32.\nD0.u32 = tmp.u32",
}

VOP3POp_PCODE = {
  VOP3POp.V_PK_MAD_I16: 'tmp[31 : 16].i16 = S0[31 : 16].i16 * S1[31 : 16].i16 + S2[31 : 16].i16;\ntmp[15 : 0].i16 = S0[15 : 0].i16 * S1[15 : 0].i16 + S2[15 : 0].i16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MUL_LO_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_ADD_I16: 'tmp[31 : 16].i16 = S0[31 : 16].i16 + S1[31 : 16].i16;\ntmp[15 : 0].i16 = S0[15 : 0].i16 + S1[15 : 0].i16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_SUB_I16: 'tmp[31 : 16].i16 = S0[31 : 16].i16 - S1[31 : 16].i16;\ntmp[15 : 0].i16 = S0[15 : 0].i16 - S1[15 : 0].i16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_LSHLREV_B16: 'tmp[31 : 16].u16 = (S1[31 : 16].u16 << S0.u32[19 : 16].u32);\ntmp[15 : 0].u16 = (S1[15 : 0].u16 << S0.u32[3 : 0].u32);\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_LSHRREV_B16: 'tmp[31 : 16].u16 = (S1[31 : 16].u16 >> S0.u32[19 : 16].u32);\ntmp[15 : 0].u16 = (S1[15 : 0].u16 >> S0.u32[3 : 0].u32);\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_ASHRREV_I16: 'tmp[31 : 16].i16 = (S1[31 : 16].i16 >> S0.u32[19 : 16].u32);\ntmp[15 : 0].i16 = (S1[15 : 0].i16 >> S0.u32[3 : 0].u32);\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MAX_I16: 'tmp[31 : 16].i16 = S0[31 : 16].i16 >= S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16;\ntmp[15 : 0].i16 = S0[15 : 0].i16 >= S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MIN_I16: 'tmp[31 : 16].i16 = S0[31 : 16].i16 < S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16;\ntmp[15 : 0].i16 = S0[15 : 0].i16 < S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MAD_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + S2[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + S2[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_ADD_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 + S1[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 + S1[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_SUB_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 - S1[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 - S1[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MAX_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 >= S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 >= S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MIN_U16: 'tmp[31 : 16].u16 = S0[31 : 16].u16 < S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16;\ntmp[15 : 0].u16 = S0[15 : 0].u16 < S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_FMA_F16: "declare tmp : 32'B;\ntmp[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, S2[31 : 16].f16);\ntmp[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, S2[15 : 0].f16);\nD0.b32 = tmp",
  VOP3POp.V_PK_ADD_F16: 'tmp[31 : 16].f16 = S0[31 : 16].f16 + S1[31 : 16].f16;\ntmp[15 : 0].f16 = S0[15 : 0].f16 + S1[15 : 0].f16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MUL_F16: 'tmp[31 : 16].f16 = S0[31 : 16].f16 * S1[31 : 16].f16;\ntmp[15 : 0].f16 = S0[15 : 0].f16 * S1[15 : 0].f16;\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MIN_F16: 'tmp[31 : 16].f16 = v_min_f16(S0[31 : 16].f16, S1[31 : 16].f16);\ntmp[15 : 0].f16 = v_min_f16(S0[15 : 0].f16, S1[15 : 0].f16);\nD0.b32 = tmp.b32',
  VOP3POp.V_PK_MAX_F16: 'tmp[31 : 16].f16 = v_max_f16(S0[31 : 16].f16, S1[31 : 16].f16);\ntmp[15 : 0].f16 = v_max_f16(S0[15 : 0].f16, S1[15 : 0].f16);\nD0.b32 = tmp.b32',
  VOP3POp.V_DOT2_F32_F16: 'tmp = S2.f32;\ntmp += f16_to_f32(S0[15 : 0].f16) * f16_to_f32(S1[15 : 0].f16);\ntmp += f16_to_f32(S0[31 : 16].f16) * f16_to_f32(S1[31 : 16].f16);\nD0.f32 = tmp',
  VOP3POp.V_DOT4_I32_IU8: "declare A : 32'I[4];\ndeclare B : 32'I[4];\nfor i in 0 : 3 do\nA8 = S0[i * 8 + 7 : i * 8];\nB8 = S1[i * 8 + 7 : i * 8];\nendfor;\nC = S2.i32;\ntmp = C.i32;\nD0.i32 = tmp",
  VOP3POp.V_DOT4_U32_U8: 'tmp = S2.u32;\ntmp += u8_to_u32(S0[7 : 0].u8) * u8_to_u32(S1[7 : 0].u8);\ntmp += u8_to_u32(S0[15 : 8].u8) * u8_to_u32(S1[15 : 8].u8);\ntmp += u8_to_u32(S0[23 : 16].u8) * u8_to_u32(S1[23 : 16].u8);\ntmp += u8_to_u32(S0[31 : 24].u8) * u8_to_u32(S1[31 : 24].u8);\nD0.u32 = tmp',
  VOP3POp.V_DOT8_I32_IU4: "declare A : 32'I[8];\ndeclare B : 32'I[8];\nfor i in 0 : 7 do\nA4 = S0[i * 4 + 3 : i * 4];\nB4 = S1[i * 4 + 3 : i * 4];\nendfor;\nC = S2.i32;\ntmp = C.i32;\nD0.i32 = tmp",
  VOP3POp.V_DOT8_U32_U4: 'tmp = S2.u32;\ntmp += u4_to_u32(S0[3 : 0].u4) * u4_to_u32(S1[3 : 0].u4);\ntmp += u4_to_u32(S0[7 : 4].u4) * u4_to_u32(S1[7 : 4].u4);\ntmp += u4_to_u32(S0[11 : 8].u4) * u4_to_u32(S1[11 : 8].u4);\ntmp += u4_to_u32(S0[15 : 12].u4) * u4_to_u32(S1[15 : 12].u4);\ntmp += u4_to_u32(S0[19 : 16].u4) * u4_to_u32(S1[19 : 16].u4);\ntmp += u4_to_u32(S0[23 : 20].u4) * u4_to_u32(S1[23 : 20].u4);\ntmp += u4_to_u32(S0[27 : 24].u4) * u4_to_u32(S1[27 : 24].u4);\ntmp += u4_to_u32(S0[31 : 28].u4) * u4_to_u32(S1[31 : 28].u4);\nD0.u32 = tmp',
  VOP3POp.V_DOT2_F32_BF16: 'tmp = S2.f32;\ntmp += bf16_to_f32(S0[15 : 0].bf16) * bf16_to_f32(S1[15 : 0].bf16);\ntmp += bf16_to_f32(S0[31 : 16].bf16) * bf16_to_f32(S1[31 : 16].bf16);\nD0.f32 = tmp',
  VOP3POp.V_FMA_MIX_F32: "declare in : 32'F[3];\ndeclare S : 32'B[3];\nfor i in 0 : 2 do\nif !OPSEL_HI.u3[i] then\nin[i] = S[i].f32\nelsif OPSEL.u3[i] then\nin[i] = f16_to_f32(S[i][31 : 16].f16)\nelse\nin[i] = f16_to_f32(S[i][15 : 0].f16)\nendif\nendfor;\nD0[31 : 0].f32 = fma(in[0], in[1], in[2])",
  VOP3POp.V_FMA_MIXLO_F16: "declare in : 32'F[3];\ndeclare S : 32'B[3];\nfor i in 0 : 2 do\nif !OPSEL_HI.u3[i] then\nin[i] = S[i].f32\nelsif OPSEL.u3[i] then\nin[i] = f16_to_f32(S[i][31 : 16].f16)\nelse\nin[i] = f16_to_f32(S[i][15 : 0].f16)\nendif\nendfor;\nD0[15 : 0].f16 = f32_to_f16(fma(in[0], in[1], in[2]))",
  VOP3POp.V_FMA_MIXHI_F16: "declare in : 32'F[3];\ndeclare S : 32'B[3];\nfor i in 0 : 2 do\nif !OPSEL_HI.u3[i] then\nin[i] = S[i].f32\nelsif OPSEL.u3[i] then\nin[i] = f16_to_f32(S[i][31 : 16].f16)\nelse\nin[i] = f16_to_f32(S[i][15 : 0].f16)\nendif\nendfor;\nD0[31 : 16].f16 = f32_to_f16(fma(in[0], in[1], in[2]))",
  VOP3POp.V_WMMA_F32_16X16X16_F16: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.f32(16x16) = S0.f16(16x16) * S1.f16(16x16) + S2.f32(16x16)";\nEXEC = saved_exec',
  VOP3POp.V_WMMA_F32_16X16X16_BF16: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.f32(16x16) = S0.bf16(16x16) * S1.bf16(16x16) + S2.f32(16x16)";\nEXEC = saved_exec',
  VOP3POp.V_WMMA_F16_16X16X16_F16: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.f16(16x16) = S0.f16(16x16) * S1.f16(16x16) + S2.f16(16x16)";\nEXEC = saved_exec',
  VOP3POp.V_WMMA_BF16_16X16X16_BF16: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.bf16(16x16) = S0.bf16(16x16) * S1.bf16(16x16) + S2.bf16(16x16)";\nEXEC = saved_exec',
  VOP3POp.V_WMMA_I32_16X16X16_IU8: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.i32(16x16) = S0.iu8(16x16) * S1.iu8(16x16) + S2.i32(16x16)";\nEXEC = saved_exec',
  VOP3POp.V_WMMA_I32_16X16X16_IU4: 'saved_exec = EXEC;\nEXEC = 64\'B(-1);\neval "D0.i32(16x16) = S0.iu4(16x16) * S1.iu4(16x16) + S2.i32(16x16)";\nEXEC = saved_exec',
}

VOPCOp_PCODE = {
  VOPCOp.V_CMP_F_F16: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_F16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f16 < S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_F16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f16 == S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_F16: 'D0.u64[laneId] = S0.f16 <= S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_F16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f16 > S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LG_F16: 'D0.u64[laneId] = S0.f16 <> S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_F16: 'D0.u64[laneId] = S0.f16 >= S1.f16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_O_F16: "Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16)));\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_U_F16: "VCC or a scalar register.\nD0.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)));\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_NGE_F16: 'D0.u64[laneId] = !(S0.f16 >= S1.f16);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLG_F16: 'D0.u64[laneId] = !(S0.f16 <> S1.f16);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NGT_F16: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f16 > S1.f16);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLE_F16: 'D0.u64[laneId] = !(S0.f16 <= S1.f16);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NEQ_F16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f16 == S1.f16);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLT_F16: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f16 < S1.f16);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_F16: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_F_F32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_F32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f32 < S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_F32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f32 == S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_F32: 'D0.u64[laneId] = S0.f32 <= S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_F32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f32 > S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LG_F32: 'D0.u64[laneId] = S0.f32 <> S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_F32: 'D0.u64[laneId] = S0.f32 >= S1.f32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_O_F32: "Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32)));\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_U_F32: "VCC or a scalar register.\nD0.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)));\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_NGE_F32: 'D0.u64[laneId] = !(S0.f32 >= S1.f32);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLG_F32: 'D0.u64[laneId] = !(S0.f32 <> S1.f32);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NGT_F32: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f32 > S1.f32);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLE_F32: 'D0.u64[laneId] = !(S0.f32 <= S1.f32);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NEQ_F32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f32 == S1.f32);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLT_F32: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f32 < S1.f32);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_F32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_F_F64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_F64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f64 < S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_F64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.f64 == S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_F64: 'D0.u64[laneId] = S0.f64 <= S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_F64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.f64 > S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LG_F64: 'D0.u64[laneId] = S0.f64 <> S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_F64: 'D0.u64[laneId] = S0.f64 >= S1.f64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_O_F64: 'Set the per-lane condition code to 1 iff the first input is orderable to the second input. Store the result into VCC\nD0.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64));\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_U_F64: 'VCC or a scalar register.\nD0.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64));\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NGE_F64: 'D0.u64[laneId] = !(S0.f64 >= S1.f64);\n// With NAN inputs this is not the same operation as <\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLG_F64: 'D0.u64[laneId] = !(S0.f64 <> S1.f64);\n// With NAN inputs this is not the same operation as ==\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NGT_F64: 'VCC or a scalar register.\nD0.u64[laneId] = !(S0.f64 > S1.f64);\n// With NAN inputs this is not the same operation as <=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLE_F64: 'D0.u64[laneId] = !(S0.f64 <= S1.f64);\n// With NAN inputs this is not the same operation as >\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NEQ_F64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f64 == S1.f64);\n// With NAN inputs this is not the same operation as !=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NLT_F64: 'Set the per-lane condition code to 1 iff the first input is not less than the second input. Store the result into VCC\nD0.u64[laneId] = !(S0.f64 < S1.f64);\n// With NAN inputs this is not the same operation as >=\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_F64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_I16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i16 < S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_I16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i16 == S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_I16: 'D0.u64[laneId] = S0.i16 <= S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_I16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i16 > S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_I16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i16 <> S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_I16: 'D0.u64[laneId] = S0.i16 >= S1.i16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LT_U16: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u16 < S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_U16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u16 == S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_U16: 'D0.u64[laneId] = S0.u16 <= S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_U16: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u16 > S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_U16: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u16 <> S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_U16: 'D0.u64[laneId] = S0.u16 >= S1.u16;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_F_I32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_I32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i32 < S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_I32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i32 == S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_I32: 'D0.u64[laneId] = S0.i32 <= S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_I32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i32 > S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_I32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i32 <> S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_I32: 'D0.u64[laneId] = S0.i32 >= S1.i32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_I32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_F_U32: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_U32: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u32 < S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_U32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u32 == S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_U32: 'D0.u64[laneId] = S0.u32 <= S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_U32: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u32 > S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_U32: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u32 <> S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_U32: 'D0.u64[laneId] = S0.u32 >= S1.u32;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_U32: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_F_I64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_I64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i64 < S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_I64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.i64 == S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_I64: 'D0.u64[laneId] = S0.i64 <= S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_I64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.i64 > S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_I64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.i64 <> S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_I64: 'D0.u64[laneId] = S0.i64 >= S1.i64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_I64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_F_U64: "Set the per-lane condition code to 0. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'0U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_LT_U64: 'Set the per-lane condition code to 1 iff the first input is less than the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u64 < S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_EQ_U64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into VCC or a\nD0.u64[laneId] = S0.u64 == S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_LE_U64: 'D0.u64[laneId] = S0.u64 <= S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GT_U64: 'Set the per-lane condition code to 1 iff the first input is greater than the second input. Store the result into VCC\nD0.u64[laneId] = S0.u64 > S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_NE_U64: 'Set the per-lane condition code to 1 iff the first input is not equal to the second input. Store the result into VCC\nD0.u64[laneId] = S0.u64 <> S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_GE_U64: 'D0.u64[laneId] = S0.u64 >= S1.u64;\n// D0 = VCC in VOPC encoding.',
  VOPCOp.V_CMP_T_U64: "Set the per-lane condition code to 1. Store the result into VCC or a scalar register.\nD0.u64[laneId] = 1'1U;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_CLASS_F16: "half-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f16)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f16)) then\nresult = S1.u32[1]\nelsif exponent(S0.f16) == 31 then\n// +-INF\nresult = S1.u32[sign(S0.f16) ? 2 : 9]\nelsif exponent(S0.f16) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f16) ? 3 : 8]\nelsif 64'F(abs(S0.f16)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f16) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f16) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_CLASS_F32: "single-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f32)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f32)) then\nresult = S1.u32[1]\nelsif exponent(S0.f32) == 255 then\n// +-INF\nresult = S1.u32[sign(S0.f32) ? 2 : 9]\nelsif exponent(S0.f32) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f32) ? 3 : 8]\nelsif 64'F(abs(S0.f32)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f32) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f32) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMP_CLASS_F64: "double-precision float, and set the per-lane condition code to the result. Store the result into VCC or a scalar\nS1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(S0.f64) then\nresult = S1.u32[0]\nelsif isQuietNAN(S0.f64) then\nresult = S1.u32[1]\nelsif exponent(S0.f64) == 2047 then\n// +-INF\nresult = S1.u32[sign(S0.f64) ? 2 : 9]\nelsif exponent(S0.f64) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f64) ? 3 : 8]\nelsif abs(S0.f64) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f64) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f64) ? 5 : 6]\nendif;\nD0.u64[laneId] = result;\n// D0 = VCC in VOPC encoding.",
  VOPCOp.V_CMPX_F_F16: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_F16: 'EXEC.u64[laneId] = S0.f16 < S1.f16',
  VOPCOp.V_CMPX_EQ_F16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f16 == S1.f16',
  VOPCOp.V_CMPX_LE_F16: 'EXEC.u64[laneId] = S0.f16 <= S1.f16',
  VOPCOp.V_CMPX_GT_F16: 'EXEC.u64[laneId] = S0.f16 > S1.f16',
  VOPCOp.V_CMPX_LG_F16: 'EXEC.u64[laneId] = S0.f16 <> S1.f16',
  VOPCOp.V_CMPX_GE_F16: 'EXEC.u64[laneId] = S0.f16 >= S1.f16',
  VOPCOp.V_CMPX_O_F16: "EXEC.u64[laneId] = (!isNAN(64'F(S0.f16)) && !isNAN(64'F(S1.f16)))",
  VOPCOp.V_CMPX_U_F16: "EXEC.u64[laneId] = (isNAN(64'F(S0.f16)) || isNAN(64'F(S1.f16)))",
  VOPCOp.V_CMPX_NGE_F16: 'EXEC.u64[laneId] = !(S0.f16 >= S1.f16);\n// With NAN inputs this is not the same operation as <',
  VOPCOp.V_CMPX_NLG_F16: 'EXEC.u64[laneId] = !(S0.f16 <> S1.f16);\n// With NAN inputs this is not the same operation as ==',
  VOPCOp.V_CMPX_NGT_F16: 'EXEC.u64[laneId] = !(S0.f16 > S1.f16);\n// With NAN inputs this is not the same operation as <=',
  VOPCOp.V_CMPX_NLE_F16: 'EXEC.u64[laneId] = !(S0.f16 <= S1.f16);\n// With NAN inputs this is not the same operation as >',
  VOPCOp.V_CMPX_NEQ_F16: 'EXEC.u64[laneId] = !(S0.f16 == S1.f16);\n// With NAN inputs this is not the same operation as !=',
  VOPCOp.V_CMPX_NLT_F16: 'EXEC.u64[laneId] = !(S0.f16 < S1.f16);\n// With NAN inputs this is not the same operation as >=',
  VOPCOp.V_CMPX_T_F16: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_F_F32: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_F32: 'EXEC.u64[laneId] = S0.f32 < S1.f32',
  VOPCOp.V_CMPX_EQ_F32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f32 == S1.f32',
  VOPCOp.V_CMPX_LE_F32: 'EXEC.u64[laneId] = S0.f32 <= S1.f32',
  VOPCOp.V_CMPX_GT_F32: 'EXEC.u64[laneId] = S0.f32 > S1.f32',
  VOPCOp.V_CMPX_LG_F32: 'EXEC.u64[laneId] = S0.f32 <> S1.f32',
  VOPCOp.V_CMPX_GE_F32: 'EXEC.u64[laneId] = S0.f32 >= S1.f32',
  VOPCOp.V_CMPX_O_F32: "EXEC.u64[laneId] = (!isNAN(64'F(S0.f32)) && !isNAN(64'F(S1.f32)))",
  VOPCOp.V_CMPX_U_F32: "EXEC.u64[laneId] = (isNAN(64'F(S0.f32)) || isNAN(64'F(S1.f32)))",
  VOPCOp.V_CMPX_NGE_F32: 'EXEC.u64[laneId] = !(S0.f32 >= S1.f32);\n// With NAN inputs this is not the same operation as <',
  VOPCOp.V_CMPX_NLG_F32: 'EXEC.u64[laneId] = !(S0.f32 <> S1.f32);\n// With NAN inputs this is not the same operation as ==',
  VOPCOp.V_CMPX_NGT_F32: 'EXEC.u64[laneId] = !(S0.f32 > S1.f32);\n// With NAN inputs this is not the same operation as <=',
  VOPCOp.V_CMPX_NLE_F32: 'EXEC.u64[laneId] = !(S0.f32 <= S1.f32);\n// With NAN inputs this is not the same operation as >',
  VOPCOp.V_CMPX_NEQ_F32: 'EXEC.u64[laneId] = !(S0.f32 == S1.f32);\n// With NAN inputs this is not the same operation as !=',
  VOPCOp.V_CMPX_NLT_F32: 'EXEC.u64[laneId] = !(S0.f32 < S1.f32);\n// With NAN inputs this is not the same operation as >=',
  VOPCOp.V_CMPX_T_F32: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_F_F64: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_F64: 'EXEC.u64[laneId] = S0.f64 < S1.f64',
  VOPCOp.V_CMPX_EQ_F64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.f64 == S1.f64',
  VOPCOp.V_CMPX_LE_F64: 'EXEC.u64[laneId] = S0.f64 <= S1.f64',
  VOPCOp.V_CMPX_GT_F64: 'EXEC.u64[laneId] = S0.f64 > S1.f64',
  VOPCOp.V_CMPX_LG_F64: 'EXEC.u64[laneId] = S0.f64 <> S1.f64',
  VOPCOp.V_CMPX_GE_F64: 'EXEC.u64[laneId] = S0.f64 >= S1.f64',
  VOPCOp.V_CMPX_O_F64: 'EXEC.u64[laneId] = (!isNAN(S0.f64) && !isNAN(S1.f64))',
  VOPCOp.V_CMPX_U_F64: 'EXEC.u64[laneId] = (isNAN(S0.f64) || isNAN(S1.f64))',
  VOPCOp.V_CMPX_NGE_F64: 'EXEC.u64[laneId] = !(S0.f64 >= S1.f64);\n// With NAN inputs this is not the same operation as <',
  VOPCOp.V_CMPX_NLG_F64: 'EXEC.u64[laneId] = !(S0.f64 <> S1.f64);\n// With NAN inputs this is not the same operation as ==',
  VOPCOp.V_CMPX_NGT_F64: 'EXEC.u64[laneId] = !(S0.f64 > S1.f64);\n// With NAN inputs this is not the same operation as <=',
  VOPCOp.V_CMPX_NLE_F64: 'EXEC.u64[laneId] = !(S0.f64 <= S1.f64);\n// With NAN inputs this is not the same operation as >',
  VOPCOp.V_CMPX_NEQ_F64: 'EXEC.u64[laneId] = !(S0.f64 == S1.f64);\n// With NAN inputs this is not the same operation as !=',
  VOPCOp.V_CMPX_NLT_F64: 'EXEC.u64[laneId] = !(S0.f64 < S1.f64);\n// With NAN inputs this is not the same operation as >=',
  VOPCOp.V_CMPX_T_F64: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_LT_I16: 'EXEC.u64[laneId] = S0.i16 < S1.i16',
  VOPCOp.V_CMPX_EQ_I16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i16 == S1.i16',
  VOPCOp.V_CMPX_LE_I16: 'EXEC.u64[laneId] = S0.i16 <= S1.i16',
  VOPCOp.V_CMPX_GT_I16: 'EXEC.u64[laneId] = S0.i16 > S1.i16',
  VOPCOp.V_CMPX_NE_I16: 'EXEC.u64[laneId] = S0.i16 <> S1.i16',
  VOPCOp.V_CMPX_GE_I16: 'EXEC.u64[laneId] = S0.i16 >= S1.i16',
  VOPCOp.V_CMPX_LT_U16: 'EXEC.u64[laneId] = S0.u16 < S1.u16',
  VOPCOp.V_CMPX_EQ_U16: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u16 == S1.u16',
  VOPCOp.V_CMPX_LE_U16: 'EXEC.u64[laneId] = S0.u16 <= S1.u16',
  VOPCOp.V_CMPX_GT_U16: 'EXEC.u64[laneId] = S0.u16 > S1.u16',
  VOPCOp.V_CMPX_NE_U16: 'EXEC.u64[laneId] = S0.u16 <> S1.u16',
  VOPCOp.V_CMPX_GE_U16: 'EXEC.u64[laneId] = S0.u16 >= S1.u16',
  VOPCOp.V_CMPX_F_I32: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_I32: 'EXEC.u64[laneId] = S0.i32 < S1.i32',
  VOPCOp.V_CMPX_EQ_I32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i32 == S1.i32',
  VOPCOp.V_CMPX_LE_I32: 'EXEC.u64[laneId] = S0.i32 <= S1.i32',
  VOPCOp.V_CMPX_GT_I32: 'EXEC.u64[laneId] = S0.i32 > S1.i32',
  VOPCOp.V_CMPX_NE_I32: 'EXEC.u64[laneId] = S0.i32 <> S1.i32',
  VOPCOp.V_CMPX_GE_I32: 'EXEC.u64[laneId] = S0.i32 >= S1.i32',
  VOPCOp.V_CMPX_T_I32: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_F_U32: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_U32: 'EXEC.u64[laneId] = S0.u32 < S1.u32',
  VOPCOp.V_CMPX_EQ_U32: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u32 == S1.u32',
  VOPCOp.V_CMPX_LE_U32: 'EXEC.u64[laneId] = S0.u32 <= S1.u32',
  VOPCOp.V_CMPX_GT_U32: 'EXEC.u64[laneId] = S0.u32 > S1.u32',
  VOPCOp.V_CMPX_NE_U32: 'EXEC.u64[laneId] = S0.u32 <> S1.u32',
  VOPCOp.V_CMPX_GE_U32: 'EXEC.u64[laneId] = S0.u32 >= S1.u32',
  VOPCOp.V_CMPX_T_U32: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_F_I64: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_I64: 'EXEC.u64[laneId] = S0.i64 < S1.i64',
  VOPCOp.V_CMPX_EQ_I64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.i64 == S1.i64',
  VOPCOp.V_CMPX_LE_I64: 'EXEC.u64[laneId] = S0.i64 <= S1.i64',
  VOPCOp.V_CMPX_GT_I64: 'EXEC.u64[laneId] = S0.i64 > S1.i64',
  VOPCOp.V_CMPX_NE_I64: 'EXEC.u64[laneId] = S0.i64 <> S1.i64',
  VOPCOp.V_CMPX_GE_I64: 'EXEC.u64[laneId] = S0.i64 >= S1.i64',
  VOPCOp.V_CMPX_T_I64: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_F_U64: "EXEC.u64[laneId] = 1'0U",
  VOPCOp.V_CMPX_LT_U64: 'EXEC.u64[laneId] = S0.u64 < S1.u64',
  VOPCOp.V_CMPX_EQ_U64: 'Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC\nEXEC.u64[laneId] = S0.u64 == S1.u64',
  VOPCOp.V_CMPX_LE_U64: 'EXEC.u64[laneId] = S0.u64 <= S1.u64',
  VOPCOp.V_CMPX_GT_U64: 'EXEC.u64[laneId] = S0.u64 > S1.u64',
  VOPCOp.V_CMPX_NE_U64: 'EXEC.u64[laneId] = S0.u64 <> S1.u64',
  VOPCOp.V_CMPX_GE_U64: 'EXEC.u64[laneId] = S0.u64 >= S1.u64',
  VOPCOp.V_CMPX_T_U64: "EXEC.u64[laneId] = 1'1U",
  VOPCOp.V_CMPX_CLASS_F16: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f16)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f16)) then\nresult = S1.u32[1]\nelsif exponent(S0.f16) == 31 then\n// +-INF\nresult = S1.u32[sign(S0.f16) ? 2 : 9]\nelsif exponent(S0.f16) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f16) ? 3 : 8]\nelsif 64'F(abs(S0.f16)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f16) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f16) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
  VOPCOp.V_CMPX_CLASS_F32: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(64'F(S0.f32)) then\nresult = S1.u32[0]\nelsif isQuietNAN(64'F(S0.f32)) then\nresult = S1.u32[1]\nelsif exponent(S0.f32) == 255 then\n// +-INF\nresult = S1.u32[sign(S0.f32) ? 2 : 9]\nelsif exponent(S0.f32) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f32) ? 3 : 8]\nelsif 64'F(abs(S0.f32)) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f32) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f32) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
  VOPCOp.V_CMPX_CLASS_F64: "S1.u[0] value is a signaling NAN.\nS1.u[1] value is a quiet NAN.\nS1.u[2] value is negative infinity.\nS1.u[3] value is a negative normal value.\nS1.u[4] value is a negative denormal value.\nS1.u[5] value is negative zero.\nS1.u[6] value is positive zero.\nS1.u[7] value is a positive denormal value.\nS1.u[8] value is a positive normal value.\nS1.u[9] value is positive infinity.\ndeclare result : 1'U;\nif isSignalNAN(S0.f64) then\nresult = S1.u32[0]\nelsif isQuietNAN(S0.f64) then\nresult = S1.u32[1]\nelsif exponent(S0.f64) == 2047 then\n// +-INF\nresult = S1.u32[sign(S0.f64) ? 2 : 9]\nelsif exponent(S0.f64) > 0 then\n// +-normal value\nresult = S1.u32[sign(S0.f64) ? 3 : 8]\nelsif abs(S0.f64) > 0.0 then\n// +-denormal value\nresult = S1.u32[sign(S0.f64) ? 4 : 7]\nelse\n// +-0.0\nresult = S1.u32[sign(S0.f64) ? 5 : 6]\nendif;\nEXEC.u64[laneId] = result",
}

DSOp_PCODE = {
  DSOp.DS_ADD_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_SUB_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 -= DATA.u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_RSUB_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 = DATA.u32 - MEM[ADDR].u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_INC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = tmp >= src ? 0U : tmp + 1U;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_DEC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = ((tmp == 0U) || (tmp > src)) ? src : tmp - 1U;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_MIN_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src < tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  DSOp.DS_MAX_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src >= tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  DSOp.DS_MIN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src < tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_MAX_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src >= tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_AND_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp & DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_OR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp | DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_XOR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp ^ DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_MSKOR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = ((tmp & ~DATA.b32) | DATA2.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_STORE_B32: 'MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0]',
  DSOp.DS_STORE_2ADDR_B32: 'MEM[ADDR + OFFSET0.u32 * 4U].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET1.u32 * 4U].b32 = DATA2[31 : 0]',
  DSOp.DS_STORE_2ADDR_STRIDE64_B32: 'MEM[ADDR + OFFSET0.u32 * 256U].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET1.u32 * 256U].b32 = DATA2[31 : 0]',
  DSOp.DS_CMPSTORE_B32: 'tmp = MEM[ADDR].b32;\nsrc = DATA.b32;\ncmp = DATA2.b32;\nMEM[ADDR].b32 = tmp == cmp ? src : tmp;\nRETURN_DATA.b32 = tmp',
  DSOp.DS_CMPSTORE_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\ncmp = DATA2.f32;\nMEM[ADDR].f32 = tmp == cmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_MIN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src < tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_MAX_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src > tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_ADD_F32: 'tmp = MEM[ADDR].f32;\nMEM[ADDR].f32 += DATA.f32;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_STORE_B8: 'MEM[ADDR].b8 = DATA[7 : 0]',
  DSOp.DS_STORE_B16: 'MEM[ADDR].b16 = DATA[15 : 0]',
  DSOp.DS_ADD_RTN_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_SUB_RTN_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 -= DATA.u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_RSUB_RTN_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 = DATA.u32 - MEM[ADDR].u32;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_INC_RTN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = tmp >= src ? 0U : tmp + 1U;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_DEC_RTN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = ((tmp == 0U) || (tmp > src)) ? src : tmp - 1U;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_MIN_RTN_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src < tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  DSOp.DS_MAX_RTN_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src >= tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  DSOp.DS_MIN_RTN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src < tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_MAX_RTN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src >= tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  DSOp.DS_AND_RTN_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp & DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_OR_RTN_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp | DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_XOR_RTN_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp ^ DATA.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_MSKOR_RTN_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = ((tmp & ~DATA.b32) | DATA2.b32);\nRETURN_DATA.b32 = tmp',
  DSOp.DS_STOREXCHG_RTN_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = DATA.b32;\nRETURN_DATA.b32 = tmp',
  DSOp.DS_STOREXCHG_2ADDR_RTN_B32: 'addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 4U;\naddr2 = ADDR_BASE.u32 + OFFSET1.u32 * 4U;\ntmp1 = MEM[addr1].b32;\ntmp2 = MEM[addr2].b32;\nMEM[addr1].b32 = DATA.b32;\nMEM[addr2].b32 = DATA2.b32;\n// Note DATA2 can be any other register\nRETURN_DATA[31 : 0] = tmp1;\nRETURN_DATA[63 : 32] = tmp2',
  DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: 'addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 256U;\naddr2 = ADDR_BASE.u32 + OFFSET1.u32 * 256U;\ntmp1 = MEM[addr1].b32;\ntmp2 = MEM[addr2].b32;\nMEM[addr1].b32 = DATA.b32;\nMEM[addr2].b32 = DATA2.b32;\n// Note DATA2 can be any other register\nRETURN_DATA[31 : 0] = tmp1;\nRETURN_DATA[63 : 32] = tmp2',
  DSOp.DS_CMPSTORE_RTN_B32: 'tmp = MEM[ADDR].b32;\nsrc = DATA.b32;\ncmp = DATA2.b32;\nMEM[ADDR].b32 = tmp == cmp ? src : tmp;\nRETURN_DATA.b32 = tmp',
  DSOp.DS_CMPSTORE_RTN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\ncmp = DATA2.f32;\nMEM[ADDR].f32 = tmp == cmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_MIN_RTN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src < tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_MAX_RTN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src > tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_WRAP_RTN_B32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 = tmp >= DATA.u32 ? tmp - DATA.u32 : tmp + DATA2.u32;\nRETURN_DATA = tmp',
  DSOp.DS_SWIZZLE_B32: 'offset = offset1:offset0;\nif (offset >= 0xe000) {\n// FFT decomposition\nmask = offset[4:0];\nfor (i = 0; i < 64; i++) {\nj = reverse_bits(i & 0x1f);\nj = (j >> count_ones(mask));\nj |= (i & mask);\nj |= i & 0x20;\nthread_out[i] = thread_valid[j] ? thread_in[j] : 0;\n} elsif (offset >= 0xc000) {\n// rotate\nrotate = offset[9:5];\nmask = offset[4:0];\nif (offset[10]) {\nrotate = -rotate;\nfor (i = 0; i < 64; i++) {\nj = (i & mask) | ((i + rotate) & ~mask);\nj |= i & 0x20;\nthread_out[i] = thread_valid[j] ? thread_in[j] : 0;\n// full data sharing within 4 consecutive threads\nfor (i = 0; i < 64; i+=4) {\nthread_out[i+0] = thread_valid[i+offset[1:0]]?thread_in[i+offset[1:0]]:0;\nthread_out[i+1] = thread_valid[i+offset[3:2]]?thread_in[i+offset[3:2]]:0;\nthread_out[i+2] = thread_valid[i+offset[5:4]]?thread_in[i+offset[5:4]]:0;\nthread_out[i+3] = thread_valid[i+offset[7:6]]?thread_in[i+offset[7:6]]:0;\n} else { // offset[15] == 0\n// limited data sharing within 32 consecutive threads\nxor_mask = offset[14:10];\nor_mask = offset[9:5];\nand_mask = offset[4:0];\nfor (i = 0; i < 64; i++) {\nj = (((i & 0x1f) & and_mask) | or_mask) ^ xor_mask;\nj |= (i & 0x20); // which group of 32\nthread_out[i] = thread_valid[j] ? thread_in[j] : 0;',
  DSOp.DS_LOAD_B32: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32',
  DSOp.DS_LOAD_2ADDR_B32: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 4U].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET1.u32 * 4U].b32',
  DSOp.DS_LOAD_2ADDR_STRIDE64_B32: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 256U].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET1.u32 * 256U].b32',
  DSOp.DS_LOAD_I8: "RETURN_DATA.i32 = 32'I(signext(MEM[ADDR].i8))",
  DSOp.DS_LOAD_U8: "RETURN_DATA.u32 = 32'U({ 24'0U, MEM[ADDR].u8 })",
  DSOp.DS_LOAD_I16: "RETURN_DATA.i32 = 32'I(signext(MEM[ADDR].i16))",
  DSOp.DS_LOAD_U16: "RETURN_DATA.u32 = 32'U({ 16'0U, MEM[ADDR].u16 })",
  DSOp.DS_CONSUME: 'addr = M0.base + offset; // offset by LDS HWBASE, limit to M.size\nrtnval = LDS(addr);\nGPR[VDST] = rtnval; // return to all valid threads',
  DSOp.DS_APPEND: 'addr = M0.base + offset; // offset by LDS HWBASE, limit to M.size\nrtnval = LDS(addr);\nGPR[VDST] = rtnval; // return to all valid threads',
  DSOp.DS_ADD_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 += DATA.u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_SUB_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 -= DATA.u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_RSUB_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 = DATA.u64 - MEM[ADDR].u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_INC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = tmp >= src ? 0ULL : tmp + 1ULL;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_DEC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = ((tmp == 0ULL) || (tmp > src)) ? src : tmp - 1ULL;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_MIN_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src < tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  DSOp.DS_MAX_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src >= tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  DSOp.DS_MIN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src < tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_MAX_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src >= tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_AND_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp & DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_OR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp | DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_XOR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp ^ DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_MSKOR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = ((tmp & ~DATA.b64) | DATA2.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_STORE_B64: 'MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET.u32 + 4U].b32 = DATA[63 : 32]',
  DSOp.DS_STORE_2ADDR_B64: 'MEM[ADDR + OFFSET0.u32 * 8U].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET0.u32 * 8U + 4U].b32 = DATA[63 : 32];\nMEM[ADDR + OFFSET1.u32 * 8U].b32 = DATA2[31 : 0];\nMEM[ADDR + OFFSET1.u32 * 8U + 4U].b32 = DATA2[63 : 32]',
  DSOp.DS_STORE_2ADDR_STRIDE64_B64: 'MEM[ADDR + OFFSET0.u32 * 512U].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET0.u32 * 512U + 4U].b32 = DATA[63 : 32];\nMEM[ADDR + OFFSET1.u32 * 512U].b32 = DATA2[31 : 0];\nMEM[ADDR + OFFSET1.u32 * 512U + 4U].b32 = DATA2[63 : 32]',
  DSOp.DS_CMPSTORE_B64: 'tmp = MEM[ADDR].b64;\nsrc = DATA.b64;\ncmp = DATA2.b64;\nMEM[ADDR].b64 = tmp == cmp ? src : tmp;\nRETURN_DATA.b64 = tmp',
  DSOp.DS_CMPSTORE_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\ncmp = DATA2.f64;\nMEM[ADDR].f64 = tmp == cmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_MIN_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\nMEM[ADDR].f64 = src < tmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_MAX_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\nMEM[ADDR].f64 = src > tmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_ADD_RTN_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 += DATA.u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_SUB_RTN_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 -= DATA.u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_RSUB_RTN_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 = DATA.u64 - MEM[ADDR].u64;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_INC_RTN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = tmp >= src ? 0ULL : tmp + 1ULL;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_DEC_RTN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = ((tmp == 0ULL) || (tmp > src)) ? src : tmp - 1ULL;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_MIN_RTN_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src < tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  DSOp.DS_MAX_RTN_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src >= tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  DSOp.DS_MIN_RTN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src < tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_MAX_RTN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src >= tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  DSOp.DS_AND_RTN_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp & DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_OR_RTN_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp | DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_XOR_RTN_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp ^ DATA.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_MSKOR_RTN_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = ((tmp & ~DATA.b64) | DATA2.b64);\nRETURN_DATA.b64 = tmp',
  DSOp.DS_STOREXCHG_RTN_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = DATA.b64;\nRETURN_DATA.b64 = tmp',
  DSOp.DS_STOREXCHG_2ADDR_RTN_B64: 'addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 8U;\naddr2 = ADDR_BASE.u32 + OFFSET1.u32 * 8U;\ntmp1 = MEM[addr1].b64;\ntmp2 = MEM[addr2].b64;\nMEM[addr1].b64 = DATA.b64;\nMEM[addr2].b64 = DATA2.b64;\n// Note DATA2 can be any other register\nRETURN_DATA[63 : 0] = tmp1;\nRETURN_DATA[127 : 64] = tmp2',
  DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: 'addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 512U;\naddr2 = ADDR_BASE.u32 + OFFSET1.u32 * 512U;\ntmp1 = MEM[addr1].b64;\ntmp2 = MEM[addr2].b64;\nMEM[addr1].b64 = DATA.b64;\nMEM[addr2].b64 = DATA2.b64;\n// Note DATA2 can be any other register\nRETURN_DATA[63 : 0] = tmp1;\nRETURN_DATA[127 : 64] = tmp2',
  DSOp.DS_CMPSTORE_RTN_B64: 'tmp = MEM[ADDR].b64;\nsrc = DATA.b64;\ncmp = DATA2.b64;\nMEM[ADDR].b64 = tmp == cmp ? src : tmp;\nRETURN_DATA.b64 = tmp',
  DSOp.DS_CMPSTORE_RTN_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\ncmp = DATA2.f64;\nMEM[ADDR].f64 = tmp == cmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_MIN_RTN_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\nMEM[ADDR].f64 = src < tmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_MAX_RTN_F64: 'tmp = MEM[ADDR].f64;\nsrc = DATA.f64;\nMEM[ADDR].f64 = src > tmp ? src : tmp;\nRETURN_DATA.f64 = tmp',
  DSOp.DS_LOAD_B64: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4U].b32',
  DSOp.DS_LOAD_2ADDR_B64: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 8U].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET0.u32 * 8U + 4U].b32;\nRETURN_DATA[95 : 64] = MEM[ADDR + OFFSET1.u32 * 8U].b32;\nRETURN_DATA[127 : 96] = MEM[ADDR + OFFSET1.u32 * 8U + 4U].b32',
  DSOp.DS_LOAD_2ADDR_STRIDE64_B64: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 512U].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET0.u32 * 512U + 4U].b32;\nRETURN_DATA[95 : 64] = MEM[ADDR + OFFSET1.u32 * 512U].b32;\nRETURN_DATA[127 : 96] = MEM[ADDR + OFFSET1.u32 * 512U + 4U].b32',
  DSOp.DS_ADD_RTN_F32: 'tmp = MEM[ADDR].f32;\nMEM[ADDR].f32 += DATA.f32;\nRETURN_DATA.f32 = tmp',
  DSOp.DS_ADD_GS_REG_RTN: 'if OFFSET0[5:2] > 7\n// 64-bit GS register access\naddr = (OFFSET0[5:2] - 8) * 2 + 8;\nVDST[0] = GS_REGS(addr + 0);\nVDST[1] = GS_REGS(addr + 1);\n{GS_REGS(addr + 1), GS_REGS(addr)} += DATA0[0]; // source is 32 bit\nelse\naddr = OFFSET0[5:2];\nVDST[0] = GS_REGS(addr);\nGS_REGS(addr) += DATA0[0];\noffset[5:2] Register\noffset[5:2] Register',
  DSOp.DS_SUB_GS_REG_RTN: 'if OFFSET0[5:2] > 7\n// 64-bit GS register access\naddr = (OFFSET0[5:2] - 8) * 2 + 8;\nVDST[0] = GS_REGS(addr + 0);\nVDST[1] = GS_REGS(addr + 1);\n{GS_REGS(addr + 1), GS_REGS(addr)} -= DATA0[0]; // source is 32 bit\nelse\naddr = OFFSET0[5:2];\nVDST[0] = GS_REGS(addr);\nGS_REGS(addr) -= DATA0[0];\noffset[5:2] Register\noffset[5:2] Register',
  DSOp.DS_CONDXCHG32_RTN_B64: "declare OFFSET0 : 8'U;\ndeclare OFFSET1 : 8'U;\ndeclare RETURN_DATA : 32'U[2];\nADDR = S0.u32;\nDATA = S1.u64;\noffset = { OFFSET1, OFFSET0 };\nRETURN_DATA[0] = LDS[ADDR0].u32;\nif DATA[31] then\nLDS[ADDR0] = { 1'0, DATA[30 : 0] }\nendif;\nRETURN_DATA[1] = LDS[ADDR1].u32;\nif DATA[63] then\nLDS[ADDR1] = { 1'0, DATA[62 : 32] }\nendif",
  DSOp.DS_STORE_B8_D16_HI: 'MEM[ADDR].b8 = DATA[23 : 16]',
  DSOp.DS_STORE_B16_D16_HI: 'MEM[ADDR].b16 = DATA[31 : 16]',
  DSOp.DS_LOAD_U8_D16: "RETURN_DATA[15 : 0].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  DSOp.DS_LOAD_U8_D16_HI: "RETURN_DATA[31 : 16].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  DSOp.DS_LOAD_I8_D16: "RETURN_DATA[15 : 0].i16 = 16'I(signext(MEM[ADDR].i8));",
  DSOp.DS_LOAD_I8_D16_HI: "RETURN_DATA[31 : 16].i16 = 16'I(signext(MEM[ADDR].i8));",
  DSOp.DS_LOAD_U16_D16: 'RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;',
  DSOp.DS_LOAD_U16_D16_HI: 'RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16;',
  DSOp.DS_BVH_STACK_RTN_B32: 'The LDS stack address is computed using values packed into ADDR and part of OFFSET1. ADDR carries the\nstack address for the lane. OFFSET1[5:4] contains stack_size[1:0] -- this value is constant for all lanes and is\n(stack_base, stack_index) = DECODE_ADDR(ADDR, OFFSET1);\nlast_node_ptr = DATA0;\n// First 3 passes: push data onto stack\nfor i = 0..2 do\nif DATA_VALID(DATA1[i])\nMEM[stack_base + stack_index] = DATA1[i];\nelsif DATA1[i] == last_node_ptr\nendif\nendfor\n// Fourth pass: return data or pop\nif DATA_VALID(DATA1[3])\nVGPR_RTN = DATA1[3]\nelse\nVGPR_RTN = MEM[stack_base + stack_index];\nMEM[stack_base + stack_index] = INVALID_NODE;\nendif\nif data == INVALID_NODE\nelsif last_node_ptr != INVALID_NODE && data == last_node_ptr\n// Match last_node_ptr\nelse\nendif',
  DSOp.DS_STORE_ADDTID_B32: "declare OFFSET0 : 8'U;\ndeclare OFFSET1 : 8'U;\nMEM[32'I({ OFFSET1, OFFSET0 } + M0[15 : 0]) + laneID.i32 * 4].u32 = DATA0.u32",
  DSOp.DS_LOAD_ADDTID_B32: "declare OFFSET0 : 8'U;\ndeclare OFFSET1 : 8'U;\nRETURN_DATA.u32 = MEM[32'I({ OFFSET1, OFFSET0 } + M0[15 : 0]) + laneID.i32 * 4].u32",
  DSOp.DS_PERMUTE_B32: "// VGPR[laneId][index] is the VGPR RAM\n// VDST, ADDR and DATA0 are from the microcode DS encoding\ndeclare tmp : 32'B[64];\ndeclare OFFSET : 16'U;\ndeclare DATA0 : 32'U;\ndeclare VDST : 32'U;\nfor i in 0 : WAVE64 ? 63 : 31 do\ntmp[i] = 0x0\nendfor;\nfor i in 0 : WAVE64 ? 63 : 31 do\nif EXEC[i].u1 then\ndst_lane = 32'I(VGPR[i][ADDR] + OFFSET.b32) / 4 % 32;\ntmp[dst_lane] = VGPR[i][DATA0]\nendif\nendfor;\n// Copy data into destination VGPRs. If multiple sources\n// select the same destination thread, the highest-numbered\nfor i in 0 : WAVE64 ? 63 : 31 do\nif EXEC[i].u1 then\nVGPR[i][VDST] = tmp[i]\nendif\nendfor",
  DSOp.DS_BPERMUTE_B32: "Note that EXEC mask is applied to both VGPR read and write. If src_lane selects a disabled thread then zero is\n// VGPR[laneId][index] is the VGPR RAM\n// VDST, ADDR and DATA0 are from the microcode DS encoding\ndeclare tmp : 32'B[64];\ndeclare OFFSET : 16'U;\ndeclare DATA0 : 32'U;\ndeclare VDST : 32'U;\nfor i in 0 : WAVE64 ? 63 : 31 do\ntmp[i] = 0x0\nendfor;\nfor i in 0 : WAVE64 ? 63 : 31 do\nsrc_lane = 32'I(VGPR[i][ADDR] + OFFSET.b32) / 4 % 32;\nif EXEC[src_lane].u1 then\ntmp[i] = VGPR[src_lane][DATA0]\nendif\nendfor;\n// Copy data into destination VGPRs. Some source\nfor i in 0 : WAVE64 ? 63 : 31 do\nif EXEC[i].u1 then\nVGPR[i][VDST] = tmp[i]\nendif\nendfor",
  DSOp.DS_STORE_B96: 'MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET.u32 + 4U].b32 = DATA[63 : 32];\nMEM[ADDR + OFFSET.u32 + 8U].b32 = DATA[95 : 64]',
  DSOp.DS_STORE_B128: 'MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0];\nMEM[ADDR + OFFSET.u32 + 4U].b32 = DATA[63 : 32];\nMEM[ADDR + OFFSET.u32 + 8U].b32 = DATA[95 : 64];\nMEM[ADDR + OFFSET.u32 + 12U].b32 = DATA[127 : 96]',
  DSOp.DS_LOAD_B96: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4U].b32;\nRETURN_DATA[95 : 64] = MEM[ADDR + OFFSET.u32 + 8U].b32',
  DSOp.DS_LOAD_B128: 'RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32;\nRETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4U].b32;\nRETURN_DATA[95 : 64] = MEM[ADDR + OFFSET.u32 + 8U].b32;\nRETURN_DATA[127 : 96] = MEM[ADDR + OFFSET.u32 + 12U].b32',
}

FLATOp_PCODE = {
  FLATOp.FLAT_LOAD_U8: "VDATA.u32 = 32'U({ 24'0U, MEM[ADDR].u8 })",
  FLATOp.FLAT_LOAD_I8: "VDATA.i32 = 32'I(signext(MEM[ADDR].i8))",
  FLATOp.FLAT_LOAD_U16: "VDATA.u32 = 32'U({ 16'0U, MEM[ADDR].u16 })",
  FLATOp.FLAT_LOAD_I16: "VDATA.i32 = 32'I(signext(MEM[ADDR].i16))",
  FLATOp.FLAT_LOAD_B32: 'VDATA[31 : 0] = MEM[ADDR].b32',
  FLATOp.FLAT_LOAD_B64: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32',
  FLATOp.FLAT_LOAD_B96: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32',
  FLATOp.FLAT_LOAD_B128: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32;\nVDATA[127 : 96] = MEM[ADDR + 12U].b32',
  FLATOp.FLAT_STORE_B8: 'MEM[ADDR].b8 = VDATA[7 : 0]',
  FLATOp.FLAT_STORE_B16: 'MEM[ADDR].b16 = VDATA[15 : 0]',
  FLATOp.FLAT_STORE_B32: 'MEM[ADDR].b32 = VDATA[31 : 0]',
  FLATOp.FLAT_STORE_B64: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32]',
  FLATOp.FLAT_STORE_B96: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64]',
  FLATOp.FLAT_STORE_B128: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64];\nMEM[ADDR + 12U].b32 = VDATA[127 : 96]',
  FLATOp.FLAT_LOAD_D16_U8: "VDATA[15 : 0].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  FLATOp.FLAT_LOAD_D16_I8: "VDATA[15 : 0].i16 = 16'I(signext(MEM[ADDR].i8));",
  FLATOp.FLAT_LOAD_D16_B16: 'VDATA[15 : 0].b16 = MEM[ADDR].b16;',
  FLATOp.FLAT_LOAD_D16_HI_U8: "VDATA[31 : 16].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  FLATOp.FLAT_LOAD_D16_HI_I8: "VDATA[31 : 16].i16 = 16'I(signext(MEM[ADDR].i8));",
  FLATOp.FLAT_LOAD_D16_HI_B16: 'VDATA[31 : 16].b16 = MEM[ADDR].b16;',
  FLATOp.FLAT_STORE_D16_HI_B8: 'MEM[ADDR].b8 = VDATA[23 : 16]',
  FLATOp.FLAT_STORE_D16_HI_B16: 'MEM[ADDR].b16 = VDATA[31 : 16]',
  FLATOp.FLAT_ATOMIC_SWAP_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = DATA.b32;\nRETURN_DATA.b32 = tmp',
  FLATOp.FLAT_ATOMIC_CMPSWAP_B32: 'tmp = MEM[ADDR].u32;\nsrc = DATA[31 : 0].u32;\ncmp = DATA[63 : 32].u32;\nMEM[ADDR].u32 = tmp == cmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_ADD_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_SUB_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 -= DATA.u32;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_MIN_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src < tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  FLATOp.FLAT_ATOMIC_MIN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src < tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_MAX_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src >= tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  FLATOp.FLAT_ATOMIC_MAX_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src >= tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_AND_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp & DATA.b32);\nRETURN_DATA.b32 = tmp',
  FLATOp.FLAT_ATOMIC_OR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp | DATA.b32);\nRETURN_DATA.b32 = tmp',
  FLATOp.FLAT_ATOMIC_XOR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp ^ DATA.b32);\nRETURN_DATA.b32 = tmp',
  FLATOp.FLAT_ATOMIC_INC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = tmp >= src ? 0U : tmp + 1U;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_DEC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = ((tmp == 0U) || (tmp > src)) ? src : tmp - 1U;\nRETURN_DATA.u32 = tmp',
  FLATOp.FLAT_ATOMIC_SWAP_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = DATA.b64;\nRETURN_DATA.b64 = tmp',
  FLATOp.FLAT_ATOMIC_CMPSWAP_B64: 'tmp = MEM[ADDR].u64;\nsrc = DATA[63 : 0].u64;\ncmp = DATA[127 : 64].u64;\nMEM[ADDR].u64 = tmp == cmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_ADD_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 += DATA.u64;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_SUB_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 -= DATA.u64;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_MIN_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src < tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  FLATOp.FLAT_ATOMIC_MIN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src < tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_MAX_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src >= tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  FLATOp.FLAT_ATOMIC_MAX_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src >= tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_AND_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp & DATA.b64);\nRETURN_DATA.b64 = tmp',
  FLATOp.FLAT_ATOMIC_OR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp | DATA.b64);\nRETURN_DATA.b64 = tmp',
  FLATOp.FLAT_ATOMIC_XOR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp ^ DATA.b64);\nRETURN_DATA.b64 = tmp',
  FLATOp.FLAT_ATOMIC_INC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = tmp >= src ? 0ULL : tmp + 1ULL;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_DEC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = ((tmp == 0ULL) || (tmp > src)) ? src : tmp - 1ULL;\nRETURN_DATA.u64 = tmp',
  FLATOp.FLAT_ATOMIC_CMPSWAP_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA[31 : 0].f32;\ncmp = DATA[63 : 32].f32;\nMEM[ADDR].f32 = tmp == cmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  FLATOp.FLAT_ATOMIC_MIN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src < tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  FLATOp.FLAT_ATOMIC_MAX_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src > tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  FLATOp.FLAT_ATOMIC_ADD_F32: 'tmp = MEM[ADDR].f32;\nMEM[ADDR].f32 += DATA.f32;\nRETURN_DATA.f32 = tmp',
  FLATOp.GLOBAL_LOAD_ADDTID_B32: "RETURN_DATA.u32 = MEM[SGPR_ADDR[63 : 0] + INST_OFFSET[11 : 0].b64 + 64'B(laneID.i32 * 4)].u32",
  FLATOp.GLOBAL_STORE_ADDTID_B32: "MEM[SGPR_ADDR[63 : 0] + INST_OFFSET[11 : 0].b64 + 64'B(laneID.i32 * 4)].u32 = DATA.u32",
}

GLOBALOp_PCODE = {
  GLOBALOp.GLOBAL_LOAD_U8: "VDATA.u32 = 32'U({ 24'0U, MEM[ADDR].u8 })",
  GLOBALOp.GLOBAL_LOAD_I8: "VDATA.i32 = 32'I(signext(MEM[ADDR].i8))",
  GLOBALOp.GLOBAL_LOAD_U16: "VDATA.u32 = 32'U({ 16'0U, MEM[ADDR].u16 })",
  GLOBALOp.GLOBAL_LOAD_I16: "VDATA.i32 = 32'I(signext(MEM[ADDR].i16))",
  GLOBALOp.GLOBAL_LOAD_B32: 'VDATA[31 : 0] = MEM[ADDR].b32',
  GLOBALOp.GLOBAL_LOAD_B64: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32',
  GLOBALOp.GLOBAL_LOAD_B96: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32',
  GLOBALOp.GLOBAL_LOAD_B128: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32;\nVDATA[127 : 96] = MEM[ADDR + 12U].b32',
  GLOBALOp.GLOBAL_STORE_B8: 'MEM[ADDR].b8 = VDATA[7 : 0]',
  GLOBALOp.GLOBAL_STORE_B16: 'MEM[ADDR].b16 = VDATA[15 : 0]',
  GLOBALOp.GLOBAL_STORE_B32: 'MEM[ADDR].b32 = VDATA[31 : 0]',
  GLOBALOp.GLOBAL_STORE_B64: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32]',
  GLOBALOp.GLOBAL_STORE_B96: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64]',
  GLOBALOp.GLOBAL_STORE_B128: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64];\nMEM[ADDR + 12U].b32 = VDATA[127 : 96]',
  GLOBALOp.GLOBAL_LOAD_D16_U8: "VDATA[15 : 0].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  GLOBALOp.GLOBAL_LOAD_D16_I8: "VDATA[15 : 0].i16 = 16'I(signext(MEM[ADDR].i8));",
  GLOBALOp.GLOBAL_LOAD_D16_B16: 'VDATA[15 : 0].b16 = MEM[ADDR].b16;',
  GLOBALOp.GLOBAL_LOAD_D16_HI_U8: "VDATA[31 : 16].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  GLOBALOp.GLOBAL_LOAD_D16_HI_I8: "VDATA[31 : 16].i16 = 16'I(signext(MEM[ADDR].i8));",
  GLOBALOp.GLOBAL_LOAD_D16_HI_B16: 'VDATA[31 : 16].b16 = MEM[ADDR].b16;',
  GLOBALOp.GLOBAL_STORE_D16_HI_B8: 'MEM[ADDR].b8 = VDATA[23 : 16]',
  GLOBALOp.GLOBAL_STORE_D16_HI_B16: 'MEM[ADDR].b16 = VDATA[31 : 16]',
  GLOBALOp.GLOBAL_LOAD_ADDTID_B32: "RETURN_DATA.u32 = MEM[SGPR_ADDR[63 : 0] + INST_OFFSET[11 : 0].b64 + 64'B(laneID.i32 * 4)].u32",
  GLOBALOp.GLOBAL_STORE_ADDTID_B32: "MEM[SGPR_ADDR[63 : 0] + INST_OFFSET[11 : 0].b64 + 64'B(laneID.i32 * 4)].u32 = DATA.u32",
  GLOBALOp.GLOBAL_ATOMIC_SWAP_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = DATA.b32;\nRETURN_DATA.b32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B32: 'tmp = MEM[ADDR].u32;\nsrc = DATA[31 : 0].u32;\ncmp = DATA[63 : 32].u32;\nMEM[ADDR].u32 = tmp == cmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_ADD_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_SUB_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 -= DATA.u32;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_CSUB_U32: "declare new_value : 32'U;\nold_value = MEM[ADDR].u32;\nif old_value < DATA.u32 then\nnew_value = 0U\nelse\nnew_value = old_value - DATA.u32\nendif;\nMEM[ADDR].u32 = new_value;\nRETURN_DATA.u32 = old_value",
  GLOBALOp.GLOBAL_ATOMIC_MIN_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src < tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MIN_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src < tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MAX_I32: 'tmp = MEM[ADDR].i32;\nsrc = DATA.i32;\nMEM[ADDR].i32 = src >= tmp ? src : tmp;\nRETURN_DATA.i32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MAX_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = src >= tmp ? src : tmp;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_AND_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp & DATA.b32);\nRETURN_DATA.b32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_OR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp | DATA.b32);\nRETURN_DATA.b32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_XOR_B32: 'tmp = MEM[ADDR].b32;\nMEM[ADDR].b32 = (tmp ^ DATA.b32);\nRETURN_DATA.b32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_INC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = tmp >= src ? 0U : tmp + 1U;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_DEC_U32: 'tmp = MEM[ADDR].u32;\nsrc = DATA.u32;\nMEM[ADDR].u32 = ((tmp == 0U) || (tmp > src)) ? src : tmp - 1U;\nRETURN_DATA.u32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_SWAP_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = DATA.b64;\nRETURN_DATA.b64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B64: 'tmp = MEM[ADDR].u64;\nsrc = DATA[63 : 0].u64;\ncmp = DATA[127 : 64].u64;\nMEM[ADDR].u64 = tmp == cmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_ADD_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 += DATA.u64;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_SUB_U64: 'tmp = MEM[ADDR].u64;\nMEM[ADDR].u64 -= DATA.u64;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MIN_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src < tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MIN_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src < tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MAX_I64: 'tmp = MEM[ADDR].i64;\nsrc = DATA.i64;\nMEM[ADDR].i64 = src >= tmp ? src : tmp;\nRETURN_DATA.i64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MAX_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = src >= tmp ? src : tmp;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_AND_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp & DATA.b64);\nRETURN_DATA.b64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_OR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp | DATA.b64);\nRETURN_DATA.b64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_XOR_B64: 'tmp = MEM[ADDR].b64;\nMEM[ADDR].b64 = (tmp ^ DATA.b64);\nRETURN_DATA.b64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_INC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = tmp >= src ? 0ULL : tmp + 1ULL;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_DEC_U64: 'tmp = MEM[ADDR].u64;\nsrc = DATA.u64;\nMEM[ADDR].u64 = ((tmp == 0ULL) || (tmp > src)) ? src : tmp - 1ULL;\nRETURN_DATA.u64 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA[31 : 0].f32;\ncmp = DATA[63 : 32].f32;\nMEM[ADDR].f32 = tmp == cmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MIN_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src < tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_MAX_F32: 'tmp = MEM[ADDR].f32;\nsrc = DATA.f32;\nMEM[ADDR].f32 = src > tmp ? src : tmp;\nRETURN_DATA.f32 = tmp',
  GLOBALOp.GLOBAL_ATOMIC_ADD_F32: 'tmp = MEM[ADDR].f32;\nMEM[ADDR].f32 += DATA.f32;\nRETURN_DATA.f32 = tmp',
}

SCRATCHOp_PCODE = {
  SCRATCHOp.SCRATCH_LOAD_U8: "VDATA.u32 = 32'U({ 24'0U, MEM[ADDR].u8 })",
  SCRATCHOp.SCRATCH_LOAD_I8: "VDATA.i32 = 32'I(signext(MEM[ADDR].i8))",
  SCRATCHOp.SCRATCH_LOAD_U16: "VDATA.u32 = 32'U({ 16'0U, MEM[ADDR].u16 })",
  SCRATCHOp.SCRATCH_LOAD_I16: "VDATA.i32 = 32'I(signext(MEM[ADDR].i16))",
  SCRATCHOp.SCRATCH_LOAD_B32: 'VDATA[31 : 0] = MEM[ADDR].b32',
  SCRATCHOp.SCRATCH_LOAD_B64: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32',
  SCRATCHOp.SCRATCH_LOAD_B96: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32',
  SCRATCHOp.SCRATCH_LOAD_B128: 'VDATA[31 : 0] = MEM[ADDR].b32;\nVDATA[63 : 32] = MEM[ADDR + 4U].b32;\nVDATA[95 : 64] = MEM[ADDR + 8U].b32;\nVDATA[127 : 96] = MEM[ADDR + 12U].b32',
  SCRATCHOp.SCRATCH_STORE_B8: 'MEM[ADDR].b8 = VDATA[7 : 0]',
  SCRATCHOp.SCRATCH_STORE_B16: 'MEM[ADDR].b16 = VDATA[15 : 0]',
  SCRATCHOp.SCRATCH_STORE_B32: 'MEM[ADDR].b32 = VDATA[31 : 0]',
  SCRATCHOp.SCRATCH_STORE_B64: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32]',
  SCRATCHOp.SCRATCH_STORE_B96: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64]',
  SCRATCHOp.SCRATCH_STORE_B128: 'MEM[ADDR].b32 = VDATA[31 : 0];\nMEM[ADDR + 4U].b32 = VDATA[63 : 32];\nMEM[ADDR + 8U].b32 = VDATA[95 : 64];\nMEM[ADDR + 12U].b32 = VDATA[127 : 96]',
  SCRATCHOp.SCRATCH_LOAD_D16_U8: "VDATA[15 : 0].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  SCRATCHOp.SCRATCH_LOAD_D16_I8: "VDATA[15 : 0].i16 = 16'I(signext(MEM[ADDR].i8));",
  SCRATCHOp.SCRATCH_LOAD_D16_B16: 'VDATA[15 : 0].b16 = MEM[ADDR].b16;',
  SCRATCHOp.SCRATCH_LOAD_D16_HI_U8: "VDATA[31 : 16].u16 = 16'U({ 8'0U, MEM[ADDR].u8 });",
  SCRATCHOp.SCRATCH_LOAD_D16_HI_I8: "VDATA[31 : 16].i16 = 16'I(signext(MEM[ADDR].i8));",
  SCRATCHOp.SCRATCH_LOAD_D16_HI_B16: 'VDATA[31 : 16].b16 = MEM[ADDR].b16;',
  SCRATCHOp.SCRATCH_STORE_D16_HI_B8: 'MEM[ADDR].b8 = VDATA[23 : 16]',
  SCRATCHOp.SCRATCH_STORE_D16_HI_B16: 'MEM[ADDR].b16 = VDATA[31 : 16]',
}

PSEUDOCODE_STRINGS = {
  SOP1Op: SOP1Op_PCODE,
  SOP2Op: SOP2Op_PCODE,
  SOPCOp: SOPCOp_PCODE,
  SOPKOp: SOPKOp_PCODE,
  SOPPOp: SOPPOp_PCODE,
  SMEMOp: SMEMOp_PCODE,
  VOP1Op: VOP1Op_PCODE,
  VOP2Op: VOP2Op_PCODE,
  VOP3Op: VOP3Op_PCODE,
  VOP3SDOp: VOP3SDOp_PCODE,
  VOP3POp: VOP3POp_PCODE,
  VOPCOp: VOPCOp_PCODE,
  DSOp: DSOp_PCODE,
  FLATOp: FLATOp_PCODE,
  GLOBALOp: GLOBALOp_PCODE,
  SCRATCHOp: SCRATCHOp_PCODE,
}