diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4807cc465b..ef528e6e72 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -655,7 +655,7 @@ jobs: uses: ./.github/actions/process-replay testrdna3: - name: RDNA3 IDE + name: AMD ASM IDE runs-on: ubuntu-24.04 timeout-minutes: 10 steps: @@ -674,19 +674,27 @@ jobs: sudo apt-get update sudo apt-get install llvm-21 llvm-21-tools cloc - name: RDNA3 Line Count - run: cloc --by-file extra/assembly/rdna3/*.py + run: cloc --by-file extra/assembly/amd/*.py - name: Run RDNA3 emulator tests - run: python -m pytest -n=auto extra/assembly/rdna3/ --durations 20 + run: python -m pytest -n=auto extra/assembly/amd/ --durations 20 - name: Install pdfplumber run: pip install pdfplumber - name: Verify RDNA3 autogen is up to date run: | - python -m extra.assembly.rdna3.lib - git diff --exit-code extra/assembly/rdna3/autogen/__init__.py + python -m extra.assembly.amd.lib --arch rdna3 + git diff --exit-code extra/assembly/amd/autogen/rdna3/__init__.py + - name: Verify CDNA4 autogen is up to date + run: | + python -m extra.assembly.amd.lib --arch cdna4 + git diff --exit-code extra/assembly/amd/autogen/cdna4/__init__.py - name: Verify RDNA3 pcode autogen is up to date run: | - python -m extra.assembly.rdna3.pcode - git diff --exit-code extra/assembly/rdna3/autogen/gen_pcode.py + python -m extra.assembly.amd.pcode --arch rdna3 + git diff --exit-code extra/assembly/amd/autogen/rdna3/gen_pcode.py + - name: Verify CDNA4 pcode autogen is up to date + run: | + python -m extra.assembly.amd.pcode --arch cdna4 + git diff --exit-code extra/assembly/amd/autogen/cdna4/gen_pcode.py testnvidia: strategy: diff --git a/CLAUDE.md b/CLAUDE.md index 7488c777c5..145841e2e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -76,10 +76,12 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()" ## Auto-generated Files (DO NOT EDIT) The following files are auto-generated and should never be edited manually: -- `extra/assembly/rdna3/autogen/gen_pcode.py` - Generated by `python -m extra.assembly.rdna3.pcode` -- `extra/assembly/rdna3/autogen/__init__.py` - Generated from AMD ISA definitions +- `extra/assembly/amd/autogen/rdna3/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch rdna3` +- `extra/assembly/amd/autogen/rdna3/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch rdna3` +- `extra/assembly/amd/autogen/cdna4/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch cdna4` +- `extra/assembly/amd/autogen/cdna4/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch cdna4` -To add missing instruction implementations, add them to `extra/assembly/rdna3/emu.py` instead. +To add missing instruction implementations, add them to `extra/assembly/amd/emu.py` instead. ## Style Notes diff --git a/extra/assembly/rdna3/asm.py b/extra/assembly/amd/asm.py similarity index 99% rename from extra/assembly/rdna3/asm.py rename to extra/assembly/amd/asm.py index 8f39d16ade..50a382d0f8 100644 --- a/extra/assembly/rdna3/asm.py +++ b/extra/assembly/amd/asm.py @@ -1,7 +1,7 @@ # RDNA3 assembler and disassembler from __future__ import annotations import re -from extra.assembly.rdna3.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap +from extra.assembly.amd.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap # Decoding helpers SPECIAL_GPRS = {106: "vcc_lo", 107: "vcc_hi", 124: "null", 125: "m0", 126: "exec_lo", 127: "exec_hi", 253: "scc"} @@ -91,7 +91,7 @@ def disasm(inst: Inst) -> str: # VOP3 and VOP3SD share encoding - check opcode to determine which is_vop3sd = cls_name == 'VOP3' and op_val in VOP3SD_OPCODES try: - from extra.assembly.rdna3 import autogen + from extra.assembly.amd.autogen import rdna3 as autogen if is_vop3sd: op_name = autogen.VOP3SDOp(op_val).name.lower() else: @@ -347,7 +347,7 @@ def disasm(inst: Inst) -> str: # VOPD: dual-issue instructions if cls_name == 'VOPD': - from extra.assembly.rdna3 import autogen + from extra.assembly.amd.autogen import rdna3 as autogen opx, opy, vdstx, vdsty_enc = [unwrap(inst._values.get(f, 0)) for f in ('opx', 'opy', 'vdstx', 'vdsty')] srcx0, vsrcx1, srcy0, vsrcy1 = [unwrap(inst._values.get(f, 0)) for f in ('srcx0', 'vsrcx1', 'srcy0', 'vsrcy1')] vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1) # Decode vdsty @@ -505,7 +505,7 @@ SOPK_IMM_FIRST = {'s_setreg_b32'} SOPK_UNSUPPORTED = {'s_setreg_imm32_b32'} def asm(text: str) -> Inst: - from extra.assembly.rdna3 import autogen + from extra.assembly.amd.autogen import rdna3 as autogen text = text.strip() clamp = 'clamp' in text.lower() if clamp: text = re.sub(r'\s+clamp\s*$', '', text, flags=re.I) diff --git a/extra/assembly/amd/autogen/cdna4/__init__.py b/extra/assembly/amd/autogen/cdna4/__init__.py new file mode 100644 index 0000000000..568878d989 --- /dev/null +++ b/extra/assembly/amd/autogen/cdna4/__init__.py @@ -0,0 +1,3441 @@ +# autogenerated from AMD CDNA4 ISA PDF by lib.py - do not edit +from enum import IntEnum +from typing import Annotated +from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField +import functools + +class SrcEnum(IntEnum): + S_ADD_U32 = 0 + S_SUB_U32 = 1 + S_ADD_I32 = 2 + S_SUB_I32 = 3 + S_ADDC_U32 = 4 + S_SUBB_U32 = 5 + S_MIN_I32 = 6 + FLAT_SCRATCH_LO = 102 + FLAT_SCRATCH_HI = 103 + XNACK_MASK_LO = 104 + XNACK_MASK_HI = 105 + VCC_LO = 106 + VCC_HI = 107 + M0 = 124 + EXEC_LO = 126 + EXEC_HI = 127 + ZERO = 128 + DPP8 = 233 + DPP8FI = 234 + SHARED_BASE = 235 + SHARED_LIMIT = 236 + PRIVATE_BASE = 237 + PRIVATE_LIMIT = 238 + RESERVED = 239 + POS_HALF = 240 + NEG_HALF = 241 + POS_ONE = 242 + NEG_ONE = 243 + POS_TWO = 244 + NEG_TWO = 245 + POS_FOUR = 246 + NEG_FOUR = 247 + INV_2PI = 248 + DPP16 = 250 + VCCZ = 251 + EXECZ = 252 + SCC = 253 + LDS_DIRECT = 254 + +class DSOp(IntEnum): + DS_ADD_U32 = 0 + DS_SUB_U32 = 1 + DS_RSUB_U32 = 2 + DS_INC_U32 = 3 + DS_DEC_U32 = 4 + DS_MIN_I32 = 5 + DS_MAX_I32 = 6 + DS_MIN_U32 = 7 + DS_MAX_U32 = 8 + DS_AND_B32 = 9 + DS_OR_B32 = 10 + DS_XOR_B32 = 11 + DS_MSKOR_B32 = 12 + DS_WRITE_B32 = 13 + DS_WRITE2_B32 = 14 + DS_WRITE2ST64_B32 = 15 + DS_CMPST_B32 = 16 + DS_CMPST_F32 = 17 + DS_MIN_F32 = 18 + DS_MAX_F32 = 19 + DS_NOP = 20 + DS_ADD_F32 = 21 + DS_PK_ADD_F16 = 23 + DS_PK_ADD_BF16 = 24 + DS_WRITE_ADDTID_B32 = 29 + DS_WRITE_B8 = 30 + DS_WRITE_B16 = 31 + DS_ADD_RTN_U32 = 32 + DS_SUB_RTN_U32 = 33 + DS_RSUB_RTN_U32 = 34 + DS_INC_RTN_U32 = 35 + DS_DEC_RTN_U32 = 36 + DS_MIN_RTN_I32 = 37 + DS_MAX_RTN_I32 = 38 + DS_MIN_RTN_U32 = 39 + DS_MAX_RTN_U32 = 40 + DS_AND_RTN_B32 = 41 + DS_OR_RTN_B32 = 42 + DS_XOR_RTN_B32 = 43 + DS_MSKOR_RTN_B32 = 44 + DS_WRXCHG_RTN_B32 = 45 + DS_WRXCHG2_RTN_B32 = 46 + DS_WRXCHG2ST64_RTN_B32 = 47 + DS_CMPST_RTN_B32 = 48 + DS_CMPST_RTN_F32 = 49 + DS_MIN_RTN_F32 = 50 + DS_MAX_RTN_F32 = 51 + DS_WRAP_RTN_B32 = 52 + DS_ADD_RTN_F32 = 53 + DS_READ_B32 = 54 + DS_READ2_B32 = 55 + DS_READ2ST64_B32 = 56 + DS_READ_I8 = 57 + DS_READ_U8 = 58 + DS_READ_I16 = 59 + DS_READ_U16 = 60 + DS_SWIZZLE_B32 = 61 + DS_PERMUTE_B32 = 62 + DS_BPERMUTE_B32 = 63 + DS_ADD_U64 = 64 + DS_SUB_U64 = 65 + DS_RSUB_U64 = 66 + DS_INC_U64 = 67 + DS_DEC_U64 = 68 + DS_MIN_I64 = 69 + DS_MAX_I64 = 70 + DS_MIN_U64 = 71 + DS_MAX_U64 = 72 + DS_AND_B64 = 73 + DS_OR_B64 = 74 + DS_XOR_B64 = 75 + DS_MSKOR_B64 = 76 + DS_WRITE_B64 = 77 + DS_WRITE2_B64 = 78 + DS_WRITE2ST64_B64 = 79 + DS_CMPST_B64 = 80 + DS_CMPST_F64 = 81 + DS_MIN_F64 = 82 + DS_MAX_F64 = 83 + DS_WRITE_B8_D16_HI = 84 + DS_WRITE_B16_D16_HI = 85 + DS_READ_U8_D16 = 86 + DS_READ_U8_D16_HI = 87 + DS_READ_I8_D16 = 88 + DS_READ_I8_D16_HI = 89 + DS_READ_U16_D16 = 90 + DS_READ_U16_D16_HI = 91 + DS_ADD_F64 = 92 + DS_ADD_RTN_U64 = 96 + DS_SUB_RTN_U64 = 97 + DS_RSUB_RTN_U64 = 98 + DS_INC_RTN_U64 = 99 + DS_DEC_RTN_U64 = 100 + DS_MIN_RTN_I64 = 101 + DS_MAX_RTN_I64 = 102 + DS_MIN_RTN_U64 = 103 + DS_MAX_RTN_U64 = 104 + DS_AND_RTN_B64 = 105 + DS_OR_RTN_B64 = 106 + DS_XOR_RTN_B64 = 107 + DS_MSKOR_RTN_B64 = 108 + DS_WRXCHG_RTN_B64 = 109 + DS_WRXCHG2_RTN_B64 = 110 + DS_WRXCHG2ST64_RTN_B64 = 111 + DS_CMPST_RTN_B64 = 112 + DS_CMPST_RTN_F64 = 113 + DS_MIN_RTN_F64 = 114 + DS_MAX_RTN_F64 = 115 + DS_READ_B64 = 118 + DS_READ2_B64 = 119 + DS_READ2ST64_B64 = 120 + DS_ADD_RTN_F64 = 124 + DS_CONDXCHG32_RTN_B64 = 126 + DS_READ_ADDTID_B32 = 182 + DS_PK_ADD_RTN_F16 = 183 + DS_PK_ADD_RTN_BF16 = 184 + DS_CONSUME = 189 + DS_APPEND = 190 + DS_WRITE_B96 = 222 + DS_WRITE_B128 = 223 + DS_READ_B64_TR_B4 = 224 + DS_READ_B96_TR_B6 = 225 + DS_READ_B64_TR_B8 = 226 + DS_READ_B64_TR_B16 = 227 + DS_READ_B96 = 254 + DS_READ_B128 = 255 + CDNA4 = 600 + +class FLATOp(IntEnum): + FLAT_LOAD_UBYTE = 16 + FLAT_LOAD_SBYTE = 17 + FLAT_LOAD_USHORT = 18 + FLAT_LOAD_SSHORT = 19 + FLAT_LOAD_DWORD = 20 + FLAT_LOAD_DWORDX2 = 21 + FLAT_LOAD_DWORDX3 = 22 + FLAT_LOAD_DWORDX4 = 23 + FLAT_STORE_BYTE = 24 + FLAT_STORE_BYTE_D16_HI = 25 + FLAT_STORE_SHORT = 26 + FLAT_STORE_SHORT_D16_HI = 27 + FLAT_STORE_DWORD = 28 + FLAT_STORE_DWORDX2 = 29 + FLAT_STORE_DWORDX3 = 30 + FLAT_STORE_DWORDX4 = 31 + FLAT_LOAD_UBYTE_D16 = 32 + FLAT_LOAD_UBYTE_D16_HI = 33 + FLAT_LOAD_SBYTE_D16 = 34 + FLAT_LOAD_SBYTE_D16_HI = 35 + FLAT_LOAD_SHORT_D16 = 36 + FLAT_LOAD_SHORT_D16_HI = 37 + FLAT_ATOMIC_SWAP = 64 + FLAT_ATOMIC_CMPSWAP = 65 + FLAT_ATOMIC_ADD = 66 + FLAT_ATOMIC_SUB = 67 + FLAT_ATOMIC_SMIN = 68 + FLAT_ATOMIC_UMIN = 69 + FLAT_ATOMIC_SMAX = 70 + FLAT_ATOMIC_UMAX = 71 + FLAT_ATOMIC_AND = 72 + FLAT_ATOMIC_OR = 73 + FLAT_ATOMIC_XOR = 74 + FLAT_ATOMIC_INC = 75 + FLAT_ATOMIC_DEC = 76 + FLAT_ATOMIC_ADD_F32 = 77 + FLAT_ATOMIC_PK_ADD_F16 = 78 + FLAT_ATOMIC_ADD_F64 = 79 + FLAT_ATOMIC_MIN_F64 = 80 + FLAT_ATOMIC_MAX_F64 = 81 + FLAT_ATOMIC_PK_ADD_BF16 = 82 + FLAT_ATOMIC_SWAP_X2 = 96 + FLAT_ATOMIC_CMPSWAP_X2 = 97 + FLAT_ATOMIC_ADD_X2 = 98 + FLAT_ATOMIC_SUB_X2 = 99 + FLAT_ATOMIC_SMIN_X2 = 100 + FLAT_ATOMIC_UMIN_X2 = 101 + FLAT_ATOMIC_SMAX_X2 = 102 + FLAT_ATOMIC_UMAX_X2 = 103 + FLAT_ATOMIC_AND_X2 = 104 + FLAT_ATOMIC_OR_X2 = 105 + FLAT_ATOMIC_XOR_X2 = 106 + FLAT_ATOMIC_INC_X2 = 107 + FLAT_ATOMIC_DEC_X2 = 108 + CDNA4 = 600 + +class GLOBALOp(IntEnum): + GLOBAL_LOAD_UBYTE = 16 + GLOBAL_LOAD_SBYTE = 17 + GLOBAL_LOAD_USHORT = 18 + GLOBAL_LOAD_SSHORT = 19 + GLOBAL_LOAD_DWORD = 20 + GLOBAL_LOAD_DWORDX2 = 21 + GLOBAL_LOAD_DWORDX3 = 22 + GLOBAL_LOAD_DWORDX4 = 23 + GLOBAL_STORE_BYTE = 24 + GLOBAL_STORE_BYTE_D16_HI = 25 + GLOBAL_STORE_SHORT = 26 + GLOBAL_STORE_SHORT_D16_HI = 27 + GLOBAL_STORE_DWORD = 28 + GLOBAL_STORE_DWORDX2 = 29 + GLOBAL_STORE_DWORDX3 = 30 + GLOBAL_STORE_DWORDX4 = 31 + GLOBAL_LOAD_UBYTE_D16 = 32 + GLOBAL_LOAD_UBYTE_D16_HI = 33 + GLOBAL_LOAD_SBYTE_D16 = 34 + GLOBAL_LOAD_SBYTE_D16_HI = 35 + GLOBAL_LOAD_SHORT_D16 = 36 + GLOBAL_LOAD_SHORT_D16_HI = 37 + GLOBAL_LOAD_LDS_UBYTE = 38 + GLOBAL_LOAD_LDS_SBYTE = 39 + GLOBAL_LOAD_LDS_USHORT = 40 + GLOBAL_LOAD_LDS_SSHORT = 41 + GLOBAL_LOAD_LDS_DWORD = 42 + GLOBAL_ATOMIC_SWAP = 64 + GLOBAL_ATOMIC_CMPSWAP = 65 + GLOBAL_ATOMIC_ADD = 66 + GLOBAL_ATOMIC_SUB = 67 + GLOBAL_ATOMIC_SMIN = 68 + GLOBAL_ATOMIC_UMIN = 69 + GLOBAL_ATOMIC_SMAX = 70 + GLOBAL_ATOMIC_UMAX = 71 + GLOBAL_ATOMIC_AND = 72 + GLOBAL_ATOMIC_OR = 73 + GLOBAL_ATOMIC_XOR = 74 + GLOBAL_ATOMIC_INC = 75 + GLOBAL_ATOMIC_DEC = 76 + GLOBAL_ATOMIC_ADD_F32 = 77 + GLOBAL_ATOMIC_PK_ADD_F16 = 78 + GLOBAL_ATOMIC_ADD_F64 = 79 + GLOBAL_ATOMIC_MIN_F64 = 80 + GLOBAL_ATOMIC_MAX_F64 = 81 + GLOBAL_ATOMIC_PK_ADD_BF16 = 82 + GLOBAL_ATOMIC_SWAP_X2 = 96 + GLOBAL_ATOMIC_CMPSWAP_X2 = 97 + GLOBAL_ATOMIC_ADD_X2 = 98 + GLOBAL_ATOMIC_SUB_X2 = 99 + GLOBAL_ATOMIC_SMIN_X2 = 100 + GLOBAL_ATOMIC_UMIN_X2 = 101 + GLOBAL_ATOMIC_SMAX_X2 = 102 + GLOBAL_ATOMIC_UMAX_X2 = 103 + GLOBAL_ATOMIC_AND_X2 = 104 + GLOBAL_ATOMIC_OR_X2 = 105 + GLOBAL_ATOMIC_XOR_X2 = 106 + GLOBAL_ATOMIC_INC_X2 = 107 + GLOBAL_ATOMIC_DEC_X2 = 108 + GLOBAL_LOAD_LDS_DWORDX4 = 125 + GLOBAL_LOAD_LDS_DWORDX3 = 126 + CDNA4 = 600 + +class MTBUFOp(IntEnum): + TBUFFER_LOAD_FORMAT_X = 0 + TBUFFER_LOAD_FORMAT_XY = 1 + TBUFFER_LOAD_FORMAT_XYZ = 2 + TBUFFER_LOAD_FORMAT_XYZW = 3 + TBUFFER_STORE_FORMAT_X = 4 + TBUFFER_STORE_FORMAT_XY = 5 + TBUFFER_STORE_FORMAT_XYZ = 6 + TBUFFER_STORE_FORMAT_XYZW = 7 + TBUFFER_LOAD_FORMAT_D16_X = 8 + TBUFFER_LOAD_FORMAT_D16_XY = 9 + TBUFFER_LOAD_FORMAT_D16_XYZ = 10 + TBUFFER_LOAD_FORMAT_D16_XYZW = 11 + TBUFFER_STORE_FORMAT_D16_X = 12 + TBUFFER_STORE_FORMAT_D16_XY = 13 + TBUFFER_STORE_FORMAT_D16_XYZ = 14 + TBUFFER_STORE_FORMAT_D16_XYZW = 15 + +class MUBUFOp(IntEnum): + BUFFER_LOAD_FORMAT_X = 0 + BUFFER_LOAD_FORMAT_XY = 1 + BUFFER_LOAD_FORMAT_XYZ = 2 + BUFFER_LOAD_FORMAT_XYZW = 3 + BUFFER_STORE_FORMAT_X = 4 + BUFFER_STORE_FORMAT_XY = 5 + BUFFER_STORE_FORMAT_XYZ = 6 + BUFFER_STORE_FORMAT_XYZW = 7 + BUFFER_LOAD_FORMAT_D16_X = 8 + BUFFER_LOAD_FORMAT_D16_XY = 9 + BUFFER_LOAD_FORMAT_D16_XYZ = 10 + BUFFER_LOAD_FORMAT_D16_XYZW = 11 + BUFFER_STORE_FORMAT_D16_X = 12 + BUFFER_STORE_FORMAT_D16_XY = 13 + BUFFER_STORE_FORMAT_D16_XYZ = 14 + BUFFER_STORE_FORMAT_D16_XYZW = 15 + BUFFER_LOAD_UBYTE = 16 + BUFFER_LOAD_SBYTE = 17 + BUFFER_LOAD_USHORT = 18 + BUFFER_LOAD_SSHORT = 19 + BUFFER_LOAD_DWORD = 20 + BUFFER_LOAD_DWORDX2 = 21 + BUFFER_LOAD_DWORDX3 = 22 + BUFFER_LOAD_DWORDX4 = 23 + BUFFER_STORE_BYTE = 24 + BUFFER_STORE_BYTE_D16_HI = 25 + BUFFER_STORE_SHORT = 26 + BUFFER_STORE_SHORT_D16_HI = 27 + BUFFER_STORE_DWORD = 28 + BUFFER_STORE_DWORDX2 = 29 + BUFFER_STORE_DWORDX3 = 30 + BUFFER_STORE_DWORDX4 = 31 + BUFFER_LOAD_UBYTE_D16 = 32 + BUFFER_LOAD_UBYTE_D16_HI = 33 + BUFFER_LOAD_SBYTE_D16 = 34 + BUFFER_LOAD_SBYTE_D16_HI = 35 + BUFFER_LOAD_SHORT_D16 = 36 + BUFFER_LOAD_SHORT_D16_HI = 37 + BUFFER_LOAD_FORMAT_D16_HI_X = 38 + BUFFER_STORE_FORMAT_D16_HI_X = 39 + BUFFER_WBL2 = 40 + BUFFER_INV = 41 + BUFFER_ATOMIC_SWAP = 64 + BUFFER_ATOMIC_CMPSWAP = 65 + BUFFER_ATOMIC_ADD = 66 + BUFFER_ATOMIC_SUB = 67 + BUFFER_ATOMIC_SMIN = 68 + BUFFER_ATOMIC_UMIN = 69 + BUFFER_ATOMIC_SMAX = 70 + BUFFER_ATOMIC_UMAX = 71 + BUFFER_ATOMIC_AND = 72 + BUFFER_ATOMIC_OR = 73 + BUFFER_ATOMIC_XOR = 74 + BUFFER_ATOMIC_INC = 75 + BUFFER_ATOMIC_DEC = 76 + BUFFER_ATOMIC_ADD_F32 = 77 + BUFFER_ATOMIC_PK_ADD_F16 = 78 + BUFFER_ATOMIC_ADD_F64 = 79 + BUFFER_ATOMIC_MIN_F64 = 80 + BUFFER_ATOMIC_MAX_F64 = 81 + BUFFER_ATOMIC_PK_ADD_BF16 = 82 + BUFFER_ATOMIC_SWAP_X2 = 96 + BUFFER_ATOMIC_CMPSWAP_X2 = 97 + BUFFER_ATOMIC_ADD_X2 = 98 + BUFFER_ATOMIC_SUB_X2 = 99 + BUFFER_ATOMIC_SMIN_X2 = 100 + BUFFER_ATOMIC_UMIN_X2 = 101 + BUFFER_ATOMIC_SMAX_X2 = 102 + BUFFER_ATOMIC_UMAX_X2 = 103 + BUFFER_ATOMIC_AND_X2 = 104 + BUFFER_ATOMIC_OR_X2 = 105 + BUFFER_ATOMIC_XOR_X2 = 106 + BUFFER_ATOMIC_INC_X2 = 107 + BUFFER_ATOMIC_DEC_X2 = 108 + CDNA4 = 600 + +class SCRATCHOp(IntEnum): + SCRATCH_LOAD_UBYTE = 16 + SCRATCH_LOAD_SBYTE = 17 + SCRATCH_LOAD_USHORT = 18 + SCRATCH_LOAD_SSHORT = 19 + SCRATCH_LOAD_DWORD = 20 + SCRATCH_LOAD_DWORDX2 = 21 + SCRATCH_LOAD_DWORDX3 = 22 + SCRATCH_LOAD_DWORDX4 = 23 + SCRATCH_STORE_BYTE = 24 + SCRATCH_STORE_BYTE_D16_HI = 25 + SCRATCH_STORE_SHORT = 26 + SCRATCH_STORE_SHORT_D16_HI = 27 + SCRATCH_STORE_DWORD = 28 + SCRATCH_STORE_DWORDX2 = 29 + SCRATCH_STORE_DWORDX3 = 30 + SCRATCH_STORE_DWORDX4 = 31 + SCRATCH_LOAD_UBYTE_D16 = 32 + SCRATCH_LOAD_UBYTE_D16_HI = 33 + SCRATCH_LOAD_SBYTE_D16 = 34 + SCRATCH_LOAD_SBYTE_D16_HI = 35 + SCRATCH_LOAD_SHORT_D16 = 36 + SCRATCH_LOAD_SHORT_D16_HI = 37 + SCRATCH_LOAD_LDS_UBYTE = 38 + SCRATCH_LOAD_LDS_SBYTE = 39 + SCRATCH_LOAD_LDS_USHORT = 40 + SCRATCH_LOAD_LDS_SSHORT = 41 + SCRATCH_LOAD_LDS_DWORD = 42 + +class SMEMOp(IntEnum): + S_LOAD_DWORD = 0 + S_LOAD_DWORDX2 = 1 + S_LOAD_DWORDX4 = 2 + S_LOAD_DWORDX8 = 3 + S_LOAD_DWORDX16 = 4 + S_SCRATCH_LOAD_DWORD = 5 + S_SCRATCH_LOAD_DWORDX2 = 6 + S_SCRATCH_LOAD_DWORDX4 = 7 + S_BUFFER_LOAD_DWORD = 8 + S_BUFFER_LOAD_DWORDX2 = 9 + S_BUFFER_LOAD_DWORDX4 = 10 + S_BUFFER_LOAD_DWORDX8 = 11 + S_BUFFER_LOAD_DWORDX16 = 12 + S_STORE_DWORD = 16 + S_STORE_DWORDX2 = 17 + S_STORE_DWORDX4 = 18 + S_SCRATCH_STORE_DWORD = 21 + S_SCRATCH_STORE_DWORDX2 = 22 + S_SCRATCH_STORE_DWORDX4 = 23 + S_BUFFER_STORE_DWORD = 24 + S_BUFFER_STORE_DWORDX2 = 25 + S_BUFFER_STORE_DWORDX4 = 26 + S_DCACHE_INV = 32 + S_DCACHE_WB = 33 + S_DCACHE_INV_VOL = 34 + S_DCACHE_WB_VOL = 35 + S_MEMTIME = 36 + S_MEMREALTIME = 37 + S_DCACHE_DISCARD = 40 + S_DCACHE_DISCARD_X2 = 41 + S_BUFFER_ATOMIC_SWAP = 64 + S_BUFFER_ATOMIC_CMPSWAP = 65 + S_BUFFER_ATOMIC_ADD = 66 + S_BUFFER_ATOMIC_SUB = 67 + S_BUFFER_ATOMIC_SMIN = 68 + S_BUFFER_ATOMIC_UMIN = 69 + S_BUFFER_ATOMIC_SMAX = 70 + S_BUFFER_ATOMIC_UMAX = 71 + S_BUFFER_ATOMIC_AND = 72 + S_BUFFER_ATOMIC_OR = 73 + S_BUFFER_ATOMIC_XOR = 74 + S_BUFFER_ATOMIC_INC = 75 + S_BUFFER_ATOMIC_DEC = 76 + S_BUFFER_ATOMIC_SWAP_X2 = 96 + S_BUFFER_ATOMIC_CMPSWAP_X2 = 97 + S_BUFFER_ATOMIC_ADD_X2 = 98 + S_BUFFER_ATOMIC_SUB_X2 = 99 + S_BUFFER_ATOMIC_SMIN_X2 = 100 + S_BUFFER_ATOMIC_UMIN_X2 = 101 + S_BUFFER_ATOMIC_SMAX_X2 = 102 + S_BUFFER_ATOMIC_UMAX_X2 = 103 + S_BUFFER_ATOMIC_AND_X2 = 104 + S_BUFFER_ATOMIC_OR_X2 = 105 + S_BUFFER_ATOMIC_XOR_X2 = 106 + S_BUFFER_ATOMIC_INC_X2 = 107 + S_BUFFER_ATOMIC_DEC_X2 = 108 + S_ATOMIC_SWAP = 128 + S_ATOMIC_CMPSWAP = 129 + S_ATOMIC_ADD = 130 + S_ATOMIC_SUB = 131 + S_ATOMIC_SMIN = 132 + S_ATOMIC_UMIN = 133 + S_ATOMIC_SMAX = 134 + S_ATOMIC_UMAX = 135 + S_ATOMIC_AND = 136 + S_ATOMIC_OR = 137 + S_ATOMIC_XOR = 138 + S_ATOMIC_INC = 139 + S_ATOMIC_DEC = 140 + S_ATOMIC_SWAP_X2 = 160 + S_ATOMIC_CMPSWAP_X2 = 161 + S_ATOMIC_ADD_X2 = 162 + S_ATOMIC_SUB_X2 = 163 + S_ATOMIC_SMIN_X2 = 164 + S_ATOMIC_UMIN_X2 = 165 + S_ATOMIC_SMAX_X2 = 166 + S_ATOMIC_UMAX_X2 = 167 + S_ATOMIC_AND_X2 = 168 + S_ATOMIC_OR_X2 = 169 + S_ATOMIC_XOR_X2 = 170 + S_ATOMIC_INC_X2 = 171 + S_ATOMIC_DEC_X2 = 172 + CDNA4 = 600 + +class SOP1Op(IntEnum): + S_MOV_B32 = 0 + S_MOV_B64 = 1 + S_CMOV_B32 = 2 + S_CMOV_B64 = 3 + S_NOT_B32 = 4 + S_NOT_B64 = 5 + S_WQM_B32 = 6 + S_WQM_B64 = 7 + S_BREV_B32 = 8 + S_BREV_B64 = 9 + S_BCNT0_I32_B32 = 10 + S_BCNT0_I32_B64 = 11 + S_BCNT1_I32_B32 = 12 + S_BCNT1_I32_B64 = 13 + S_FF0_I32_B32 = 14 + S_FF0_I32_B64 = 15 + S_FF1_I32_B32 = 16 + S_FF1_I32_B64 = 17 + S_FLBIT_I32_B32 = 18 + S_FLBIT_I32_B64 = 19 + S_FLBIT_I32 = 20 + S_FLBIT_I32_I64 = 21 + S_SEXT_I32_I8 = 22 + S_SEXT_I32_I16 = 23 + S_BITSET0_B32 = 24 + S_BITSET0_B64 = 25 + S_BITSET1_B32 = 26 + S_BITSET1_B64 = 27 + S_GETPC_B64 = 28 + S_SETPC_B64 = 29 + S_SWAPPC_B64 = 30 + S_RFE_B64 = 31 + S_AND_SAVEEXEC_B64 = 32 + S_OR_SAVEEXEC_B64 = 33 + S_XOR_SAVEEXEC_B64 = 34 + S_ANDN2_SAVEEXEC_B64 = 35 + S_ORN2_SAVEEXEC_B64 = 36 + S_NAND_SAVEEXEC_B64 = 37 + S_NOR_SAVEEXEC_B64 = 38 + S_XNOR_SAVEEXEC_B64 = 39 + S_QUADMASK_B32 = 40 + S_QUADMASK_B64 = 41 + S_MOVRELS_B32 = 42 + S_MOVRELS_B64 = 43 + S_MOVRELD_B32 = 44 + S_MOVRELD_B64 = 45 + S_CBRANCH_JOIN = 46 + S_ABS_I32 = 48 + S_SET_GPR_IDX_IDX = 50 + S_ANDN1_SAVEEXEC_B64 = 51 + S_ORN1_SAVEEXEC_B64 = 52 + S_ANDN1_WREXEC_B64 = 53 + S_ANDN2_WREXEC_B64 = 54 + S_BITREPLICATE_B64_B32 = 55 + CDNA4 = 600 + +class SOP2Op(IntEnum): + S_ADD_U32 = 0 + S_SUB_U32 = 1 + S_ADD_I32 = 2 + S_SUB_I32 = 3 + S_ADDC_U32 = 4 + S_SUBB_U32 = 5 + S_MIN_I32 = 6 + S_MIN_U32 = 7 + S_MAX_I32 = 8 + S_MAX_U32 = 9 + S_CSELECT_B32 = 10 + S_CSELECT_B64 = 11 + S_AND_B32 = 12 + S_AND_B64 = 13 + S_OR_B32 = 14 + S_OR_B64 = 15 + S_XOR_B32 = 16 + S_XOR_B64 = 17 + S_ANDN2_B32 = 18 + S_ANDN2_B64 = 19 + S_ORN2_B32 = 20 + S_ORN2_B64 = 21 + S_NAND_B32 = 22 + S_NAND_B64 = 23 + S_NOR_B32 = 24 + S_NOR_B64 = 25 + S_XNOR_B32 = 26 + S_XNOR_B64 = 27 + S_LSHL_B32 = 28 + S_LSHL_B64 = 29 + S_LSHR_B32 = 30 + S_LSHR_B64 = 31 + S_ASHR_I32 = 32 + S_ASHR_I64 = 33 + S_BFM_B32 = 34 + S_BFM_B64 = 35 + S_MUL_I32 = 36 + S_BFE_U32 = 37 + S_BFE_I32 = 38 + S_BFE_U64 = 39 + S_BFE_I64 = 40 + S_CBRANCH_G_FORK = 41 + S_ABSDIFF_I32 = 42 + S_MUL_HI_U32 = 44 + S_MUL_HI_I32 = 45 + S_LSHL1_ADD_U32 = 46 + S_LSHL2_ADD_U32 = 47 + S_LSHL3_ADD_U32 = 48 + S_LSHL4_ADD_U32 = 49 + S_PACK_LL_B32_B16 = 50 + S_PACK_LH_B32_B16 = 51 + S_PACK_HH_B32_B16 = 52 + CDNA4 = 600 + +class SOPCOp(IntEnum): + S_CMP_EQ_I32 = 0 + S_CMP_LG_I32 = 1 + S_CMP_GT_I32 = 2 + S_CMP_GE_I32 = 3 + S_CMP_LT_I32 = 4 + S_CMP_LE_I32 = 5 + S_CMP_EQ_U32 = 6 + S_CMP_LG_U32 = 7 + S_CMP_GT_U32 = 8 + S_CMP_GE_U32 = 9 + S_CMP_LT_U32 = 10 + S_CMP_LE_U32 = 11 + S_BITCMP0_B32 = 12 + S_BITCMP1_B32 = 13 + S_BITCMP0_B64 = 14 + S_BITCMP1_B64 = 15 + S_SETVSKIP = 16 + S_SET_GPR_IDX_ON = 17 + S_CMP_EQ_U64 = 18 + S_CMP_LG_U64 = 19 + CDNA4 = 600 + +class SOPKOp(IntEnum): + S_MOVK_I32 = 0 + S_CMOVK_I32 = 1 + S_CMPK_EQ_I32 = 2 + S_CMPK_LG_I32 = 3 + S_CMPK_GT_I32 = 4 + S_CMPK_GE_I32 = 5 + S_CMPK_LT_I32 = 6 + S_CMPK_LE_I32 = 7 + S_CMPK_EQ_U32 = 8 + S_CMPK_LG_U32 = 9 + S_CMPK_GT_U32 = 10 + S_CMPK_GE_U32 = 11 + S_CMPK_LT_U32 = 12 + S_CMPK_LE_U32 = 13 + S_ADDK_I32 = 14 + S_MULK_I32 = 15 + S_CBRANCH_I_FORK = 16 + S_GETREG_B32 = 17 + S_SETREG_B32 = 18 + S_SETREG_IMM32_B32 = 20 + S_CALL_B64 = 21 + +class SOPPOp(IntEnum): + S_NOP = 0 + S_ENDPGM = 1 + S_BRANCH = 2 + S_WAKEUP = 3 + S_CBRANCH_SCC0 = 4 + S_CBRANCH_SCC1 = 5 + S_CBRANCH_VCCZ = 6 + S_CBRANCH_VCCNZ = 7 + S_CBRANCH_EXECZ = 8 + S_CBRANCH_EXECNZ = 9 + S_BARRIER = 10 + S_SETKILL = 11 + S_WAITCNT = 12 + S_SETHALT = 13 + S_SLEEP = 14 + S_SETPRIO = 15 + S_SENDMSG = 16 + S_SENDMSGHALT = 17 + S_TRAP = 18 + S_ICACHE_INV = 19 + S_INCPERFLEVEL = 20 + S_DECPERFLEVEL = 21 + S_TTRACEDATA = 22 + S_CBRANCH_CDBGSYS = 23 + S_CBRANCH_CDBGUSER = 24 + S_CBRANCH_CDBGSYS_OR_USER = 25 + S_CBRANCH_CDBGSYS_AND_USER = 26 + S_ENDPGM_SAVED = 27 + S_SET_GPR_IDX_OFF = 28 + S_SET_GPR_IDX_MODE = 29 + CDNA4 = 600 + +class VOP1Op(IntEnum): + V_NOP = 0 + V_MOV_B32 = 1 + V_READFIRSTLANE_B32 = 2 + V_CVT_I32_F64 = 3 + V_CVT_F64_I32 = 4 + V_CVT_F32_I32 = 5 + V_CVT_F32_U32 = 6 + V_CVT_U32_F32 = 7 + V_CVT_I32_F32 = 8 + V_CVT_F16_F32 = 10 + V_CVT_F32_F16 = 11 + V_CVT_RPI_I32_F32 = 12 + V_CVT_FLR_I32_F32 = 13 + V_CVT_OFF_F32_I4 = 14 + V_CVT_F32_F64 = 15 + V_CVT_F64_F32 = 16 + V_CVT_F32_UBYTE0 = 17 + V_CVT_F32_UBYTE1 = 18 + V_CVT_F32_UBYTE2 = 19 + V_CVT_F32_UBYTE3 = 20 + V_CVT_U32_F64 = 21 + V_CVT_F64_U32 = 22 + V_TRUNC_F64 = 23 + V_CEIL_F64 = 24 + V_RNDNE_F64 = 25 + V_FLOOR_F64 = 26 + V_FRACT_F32 = 27 + V_TRUNC_F32 = 28 + V_CEIL_F32 = 29 + V_RNDNE_F32 = 30 + V_FLOOR_F32 = 31 + V_EXP_F32 = 32 + V_LOG_F32 = 33 + V_RCP_F32 = 34 + V_RCP_IFLAG_F32 = 35 + V_RSQ_F32 = 36 + V_RCP_F64 = 37 + V_RSQ_F64 = 38 + V_SQRT_F32 = 39 + V_SQRT_F64 = 40 + V_SIN_F32 = 41 + V_COS_F32 = 42 + V_NOT_B32 = 43 + V_BFREV_B32 = 44 + V_FFBH_U32 = 45 + V_FFBL_B32 = 46 + V_FFBH_I32 = 47 + V_FREXP_EXP_I32_F64 = 48 + V_FREXP_MANT_F64 = 49 + V_FRACT_F64 = 50 + V_FREXP_EXP_I32_F32 = 51 + V_FREXP_MANT_F32 = 52 + V_CLREXCP = 53 + V_MOV_B64 = 56 + V_CVT_F16_U16 = 57 + V_CVT_F16_I16 = 58 + V_CVT_U16_F16 = 59 + V_CVT_I16_F16 = 60 + V_RCP_F16 = 61 + V_SQRT_F16 = 62 + V_RSQ_F16 = 63 + V_LOG_F16 = 64 + V_EXP_F16 = 65 + V_FREXP_MANT_F16 = 66 + V_FREXP_EXP_I16_F16 = 67 + V_FLOOR_F16 = 68 + V_CEIL_F16 = 69 + V_TRUNC_F16 = 70 + V_RNDNE_F16 = 71 + V_FRACT_F16 = 72 + V_SIN_F16 = 73 + V_COS_F16 = 74 + V_CVT_NORM_I16_F16 = 77 + V_CVT_NORM_U16_F16 = 78 + V_SAT_PK_U8_I16 = 79 + V_SWAP_B32 = 81 + V_ACCVGPR_MOV_B32 = 82 + V_CVT_F32_FP8 = 84 + V_CVT_F32_BF8 = 85 + V_CVT_PK_F32_FP8 = 86 + V_CVT_PK_F32_BF8 = 87 + V_PRNG_B32 = 88 + V_PERMLANE16_SWAP_B32 = 89 + V_PERMLANE32_SWAP_B32 = 90 + V_CVT_F32_BF16 = 91 + CDNA4 = 600 + +class VOP2Op(IntEnum): + V_CNDMASK_B32 = 0 + V_ADD_F32 = 1 + V_SUB_F32 = 2 + V_SUBREV_F32 = 3 + V_FMAC_F64 = 4 + V_MUL_F32 = 5 + V_MUL_I32_I24 = 6 + V_MUL_HI_I32_I24 = 7 + V_MUL_U32_U24 = 8 + V_MUL_HI_U32_U24 = 9 + V_MIN_F32 = 10 + V_MAX_F32 = 11 + V_MIN_I32 = 12 + V_MAX_I32 = 13 + V_MIN_U32 = 14 + V_MAX_U32 = 15 + V_LSHRREV_B32 = 16 + V_ASHRREV_I32 = 17 + V_LSHLREV_B32 = 18 + V_AND_B32 = 19 + V_OR_B32 = 20 + V_XOR_B32 = 21 + V_DOT2C_F32_BF16 = 22 + V_FMAMK_F32 = 23 + V_FMAAK_F32 = 24 + V_ADD_CO_U32 = 25 + V_SUB_CO_U32 = 26 + V_SUBREV_CO_U32 = 27 + V_ADDC_CO_U32 = 28 + V_SUBB_CO_U32 = 29 + V_SUBBREV_CO_U32 = 30 + V_ADD_F16 = 31 + V_SUB_F16 = 32 + V_SUBREV_F16 = 33 + V_MUL_F16 = 34 + V_MAC_F16 = 35 + V_MADMK_F16 = 36 + V_MADAK_F16 = 37 + V_ADD_U16 = 38 + V_SUB_U16 = 39 + V_SUBREV_U16 = 40 + V_MUL_LO_U16 = 41 + V_LSHLREV_B16 = 42 + V_LSHRREV_B16 = 43 + V_ASHRREV_I16 = 44 + V_MAX_F16 = 45 + V_MIN_F16 = 46 + V_MAX_U16 = 47 + V_MAX_I16 = 48 + V_MIN_U16 = 49 + V_MIN_I16 = 50 + V_LDEXP_F16 = 51 + V_ADD_U32 = 52 + V_SUB_U32 = 53 + V_SUBREV_U32 = 54 + V_DOT2C_F32_F16 = 55 + V_DOT2C_I32_I16 = 56 + V_DOT4C_I32_I8 = 57 + V_DOT8C_I32_I4 = 58 + V_FMAC_F32 = 59 + V_PK_FMAC_F16 = 60 + V_XNOR_B32 = 61 + CDNA4 = 600 + +class VOP3AOp(IntEnum): + V_CMP_CLASS_F32 = 16 + V_CMPX_CLASS_F32 = 17 + V_CMP_CLASS_F64 = 18 + V_CMPX_CLASS_F64 = 19 + V_CMP_CLASS_F16 = 20 + V_CMPX_CLASS_F16 = 21 + V_CMP_F_F16 = 32 + V_CMP_LT_F16 = 33 + V_CMP_EQ_F16 = 34 + V_CMP_LE_F16 = 35 + V_CMP_GT_F16 = 36 + V_CMP_LG_F16 = 37 + V_CMP_GE_F16 = 38 + V_CMP_O_F16 = 39 + V_CMP_U_F16 = 40 + V_CMP_NGE_F16 = 41 + V_CMP_NLG_F16 = 42 + V_CMP_NGT_F16 = 43 + V_CMP_NLE_F16 = 44 + V_CMP_NEQ_F16 = 45 + V_CMP_NLT_F16 = 46 + V_CMP_TRU_F16 = 47 + V_CMPX_F_F16 = 48 + V_CMPX_LT_F16 = 49 + V_CMPX_EQ_F16 = 50 + V_CMPX_LE_F16 = 51 + V_CMPX_GT_F16 = 52 + V_CMPX_LG_F16 = 53 + V_CMPX_GE_F16 = 54 + V_CMPX_O_F16 = 55 + V_CMPX_U_F16 = 56 + V_CMPX_NGE_F16 = 57 + V_CMPX_NLG_F16 = 58 + V_CMPX_NGT_F16 = 59 + V_CMPX_NLE_F16 = 60 + V_CMPX_NEQ_F16 = 61 + V_CMPX_NLT_F16 = 62 + V_CMPX_TRU_F16 = 63 + V_CMP_F_F32 = 64 + V_CMP_LT_F32 = 65 + V_CMP_EQ_F32 = 66 + V_CMP_LE_F32 = 67 + V_CMP_GT_F32 = 68 + V_CMP_LG_F32 = 69 + V_CMP_GE_F32 = 70 + V_CMP_O_F32 = 71 + V_CMP_U_F32 = 72 + V_CMP_NGE_F32 = 73 + V_CMP_NLG_F32 = 74 + V_CMP_NGT_F32 = 75 + V_CMP_NLE_F32 = 76 + V_CMP_NEQ_F32 = 77 + V_CMP_NLT_F32 = 78 + V_CMP_TRU_F32 = 79 + V_CMPX_F_F32 = 80 + V_CMPX_LT_F32 = 81 + V_CMPX_EQ_F32 = 82 + V_CMPX_LE_F32 = 83 + V_CMPX_GT_F32 = 84 + V_CMPX_LG_F32 = 85 + V_CMPX_GE_F32 = 86 + V_CMPX_O_F32 = 87 + V_CMPX_U_F32 = 88 + V_CMPX_NGE_F32 = 89 + V_CMPX_NLG_F32 = 90 + V_CMPX_NGT_F32 = 91 + V_CMPX_NLE_F32 = 92 + V_CMPX_NEQ_F32 = 93 + V_CMPX_NLT_F32 = 94 + V_CMPX_TRU_F32 = 95 + V_CMP_F_F64 = 96 + V_CMP_LT_F64 = 97 + V_CMP_EQ_F64 = 98 + V_CMP_LE_F64 = 99 + V_CMP_GT_F64 = 100 + V_CMP_LG_F64 = 101 + V_CMP_GE_F64 = 102 + V_CMP_O_F64 = 103 + V_CMP_U_F64 = 104 + V_CMP_NGE_F64 = 105 + V_CMP_NLG_F64 = 106 + V_CMP_NGT_F64 = 107 + V_CMP_NLE_F64 = 108 + V_CMP_NEQ_F64 = 109 + V_CMP_NLT_F64 = 110 + V_CMP_TRU_F64 = 111 + V_CMPX_F_F64 = 112 + V_CMPX_LT_F64 = 113 + V_CMPX_EQ_F64 = 114 + V_CMPX_LE_F64 = 115 + V_CMPX_GT_F64 = 116 + V_CMPX_LG_F64 = 117 + V_CMPX_GE_F64 = 118 + V_CMPX_O_F64 = 119 + V_CMPX_U_F64 = 120 + V_CMPX_NGE_F64 = 121 + V_CMPX_NLG_F64 = 122 + V_CMPX_NGT_F64 = 123 + V_CMPX_NLE_F64 = 124 + V_CMPX_NEQ_F64 = 125 + V_CMPX_NLT_F64 = 126 + V_CMPX_TRU_F64 = 127 + V_CMP_F_I16 = 160 + V_CMP_LT_I16 = 161 + V_CMP_EQ_I16 = 162 + V_CMP_LE_I16 = 163 + V_CMP_GT_I16 = 164 + V_CMP_NE_I16 = 165 + V_CMP_GE_I16 = 166 + V_CMP_T_I16 = 167 + V_CMP_F_U16 = 168 + V_CMP_LT_U16 = 169 + V_CMP_EQ_U16 = 170 + V_CMP_LE_U16 = 171 + V_CMP_GT_U16 = 172 + V_CMP_NE_U16 = 173 + V_CMP_GE_U16 = 174 + V_CMP_T_U16 = 175 + V_CMPX_F_I16 = 176 + V_CMPX_LT_I16 = 177 + V_CMPX_EQ_I16 = 178 + V_CMPX_LE_I16 = 179 + V_CMPX_GT_I16 = 180 + V_CMPX_NE_I16 = 181 + V_CMPX_GE_I16 = 182 + V_CMPX_T_I16 = 183 + V_CMPX_F_U16 = 184 + V_CMPX_LT_U16 = 185 + V_CMPX_EQ_U16 = 186 + V_CMPX_LE_U16 = 187 + V_CMPX_GT_U16 = 188 + V_CMPX_NE_U16 = 189 + V_CMPX_GE_U16 = 190 + V_CMPX_T_U16 = 191 + V_CMP_F_I32 = 192 + V_CMP_LT_I32 = 193 + V_CMP_EQ_I32 = 194 + V_CMP_LE_I32 = 195 + V_CMP_GT_I32 = 196 + V_CMP_NE_I32 = 197 + V_CMP_GE_I32 = 198 + V_CMP_T_I32 = 199 + V_CMP_F_U32 = 200 + V_CMP_LT_U32 = 201 + V_CMP_EQ_U32 = 202 + V_CMP_LE_U32 = 203 + V_CMP_GT_U32 = 204 + V_CMP_NE_U32 = 205 + V_CMP_GE_U32 = 206 + V_CMP_T_U32 = 207 + V_CMPX_F_I32 = 208 + V_CMPX_LT_I32 = 209 + V_CMPX_EQ_I32 = 210 + V_CMPX_LE_I32 = 211 + V_CMPX_GT_I32 = 212 + V_CMPX_NE_I32 = 213 + V_CMPX_GE_I32 = 214 + V_CMPX_T_I32 = 215 + V_CMPX_F_U32 = 216 + V_CMPX_LT_U32 = 217 + V_CMPX_EQ_U32 = 218 + V_CMPX_LE_U32 = 219 + V_CMPX_GT_U32 = 220 + V_CMPX_NE_U32 = 221 + V_CMPX_GE_U32 = 222 + V_CMPX_T_U32 = 223 + V_CMP_F_I64 = 224 + V_CMP_LT_I64 = 225 + V_CMP_EQ_I64 = 226 + V_CMP_LE_I64 = 227 + V_CMP_GT_I64 = 228 + V_CMP_NE_I64 = 229 + V_CMP_GE_I64 = 230 + V_CMP_T_I64 = 231 + V_CMP_F_U64 = 232 + V_CMP_LT_U64 = 233 + V_CMP_EQ_U64 = 234 + V_CMP_LE_U64 = 235 + V_CMP_GT_U64 = 236 + V_CMP_NE_U64 = 237 + V_CMP_GE_U64 = 238 + V_CMP_T_U64 = 239 + V_CMPX_F_I64 = 240 + V_CMPX_LT_I64 = 241 + V_CMPX_EQ_I64 = 242 + V_CMPX_LE_I64 = 243 + V_CMPX_GT_I64 = 244 + V_CMPX_NE_I64 = 245 + V_CMPX_GE_I64 = 246 + V_CMPX_T_I64 = 247 + V_CMPX_F_U64 = 248 + V_CMPX_LT_U64 = 249 + V_CMPX_EQ_U64 = 250 + V_CMPX_LE_U64 = 251 + V_CMPX_GT_U64 = 252 + V_CMPX_NE_U64 = 253 + V_CMPX_GE_U64 = 254 + V_CMPX_T_U64 = 255 + V_CNDMASK_B32 = 256 + V_ADD_F32 = 257 + V_SUB_F32 = 258 + V_SUBREV_F32 = 259 + V_FMAC_F64 = 260 + V_MUL_F32 = 261 + V_MUL_I32_I24 = 262 + V_MUL_HI_I32_I24 = 263 + V_MUL_U32_U24 = 264 + V_MUL_HI_U32_U24 = 265 + V_MIN_F32 = 266 + V_MAX_F32 = 267 + V_MIN_I32 = 268 + V_MAX_I32 = 269 + V_MIN_U32 = 270 + V_MAX_U32 = 271 + V_LSHRREV_B32 = 272 + V_ASHRREV_I32 = 273 + V_LSHLREV_B32 = 274 + V_AND_B32 = 275 + V_OR_B32 = 276 + V_XOR_B32 = 277 + V_DOT2C_F32_BF16 = 278 + V_ADD_F16 = 287 + V_SUB_F16 = 288 + V_SUBREV_F16 = 289 + V_MUL_F16 = 290 + V_MAC_F16 = 291 + V_ADD_U16 = 294 + V_SUB_U16 = 295 + V_SUBREV_U16 = 296 + V_MUL_LO_U16 = 297 + V_LSHLREV_B16 = 298 + V_LSHRREV_B16 = 299 + V_ASHRREV_I16 = 300 + V_MAX_F16 = 301 + V_MIN_F16 = 302 + V_MAX_U16 = 303 + V_MAX_I16 = 304 + V_MIN_U16 = 305 + V_MIN_I16 = 306 + V_LDEXP_F16 = 307 + V_ADD_U32 = 308 + V_SUB_U32 = 309 + V_SUBREV_U32 = 310 + V_DOT2C_F32_F16 = 311 + V_DOT2C_I32_I16 = 312 + V_DOT4C_I32_I8 = 313 + V_DOT8C_I32_I4 = 314 + V_FMAC_F32 = 315 + V_PK_FMAC_F16 = 316 + V_XNOR_B32 = 317 + V_NOP = 384 + V_MOV_B32 = 385 + V_READFIRSTLANE_B32 = 386 + V_CVT_I32_F64 = 387 + V_CVT_F64_I32 = 388 + V_CVT_F32_I32 = 389 + V_CVT_F32_U32 = 390 + V_CVT_U32_F32 = 391 + V_CVT_I32_F32 = 392 + V_CVT_F16_F32 = 394 + V_CVT_F32_F16 = 395 + V_CVT_RPI_I32_F32 = 396 + V_CVT_FLR_I32_F32 = 397 + V_CVT_OFF_F32_I4 = 398 + V_CVT_F32_F64 = 399 + V_CVT_F64_F32 = 400 + V_CVT_F32_UBYTE0 = 401 + V_CVT_F32_UBYTE1 = 402 + V_CVT_F32_UBYTE2 = 403 + V_CVT_F32_UBYTE3 = 404 + V_CVT_U32_F64 = 405 + V_CVT_F64_U32 = 406 + V_TRUNC_F64 = 407 + V_CEIL_F64 = 408 + V_RNDNE_F64 = 409 + V_FLOOR_F64 = 410 + V_FRACT_F32 = 411 + V_TRUNC_F32 = 412 + V_CEIL_F32 = 413 + V_RNDNE_F32 = 414 + V_FLOOR_F32 = 415 + V_EXP_F32 = 416 + V_LOG_F32 = 417 + V_RCP_F32 = 418 + V_RCP_IFLAG_F32 = 419 + V_RSQ_F32 = 420 + V_RCP_F64 = 421 + V_RSQ_F64 = 422 + V_SQRT_F32 = 423 + V_SQRT_F64 = 424 + V_SIN_F32 = 425 + V_COS_F32 = 426 + V_NOT_B32 = 427 + V_BFREV_B32 = 428 + V_FFBH_U32 = 429 + V_FFBL_B32 = 430 + V_FFBH_I32 = 431 + V_FREXP_EXP_I32_F64 = 432 + V_FREXP_MANT_F64 = 433 + V_FRACT_F64 = 434 + V_FREXP_EXP_I32_F32 = 435 + V_FREXP_MANT_F32 = 436 + V_CLREXCP = 437 + V_MOV_B64 = 440 + V_CVT_F16_U16 = 441 + V_CVT_F16_I16 = 442 + V_CVT_U16_F16 = 443 + V_CVT_I16_F16 = 444 + V_RCP_F16 = 445 + V_SQRT_F16 = 446 + V_RSQ_F16 = 447 + V_LOG_F16 = 448 + V_EXP_F16 = 449 + V_MAD_I32_I24 = 450 + V_MAD_U32_U24 = 451 + V_CUBEID_F32 = 452 + V_CUBESC_F32 = 453 + V_CUBETC_F32 = 454 + V_CUBEMA_F32 = 455 + V_BFE_U32 = 456 + V_BFE_I32 = 457 + V_BFI_B32 = 458 + V_FMA_F32 = 459 + V_FMA_F64 = 460 + V_LERP_U8 = 461 + V_ALIGNBIT_B32 = 462 + V_ALIGNBYTE_B32 = 463 + V_MIN3_F32 = 464 + V_MIN3_I32 = 465 + V_MIN3_U32 = 466 + V_MAX3_F32 = 467 + V_MAX3_I32 = 468 + V_MAX3_U32 = 469 + V_MED3_F32 = 470 + V_MED3_I32 = 471 + V_MED3_U32 = 472 + V_SAD_U8 = 473 + V_SAD_HI_U8 = 474 + V_SAD_U16 = 475 + V_SAD_U32 = 476 + V_CVT_PK_U8_F32 = 477 + V_DIV_FIXUP_F32 = 478 + V_DIV_FIXUP_F64 = 479 + V_DIV_FMAS_F32 = 482 + V_DIV_FMAS_F64 = 483 + V_MSAD_U8 = 484 + V_QSAD_PK_U16_U8 = 485 + V_MQSAD_PK_U16_U8 = 486 + V_MQSAD_U32_U8 = 487 + V_MAD_LEGACY_F16 = 490 + V_MAD_LEGACY_U16 = 491 + V_MAD_LEGACY_I16 = 492 + V_PERM_B32 = 493 + V_FMA_LEGACY_F16 = 494 + V_DIV_FIXUP_LEGACY_F16 = 495 + V_CVT_PKACCUM_U8_F32 = 496 + V_MAD_U32_U16 = 497 + V_MAD_I32_I16 = 498 + V_XAD_U32 = 499 + V_MIN3_F16 = 500 + V_MIN3_I16 = 501 + V_MIN3_U16 = 502 + V_MAX3_F16 = 503 + V_MAX3_I16 = 504 + V_MAX3_U16 = 505 + V_MED3_F16 = 506 + V_MED3_I16 = 507 + V_MED3_U16 = 508 + V_LSHL_ADD_U32 = 509 + V_ADD_LSHL_U32 = 510 + V_ADD3_U32 = 511 + V_LSHL_OR_B32 = 512 + V_AND_OR_B32 = 513 + V_OR3_B32 = 514 + V_MAD_F16 = 515 + V_MAD_U16 = 516 + V_MAD_I16 = 517 + V_FMA_F16 = 518 + V_DIV_FIXUP_F16 = 519 + V_LSHL_ADD_U64 = 520 + V_BITOP3_B16 = 563 + V_BITOP3_B32 = 564 + V_CVT_SCALEF32_PK_FP8_F32 = 565 + V_CVT_SCALEF32_PK_BF8_F32 = 566 + V_CVT_SCALEF32_SR_FP8_F32 = 567 + V_CVT_SCALEF32_SR_BF8_F32 = 568 + V_CVT_SCALEF32_PK_F32_FP8 = 569 + V_CVT_SCALEF32_PK_F32_BF8 = 570 + V_CVT_SCALEF32_F32_FP8 = 571 + V_CVT_SCALEF32_F32_BF8 = 572 + V_CVT_SCALEF32_PK_FP4_F32 = 573 + V_CVT_SCALEF32_SR_PK_FP4_F32 = 574 + V_CVT_SCALEF32_PK_F32_FP4 = 575 + V_CVT_SCALEF32_PK_FP8_F16 = 576 + V_CVT_SCALEF32_PK_BF8_F16 = 577 + V_CVT_SCALEF32_SR_FP8_F16 = 578 + V_CVT_SCALEF32_SR_BF8_F16 = 579 + V_CVT_SCALEF32_PK_FP8_BF16 = 580 + V_CVT_SCALEF32_PK_BF8_BF16 = 581 + V_CVT_SCALEF32_SR_FP8_BF16 = 582 + V_CVT_SCALEF32_SR_BF8_BF16 = 583 + V_CVT_SCALEF32_PK_F16_FP8 = 584 + V_CVT_SCALEF32_PK_F16_BF8 = 585 + V_CVT_SCALEF32_F16_FP8 = 586 + V_CVT_SCALEF32_F16_BF8 = 587 + V_CVT_SCALEF32_PK_FP4_F16 = 588 + V_CVT_SCALEF32_PK_FP4_BF16 = 589 + V_CVT_SCALEF32_SR_PK_FP4_F16 = 590 + V_CVT_SCALEF32_SR_PK_FP4_BF16 = 591 + V_CVT_SCALEF32_PK_F16_FP4 = 592 + V_CVT_SCALEF32_PK_BF16_FP4 = 593 + V_CVT_SCALEF32_2XPK16_FP6_F32 = 594 + V_CVT_SCALEF32_2XPK16_BF6_F32 = 595 + V_CVT_SCALEF32_SR_PK32_FP6_F32 = 596 + V_CVT_SCALEF32_SR_PK32_BF6_F32 = 597 + V_CVT_SCALEF32_PK32_F32_FP6 = 598 + V_CVT_SCALEF32_PK32_F32_BF6 = 599 + CDNA4 = 600 + V_CVT_SCALEF32_PK32_FP6_BF16 = 601 + V_CVT_SCALEF32_PK32_BF6_F16 = 602 + V_CVT_SCALEF32_PK32_BF6_BF16 = 603 + V_CVT_SCALEF32_SR_PK32_FP6_F16 = 604 + V_CVT_SCALEF32_SR_PK32_FP6_BF16 = 605 + V_CVT_SCALEF32_SR_PK32_BF6_F16 = 606 + V_CVT_SCALEF32_SR_PK32_BF6_BF16 = 607 + V_CVT_SCALEF32_PK32_F16_FP6 = 608 + V_CVT_SCALEF32_PK32_BF16_FP6 = 609 + V_CVT_SCALEF32_PK32_F16_BF6 = 610 + V_CVT_SCALEF32_PK32_BF16_BF6 = 611 + V_ASHR_PK_I8_I32 = 613 + V_ASHR_PK_U8_I32 = 614 + V_CVT_PK_F16_F32 = 615 + V_CVT_PK_BF16_F32 = 616 + V_CVT_SCALEF32_PK_BF16_FP8 = 617 + V_CVT_SCALEF32_PK_BF16_BF8 = 618 + V_ADD_F64 = 640 + V_MUL_F64 = 641 + V_MIN_F64 = 642 + V_MAX_F64 = 643 + V_LDEXP_F64 = 644 + V_MUL_LO_U32 = 645 + V_MUL_HI_U32 = 646 + V_MUL_HI_I32 = 647 + V_LDEXP_F32 = 648 + V_READLANE_B32 = 649 + V_WRITELANE_B32 = 650 + V_BCNT_U32_B32 = 651 + V_MBCNT_LO_U32_B32 = 652 + V_MBCNT_HI_U32_B32 = 653 + V_LSHLREV_B64 = 655 + V_LSHRREV_B64 = 656 + V_ASHRREV_I64 = 657 + V_TRIG_PREOP_F64 = 658 + V_BFM_B32 = 659 + V_CVT_PKNORM_I16_F32 = 660 + V_CVT_PKNORM_U16_F32 = 661 + V_CVT_PKRTZ_F16_F32 = 662 + V_CVT_PK_U16_U32 = 663 + V_CVT_PK_I16_I32 = 664 + V_CVT_PKNORM_I16_F16 = 665 + V_CVT_PKNORM_U16_F16 = 666 + V_ADD_I32 = 668 + V_SUB_I32 = 669 + V_ADD_I16 = 670 + V_SUB_I16 = 671 + V_PACK_B32_F16 = 672 + V_MUL_LEGACY_F32 = 673 + V_CVT_PK_FP8_F32 = 674 + V_CVT_PK_BF8_F32 = 675 + V_CVT_SR_FP8_F32 = 676 + V_CVT_SR_BF8_F32 = 677 + V_CVT_SR_F16_F32 = 678 + V_CVT_SR_BF16_F32 = 679 + V_MINIMUM3_F32 = 680 + V_MAXIMUM3_F32 = 681 + +class VOP3BOp(IntEnum): + V_ADD_CO_U32 = 281 + V_SUB_CO_U32 = 282 + V_SUBREV_CO_U32 = 283 + V_ADDC_CO_U32 = 284 + V_SUBB_CO_U32 = 285 + V_SUBBREV_CO_U32 = 286 + V_DIV_SCALE_F32 = 480 + V_DIV_SCALE_F64 = 481 + V_MAD_U64_U32 = 488 + V_MAD_I64_I32 = 489 + CDNA4 = 600 + +class VOP3POp(IntEnum): + V_PK_MAD_I16 = 0 + V_PK_MUL_LO_U16 = 1 + V_PK_ADD_I16 = 2 + V_PK_SUB_I16 = 3 + V_PK_LSHLREV_B16 = 4 + V_PK_LSHRREV_B16 = 5 + V_PK_ASHRREV_I16 = 6 + V_PK_MAX_I16 = 7 + V_PK_MIN_I16 = 8 + V_PK_MAD_U16 = 9 + V_PK_ADD_U16 = 10 + V_PK_SUB_U16 = 11 + V_PK_MAX_U16 = 12 + V_PK_MIN_U16 = 13 + V_PK_FMA_F16 = 14 + V_PK_ADD_F16 = 15 + V_PK_MUL_F16 = 16 + V_PK_MIN_F16 = 17 + V_PK_MAX_F16 = 18 + V_DOT2_F32_BF16 = 26 + V_PK_MINIMUM3_F16 = 27 + V_PK_MAXIMUM3_F16 = 28 + V_MAD_MIX_F32 = 32 + V_MAD_MIXLO_F16 = 33 + V_MAD_MIXHI_F16 = 34 + V_DOT2_F32_F16 = 35 + V_DOT2_I32_I16 = 38 + V_DOT2_U32_U16 = 39 + V_DOT4_I32_I8 = 40 + V_DOT4_U32_U8 = 41 + V_DOT8_I32_I4 = 42 + V_DOT8_U32_U4 = 43 + V_MFMA_F32_16X16X128_F8F6F4 = 45 + V_MFMA_F32_32X32X64_F8F6F4 = 46 + V_PK_FMA_F32 = 48 + V_PK_MUL_F32 = 49 + V_PK_ADD_F32 = 50 + V_PK_MOV_B32 = 51 + V_MFMA_F32_16X16X32_BF16 = 53 + V_MFMA_I32_16X16X64_I8 = 54 + V_MFMA_F32_32X32X16_BF16 = 55 + V_MFMA_I32_32X32X32_I8 = 56 + V_SMFMAC_F32_16X16X64_BF16 = 57 + V_SMFMAC_I32_16X16X128_I8 = 58 + V_SMFMAC_F32_16X16X128_BF8_BF8 = 59 + V_SMFMAC_F32_16X16X128_BF8_FP8 = 60 + V_SMFMAC_F32_16X16X128_FP8_BF8 = 61 + V_MFMA_F32_32X32X1_2B_F32 = 64 + V_MFMA_F32_16X16X1_4B_F32 = 65 + V_MFMA_F32_4X4X1_16B_F32 = 66 + V_SMFMAC_F32_16X16X128_FP8_FP8 = 67 + V_MFMA_F32_32X32X2_F32 = 68 + V_MFMA_F32_16X16X4_F32 = 69 + V_SMFMAC_F32_32X32X32_BF16 = 70 + V_SMFMAC_I32_32X32X64_I8 = 71 + V_MFMA_F32_32X32X4_2B_F16 = 72 + V_MFMA_F32_16X16X4_4B_F16 = 73 + V_MFMA_F32_4X4X4_16B_F16 = 74 + V_SMFMAC_F32_32X32X64_BF8_BF8 = 75 + V_MFMA_F32_32X32X8_F16 = 76 + V_MFMA_F32_16X16X16_F16 = 77 + V_SMFMAC_F32_32X32X64_BF8_FP8 = 78 + V_SMFMAC_F32_32X32X64_FP8_BF8 = 79 + V_MFMA_I32_32X32X4_2B_I8 = 80 + V_MFMA_I32_16X16X4_4B_I8 = 81 + V_MFMA_I32_4X4X4_16B_I8 = 82 + V_SMFMAC_F32_32X32X64_FP8_FP8 = 83 + V_MFMA_F32_16X16X32_F16 = 84 + V_MFMA_F32_32X32X16_F16 = 85 + V_MFMA_I32_32X32X16_I8 = 86 + V_MFMA_I32_16X16X32_I8 = 87 + V_ACCVGPR_READ = 88 + V_ACCVGPR_WRITE = 89 + V_SMFMAC_F32_16X16X64_F16 = 90 + V_SMFMAC_F32_32X32X32_F16 = 91 + V_MFMA_F32_32X32X4_2B_BF16 = 93 + V_MFMA_F32_16X16X4_4B_BF16 = 94 + V_MFMA_F32_4X4X4_16B_BF16 = 95 + V_MFMA_F32_32X32X8_BF16 = 96 + V_MFMA_F32_16X16X16_BF16 = 97 + V_SMFMAC_F32_16X16X32_F16 = 98 + V_SMFMAC_F32_32X32X16_F16 = 100 + V_SMFMAC_F32_16X16X32_BF16 = 102 + V_SMFMAC_F32_32X32X16_BF16 = 104 + V_SMFMAC_I32_16X16X64_I8 = 106 + V_SMFMAC_I32_32X32X32_I8 = 108 + V_MFMA_F64_16X16X4_F64 = 110 + V_MFMA_F64_4X4X4_4B_F64 = 111 + V_MFMA_F32_16X16X32_BF8_BF8 = 112 + V_MFMA_F32_16X16X32_BF8_FP8 = 113 + V_MFMA_F32_16X16X32_FP8_BF8 = 114 + V_MFMA_F32_16X16X32_FP8_FP8 = 115 + V_MFMA_F32_32X32X16_BF8_BF8 = 116 + V_MFMA_F32_32X32X16_BF8_FP8 = 117 + V_MFMA_F32_32X32X16_FP8_BF8 = 118 + V_MFMA_F32_32X32X16_FP8_FP8 = 119 + V_SMFMAC_F32_16X16X64_BF8_BF8 = 120 + V_SMFMAC_F32_16X16X64_BF8_FP8 = 121 + V_SMFMAC_F32_16X16X64_FP8_BF8 = 122 + V_SMFMAC_F32_16X16X64_FP8_FP8 = 123 + V_SMFMAC_F32_32X32X32_BF8_BF8 = 124 + V_SMFMAC_F32_32X32X32_BF8_FP8 = 125 + V_SMFMAC_F32_32X32X32_FP8_BF8 = 126 + V_SMFMAC_F32_32X32X32_FP8_FP8 = 127 + CDNA4 = 600 + +class VOPCOp(IntEnum): + V_CMP_CLASS_F32 = 16 + V_CMPX_CLASS_F32 = 17 + V_CMP_CLASS_F64 = 18 + V_CMPX_CLASS_F64 = 19 + V_CMP_CLASS_F16 = 20 + V_CMPX_CLASS_F16 = 21 + V_CMP_F_F16 = 32 + V_CMP_LT_F16 = 33 + V_CMP_EQ_F16 = 34 + V_CMP_LE_F16 = 35 + V_CMP_GT_F16 = 36 + V_CMP_LG_F16 = 37 + V_CMP_GE_F16 = 38 + V_CMP_O_F16 = 39 + V_CMP_U_F16 = 40 + V_CMP_NGE_F16 = 41 + V_CMP_NLG_F16 = 42 + V_CMP_NGT_F16 = 43 + V_CMP_NLE_F16 = 44 + V_CMP_NEQ_F16 = 45 + V_CMP_NLT_F16 = 46 + V_CMP_TRU_F16 = 47 + V_CMPX_F_F16 = 48 + V_CMPX_LT_F16 = 49 + V_CMPX_EQ_F16 = 50 + V_CMPX_LE_F16 = 51 + V_CMPX_GT_F16 = 52 + V_CMPX_LG_F16 = 53 + V_CMPX_GE_F16 = 54 + V_CMPX_O_F16 = 55 + V_CMPX_U_F16 = 56 + V_CMPX_NGE_F16 = 57 + V_CMPX_NLG_F16 = 58 + V_CMPX_NGT_F16 = 59 + V_CMPX_NLE_F16 = 60 + V_CMPX_NEQ_F16 = 61 + V_CMPX_NLT_F16 = 62 + V_CMPX_TRU_F16 = 63 + V_CMP_F_F32 = 64 + V_CMP_LT_F32 = 65 + V_CMP_EQ_F32 = 66 + V_CMP_LE_F32 = 67 + V_CMP_GT_F32 = 68 + V_CMP_LG_F32 = 69 + V_CMP_GE_F32 = 70 + V_CMP_O_F32 = 71 + V_CMP_U_F32 = 72 + V_CMP_NGE_F32 = 73 + V_CMP_NLG_F32 = 74 + V_CMP_NGT_F32 = 75 + V_CMP_NLE_F32 = 76 + V_CMP_NEQ_F32 = 77 + V_CMP_NLT_F32 = 78 + V_CMP_TRU_F32 = 79 + V_CMPX_F_F32 = 80 + V_CMPX_LT_F32 = 81 + V_CMPX_EQ_F32 = 82 + V_CMPX_LE_F32 = 83 + V_CMPX_GT_F32 = 84 + V_CMPX_LG_F32 = 85 + V_CMPX_GE_F32 = 86 + V_CMPX_O_F32 = 87 + V_CMPX_U_F32 = 88 + V_CMPX_NGE_F32 = 89 + V_CMPX_NLG_F32 = 90 + V_CMPX_NGT_F32 = 91 + V_CMPX_NLE_F32 = 92 + V_CMPX_NEQ_F32 = 93 + V_CMPX_NLT_F32 = 94 + V_CMPX_TRU_F32 = 95 + V_CMP_F_F64 = 96 + V_CMP_LT_F64 = 97 + V_CMP_EQ_F64 = 98 + V_CMP_LE_F64 = 99 + V_CMP_GT_F64 = 100 + V_CMP_LG_F64 = 101 + V_CMP_GE_F64 = 102 + V_CMP_O_F64 = 103 + V_CMP_U_F64 = 104 + V_CMP_NGE_F64 = 105 + V_CMP_NLG_F64 = 106 + V_CMP_NGT_F64 = 107 + V_CMP_NLE_F64 = 108 + V_CMP_NEQ_F64 = 109 + V_CMP_NLT_F64 = 110 + V_CMP_TRU_F64 = 111 + V_CMPX_F_F64 = 112 + V_CMPX_LT_F64 = 113 + V_CMPX_EQ_F64 = 114 + V_CMPX_LE_F64 = 115 + V_CMPX_GT_F64 = 116 + V_CMPX_LG_F64 = 117 + V_CMPX_GE_F64 = 118 + V_CMPX_O_F64 = 119 + V_CMPX_U_F64 = 120 + V_CMPX_NGE_F64 = 121 + V_CMPX_NLG_F64 = 122 + V_CMPX_NGT_F64 = 123 + V_CMPX_NLE_F64 = 124 + V_CMPX_NEQ_F64 = 125 + V_CMPX_NLT_F64 = 126 + V_CMPX_TRU_F64 = 127 + V_CMP_F_I16 = 160 + V_CMP_LT_I16 = 161 + V_CMP_EQ_I16 = 162 + V_CMP_LE_I16 = 163 + V_CMP_GT_I16 = 164 + V_CMP_NE_I16 = 165 + V_CMP_GE_I16 = 166 + V_CMP_T_I16 = 167 + V_CMP_F_U16 = 168 + V_CMP_LT_U16 = 169 + V_CMP_EQ_U16 = 170 + V_CMP_LE_U16 = 171 + V_CMP_GT_U16 = 172 + V_CMP_NE_U16 = 173 + V_CMP_GE_U16 = 174 + V_CMP_T_U16 = 175 + V_CMPX_F_I16 = 176 + V_CMPX_LT_I16 = 177 + V_CMPX_EQ_I16 = 178 + V_CMPX_LE_I16 = 179 + V_CMPX_GT_I16 = 180 + V_CMPX_NE_I16 = 181 + V_CMPX_GE_I16 = 182 + V_CMPX_T_I16 = 183 + V_CMPX_F_U16 = 184 + V_CMPX_LT_U16 = 185 + V_CMPX_EQ_U16 = 186 + V_CMPX_LE_U16 = 187 + V_CMPX_GT_U16 = 188 + V_CMPX_NE_U16 = 189 + V_CMPX_GE_U16 = 190 + V_CMPX_T_U16 = 191 + V_CMP_F_I32 = 192 + V_CMP_LT_I32 = 193 + V_CMP_EQ_I32 = 194 + V_CMP_LE_I32 = 195 + V_CMP_GT_I32 = 196 + V_CMP_NE_I32 = 197 + V_CMP_GE_I32 = 198 + V_CMP_T_I32 = 199 + V_CMP_F_U32 = 200 + V_CMP_LT_U32 = 201 + V_CMP_EQ_U32 = 202 + V_CMP_LE_U32 = 203 + V_CMP_GT_U32 = 204 + V_CMP_NE_U32 = 205 + V_CMP_GE_U32 = 206 + V_CMP_T_U32 = 207 + V_CMPX_F_I32 = 208 + V_CMPX_LT_I32 = 209 + V_CMPX_EQ_I32 = 210 + V_CMPX_LE_I32 = 211 + V_CMPX_GT_I32 = 212 + V_CMPX_NE_I32 = 213 + V_CMPX_GE_I32 = 214 + V_CMPX_T_I32 = 215 + V_CMPX_F_U32 = 216 + V_CMPX_LT_U32 = 217 + V_CMPX_EQ_U32 = 218 + V_CMPX_LE_U32 = 219 + V_CMPX_GT_U32 = 220 + V_CMPX_NE_U32 = 221 + V_CMPX_GE_U32 = 222 + V_CMPX_T_U32 = 223 + V_CMP_F_I64 = 224 + V_CMP_LT_I64 = 225 + V_CMP_EQ_I64 = 226 + V_CMP_LE_I64 = 227 + V_CMP_GT_I64 = 228 + V_CMP_NE_I64 = 229 + V_CMP_GE_I64 = 230 + V_CMP_T_I64 = 231 + V_CMP_F_U64 = 232 + V_CMP_LT_U64 = 233 + V_CMP_EQ_U64 = 234 + V_CMP_LE_U64 = 235 + V_CMP_GT_U64 = 236 + V_CMP_NE_U64 = 237 + V_CMP_GE_U64 = 238 + V_CMP_T_U64 = 239 + V_CMPX_F_I64 = 240 + V_CMPX_LT_I64 = 241 + V_CMPX_EQ_I64 = 242 + V_CMPX_LE_I64 = 243 + V_CMPX_GT_I64 = 244 + V_CMPX_NE_I64 = 245 + V_CMPX_GE_I64 = 246 + V_CMPX_T_I64 = 247 + V_CMPX_F_U64 = 248 + V_CMPX_LT_U64 = 249 + V_CMPX_EQ_U64 = 250 + V_CMPX_LE_U64 = 251 + V_CMPX_GT_U64 = 252 + V_CMPX_NE_U64 = 253 + V_CMPX_GE_U64 = 254 + V_CMPX_T_U64 = 255 + CDNA4 = 600 + +# instruction formats +class DPP(Inst64): + encoding = bits[31:26] == 0b110110 + offset0 = bits[7:0] + offset1 = bits[15:8] + op = bits[24:17] + acc = bits[25] + addr:VGPRField = bits[39:32] + data0:VGPRField = bits[47:40] + data1:VGPRField = bits[55:48] + vdst:VGPRField = bits[63:56] + row_mask = bits[63:60] + +class DS(Inst64): + encoding = bits[31:26] == 0b110110 + op:Annotated[BitField, DSOp] = bits[24:17] + vdst:VGPRField = bits[63:56] + addr:VGPRField = bits[39:32] + data0:VGPRField = bits[47:40] + data1:VGPRField = bits[55:48] + offset0 = bits[7:0] + offset1 = bits[15:8] + acc = bits[25] + +class FLAT(Inst64): + encoding = bits[31:26] == 0b110111 + op:Annotated[BitField, FLATOp] = bits[24:18] + vdst:VGPRField = bits[63:56] + addr:VGPRField = bits[39:32] + data:VGPRField = bits[47:40] + saddr:SSrc = bits[54:48] + offset:Imm = bits[12:0] + seg = bits[15:14] + lds = bits[13] + sc0 = bits[16] + nt = bits[17] + sc1 = bits[25] + acc = bits[55] + +class MTBUF(Inst64): + encoding = bits[31:26] == 0b111010 + op:Annotated[BitField, MTBUFOp] = bits[18:15] + vdata:VGPRField = bits[47:40] + vaddr:VGPRField = bits[39:32] + srsrc:SGPRField = bits[52:48] + soffset:SSrc = bits[63:56] + offset:Imm = bits[11:0] + offen = bits[12] + idxen = bits[13] + sc0 = bits[14] + sc1 = bits[53] + nt = bits[54] + acc = bits[55] + +class MUBUF(Inst64): + encoding = bits[31:26] == 0b111000 + op:Annotated[BitField, MUBUFOp] = bits[24:18] + vdata:VGPRField = bits[47:40] + vaddr:VGPRField = bits[39:32] + srsrc:SGPRField = bits[52:48] + soffset:SSrc = bits[63:56] + offset:Imm = bits[11:0] + offen = bits[12] + idxen = bits[13] + sc0 = bits[14] + sc1 = bits[15] + lds = bits[16] + nt = bits[17] + acc = bits[55] + +class SDWA(Inst64): + src0:Src = bits[39:32] + dst_sel = bits[42:40] + dst_u = bits[44:43] + clmp = bits[45] + omod = bits[47:46] + src0_sel = bits[50:48] + src0_sext = bits[51] + src0_neg = bits[52] + src0_abs = bits[53] + s0 = bits[55] + src1_sel = bits[58:56] + src1_sext = bits[59] + src1_neg = bits[60] + src1_abs = bits[61] + s1 = bits[63] + sdst:SGPRField = bits[46:40] + sd = bits[47] + row_mask = bits[63:60] + +class SMEM(Inst64): + encoding = bits[31:26] == 0b110000 + op:Annotated[BitField, SMEMOp] = bits[25:18] + sdata:SGPRField = bits[12:6] + sbase:SGPRField = bits[5:0] + soffset:SSrc = bits[63:57] + offset:Imm = bits[52:32] + glc = bits[14] + soe = bits[14] + nv = bits[15] + imm = bits[17] + +class SOP1(Inst32): + encoding = bits[31:23] == 0b101111101 + op:Annotated[BitField, SOP1Op] = bits[15:8] + sdst:SGPRField = bits[22:16] + ssrc0:SSrc = bits[7:0] + +class SOP2(Inst32): + encoding = bits[31:30] == 0b10 + op:Annotated[BitField, SOP2Op] = bits[29:23] + sdst:SGPRField = bits[22:16] + ssrc0:SSrc = bits[7:0] + ssrc1:SSrc = bits[15:8] + +class SOPC(Inst32): + encoding = bits[31:23] == 0b101111110 + op:Annotated[BitField, SOPCOp] = bits[22:16] + ssrc0:SSrc = bits[7:0] + ssrc1:SSrc = bits[15:8] + +class SOPK(Inst32): + encoding = bits[31:28] == 0b1011 + op:Annotated[BitField, SOPKOp] = bits[27:23] + sdst:SGPRField = bits[22:16] + simm16:SImm = bits[15:0] + +class SOPP(Inst32): + encoding = bits[31:23] == 0b101111111 + op:Annotated[BitField, SOPPOp] = bits[22:16] + simm16:SImm = bits[15:0] + +class VOP1(Inst32): + encoding = bits[31:25] == 0b111111 + op:Annotated[BitField, VOP1Op] = bits[16:9] + vdst:VGPRField = bits[24:17] + src0:Src = bits[8:0] + +class VOP2(Inst32): + encoding = bits[31] == 0 + op:Annotated[BitField, VOP2Op] = bits[30:25] + vdst:VGPRField = bits[24:17] + src0:Src = bits[8:0] + vsrc1:VGPRField = bits[16:9] + +class VOP3A(Inst64): + encoding = bits[31:26] == 0b110100 + vdst:VGPRField = bits[7:0] + abs = bits[10:8] + opsel = bits[14:11] + clmp = bits[15] + op:Annotated[BitField, VOP3AOp] = bits[25:16] + src0:Src = bits[40:32] + src1:Src = bits[49:41] + src2:Src = bits[58:50] + omod = bits[60:59] + neg = bits[63:61] + +class VOP3B(Inst64): + encoding = bits[31:26] == 0b110100 + vdst:VGPRField = bits[7:0] + sdst:SGPRField = bits[14:8] + clmp = bits[15] + op:Annotated[BitField, VOP3BOp] = bits[25:16] + src0:Src = bits[40:32] + src1:Src = bits[49:41] + src2:Src = bits[58:50] + omod = bits[60:59] + neg = bits[63:61] + +class VOP3P(Inst64): + encoding = bits[31:23] == 0b110100111 + _defaults = {'opsel_hi': 3, 'opsel_hi2': 1} + op:Annotated[BitField, VOP3POp] = bits[22:16] + vdst:VGPRField = bits[7:0] + src0:Src = bits[40:32] + src1:Src = bits[49:41] + src2:Src = bits[58:50] + neg = bits[63:61] + neg_hi = bits[10:8] + opsel = bits[13:11] + opsel_hi = bits[60:59] + clmp = bits[15] + opsel_hi2 = bits[14] + +class VOPC(Inst32): + encoding = bits[31:25] == 0b111110 + op:Annotated[BitField, VOPCOp] = bits[24:17] + src0:Src = bits[8:0] + vsrc1:VGPRField = bits[16:9] + +# instruction helpers +ds_add_u32 = functools.partial(DS, DSOp.DS_ADD_U32) +ds_sub_u32 = functools.partial(DS, DSOp.DS_SUB_U32) +ds_rsub_u32 = functools.partial(DS, DSOp.DS_RSUB_U32) +ds_inc_u32 = functools.partial(DS, DSOp.DS_INC_U32) +ds_dec_u32 = functools.partial(DS, DSOp.DS_DEC_U32) +ds_min_i32 = functools.partial(DS, DSOp.DS_MIN_I32) +ds_max_i32 = functools.partial(DS, DSOp.DS_MAX_I32) +ds_min_u32 = functools.partial(DS, DSOp.DS_MIN_U32) +ds_max_u32 = functools.partial(DS, DSOp.DS_MAX_U32) +ds_and_b32 = functools.partial(DS, DSOp.DS_AND_B32) +ds_or_b32 = functools.partial(DS, DSOp.DS_OR_B32) +ds_xor_b32 = functools.partial(DS, DSOp.DS_XOR_B32) +ds_mskor_b32 = functools.partial(DS, DSOp.DS_MSKOR_B32) +ds_write_b32 = functools.partial(DS, DSOp.DS_WRITE_B32) +ds_write2_b32 = functools.partial(DS, DSOp.DS_WRITE2_B32) +ds_write2st64_b32 = functools.partial(DS, DSOp.DS_WRITE2ST64_B32) +ds_cmpst_b32 = functools.partial(DS, DSOp.DS_CMPST_B32) +ds_cmpst_f32 = functools.partial(DS, DSOp.DS_CMPST_F32) +ds_min_f32 = functools.partial(DS, DSOp.DS_MIN_F32) +ds_max_f32 = functools.partial(DS, DSOp.DS_MAX_F32) +ds_nop = functools.partial(DS, DSOp.DS_NOP) +ds_add_f32 = functools.partial(DS, DSOp.DS_ADD_F32) +ds_pk_add_f16 = functools.partial(DS, DSOp.DS_PK_ADD_F16) +ds_pk_add_bf16 = functools.partial(DS, DSOp.DS_PK_ADD_BF16) +ds_write_addtid_b32 = functools.partial(DS, DSOp.DS_WRITE_ADDTID_B32) +ds_write_b8 = functools.partial(DS, DSOp.DS_WRITE_B8) +ds_write_b16 = functools.partial(DS, DSOp.DS_WRITE_B16) +ds_add_rtn_u32 = functools.partial(DS, DSOp.DS_ADD_RTN_U32) +ds_sub_rtn_u32 = functools.partial(DS, DSOp.DS_SUB_RTN_U32) +ds_rsub_rtn_u32 = functools.partial(DS, DSOp.DS_RSUB_RTN_U32) +ds_inc_rtn_u32 = functools.partial(DS, DSOp.DS_INC_RTN_U32) +ds_dec_rtn_u32 = functools.partial(DS, DSOp.DS_DEC_RTN_U32) +ds_min_rtn_i32 = functools.partial(DS, DSOp.DS_MIN_RTN_I32) +ds_max_rtn_i32 = functools.partial(DS, DSOp.DS_MAX_RTN_I32) +ds_min_rtn_u32 = functools.partial(DS, DSOp.DS_MIN_RTN_U32) +ds_max_rtn_u32 = functools.partial(DS, DSOp.DS_MAX_RTN_U32) +ds_and_rtn_b32 = functools.partial(DS, DSOp.DS_AND_RTN_B32) +ds_or_rtn_b32 = functools.partial(DS, DSOp.DS_OR_RTN_B32) +ds_xor_rtn_b32 = functools.partial(DS, DSOp.DS_XOR_RTN_B32) +ds_mskor_rtn_b32 = functools.partial(DS, DSOp.DS_MSKOR_RTN_B32) +ds_wrxchg_rtn_b32 = functools.partial(DS, DSOp.DS_WRXCHG_RTN_B32) +ds_wrxchg2_rtn_b32 = functools.partial(DS, DSOp.DS_WRXCHG2_RTN_B32) +ds_wrxchg2st64_rtn_b32 = functools.partial(DS, DSOp.DS_WRXCHG2ST64_RTN_B32) +ds_cmpst_rtn_b32 = functools.partial(DS, DSOp.DS_CMPST_RTN_B32) +ds_cmpst_rtn_f32 = functools.partial(DS, DSOp.DS_CMPST_RTN_F32) +ds_min_rtn_f32 = functools.partial(DS, DSOp.DS_MIN_RTN_F32) +ds_max_rtn_f32 = functools.partial(DS, DSOp.DS_MAX_RTN_F32) +ds_wrap_rtn_b32 = functools.partial(DS, DSOp.DS_WRAP_RTN_B32) +ds_add_rtn_f32 = functools.partial(DS, DSOp.DS_ADD_RTN_F32) +ds_read_b32 = functools.partial(DS, DSOp.DS_READ_B32) +ds_read2_b32 = functools.partial(DS, DSOp.DS_READ2_B32) +ds_read2st64_b32 = functools.partial(DS, DSOp.DS_READ2ST64_B32) +ds_read_i8 = functools.partial(DS, DSOp.DS_READ_I8) +ds_read_u8 = functools.partial(DS, DSOp.DS_READ_U8) +ds_read_i16 = functools.partial(DS, DSOp.DS_READ_I16) +ds_read_u16 = functools.partial(DS, DSOp.DS_READ_U16) +ds_swizzle_b32 = functools.partial(DS, DSOp.DS_SWIZZLE_B32) +ds_permute_b32 = functools.partial(DS, DSOp.DS_PERMUTE_B32) +ds_bpermute_b32 = functools.partial(DS, DSOp.DS_BPERMUTE_B32) +ds_add_u64 = functools.partial(DS, DSOp.DS_ADD_U64) +ds_sub_u64 = functools.partial(DS, DSOp.DS_SUB_U64) +ds_rsub_u64 = functools.partial(DS, DSOp.DS_RSUB_U64) +ds_inc_u64 = functools.partial(DS, DSOp.DS_INC_U64) +ds_dec_u64 = functools.partial(DS, DSOp.DS_DEC_U64) +ds_min_i64 = functools.partial(DS, DSOp.DS_MIN_I64) +ds_max_i64 = functools.partial(DS, DSOp.DS_MAX_I64) +ds_min_u64 = functools.partial(DS, DSOp.DS_MIN_U64) +ds_max_u64 = functools.partial(DS, DSOp.DS_MAX_U64) +ds_and_b64 = functools.partial(DS, DSOp.DS_AND_B64) +ds_or_b64 = functools.partial(DS, DSOp.DS_OR_B64) +ds_xor_b64 = functools.partial(DS, DSOp.DS_XOR_B64) +ds_mskor_b64 = functools.partial(DS, DSOp.DS_MSKOR_B64) +ds_write_b64 = functools.partial(DS, DSOp.DS_WRITE_B64) +ds_write2_b64 = functools.partial(DS, DSOp.DS_WRITE2_B64) +ds_write2st64_b64 = functools.partial(DS, DSOp.DS_WRITE2ST64_B64) +ds_cmpst_b64 = functools.partial(DS, DSOp.DS_CMPST_B64) +ds_cmpst_f64 = functools.partial(DS, DSOp.DS_CMPST_F64) +ds_min_f64 = functools.partial(DS, DSOp.DS_MIN_F64) +ds_max_f64 = functools.partial(DS, DSOp.DS_MAX_F64) +ds_write_b8_d16_hi = functools.partial(DS, DSOp.DS_WRITE_B8_D16_HI) +ds_write_b16_d16_hi = functools.partial(DS, DSOp.DS_WRITE_B16_D16_HI) +ds_read_u8_d16 = functools.partial(DS, DSOp.DS_READ_U8_D16) +ds_read_u8_d16_hi = functools.partial(DS, DSOp.DS_READ_U8_D16_HI) +ds_read_i8_d16 = functools.partial(DS, DSOp.DS_READ_I8_D16) +ds_read_i8_d16_hi = functools.partial(DS, DSOp.DS_READ_I8_D16_HI) +ds_read_u16_d16 = functools.partial(DS, DSOp.DS_READ_U16_D16) +ds_read_u16_d16_hi = functools.partial(DS, DSOp.DS_READ_U16_D16_HI) +ds_add_f64 = functools.partial(DS, DSOp.DS_ADD_F64) +ds_add_rtn_u64 = functools.partial(DS, DSOp.DS_ADD_RTN_U64) +ds_sub_rtn_u64 = functools.partial(DS, DSOp.DS_SUB_RTN_U64) +ds_rsub_rtn_u64 = functools.partial(DS, DSOp.DS_RSUB_RTN_U64) +ds_inc_rtn_u64 = functools.partial(DS, DSOp.DS_INC_RTN_U64) +ds_dec_rtn_u64 = functools.partial(DS, DSOp.DS_DEC_RTN_U64) +ds_min_rtn_i64 = functools.partial(DS, DSOp.DS_MIN_RTN_I64) +ds_max_rtn_i64 = functools.partial(DS, DSOp.DS_MAX_RTN_I64) +ds_min_rtn_u64 = functools.partial(DS, DSOp.DS_MIN_RTN_U64) +ds_max_rtn_u64 = functools.partial(DS, DSOp.DS_MAX_RTN_U64) +ds_and_rtn_b64 = functools.partial(DS, DSOp.DS_AND_RTN_B64) +ds_or_rtn_b64 = functools.partial(DS, DSOp.DS_OR_RTN_B64) +ds_xor_rtn_b64 = functools.partial(DS, DSOp.DS_XOR_RTN_B64) +ds_mskor_rtn_b64 = functools.partial(DS, DSOp.DS_MSKOR_RTN_B64) +ds_wrxchg_rtn_b64 = functools.partial(DS, DSOp.DS_WRXCHG_RTN_B64) +ds_wrxchg2_rtn_b64 = functools.partial(DS, DSOp.DS_WRXCHG2_RTN_B64) +ds_wrxchg2st64_rtn_b64 = functools.partial(DS, DSOp.DS_WRXCHG2ST64_RTN_B64) +ds_cmpst_rtn_b64 = functools.partial(DS, DSOp.DS_CMPST_RTN_B64) +ds_cmpst_rtn_f64 = functools.partial(DS, DSOp.DS_CMPST_RTN_F64) +ds_min_rtn_f64 = functools.partial(DS, DSOp.DS_MIN_RTN_F64) +ds_max_rtn_f64 = functools.partial(DS, DSOp.DS_MAX_RTN_F64) +ds_read_b64 = functools.partial(DS, DSOp.DS_READ_B64) +ds_read2_b64 = functools.partial(DS, DSOp.DS_READ2_B64) +ds_read2st64_b64 = functools.partial(DS, DSOp.DS_READ2ST64_B64) +ds_add_rtn_f64 = functools.partial(DS, DSOp.DS_ADD_RTN_F64) +ds_condxchg32_rtn_b64 = functools.partial(DS, DSOp.DS_CONDXCHG32_RTN_B64) +ds_read_addtid_b32 = functools.partial(DS, DSOp.DS_READ_ADDTID_B32) +ds_pk_add_rtn_f16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_F16) +ds_pk_add_rtn_bf16 = functools.partial(DS, DSOp.DS_PK_ADD_RTN_BF16) +ds_consume = functools.partial(DS, DSOp.DS_CONSUME) +ds_append = functools.partial(DS, DSOp.DS_APPEND) +ds_write_b96 = functools.partial(DS, DSOp.DS_WRITE_B96) +ds_write_b128 = functools.partial(DS, DSOp.DS_WRITE_B128) +ds_read_b64_tr_b4 = functools.partial(DS, DSOp.DS_READ_B64_TR_B4) +ds_read_b96_tr_b6 = functools.partial(DS, DSOp.DS_READ_B96_TR_B6) +ds_read_b64_tr_b8 = functools.partial(DS, DSOp.DS_READ_B64_TR_B8) +ds_read_b64_tr_b16 = functools.partial(DS, DSOp.DS_READ_B64_TR_B16) +ds_read_b96 = functools.partial(DS, DSOp.DS_READ_B96) +ds_read_b128 = functools.partial(DS, DSOp.DS_READ_B128) +cdna4 = functools.partial(DS, DSOp.CDNA4) +flat_load_ubyte = functools.partial(FLAT, FLATOp.FLAT_LOAD_UBYTE) +flat_load_sbyte = functools.partial(FLAT, FLATOp.FLAT_LOAD_SBYTE) +flat_load_ushort = functools.partial(FLAT, FLATOp.FLAT_LOAD_USHORT) +flat_load_sshort = functools.partial(FLAT, FLATOp.FLAT_LOAD_SSHORT) +flat_load_dword = functools.partial(FLAT, FLATOp.FLAT_LOAD_DWORD) +flat_load_dwordx2 = functools.partial(FLAT, FLATOp.FLAT_LOAD_DWORDX2) +flat_load_dwordx3 = functools.partial(FLAT, FLATOp.FLAT_LOAD_DWORDX3) +flat_load_dwordx4 = functools.partial(FLAT, FLATOp.FLAT_LOAD_DWORDX4) +flat_store_byte = functools.partial(FLAT, FLATOp.FLAT_STORE_BYTE) +flat_store_byte_d16_hi = functools.partial(FLAT, FLATOp.FLAT_STORE_BYTE_D16_HI) +flat_store_short = functools.partial(FLAT, FLATOp.FLAT_STORE_SHORT) +flat_store_short_d16_hi = functools.partial(FLAT, FLATOp.FLAT_STORE_SHORT_D16_HI) +flat_store_dword = functools.partial(FLAT, FLATOp.FLAT_STORE_DWORD) +flat_store_dwordx2 = functools.partial(FLAT, FLATOp.FLAT_STORE_DWORDX2) +flat_store_dwordx3 = functools.partial(FLAT, FLATOp.FLAT_STORE_DWORDX3) +flat_store_dwordx4 = functools.partial(FLAT, FLATOp.FLAT_STORE_DWORDX4) +flat_load_ubyte_d16 = functools.partial(FLAT, FLATOp.FLAT_LOAD_UBYTE_D16) +flat_load_ubyte_d16_hi = functools.partial(FLAT, FLATOp.FLAT_LOAD_UBYTE_D16_HI) +flat_load_sbyte_d16 = functools.partial(FLAT, FLATOp.FLAT_LOAD_SBYTE_D16) +flat_load_sbyte_d16_hi = functools.partial(FLAT, FLATOp.FLAT_LOAD_SBYTE_D16_HI) +flat_load_short_d16 = functools.partial(FLAT, FLATOp.FLAT_LOAD_SHORT_D16) +flat_load_short_d16_hi = functools.partial(FLAT, FLATOp.FLAT_LOAD_SHORT_D16_HI) +flat_atomic_swap = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SWAP) +flat_atomic_cmpswap = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_CMPSWAP) +flat_atomic_add = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_ADD) +flat_atomic_sub = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SUB) +flat_atomic_smin = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SMIN) +flat_atomic_umin = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_UMIN) +flat_atomic_smax = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SMAX) +flat_atomic_umax = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_UMAX) +flat_atomic_and = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_AND) +flat_atomic_or = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_OR) +flat_atomic_xor = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_XOR) +flat_atomic_inc = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_INC) +flat_atomic_dec = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_DEC) +flat_atomic_add_f32 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_ADD_F32) +flat_atomic_pk_add_f16 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_PK_ADD_F16) +flat_atomic_add_f64 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_ADD_F64) +flat_atomic_min_f64 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_MIN_F64) +flat_atomic_max_f64 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_MAX_F64) +flat_atomic_pk_add_bf16 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_PK_ADD_BF16) +flat_atomic_swap_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SWAP_X2) +flat_atomic_cmpswap_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_CMPSWAP_X2) +flat_atomic_add_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_ADD_X2) +flat_atomic_sub_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SUB_X2) +flat_atomic_smin_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SMIN_X2) +flat_atomic_umin_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_UMIN_X2) +flat_atomic_smax_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_SMAX_X2) +flat_atomic_umax_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_UMAX_X2) +flat_atomic_and_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_AND_X2) +flat_atomic_or_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_OR_X2) +flat_atomic_xor_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_XOR_X2) +flat_atomic_inc_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_INC_X2) +flat_atomic_dec_x2 = functools.partial(FLAT, FLATOp.FLAT_ATOMIC_DEC_X2) +cdna4 = functools.partial(FLAT, FLATOp.CDNA4) +global_load_ubyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_UBYTE, seg=2) +global_load_sbyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SBYTE, seg=2) +global_load_ushort = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_USHORT, seg=2) +global_load_sshort = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SSHORT, seg=2) +global_load_dword = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_DWORD, seg=2) +global_load_dwordx2 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_DWORDX2, seg=2) +global_load_dwordx3 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_DWORDX3, seg=2) +global_load_dwordx4 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_DWORDX4, seg=2) +global_store_byte = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_BYTE, seg=2) +global_store_byte_d16_hi = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_BYTE_D16_HI, seg=2) +global_store_short = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_SHORT, seg=2) +global_store_short_d16_hi = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_SHORT_D16_HI, seg=2) +global_store_dword = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_DWORD, seg=2) +global_store_dwordx2 = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_DWORDX2, seg=2) +global_store_dwordx3 = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_DWORDX3, seg=2) +global_store_dwordx4 = functools.partial(FLAT, GLOBALOp.GLOBAL_STORE_DWORDX4, seg=2) +global_load_ubyte_d16 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_UBYTE_D16, seg=2) +global_load_ubyte_d16_hi = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_UBYTE_D16_HI, seg=2) +global_load_sbyte_d16 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SBYTE_D16, seg=2) +global_load_sbyte_d16_hi = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SBYTE_D16_HI, seg=2) +global_load_short_d16 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SHORT_D16, seg=2) +global_load_short_d16_hi = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_SHORT_D16_HI, seg=2) +global_load_lds_ubyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_UBYTE, seg=2) +global_load_lds_sbyte = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_SBYTE, seg=2) +global_load_lds_ushort = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_USHORT, seg=2) +global_load_lds_sshort = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_SSHORT, seg=2) +global_load_lds_dword = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_DWORD, seg=2) +global_atomic_swap = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SWAP, seg=2) +global_atomic_cmpswap = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_CMPSWAP, seg=2) +global_atomic_add = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_ADD, seg=2) +global_atomic_sub = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SUB, seg=2) +global_atomic_smin = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SMIN, seg=2) +global_atomic_umin = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_UMIN, seg=2) +global_atomic_smax = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SMAX, seg=2) +global_atomic_umax = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_UMAX, seg=2) +global_atomic_and = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_AND, seg=2) +global_atomic_or = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_OR, seg=2) +global_atomic_xor = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_XOR, seg=2) +global_atomic_inc = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_INC, seg=2) +global_atomic_dec = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_DEC, seg=2) +global_atomic_add_f32 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_ADD_F32, seg=2) +global_atomic_pk_add_f16 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_PK_ADD_F16, seg=2) +global_atomic_add_f64 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_ADD_F64, seg=2) +global_atomic_min_f64 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_MIN_F64, seg=2) +global_atomic_max_f64 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_MAX_F64, seg=2) +global_atomic_pk_add_bf16 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_PK_ADD_BF16, seg=2) +global_atomic_swap_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SWAP_X2, seg=2) +global_atomic_cmpswap_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_X2, seg=2) +global_atomic_add_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_ADD_X2, seg=2) +global_atomic_sub_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SUB_X2, seg=2) +global_atomic_smin_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SMIN_X2, seg=2) +global_atomic_umin_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_UMIN_X2, seg=2) +global_atomic_smax_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_SMAX_X2, seg=2) +global_atomic_umax_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_UMAX_X2, seg=2) +global_atomic_and_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_AND_X2, seg=2) +global_atomic_or_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_OR_X2, seg=2) +global_atomic_xor_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_XOR_X2, seg=2) +global_atomic_inc_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_INC_X2, seg=2) +global_atomic_dec_x2 = functools.partial(FLAT, GLOBALOp.GLOBAL_ATOMIC_DEC_X2, seg=2) +global_load_lds_dwordx4 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_DWORDX4, seg=2) +global_load_lds_dwordx3 = functools.partial(FLAT, GLOBALOp.GLOBAL_LOAD_LDS_DWORDX3, seg=2) +cdna4 = functools.partial(FLAT, GLOBALOp.CDNA4, seg=2) +tbuffer_load_format_x = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_X) +tbuffer_load_format_xy = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_XY) +tbuffer_load_format_xyz = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_XYZ) +tbuffer_load_format_xyzw = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_XYZW) +tbuffer_store_format_x = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_X) +tbuffer_store_format_xy = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_XY) +tbuffer_store_format_xyz = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_XYZ) +tbuffer_store_format_xyzw = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_XYZW) +tbuffer_load_format_d16_x = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_D16_X) +tbuffer_load_format_d16_xy = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_D16_XY) +tbuffer_load_format_d16_xyz = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_D16_XYZ) +tbuffer_load_format_d16_xyzw = functools.partial(MTBUF, MTBUFOp.TBUFFER_LOAD_FORMAT_D16_XYZW) +tbuffer_store_format_d16_x = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_D16_X) +tbuffer_store_format_d16_xy = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_D16_XY) +tbuffer_store_format_d16_xyz = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_D16_XYZ) +tbuffer_store_format_d16_xyzw = functools.partial(MTBUF, MTBUFOp.TBUFFER_STORE_FORMAT_D16_XYZW) +buffer_load_format_x = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_X) +buffer_load_format_xy = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_XY) +buffer_load_format_xyz = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_XYZ) +buffer_load_format_xyzw = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_XYZW) +buffer_store_format_x = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_X) +buffer_store_format_xy = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_XY) +buffer_store_format_xyz = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_XYZ) +buffer_store_format_xyzw = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_XYZW) +buffer_load_format_d16_x = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_D16_X) +buffer_load_format_d16_xy = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_D16_XY) +buffer_load_format_d16_xyz = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_D16_XYZ) +buffer_load_format_d16_xyzw = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_D16_XYZW) +buffer_store_format_d16_x = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_D16_X) +buffer_store_format_d16_xy = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_D16_XY) +buffer_store_format_d16_xyz = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_D16_XYZ) +buffer_store_format_d16_xyzw = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_D16_XYZW) +buffer_load_ubyte = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_UBYTE) +buffer_load_sbyte = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SBYTE) +buffer_load_ushort = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_USHORT) +buffer_load_sshort = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SSHORT) +buffer_load_dword = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_DWORD) +buffer_load_dwordx2 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_DWORDX2) +buffer_load_dwordx3 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_DWORDX3) +buffer_load_dwordx4 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_DWORDX4) +buffer_store_byte = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_BYTE) +buffer_store_byte_d16_hi = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_BYTE_D16_HI) +buffer_store_short = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_SHORT) +buffer_store_short_d16_hi = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_SHORT_D16_HI) +buffer_store_dword = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_DWORD) +buffer_store_dwordx2 = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_DWORDX2) +buffer_store_dwordx3 = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_DWORDX3) +buffer_store_dwordx4 = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_DWORDX4) +buffer_load_ubyte_d16 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_UBYTE_D16) +buffer_load_ubyte_d16_hi = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_UBYTE_D16_HI) +buffer_load_sbyte_d16 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SBYTE_D16) +buffer_load_sbyte_d16_hi = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SBYTE_D16_HI) +buffer_load_short_d16 = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SHORT_D16) +buffer_load_short_d16_hi = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_SHORT_D16_HI) +buffer_load_format_d16_hi_x = functools.partial(MUBUF, MUBUFOp.BUFFER_LOAD_FORMAT_D16_HI_X) +buffer_store_format_d16_hi_x = functools.partial(MUBUF, MUBUFOp.BUFFER_STORE_FORMAT_D16_HI_X) +buffer_wbl2 = functools.partial(MUBUF, MUBUFOp.BUFFER_WBL2) +buffer_inv = functools.partial(MUBUF, MUBUFOp.BUFFER_INV) +buffer_atomic_swap = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SWAP) +buffer_atomic_cmpswap = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_CMPSWAP) +buffer_atomic_add = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_ADD) +buffer_atomic_sub = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SUB) +buffer_atomic_smin = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SMIN) +buffer_atomic_umin = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_UMIN) +buffer_atomic_smax = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SMAX) +buffer_atomic_umax = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_UMAX) +buffer_atomic_and = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_AND) +buffer_atomic_or = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_OR) +buffer_atomic_xor = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_XOR) +buffer_atomic_inc = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_INC) +buffer_atomic_dec = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_DEC) +buffer_atomic_add_f32 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_ADD_F32) +buffer_atomic_pk_add_f16 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_PK_ADD_F16) +buffer_atomic_add_f64 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_ADD_F64) +buffer_atomic_min_f64 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_MIN_F64) +buffer_atomic_max_f64 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_MAX_F64) +buffer_atomic_pk_add_bf16 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_PK_ADD_BF16) +buffer_atomic_swap_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SWAP_X2) +buffer_atomic_cmpswap_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_CMPSWAP_X2) +buffer_atomic_add_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_ADD_X2) +buffer_atomic_sub_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SUB_X2) +buffer_atomic_smin_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SMIN_X2) +buffer_atomic_umin_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_UMIN_X2) +buffer_atomic_smax_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_SMAX_X2) +buffer_atomic_umax_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_UMAX_X2) +buffer_atomic_and_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_AND_X2) +buffer_atomic_or_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_OR_X2) +buffer_atomic_xor_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_XOR_X2) +buffer_atomic_inc_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_INC_X2) +buffer_atomic_dec_x2 = functools.partial(MUBUF, MUBUFOp.BUFFER_ATOMIC_DEC_X2) +cdna4 = functools.partial(MUBUF, MUBUFOp.CDNA4) +scratch_load_ubyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_UBYTE, seg=2) +scratch_load_sbyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SBYTE, seg=2) +scratch_load_ushort = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_USHORT, seg=2) +scratch_load_sshort = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SSHORT, seg=2) +scratch_load_dword = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_DWORD, seg=2) +scratch_load_dwordx2 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_DWORDX2, seg=2) +scratch_load_dwordx3 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_DWORDX3, seg=2) +scratch_load_dwordx4 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_DWORDX4, seg=2) +scratch_store_byte = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_BYTE, seg=2) +scratch_store_byte_d16_hi = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_BYTE_D16_HI, seg=2) +scratch_store_short = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_SHORT, seg=2) +scratch_store_short_d16_hi = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_SHORT_D16_HI, seg=2) +scratch_store_dword = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_DWORD, seg=2) +scratch_store_dwordx2 = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_DWORDX2, seg=2) +scratch_store_dwordx3 = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_DWORDX3, seg=2) +scratch_store_dwordx4 = functools.partial(FLAT, SCRATCHOp.SCRATCH_STORE_DWORDX4, seg=2) +scratch_load_ubyte_d16 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_UBYTE_D16, seg=2) +scratch_load_ubyte_d16_hi = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_UBYTE_D16_HI, seg=2) +scratch_load_sbyte_d16 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SBYTE_D16, seg=2) +scratch_load_sbyte_d16_hi = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SBYTE_D16_HI, seg=2) +scratch_load_short_d16 = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SHORT_D16, seg=2) +scratch_load_short_d16_hi = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_SHORT_D16_HI, seg=2) +scratch_load_lds_ubyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_LDS_UBYTE, seg=2) +scratch_load_lds_sbyte = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_LDS_SBYTE, seg=2) +scratch_load_lds_ushort = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_LDS_USHORT, seg=2) +scratch_load_lds_sshort = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_LDS_SSHORT, seg=2) +scratch_load_lds_dword = functools.partial(FLAT, SCRATCHOp.SCRATCH_LOAD_LDS_DWORD, seg=2) +s_load_dword = functools.partial(SMEM, SMEMOp.S_LOAD_DWORD) +s_load_dwordx2 = functools.partial(SMEM, SMEMOp.S_LOAD_DWORDX2) +s_load_dwordx4 = functools.partial(SMEM, SMEMOp.S_LOAD_DWORDX4) +s_load_dwordx8 = functools.partial(SMEM, SMEMOp.S_LOAD_DWORDX8) +s_load_dwordx16 = functools.partial(SMEM, SMEMOp.S_LOAD_DWORDX16) +s_scratch_load_dword = functools.partial(SMEM, SMEMOp.S_SCRATCH_LOAD_DWORD) +s_scratch_load_dwordx2 = functools.partial(SMEM, SMEMOp.S_SCRATCH_LOAD_DWORDX2) +s_scratch_load_dwordx4 = functools.partial(SMEM, SMEMOp.S_SCRATCH_LOAD_DWORDX4) +s_buffer_load_dword = functools.partial(SMEM, SMEMOp.S_BUFFER_LOAD_DWORD) +s_buffer_load_dwordx2 = functools.partial(SMEM, SMEMOp.S_BUFFER_LOAD_DWORDX2) +s_buffer_load_dwordx4 = functools.partial(SMEM, SMEMOp.S_BUFFER_LOAD_DWORDX4) +s_buffer_load_dwordx8 = functools.partial(SMEM, SMEMOp.S_BUFFER_LOAD_DWORDX8) +s_buffer_load_dwordx16 = functools.partial(SMEM, SMEMOp.S_BUFFER_LOAD_DWORDX16) +s_store_dword = functools.partial(SMEM, SMEMOp.S_STORE_DWORD) +s_store_dwordx2 = functools.partial(SMEM, SMEMOp.S_STORE_DWORDX2) +s_store_dwordx4 = functools.partial(SMEM, SMEMOp.S_STORE_DWORDX4) +s_scratch_store_dword = functools.partial(SMEM, SMEMOp.S_SCRATCH_STORE_DWORD) +s_scratch_store_dwordx2 = functools.partial(SMEM, SMEMOp.S_SCRATCH_STORE_DWORDX2) +s_scratch_store_dwordx4 = functools.partial(SMEM, SMEMOp.S_SCRATCH_STORE_DWORDX4) +s_buffer_store_dword = functools.partial(SMEM, SMEMOp.S_BUFFER_STORE_DWORD) +s_buffer_store_dwordx2 = functools.partial(SMEM, SMEMOp.S_BUFFER_STORE_DWORDX2) +s_buffer_store_dwordx4 = functools.partial(SMEM, SMEMOp.S_BUFFER_STORE_DWORDX4) +s_dcache_inv = functools.partial(SMEM, SMEMOp.S_DCACHE_INV) +s_dcache_wb = functools.partial(SMEM, SMEMOp.S_DCACHE_WB) +s_dcache_inv_vol = functools.partial(SMEM, SMEMOp.S_DCACHE_INV_VOL) +s_dcache_wb_vol = functools.partial(SMEM, SMEMOp.S_DCACHE_WB_VOL) +s_memtime = functools.partial(SMEM, SMEMOp.S_MEMTIME) +s_memrealtime = functools.partial(SMEM, SMEMOp.S_MEMREALTIME) +s_dcache_discard = functools.partial(SMEM, SMEMOp.S_DCACHE_DISCARD) +s_dcache_discard_x2 = functools.partial(SMEM, SMEMOp.S_DCACHE_DISCARD_X2) +s_buffer_atomic_swap = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SWAP) +s_buffer_atomic_cmpswap = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_CMPSWAP) +s_buffer_atomic_add = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_ADD) +s_buffer_atomic_sub = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SUB) +s_buffer_atomic_smin = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SMIN) +s_buffer_atomic_umin = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_UMIN) +s_buffer_atomic_smax = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SMAX) +s_buffer_atomic_umax = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_UMAX) +s_buffer_atomic_and = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_AND) +s_buffer_atomic_or = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_OR) +s_buffer_atomic_xor = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_XOR) +s_buffer_atomic_inc = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_INC) +s_buffer_atomic_dec = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_DEC) +s_buffer_atomic_swap_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SWAP_X2) +s_buffer_atomic_cmpswap_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_CMPSWAP_X2) +s_buffer_atomic_add_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_ADD_X2) +s_buffer_atomic_sub_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SUB_X2) +s_buffer_atomic_smin_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SMIN_X2) +s_buffer_atomic_umin_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_UMIN_X2) +s_buffer_atomic_smax_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_SMAX_X2) +s_buffer_atomic_umax_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_UMAX_X2) +s_buffer_atomic_and_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_AND_X2) +s_buffer_atomic_or_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_OR_X2) +s_buffer_atomic_xor_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_XOR_X2) +s_buffer_atomic_inc_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_INC_X2) +s_buffer_atomic_dec_x2 = functools.partial(SMEM, SMEMOp.S_BUFFER_ATOMIC_DEC_X2) +s_atomic_swap = functools.partial(SMEM, SMEMOp.S_ATOMIC_SWAP) +s_atomic_cmpswap = functools.partial(SMEM, SMEMOp.S_ATOMIC_CMPSWAP) +s_atomic_add = functools.partial(SMEM, SMEMOp.S_ATOMIC_ADD) +s_atomic_sub = functools.partial(SMEM, SMEMOp.S_ATOMIC_SUB) +s_atomic_smin = functools.partial(SMEM, SMEMOp.S_ATOMIC_SMIN) +s_atomic_umin = functools.partial(SMEM, SMEMOp.S_ATOMIC_UMIN) +s_atomic_smax = functools.partial(SMEM, SMEMOp.S_ATOMIC_SMAX) +s_atomic_umax = functools.partial(SMEM, SMEMOp.S_ATOMIC_UMAX) +s_atomic_and = functools.partial(SMEM, SMEMOp.S_ATOMIC_AND) +s_atomic_or = functools.partial(SMEM, SMEMOp.S_ATOMIC_OR) +s_atomic_xor = functools.partial(SMEM, SMEMOp.S_ATOMIC_XOR) +s_atomic_inc = functools.partial(SMEM, SMEMOp.S_ATOMIC_INC) +s_atomic_dec = functools.partial(SMEM, SMEMOp.S_ATOMIC_DEC) +s_atomic_swap_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_SWAP_X2) +s_atomic_cmpswap_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_CMPSWAP_X2) +s_atomic_add_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_ADD_X2) +s_atomic_sub_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_SUB_X2) +s_atomic_smin_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_SMIN_X2) +s_atomic_umin_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_UMIN_X2) +s_atomic_smax_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_SMAX_X2) +s_atomic_umax_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_UMAX_X2) +s_atomic_and_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_AND_X2) +s_atomic_or_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_OR_X2) +s_atomic_xor_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_XOR_X2) +s_atomic_inc_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_INC_X2) +s_atomic_dec_x2 = functools.partial(SMEM, SMEMOp.S_ATOMIC_DEC_X2) +cdna4 = functools.partial(SMEM, SMEMOp.CDNA4) +s_mov_b32 = functools.partial(SOP1, SOP1Op.S_MOV_B32) +s_mov_b64 = functools.partial(SOP1, SOP1Op.S_MOV_B64) +s_cmov_b32 = functools.partial(SOP1, SOP1Op.S_CMOV_B32) +s_cmov_b64 = functools.partial(SOP1, SOP1Op.S_CMOV_B64) +s_not_b32 = functools.partial(SOP1, SOP1Op.S_NOT_B32) +s_not_b64 = functools.partial(SOP1, SOP1Op.S_NOT_B64) +s_wqm_b32 = functools.partial(SOP1, SOP1Op.S_WQM_B32) +s_wqm_b64 = functools.partial(SOP1, SOP1Op.S_WQM_B64) +s_brev_b32 = functools.partial(SOP1, SOP1Op.S_BREV_B32) +s_brev_b64 = functools.partial(SOP1, SOP1Op.S_BREV_B64) +s_bcnt0_i32_b32 = functools.partial(SOP1, SOP1Op.S_BCNT0_I32_B32) +s_bcnt0_i32_b64 = functools.partial(SOP1, SOP1Op.S_BCNT0_I32_B64) +s_bcnt1_i32_b32 = functools.partial(SOP1, SOP1Op.S_BCNT1_I32_B32) +s_bcnt1_i32_b64 = functools.partial(SOP1, SOP1Op.S_BCNT1_I32_B64) +s_ff0_i32_b32 = functools.partial(SOP1, SOP1Op.S_FF0_I32_B32) +s_ff0_i32_b64 = functools.partial(SOP1, SOP1Op.S_FF0_I32_B64) +s_ff1_i32_b32 = functools.partial(SOP1, SOP1Op.S_FF1_I32_B32) +s_ff1_i32_b64 = functools.partial(SOP1, SOP1Op.S_FF1_I32_B64) +s_flbit_i32_b32 = functools.partial(SOP1, SOP1Op.S_FLBIT_I32_B32) +s_flbit_i32_b64 = functools.partial(SOP1, SOP1Op.S_FLBIT_I32_B64) +s_flbit_i32 = functools.partial(SOP1, SOP1Op.S_FLBIT_I32) +s_flbit_i32_i64 = functools.partial(SOP1, SOP1Op.S_FLBIT_I32_I64) +s_sext_i32_i8 = functools.partial(SOP1, SOP1Op.S_SEXT_I32_I8) +s_sext_i32_i16 = functools.partial(SOP1, SOP1Op.S_SEXT_I32_I16) +s_bitset0_b32 = functools.partial(SOP1, SOP1Op.S_BITSET0_B32) +s_bitset0_b64 = functools.partial(SOP1, SOP1Op.S_BITSET0_B64) +s_bitset1_b32 = functools.partial(SOP1, SOP1Op.S_BITSET1_B32) +s_bitset1_b64 = functools.partial(SOP1, SOP1Op.S_BITSET1_B64) +s_getpc_b64 = functools.partial(SOP1, SOP1Op.S_GETPC_B64) +s_setpc_b64 = functools.partial(SOP1, SOP1Op.S_SETPC_B64) +s_swappc_b64 = functools.partial(SOP1, SOP1Op.S_SWAPPC_B64) +s_rfe_b64 = functools.partial(SOP1, SOP1Op.S_RFE_B64) +s_and_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_AND_SAVEEXEC_B64) +s_or_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_OR_SAVEEXEC_B64) +s_xor_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_XOR_SAVEEXEC_B64) +s_andn2_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN2_SAVEEXEC_B64) +s_orn2_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_ORN2_SAVEEXEC_B64) +s_nand_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_NAND_SAVEEXEC_B64) +s_nor_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_NOR_SAVEEXEC_B64) +s_xnor_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_XNOR_SAVEEXEC_B64) +s_quadmask_b32 = functools.partial(SOP1, SOP1Op.S_QUADMASK_B32) +s_quadmask_b64 = functools.partial(SOP1, SOP1Op.S_QUADMASK_B64) +s_movrels_b32 = functools.partial(SOP1, SOP1Op.S_MOVRELS_B32) +s_movrels_b64 = functools.partial(SOP1, SOP1Op.S_MOVRELS_B64) +s_movreld_b32 = functools.partial(SOP1, SOP1Op.S_MOVRELD_B32) +s_movreld_b64 = functools.partial(SOP1, SOP1Op.S_MOVRELD_B64) +s_cbranch_join = functools.partial(SOP1, SOP1Op.S_CBRANCH_JOIN) +s_abs_i32 = functools.partial(SOP1, SOP1Op.S_ABS_I32) +s_set_gpr_idx_idx = functools.partial(SOP1, SOP1Op.S_SET_GPR_IDX_IDX) +s_andn1_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN1_SAVEEXEC_B64) +s_orn1_saveexec_b64 = functools.partial(SOP1, SOP1Op.S_ORN1_SAVEEXEC_B64) +s_andn1_wrexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN1_WREXEC_B64) +s_andn2_wrexec_b64 = functools.partial(SOP1, SOP1Op.S_ANDN2_WREXEC_B64) +s_bitreplicate_b64_b32 = functools.partial(SOP1, SOP1Op.S_BITREPLICATE_B64_B32) +cdna4 = functools.partial(SOP1, SOP1Op.CDNA4) +s_add_u32 = functools.partial(SOP2, SOP2Op.S_ADD_U32) +s_sub_u32 = functools.partial(SOP2, SOP2Op.S_SUB_U32) +s_add_i32 = functools.partial(SOP2, SOP2Op.S_ADD_I32) +s_sub_i32 = functools.partial(SOP2, SOP2Op.S_SUB_I32) +s_addc_u32 = functools.partial(SOP2, SOP2Op.S_ADDC_U32) +s_subb_u32 = functools.partial(SOP2, SOP2Op.S_SUBB_U32) +s_min_i32 = functools.partial(SOP2, SOP2Op.S_MIN_I32) +s_min_u32 = functools.partial(SOP2, SOP2Op.S_MIN_U32) +s_max_i32 = functools.partial(SOP2, SOP2Op.S_MAX_I32) +s_max_u32 = functools.partial(SOP2, SOP2Op.S_MAX_U32) +s_cselect_b32 = functools.partial(SOP2, SOP2Op.S_CSELECT_B32) +s_cselect_b64 = functools.partial(SOP2, SOP2Op.S_CSELECT_B64) +s_and_b32 = functools.partial(SOP2, SOP2Op.S_AND_B32) +s_and_b64 = functools.partial(SOP2, SOP2Op.S_AND_B64) +s_or_b32 = functools.partial(SOP2, SOP2Op.S_OR_B32) +s_or_b64 = functools.partial(SOP2, SOP2Op.S_OR_B64) +s_xor_b32 = functools.partial(SOP2, SOP2Op.S_XOR_B32) +s_xor_b64 = functools.partial(SOP2, SOP2Op.S_XOR_B64) +s_andn2_b32 = functools.partial(SOP2, SOP2Op.S_ANDN2_B32) +s_andn2_b64 = functools.partial(SOP2, SOP2Op.S_ANDN2_B64) +s_orn2_b32 = functools.partial(SOP2, SOP2Op.S_ORN2_B32) +s_orn2_b64 = functools.partial(SOP2, SOP2Op.S_ORN2_B64) +s_nand_b32 = functools.partial(SOP2, SOP2Op.S_NAND_B32) +s_nand_b64 = functools.partial(SOP2, SOP2Op.S_NAND_B64) +s_nor_b32 = functools.partial(SOP2, SOP2Op.S_NOR_B32) +s_nor_b64 = functools.partial(SOP2, SOP2Op.S_NOR_B64) +s_xnor_b32 = functools.partial(SOP2, SOP2Op.S_XNOR_B32) +s_xnor_b64 = functools.partial(SOP2, SOP2Op.S_XNOR_B64) +s_lshl_b32 = functools.partial(SOP2, SOP2Op.S_LSHL_B32) +s_lshl_b64 = functools.partial(SOP2, SOP2Op.S_LSHL_B64) +s_lshr_b32 = functools.partial(SOP2, SOP2Op.S_LSHR_B32) +s_lshr_b64 = functools.partial(SOP2, SOP2Op.S_LSHR_B64) +s_ashr_i32 = functools.partial(SOP2, SOP2Op.S_ASHR_I32) +s_ashr_i64 = functools.partial(SOP2, SOP2Op.S_ASHR_I64) +s_bfm_b32 = functools.partial(SOP2, SOP2Op.S_BFM_B32) +s_bfm_b64 = functools.partial(SOP2, SOP2Op.S_BFM_B64) +s_mul_i32 = functools.partial(SOP2, SOP2Op.S_MUL_I32) +s_bfe_u32 = functools.partial(SOP2, SOP2Op.S_BFE_U32) +s_bfe_i32 = functools.partial(SOP2, SOP2Op.S_BFE_I32) +s_bfe_u64 = functools.partial(SOP2, SOP2Op.S_BFE_U64) +s_bfe_i64 = functools.partial(SOP2, SOP2Op.S_BFE_I64) +s_cbranch_g_fork = functools.partial(SOP2, SOP2Op.S_CBRANCH_G_FORK) +s_absdiff_i32 = functools.partial(SOP2, SOP2Op.S_ABSDIFF_I32) +s_mul_hi_u32 = functools.partial(SOP2, SOP2Op.S_MUL_HI_U32) +s_mul_hi_i32 = functools.partial(SOP2, SOP2Op.S_MUL_HI_I32) +s_lshl1_add_u32 = functools.partial(SOP2, SOP2Op.S_LSHL1_ADD_U32) +s_lshl2_add_u32 = functools.partial(SOP2, SOP2Op.S_LSHL2_ADD_U32) +s_lshl3_add_u32 = functools.partial(SOP2, SOP2Op.S_LSHL3_ADD_U32) +s_lshl4_add_u32 = functools.partial(SOP2, SOP2Op.S_LSHL4_ADD_U32) +s_pack_ll_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_LL_B32_B16) +s_pack_lh_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_LH_B32_B16) +s_pack_hh_b32_b16 = functools.partial(SOP2, SOP2Op.S_PACK_HH_B32_B16) +cdna4 = functools.partial(SOP2, SOP2Op.CDNA4) +s_cmp_eq_i32 = functools.partial(SOPC, SOPCOp.S_CMP_EQ_I32) +s_cmp_lg_i32 = functools.partial(SOPC, SOPCOp.S_CMP_LG_I32) +s_cmp_gt_i32 = functools.partial(SOPC, SOPCOp.S_CMP_GT_I32) +s_cmp_ge_i32 = functools.partial(SOPC, SOPCOp.S_CMP_GE_I32) +s_cmp_lt_i32 = functools.partial(SOPC, SOPCOp.S_CMP_LT_I32) +s_cmp_le_i32 = functools.partial(SOPC, SOPCOp.S_CMP_LE_I32) +s_cmp_eq_u32 = functools.partial(SOPC, SOPCOp.S_CMP_EQ_U32) +s_cmp_lg_u32 = functools.partial(SOPC, SOPCOp.S_CMP_LG_U32) +s_cmp_gt_u32 = functools.partial(SOPC, SOPCOp.S_CMP_GT_U32) +s_cmp_ge_u32 = functools.partial(SOPC, SOPCOp.S_CMP_GE_U32) +s_cmp_lt_u32 = functools.partial(SOPC, SOPCOp.S_CMP_LT_U32) +s_cmp_le_u32 = functools.partial(SOPC, SOPCOp.S_CMP_LE_U32) +s_bitcmp0_b32 = functools.partial(SOPC, SOPCOp.S_BITCMP0_B32) +s_bitcmp1_b32 = functools.partial(SOPC, SOPCOp.S_BITCMP1_B32) +s_bitcmp0_b64 = functools.partial(SOPC, SOPCOp.S_BITCMP0_B64) +s_bitcmp1_b64 = functools.partial(SOPC, SOPCOp.S_BITCMP1_B64) +s_setvskip = functools.partial(SOPC, SOPCOp.S_SETVSKIP) +s_set_gpr_idx_on = functools.partial(SOPC, SOPCOp.S_SET_GPR_IDX_ON) +s_cmp_eq_u64 = functools.partial(SOPC, SOPCOp.S_CMP_EQ_U64) +s_cmp_lg_u64 = functools.partial(SOPC, SOPCOp.S_CMP_LG_U64) +cdna4 = functools.partial(SOPC, SOPCOp.CDNA4) +s_movk_i32 = functools.partial(SOPK, SOPKOp.S_MOVK_I32) +s_cmovk_i32 = functools.partial(SOPK, SOPKOp.S_CMOVK_I32) +s_cmpk_eq_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_EQ_I32) +s_cmpk_lg_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_LG_I32) +s_cmpk_gt_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_GT_I32) +s_cmpk_ge_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_GE_I32) +s_cmpk_lt_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_LT_I32) +s_cmpk_le_i32 = functools.partial(SOPK, SOPKOp.S_CMPK_LE_I32) +s_cmpk_eq_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_EQ_U32) +s_cmpk_lg_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_LG_U32) +s_cmpk_gt_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_GT_U32) +s_cmpk_ge_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_GE_U32) +s_cmpk_lt_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_LT_U32) +s_cmpk_le_u32 = functools.partial(SOPK, SOPKOp.S_CMPK_LE_U32) +s_addk_i32 = functools.partial(SOPK, SOPKOp.S_ADDK_I32) +s_mulk_i32 = functools.partial(SOPK, SOPKOp.S_MULK_I32) +s_cbranch_i_fork = functools.partial(SOPK, SOPKOp.S_CBRANCH_I_FORK) +s_getreg_b32 = functools.partial(SOPK, SOPKOp.S_GETREG_B32) +s_setreg_b32 = functools.partial(SOPK, SOPKOp.S_SETREG_B32) +s_setreg_imm32_b32 = functools.partial(SOPK, SOPKOp.S_SETREG_IMM32_B32) +s_call_b64 = functools.partial(SOPK, SOPKOp.S_CALL_B64) +s_nop = functools.partial(SOPP, SOPPOp.S_NOP) +s_endpgm = functools.partial(SOPP, SOPPOp.S_ENDPGM) +s_branch = functools.partial(SOPP, SOPPOp.S_BRANCH) +s_wakeup = functools.partial(SOPP, SOPPOp.S_WAKEUP) +s_cbranch_scc0 = functools.partial(SOPP, SOPPOp.S_CBRANCH_SCC0) +s_cbranch_scc1 = functools.partial(SOPP, SOPPOp.S_CBRANCH_SCC1) +s_cbranch_vccz = functools.partial(SOPP, SOPPOp.S_CBRANCH_VCCZ) +s_cbranch_vccnz = functools.partial(SOPP, SOPPOp.S_CBRANCH_VCCNZ) +s_cbranch_execz = functools.partial(SOPP, SOPPOp.S_CBRANCH_EXECZ) +s_cbranch_execnz = functools.partial(SOPP, SOPPOp.S_CBRANCH_EXECNZ) +s_barrier = functools.partial(SOPP, SOPPOp.S_BARRIER) +s_setkill = functools.partial(SOPP, SOPPOp.S_SETKILL) +s_waitcnt = functools.partial(SOPP, SOPPOp.S_WAITCNT) +s_sethalt = functools.partial(SOPP, SOPPOp.S_SETHALT) +s_sleep = functools.partial(SOPP, SOPPOp.S_SLEEP) +s_setprio = functools.partial(SOPP, SOPPOp.S_SETPRIO) +s_sendmsg = functools.partial(SOPP, SOPPOp.S_SENDMSG) +s_sendmsghalt = functools.partial(SOPP, SOPPOp.S_SENDMSGHALT) +s_trap = functools.partial(SOPP, SOPPOp.S_TRAP) +s_icache_inv = functools.partial(SOPP, SOPPOp.S_ICACHE_INV) +s_incperflevel = functools.partial(SOPP, SOPPOp.S_INCPERFLEVEL) +s_decperflevel = functools.partial(SOPP, SOPPOp.S_DECPERFLEVEL) +s_ttracedata = functools.partial(SOPP, SOPPOp.S_TTRACEDATA) +s_cbranch_cdbgsys = functools.partial(SOPP, SOPPOp.S_CBRANCH_CDBGSYS) +s_cbranch_cdbguser = functools.partial(SOPP, SOPPOp.S_CBRANCH_CDBGUSER) +s_cbranch_cdbgsys_or_user = functools.partial(SOPP, SOPPOp.S_CBRANCH_CDBGSYS_OR_USER) +s_cbranch_cdbgsys_and_user = functools.partial(SOPP, SOPPOp.S_CBRANCH_CDBGSYS_AND_USER) +s_endpgm_saved = functools.partial(SOPP, SOPPOp.S_ENDPGM_SAVED) +s_set_gpr_idx_off = functools.partial(SOPP, SOPPOp.S_SET_GPR_IDX_OFF) +s_set_gpr_idx_mode = functools.partial(SOPP, SOPPOp.S_SET_GPR_IDX_MODE) +cdna4 = functools.partial(SOPP, SOPPOp.CDNA4) +v_nop_e32 = functools.partial(VOP1, VOP1Op.V_NOP) +v_mov_b32_e32 = functools.partial(VOP1, VOP1Op.V_MOV_B32) +v_readfirstlane_b32_e32 = functools.partial(VOP1, VOP1Op.V_READFIRSTLANE_B32) +v_cvt_i32_f64_e32 = functools.partial(VOP1, VOP1Op.V_CVT_I32_F64) +v_cvt_f64_i32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F64_I32) +v_cvt_f32_i32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_I32) +v_cvt_f32_u32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_U32) +v_cvt_u32_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_U32_F32) +v_cvt_i32_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_I32_F32) +v_cvt_f16_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F16_F32) +v_cvt_f32_f16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_F16) +v_cvt_rpi_i32_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_RPI_I32_F32) +v_cvt_flr_i32_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_FLR_I32_F32) +v_cvt_off_f32_i4_e32 = functools.partial(VOP1, VOP1Op.V_CVT_OFF_F32_I4) +v_cvt_f32_f64_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_F64) +v_cvt_f64_f32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F64_F32) +v_cvt_f32_ubyte0_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_UBYTE0) +v_cvt_f32_ubyte1_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_UBYTE1) +v_cvt_f32_ubyte2_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_UBYTE2) +v_cvt_f32_ubyte3_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_UBYTE3) +v_cvt_u32_f64_e32 = functools.partial(VOP1, VOP1Op.V_CVT_U32_F64) +v_cvt_f64_u32_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F64_U32) +v_trunc_f64_e32 = functools.partial(VOP1, VOP1Op.V_TRUNC_F64) +v_ceil_f64_e32 = functools.partial(VOP1, VOP1Op.V_CEIL_F64) +v_rndne_f64_e32 = functools.partial(VOP1, VOP1Op.V_RNDNE_F64) +v_floor_f64_e32 = functools.partial(VOP1, VOP1Op.V_FLOOR_F64) +v_fract_f32_e32 = functools.partial(VOP1, VOP1Op.V_FRACT_F32) +v_trunc_f32_e32 = functools.partial(VOP1, VOP1Op.V_TRUNC_F32) +v_ceil_f32_e32 = functools.partial(VOP1, VOP1Op.V_CEIL_F32) +v_rndne_f32_e32 = functools.partial(VOP1, VOP1Op.V_RNDNE_F32) +v_floor_f32_e32 = functools.partial(VOP1, VOP1Op.V_FLOOR_F32) +v_exp_f32_e32 = functools.partial(VOP1, VOP1Op.V_EXP_F32) +v_log_f32_e32 = functools.partial(VOP1, VOP1Op.V_LOG_F32) +v_rcp_f32_e32 = functools.partial(VOP1, VOP1Op.V_RCP_F32) +v_rcp_iflag_f32_e32 = functools.partial(VOP1, VOP1Op.V_RCP_IFLAG_F32) +v_rsq_f32_e32 = functools.partial(VOP1, VOP1Op.V_RSQ_F32) +v_rcp_f64_e32 = functools.partial(VOP1, VOP1Op.V_RCP_F64) +v_rsq_f64_e32 = functools.partial(VOP1, VOP1Op.V_RSQ_F64) +v_sqrt_f32_e32 = functools.partial(VOP1, VOP1Op.V_SQRT_F32) +v_sqrt_f64_e32 = functools.partial(VOP1, VOP1Op.V_SQRT_F64) +v_sin_f32_e32 = functools.partial(VOP1, VOP1Op.V_SIN_F32) +v_cos_f32_e32 = functools.partial(VOP1, VOP1Op.V_COS_F32) +v_not_b32_e32 = functools.partial(VOP1, VOP1Op.V_NOT_B32) +v_bfrev_b32_e32 = functools.partial(VOP1, VOP1Op.V_BFREV_B32) +v_ffbh_u32_e32 = functools.partial(VOP1, VOP1Op.V_FFBH_U32) +v_ffbl_b32_e32 = functools.partial(VOP1, VOP1Op.V_FFBL_B32) +v_ffbh_i32_e32 = functools.partial(VOP1, VOP1Op.V_FFBH_I32) +v_frexp_exp_i32_f64_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_EXP_I32_F64) +v_frexp_mant_f64_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_MANT_F64) +v_fract_f64_e32 = functools.partial(VOP1, VOP1Op.V_FRACT_F64) +v_frexp_exp_i32_f32_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_EXP_I32_F32) +v_frexp_mant_f32_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_MANT_F32) +v_clrexcp_e32 = functools.partial(VOP1, VOP1Op.V_CLREXCP) +v_mov_b64_e32 = functools.partial(VOP1, VOP1Op.V_MOV_B64) +v_cvt_f16_u16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F16_U16) +v_cvt_f16_i16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F16_I16) +v_cvt_u16_f16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_U16_F16) +v_cvt_i16_f16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_I16_F16) +v_rcp_f16_e32 = functools.partial(VOP1, VOP1Op.V_RCP_F16) +v_sqrt_f16_e32 = functools.partial(VOP1, VOP1Op.V_SQRT_F16) +v_rsq_f16_e32 = functools.partial(VOP1, VOP1Op.V_RSQ_F16) +v_log_f16_e32 = functools.partial(VOP1, VOP1Op.V_LOG_F16) +v_exp_f16_e32 = functools.partial(VOP1, VOP1Op.V_EXP_F16) +v_frexp_mant_f16_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_MANT_F16) +v_frexp_exp_i16_f16_e32 = functools.partial(VOP1, VOP1Op.V_FREXP_EXP_I16_F16) +v_floor_f16_e32 = functools.partial(VOP1, VOP1Op.V_FLOOR_F16) +v_ceil_f16_e32 = functools.partial(VOP1, VOP1Op.V_CEIL_F16) +v_trunc_f16_e32 = functools.partial(VOP1, VOP1Op.V_TRUNC_F16) +v_rndne_f16_e32 = functools.partial(VOP1, VOP1Op.V_RNDNE_F16) +v_fract_f16_e32 = functools.partial(VOP1, VOP1Op.V_FRACT_F16) +v_sin_f16_e32 = functools.partial(VOP1, VOP1Op.V_SIN_F16) +v_cos_f16_e32 = functools.partial(VOP1, VOP1Op.V_COS_F16) +v_cvt_norm_i16_f16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_NORM_I16_F16) +v_cvt_norm_u16_f16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_NORM_U16_F16) +v_sat_pk_u8_i16_e32 = functools.partial(VOP1, VOP1Op.V_SAT_PK_U8_I16) +v_swap_b32_e32 = functools.partial(VOP1, VOP1Op.V_SWAP_B32) +v_accvgpr_mov_b32_e32 = functools.partial(VOP1, VOP1Op.V_ACCVGPR_MOV_B32) +v_cvt_f32_fp8_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_FP8) +v_cvt_f32_bf8_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_BF8) +v_cvt_pk_f32_fp8_e32 = functools.partial(VOP1, VOP1Op.V_CVT_PK_F32_FP8) +v_cvt_pk_f32_bf8_e32 = functools.partial(VOP1, VOP1Op.V_CVT_PK_F32_BF8) +v_prng_b32_e32 = functools.partial(VOP1, VOP1Op.V_PRNG_B32) +v_permlane16_swap_b32_e32 = functools.partial(VOP1, VOP1Op.V_PERMLANE16_SWAP_B32) +v_permlane32_swap_b32_e32 = functools.partial(VOP1, VOP1Op.V_PERMLANE32_SWAP_B32) +v_cvt_f32_bf16_e32 = functools.partial(VOP1, VOP1Op.V_CVT_F32_BF16) +cdna4_e32 = functools.partial(VOP1, VOP1Op.CDNA4) +v_cndmask_b32_e32 = functools.partial(VOP2, VOP2Op.V_CNDMASK_B32) +v_add_f32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_F32) +v_sub_f32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F32) +v_subrev_f32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_F32) +v_fmac_f64_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F64) +v_mul_f32_e32 = functools.partial(VOP2, VOP2Op.V_MUL_F32) +v_mul_i32_i24_e32 = functools.partial(VOP2, VOP2Op.V_MUL_I32_I24) +v_mul_hi_i32_i24_e32 = functools.partial(VOP2, VOP2Op.V_MUL_HI_I32_I24) +v_mul_u32_u24_e32 = functools.partial(VOP2, VOP2Op.V_MUL_U32_U24) +v_mul_hi_u32_u24_e32 = functools.partial(VOP2, VOP2Op.V_MUL_HI_U32_U24) +v_min_f32_e32 = functools.partial(VOP2, VOP2Op.V_MIN_F32) +v_max_f32_e32 = functools.partial(VOP2, VOP2Op.V_MAX_F32) +v_min_i32_e32 = functools.partial(VOP2, VOP2Op.V_MIN_I32) +v_max_i32_e32 = functools.partial(VOP2, VOP2Op.V_MAX_I32) +v_min_u32_e32 = functools.partial(VOP2, VOP2Op.V_MIN_U32) +v_max_u32_e32 = functools.partial(VOP2, VOP2Op.V_MAX_U32) +v_lshrrev_b32_e32 = functools.partial(VOP2, VOP2Op.V_LSHRREV_B32) +v_ashrrev_i32_e32 = functools.partial(VOP2, VOP2Op.V_ASHRREV_I32) +v_lshlrev_b32_e32 = functools.partial(VOP2, VOP2Op.V_LSHLREV_B32) +v_and_b32_e32 = functools.partial(VOP2, VOP2Op.V_AND_B32) +v_or_b32_e32 = functools.partial(VOP2, VOP2Op.V_OR_B32) +v_xor_b32_e32 = functools.partial(VOP2, VOP2Op.V_XOR_B32) +v_dot2c_f32_bf16_e32 = functools.partial(VOP2, VOP2Op.V_DOT2C_F32_BF16) +def v_fmamk_f32_e32(vdst, src0, K, vsrc1): return VOP2(VOP2Op.V_FMAMK_F32, vdst, src0, vsrc1, literal=K) +def v_fmaak_f32_e32(vdst, src0, vsrc1, K): return VOP2(VOP2Op.V_FMAAK_F32, vdst, src0, vsrc1, literal=K) +v_add_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_CO_U32) +v_sub_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_CO_U32) +v_subrev_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_CO_U32) +v_addc_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADDC_CO_U32) +v_subb_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBB_CO_U32) +v_subbrev_co_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBBREV_CO_U32) +v_add_f16_e32 = functools.partial(VOP2, VOP2Op.V_ADD_F16) +v_sub_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUB_F16) +v_subrev_f16_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_F16) +v_mul_f16_e32 = functools.partial(VOP2, VOP2Op.V_MUL_F16) +v_mac_f16_e32 = functools.partial(VOP2, VOP2Op.V_MAC_F16) +v_madmk_f16_e32 = functools.partial(VOP2, VOP2Op.V_MADMK_F16) +v_madak_f16_e32 = functools.partial(VOP2, VOP2Op.V_MADAK_F16) +v_add_u16_e32 = functools.partial(VOP2, VOP2Op.V_ADD_U16) +v_sub_u16_e32 = functools.partial(VOP2, VOP2Op.V_SUB_U16) +v_subrev_u16_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_U16) +v_mul_lo_u16_e32 = functools.partial(VOP2, VOP2Op.V_MUL_LO_U16) +v_lshlrev_b16_e32 = functools.partial(VOP2, VOP2Op.V_LSHLREV_B16) +v_lshrrev_b16_e32 = functools.partial(VOP2, VOP2Op.V_LSHRREV_B16) +v_ashrrev_i16_e32 = functools.partial(VOP2, VOP2Op.V_ASHRREV_I16) +v_max_f16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_F16) +v_min_f16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_F16) +v_max_u16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_U16) +v_max_i16_e32 = functools.partial(VOP2, VOP2Op.V_MAX_I16) +v_min_u16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_U16) +v_min_i16_e32 = functools.partial(VOP2, VOP2Op.V_MIN_I16) +v_ldexp_f16_e32 = functools.partial(VOP2, VOP2Op.V_LDEXP_F16) +v_add_u32_e32 = functools.partial(VOP2, VOP2Op.V_ADD_U32) +v_sub_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUB_U32) +v_subrev_u32_e32 = functools.partial(VOP2, VOP2Op.V_SUBREV_U32) +v_dot2c_f32_f16_e32 = functools.partial(VOP2, VOP2Op.V_DOT2C_F32_F16) +v_dot2c_i32_i16_e32 = functools.partial(VOP2, VOP2Op.V_DOT2C_I32_I16) +v_dot4c_i32_i8_e32 = functools.partial(VOP2, VOP2Op.V_DOT4C_I32_I8) +v_dot8c_i32_i4_e32 = functools.partial(VOP2, VOP2Op.V_DOT8C_I32_I4) +v_fmac_f32_e32 = functools.partial(VOP2, VOP2Op.V_FMAC_F32) +v_pk_fmac_f16_e32 = functools.partial(VOP2, VOP2Op.V_PK_FMAC_F16) +v_xnor_b32_e32 = functools.partial(VOP2, VOP2Op.V_XNOR_B32) +cdna4_e32 = functools.partial(VOP2, VOP2Op.CDNA4) +v_cmp_class_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_CLASS_F32) +v_cmpx_class_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_CLASS_F32) +v_cmp_class_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_CLASS_F64) +v_cmpx_class_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_CLASS_F64) +v_cmp_class_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_CLASS_F16) +v_cmpx_class_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_CLASS_F16) +v_cmp_f_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_F16) +v_cmp_lt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_F16) +v_cmp_eq_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_F16) +v_cmp_le_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_F16) +v_cmp_gt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_F16) +v_cmp_lg_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LG_F16) +v_cmp_ge_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_F16) +v_cmp_o_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_O_F16) +v_cmp_u_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_U_F16) +v_cmp_nge_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGE_F16) +v_cmp_nlg_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLG_F16) +v_cmp_ngt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGT_F16) +v_cmp_nle_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLE_F16) +v_cmp_neq_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NEQ_F16) +v_cmp_nlt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLT_F16) +v_cmp_tru_f16 = functools.partial(VOP3A, VOP3AOp.V_CMP_TRU_F16) +v_cmpx_f_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_F16) +v_cmpx_lt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_F16) +v_cmpx_eq_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_F16) +v_cmpx_le_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_F16) +v_cmpx_gt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_F16) +v_cmpx_lg_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LG_F16) +v_cmpx_ge_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_F16) +v_cmpx_o_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_O_F16) +v_cmpx_u_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_U_F16) +v_cmpx_nge_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGE_F16) +v_cmpx_nlg_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLG_F16) +v_cmpx_ngt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGT_F16) +v_cmpx_nle_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLE_F16) +v_cmpx_neq_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NEQ_F16) +v_cmpx_nlt_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLT_F16) +v_cmpx_tru_f16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_TRU_F16) +v_cmp_f_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_F32) +v_cmp_lt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_F32) +v_cmp_eq_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_F32) +v_cmp_le_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_F32) +v_cmp_gt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_F32) +v_cmp_lg_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LG_F32) +v_cmp_ge_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_F32) +v_cmp_o_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_O_F32) +v_cmp_u_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_U_F32) +v_cmp_nge_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGE_F32) +v_cmp_nlg_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLG_F32) +v_cmp_ngt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGT_F32) +v_cmp_nle_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLE_F32) +v_cmp_neq_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NEQ_F32) +v_cmp_nlt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLT_F32) +v_cmp_tru_f32 = functools.partial(VOP3A, VOP3AOp.V_CMP_TRU_F32) +v_cmpx_f_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_F32) +v_cmpx_lt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_F32) +v_cmpx_eq_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_F32) +v_cmpx_le_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_F32) +v_cmpx_gt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_F32) +v_cmpx_lg_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LG_F32) +v_cmpx_ge_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_F32) +v_cmpx_o_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_O_F32) +v_cmpx_u_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_U_F32) +v_cmpx_nge_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGE_F32) +v_cmpx_nlg_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLG_F32) +v_cmpx_ngt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGT_F32) +v_cmpx_nle_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLE_F32) +v_cmpx_neq_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NEQ_F32) +v_cmpx_nlt_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLT_F32) +v_cmpx_tru_f32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_TRU_F32) +v_cmp_f_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_F64) +v_cmp_lt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_F64) +v_cmp_eq_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_F64) +v_cmp_le_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_F64) +v_cmp_gt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_F64) +v_cmp_lg_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LG_F64) +v_cmp_ge_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_F64) +v_cmp_o_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_O_F64) +v_cmp_u_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_U_F64) +v_cmp_nge_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGE_F64) +v_cmp_nlg_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLG_F64) +v_cmp_ngt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NGT_F64) +v_cmp_nle_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLE_F64) +v_cmp_neq_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NEQ_F64) +v_cmp_nlt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NLT_F64) +v_cmp_tru_f64 = functools.partial(VOP3A, VOP3AOp.V_CMP_TRU_F64) +v_cmpx_f_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_F64) +v_cmpx_lt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_F64) +v_cmpx_eq_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_F64) +v_cmpx_le_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_F64) +v_cmpx_gt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_F64) +v_cmpx_lg_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LG_F64) +v_cmpx_ge_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_F64) +v_cmpx_o_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_O_F64) +v_cmpx_u_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_U_F64) +v_cmpx_nge_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGE_F64) +v_cmpx_nlg_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLG_F64) +v_cmpx_ngt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NGT_F64) +v_cmpx_nle_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLE_F64) +v_cmpx_neq_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NEQ_F64) +v_cmpx_nlt_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NLT_F64) +v_cmpx_tru_f64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_TRU_F64) +v_cmp_f_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_I16) +v_cmp_lt_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_I16) +v_cmp_eq_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_I16) +v_cmp_le_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_I16) +v_cmp_gt_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_I16) +v_cmp_ne_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_I16) +v_cmp_ge_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_I16) +v_cmp_t_i16 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_I16) +v_cmp_f_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_U16) +v_cmp_lt_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_U16) +v_cmp_eq_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_U16) +v_cmp_le_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_U16) +v_cmp_gt_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_U16) +v_cmp_ne_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_U16) +v_cmp_ge_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_U16) +v_cmp_t_u16 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_U16) +v_cmpx_f_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_I16) +v_cmpx_lt_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_I16) +v_cmpx_eq_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_I16) +v_cmpx_le_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_I16) +v_cmpx_gt_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_I16) +v_cmpx_ne_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_I16) +v_cmpx_ge_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_I16) +v_cmpx_t_i16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_I16) +v_cmpx_f_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_U16) +v_cmpx_lt_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_U16) +v_cmpx_eq_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_U16) +v_cmpx_le_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_U16) +v_cmpx_gt_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_U16) +v_cmpx_ne_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_U16) +v_cmpx_ge_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_U16) +v_cmpx_t_u16 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_U16) +v_cmp_f_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_I32) +v_cmp_lt_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_I32) +v_cmp_eq_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_I32) +v_cmp_le_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_I32) +v_cmp_gt_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_I32) +v_cmp_ne_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_I32) +v_cmp_ge_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_I32) +v_cmp_t_i32 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_I32) +v_cmp_f_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_U32) +v_cmp_lt_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_U32) +v_cmp_eq_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_U32) +v_cmp_le_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_U32) +v_cmp_gt_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_U32) +v_cmp_ne_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_U32) +v_cmp_ge_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_U32) +v_cmp_t_u32 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_U32) +v_cmpx_f_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_I32) +v_cmpx_lt_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_I32) +v_cmpx_eq_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_I32) +v_cmpx_le_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_I32) +v_cmpx_gt_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_I32) +v_cmpx_ne_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_I32) +v_cmpx_ge_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_I32) +v_cmpx_t_i32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_I32) +v_cmpx_f_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_U32) +v_cmpx_lt_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_U32) +v_cmpx_eq_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_U32) +v_cmpx_le_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_U32) +v_cmpx_gt_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_U32) +v_cmpx_ne_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_U32) +v_cmpx_ge_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_U32) +v_cmpx_t_u32 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_U32) +v_cmp_f_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_I64) +v_cmp_lt_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_I64) +v_cmp_eq_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_I64) +v_cmp_le_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_I64) +v_cmp_gt_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_I64) +v_cmp_ne_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_I64) +v_cmp_ge_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_I64) +v_cmp_t_i64 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_I64) +v_cmp_f_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_F_U64) +v_cmp_lt_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LT_U64) +v_cmp_eq_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_EQ_U64) +v_cmp_le_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_LE_U64) +v_cmp_gt_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GT_U64) +v_cmp_ne_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_NE_U64) +v_cmp_ge_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_GE_U64) +v_cmp_t_u64 = functools.partial(VOP3A, VOP3AOp.V_CMP_T_U64) +v_cmpx_f_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_I64) +v_cmpx_lt_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_I64) +v_cmpx_eq_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_I64) +v_cmpx_le_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_I64) +v_cmpx_gt_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_I64) +v_cmpx_ne_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_I64) +v_cmpx_ge_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_I64) +v_cmpx_t_i64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_I64) +v_cmpx_f_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_F_U64) +v_cmpx_lt_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LT_U64) +v_cmpx_eq_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_EQ_U64) +v_cmpx_le_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_LE_U64) +v_cmpx_gt_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GT_U64) +v_cmpx_ne_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_NE_U64) +v_cmpx_ge_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_GE_U64) +v_cmpx_t_u64 = functools.partial(VOP3A, VOP3AOp.V_CMPX_T_U64) +v_cndmask_b32 = functools.partial(VOP3A, VOP3AOp.V_CNDMASK_B32) +v_add_f32 = functools.partial(VOP3A, VOP3AOp.V_ADD_F32) +v_sub_f32 = functools.partial(VOP3A, VOP3AOp.V_SUB_F32) +v_subrev_f32 = functools.partial(VOP3A, VOP3AOp.V_SUBREV_F32) +v_fmac_f64 = functools.partial(VOP3A, VOP3AOp.V_FMAC_F64) +v_mul_f32 = functools.partial(VOP3A, VOP3AOp.V_MUL_F32) +v_mul_i32_i24 = functools.partial(VOP3A, VOP3AOp.V_MUL_I32_I24) +v_mul_hi_i32_i24 = functools.partial(VOP3A, VOP3AOp.V_MUL_HI_I32_I24) +v_mul_u32_u24 = functools.partial(VOP3A, VOP3AOp.V_MUL_U32_U24) +v_mul_hi_u32_u24 = functools.partial(VOP3A, VOP3AOp.V_MUL_HI_U32_U24) +v_min_f32 = functools.partial(VOP3A, VOP3AOp.V_MIN_F32) +v_max_f32 = functools.partial(VOP3A, VOP3AOp.V_MAX_F32) +v_min_i32 = functools.partial(VOP3A, VOP3AOp.V_MIN_I32) +v_max_i32 = functools.partial(VOP3A, VOP3AOp.V_MAX_I32) +v_min_u32 = functools.partial(VOP3A, VOP3AOp.V_MIN_U32) +v_max_u32 = functools.partial(VOP3A, VOP3AOp.V_MAX_U32) +v_lshrrev_b32 = functools.partial(VOP3A, VOP3AOp.V_LSHRREV_B32) +v_ashrrev_i32 = functools.partial(VOP3A, VOP3AOp.V_ASHRREV_I32) +v_lshlrev_b32 = functools.partial(VOP3A, VOP3AOp.V_LSHLREV_B32) +v_and_b32 = functools.partial(VOP3A, VOP3AOp.V_AND_B32) +v_or_b32 = functools.partial(VOP3A, VOP3AOp.V_OR_B32) +v_xor_b32 = functools.partial(VOP3A, VOP3AOp.V_XOR_B32) +v_dot2c_f32_bf16 = functools.partial(VOP3A, VOP3AOp.V_DOT2C_F32_BF16) +v_add_f16 = functools.partial(VOP3A, VOP3AOp.V_ADD_F16) +v_sub_f16 = functools.partial(VOP3A, VOP3AOp.V_SUB_F16) +v_subrev_f16 = functools.partial(VOP3A, VOP3AOp.V_SUBREV_F16) +v_mul_f16 = functools.partial(VOP3A, VOP3AOp.V_MUL_F16) +v_mac_f16 = functools.partial(VOP3A, VOP3AOp.V_MAC_F16) +v_add_u16 = functools.partial(VOP3A, VOP3AOp.V_ADD_U16) +v_sub_u16 = functools.partial(VOP3A, VOP3AOp.V_SUB_U16) +v_subrev_u16 = functools.partial(VOP3A, VOP3AOp.V_SUBREV_U16) +v_mul_lo_u16 = functools.partial(VOP3A, VOP3AOp.V_MUL_LO_U16) +v_lshlrev_b16 = functools.partial(VOP3A, VOP3AOp.V_LSHLREV_B16) +v_lshrrev_b16 = functools.partial(VOP3A, VOP3AOp.V_LSHRREV_B16) +v_ashrrev_i16 = functools.partial(VOP3A, VOP3AOp.V_ASHRREV_I16) +v_max_f16 = functools.partial(VOP3A, VOP3AOp.V_MAX_F16) +v_min_f16 = functools.partial(VOP3A, VOP3AOp.V_MIN_F16) +v_max_u16 = functools.partial(VOP3A, VOP3AOp.V_MAX_U16) +v_max_i16 = functools.partial(VOP3A, VOP3AOp.V_MAX_I16) +v_min_u16 = functools.partial(VOP3A, VOP3AOp.V_MIN_U16) +v_min_i16 = functools.partial(VOP3A, VOP3AOp.V_MIN_I16) +v_ldexp_f16 = functools.partial(VOP3A, VOP3AOp.V_LDEXP_F16) +v_add_u32 = functools.partial(VOP3A, VOP3AOp.V_ADD_U32) +v_sub_u32 = functools.partial(VOP3A, VOP3AOp.V_SUB_U32) +v_subrev_u32 = functools.partial(VOP3A, VOP3AOp.V_SUBREV_U32) +v_dot2c_f32_f16 = functools.partial(VOP3A, VOP3AOp.V_DOT2C_F32_F16) +v_dot2c_i32_i16 = functools.partial(VOP3A, VOP3AOp.V_DOT2C_I32_I16) +v_dot4c_i32_i8 = functools.partial(VOP3A, VOP3AOp.V_DOT4C_I32_I8) +v_dot8c_i32_i4 = functools.partial(VOP3A, VOP3AOp.V_DOT8C_I32_I4) +v_fmac_f32 = functools.partial(VOP3A, VOP3AOp.V_FMAC_F32) +v_pk_fmac_f16 = functools.partial(VOP3A, VOP3AOp.V_PK_FMAC_F16) +v_xnor_b32 = functools.partial(VOP3A, VOP3AOp.V_XNOR_B32) +v_nop = functools.partial(VOP3A, VOP3AOp.V_NOP) +v_mov_b32 = functools.partial(VOP3A, VOP3AOp.V_MOV_B32) +v_readfirstlane_b32 = functools.partial(VOP3A, VOP3AOp.V_READFIRSTLANE_B32) +v_cvt_i32_f64 = functools.partial(VOP3A, VOP3AOp.V_CVT_I32_F64) +v_cvt_f64_i32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F64_I32) +v_cvt_f32_i32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_I32) +v_cvt_f32_u32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_U32) +v_cvt_u32_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_U32_F32) +v_cvt_i32_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_I32_F32) +v_cvt_f16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F16_F32) +v_cvt_f32_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_F16) +v_cvt_rpi_i32_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_RPI_I32_F32) +v_cvt_flr_i32_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_FLR_I32_F32) +v_cvt_off_f32_i4 = functools.partial(VOP3A, VOP3AOp.V_CVT_OFF_F32_I4) +v_cvt_f32_f64 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_F64) +v_cvt_f64_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F64_F32) +v_cvt_f32_ubyte0 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_UBYTE0) +v_cvt_f32_ubyte1 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_UBYTE1) +v_cvt_f32_ubyte2 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_UBYTE2) +v_cvt_f32_ubyte3 = functools.partial(VOP3A, VOP3AOp.V_CVT_F32_UBYTE3) +v_cvt_u32_f64 = functools.partial(VOP3A, VOP3AOp.V_CVT_U32_F64) +v_cvt_f64_u32 = functools.partial(VOP3A, VOP3AOp.V_CVT_F64_U32) +v_trunc_f64 = functools.partial(VOP3A, VOP3AOp.V_TRUNC_F64) +v_ceil_f64 = functools.partial(VOP3A, VOP3AOp.V_CEIL_F64) +v_rndne_f64 = functools.partial(VOP3A, VOP3AOp.V_RNDNE_F64) +v_floor_f64 = functools.partial(VOP3A, VOP3AOp.V_FLOOR_F64) +v_fract_f32 = functools.partial(VOP3A, VOP3AOp.V_FRACT_F32) +v_trunc_f32 = functools.partial(VOP3A, VOP3AOp.V_TRUNC_F32) +v_ceil_f32 = functools.partial(VOP3A, VOP3AOp.V_CEIL_F32) +v_rndne_f32 = functools.partial(VOP3A, VOP3AOp.V_RNDNE_F32) +v_floor_f32 = functools.partial(VOP3A, VOP3AOp.V_FLOOR_F32) +v_exp_f32 = functools.partial(VOP3A, VOP3AOp.V_EXP_F32) +v_log_f32 = functools.partial(VOP3A, VOP3AOp.V_LOG_F32) +v_rcp_f32 = functools.partial(VOP3A, VOP3AOp.V_RCP_F32) +v_rcp_iflag_f32 = functools.partial(VOP3A, VOP3AOp.V_RCP_IFLAG_F32) +v_rsq_f32 = functools.partial(VOP3A, VOP3AOp.V_RSQ_F32) +v_rcp_f64 = functools.partial(VOP3A, VOP3AOp.V_RCP_F64) +v_rsq_f64 = functools.partial(VOP3A, VOP3AOp.V_RSQ_F64) +v_sqrt_f32 = functools.partial(VOP3A, VOP3AOp.V_SQRT_F32) +v_sqrt_f64 = functools.partial(VOP3A, VOP3AOp.V_SQRT_F64) +v_sin_f32 = functools.partial(VOP3A, VOP3AOp.V_SIN_F32) +v_cos_f32 = functools.partial(VOP3A, VOP3AOp.V_COS_F32) +v_not_b32 = functools.partial(VOP3A, VOP3AOp.V_NOT_B32) +v_bfrev_b32 = functools.partial(VOP3A, VOP3AOp.V_BFREV_B32) +v_ffbh_u32 = functools.partial(VOP3A, VOP3AOp.V_FFBH_U32) +v_ffbl_b32 = functools.partial(VOP3A, VOP3AOp.V_FFBL_B32) +v_ffbh_i32 = functools.partial(VOP3A, VOP3AOp.V_FFBH_I32) +v_frexp_exp_i32_f64 = functools.partial(VOP3A, VOP3AOp.V_FREXP_EXP_I32_F64) +v_frexp_mant_f64 = functools.partial(VOP3A, VOP3AOp.V_FREXP_MANT_F64) +v_fract_f64 = functools.partial(VOP3A, VOP3AOp.V_FRACT_F64) +v_frexp_exp_i32_f32 = functools.partial(VOP3A, VOP3AOp.V_FREXP_EXP_I32_F32) +v_frexp_mant_f32 = functools.partial(VOP3A, VOP3AOp.V_FREXP_MANT_F32) +v_clrexcp = functools.partial(VOP3A, VOP3AOp.V_CLREXCP) +v_mov_b64 = functools.partial(VOP3A, VOP3AOp.V_MOV_B64) +v_cvt_f16_u16 = functools.partial(VOP3A, VOP3AOp.V_CVT_F16_U16) +v_cvt_f16_i16 = functools.partial(VOP3A, VOP3AOp.V_CVT_F16_I16) +v_cvt_u16_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_U16_F16) +v_cvt_i16_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_I16_F16) +v_rcp_f16 = functools.partial(VOP3A, VOP3AOp.V_RCP_F16) +v_sqrt_f16 = functools.partial(VOP3A, VOP3AOp.V_SQRT_F16) +v_rsq_f16 = functools.partial(VOP3A, VOP3AOp.V_RSQ_F16) +v_log_f16 = functools.partial(VOP3A, VOP3AOp.V_LOG_F16) +v_exp_f16 = functools.partial(VOP3A, VOP3AOp.V_EXP_F16) +v_mad_i32_i24 = functools.partial(VOP3A, VOP3AOp.V_MAD_I32_I24) +v_mad_u32_u24 = functools.partial(VOP3A, VOP3AOp.V_MAD_U32_U24) +v_cubeid_f32 = functools.partial(VOP3A, VOP3AOp.V_CUBEID_F32) +v_cubesc_f32 = functools.partial(VOP3A, VOP3AOp.V_CUBESC_F32) +v_cubetc_f32 = functools.partial(VOP3A, VOP3AOp.V_CUBETC_F32) +v_cubema_f32 = functools.partial(VOP3A, VOP3AOp.V_CUBEMA_F32) +v_bfe_u32 = functools.partial(VOP3A, VOP3AOp.V_BFE_U32) +v_bfe_i32 = functools.partial(VOP3A, VOP3AOp.V_BFE_I32) +v_bfi_b32 = functools.partial(VOP3A, VOP3AOp.V_BFI_B32) +v_fma_f32 = functools.partial(VOP3A, VOP3AOp.V_FMA_F32) +v_fma_f64 = functools.partial(VOP3A, VOP3AOp.V_FMA_F64) +v_lerp_u8 = functools.partial(VOP3A, VOP3AOp.V_LERP_U8) +v_alignbit_b32 = functools.partial(VOP3A, VOP3AOp.V_ALIGNBIT_B32) +v_alignbyte_b32 = functools.partial(VOP3A, VOP3AOp.V_ALIGNBYTE_B32) +v_min3_f32 = functools.partial(VOP3A, VOP3AOp.V_MIN3_F32) +v_min3_i32 = functools.partial(VOP3A, VOP3AOp.V_MIN3_I32) +v_min3_u32 = functools.partial(VOP3A, VOP3AOp.V_MIN3_U32) +v_max3_f32 = functools.partial(VOP3A, VOP3AOp.V_MAX3_F32) +v_max3_i32 = functools.partial(VOP3A, VOP3AOp.V_MAX3_I32) +v_max3_u32 = functools.partial(VOP3A, VOP3AOp.V_MAX3_U32) +v_med3_f32 = functools.partial(VOP3A, VOP3AOp.V_MED3_F32) +v_med3_i32 = functools.partial(VOP3A, VOP3AOp.V_MED3_I32) +v_med3_u32 = functools.partial(VOP3A, VOP3AOp.V_MED3_U32) +v_sad_u8 = functools.partial(VOP3A, VOP3AOp.V_SAD_U8) +v_sad_hi_u8 = functools.partial(VOP3A, VOP3AOp.V_SAD_HI_U8) +v_sad_u16 = functools.partial(VOP3A, VOP3AOp.V_SAD_U16) +v_sad_u32 = functools.partial(VOP3A, VOP3AOp.V_SAD_U32) +v_cvt_pk_u8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_U8_F32) +v_div_fixup_f32 = functools.partial(VOP3A, VOP3AOp.V_DIV_FIXUP_F32) +v_div_fixup_f64 = functools.partial(VOP3A, VOP3AOp.V_DIV_FIXUP_F64) +v_div_fmas_f32 = functools.partial(VOP3A, VOP3AOp.V_DIV_FMAS_F32) +v_div_fmas_f64 = functools.partial(VOP3A, VOP3AOp.V_DIV_FMAS_F64) +v_msad_u8 = functools.partial(VOP3A, VOP3AOp.V_MSAD_U8) +v_qsad_pk_u16_u8 = functools.partial(VOP3A, VOP3AOp.V_QSAD_PK_U16_U8) +v_mqsad_pk_u16_u8 = functools.partial(VOP3A, VOP3AOp.V_MQSAD_PK_U16_U8) +v_mqsad_u32_u8 = functools.partial(VOP3A, VOP3AOp.V_MQSAD_U32_U8) +v_mad_legacy_f16 = functools.partial(VOP3A, VOP3AOp.V_MAD_LEGACY_F16) +v_mad_legacy_u16 = functools.partial(VOP3A, VOP3AOp.V_MAD_LEGACY_U16) +v_mad_legacy_i16 = functools.partial(VOP3A, VOP3AOp.V_MAD_LEGACY_I16) +v_perm_b32 = functools.partial(VOP3A, VOP3AOp.V_PERM_B32) +v_fma_legacy_f16 = functools.partial(VOP3A, VOP3AOp.V_FMA_LEGACY_F16) +v_div_fixup_legacy_f16 = functools.partial(VOP3A, VOP3AOp.V_DIV_FIXUP_LEGACY_F16) +v_cvt_pkaccum_u8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKACCUM_U8_F32) +v_mad_u32_u16 = functools.partial(VOP3A, VOP3AOp.V_MAD_U32_U16) +v_mad_i32_i16 = functools.partial(VOP3A, VOP3AOp.V_MAD_I32_I16) +v_xad_u32 = functools.partial(VOP3A, VOP3AOp.V_XAD_U32) +v_min3_f16 = functools.partial(VOP3A, VOP3AOp.V_MIN3_F16) +v_min3_i16 = functools.partial(VOP3A, VOP3AOp.V_MIN3_I16) +v_min3_u16 = functools.partial(VOP3A, VOP3AOp.V_MIN3_U16) +v_max3_f16 = functools.partial(VOP3A, VOP3AOp.V_MAX3_F16) +v_max3_i16 = functools.partial(VOP3A, VOP3AOp.V_MAX3_I16) +v_max3_u16 = functools.partial(VOP3A, VOP3AOp.V_MAX3_U16) +v_med3_f16 = functools.partial(VOP3A, VOP3AOp.V_MED3_F16) +v_med3_i16 = functools.partial(VOP3A, VOP3AOp.V_MED3_I16) +v_med3_u16 = functools.partial(VOP3A, VOP3AOp.V_MED3_U16) +v_lshl_add_u32 = functools.partial(VOP3A, VOP3AOp.V_LSHL_ADD_U32) +v_add_lshl_u32 = functools.partial(VOP3A, VOP3AOp.V_ADD_LSHL_U32) +v_add3_u32 = functools.partial(VOP3A, VOP3AOp.V_ADD3_U32) +v_lshl_or_b32 = functools.partial(VOP3A, VOP3AOp.V_LSHL_OR_B32) +v_and_or_b32 = functools.partial(VOP3A, VOP3AOp.V_AND_OR_B32) +v_or3_b32 = functools.partial(VOP3A, VOP3AOp.V_OR3_B32) +v_mad_f16 = functools.partial(VOP3A, VOP3AOp.V_MAD_F16) +v_mad_u16 = functools.partial(VOP3A, VOP3AOp.V_MAD_U16) +v_mad_i16 = functools.partial(VOP3A, VOP3AOp.V_MAD_I16) +v_fma_f16 = functools.partial(VOP3A, VOP3AOp.V_FMA_F16) +v_div_fixup_f16 = functools.partial(VOP3A, VOP3AOp.V_DIV_FIXUP_F16) +v_lshl_add_u64 = functools.partial(VOP3A, VOP3AOp.V_LSHL_ADD_U64) +v_bitop3_b16 = functools.partial(VOP3A, VOP3AOp.V_BITOP3_B16) +v_bitop3_b32 = functools.partial(VOP3A, VOP3AOp.V_BITOP3_B32) +v_cvt_scalef32_pk_fp8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP8_F32) +v_cvt_scalef32_pk_bf8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF8_F32) +v_cvt_scalef32_sr_fp8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_FP8_F32) +v_cvt_scalef32_sr_bf8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_BF8_F32) +v_cvt_scalef32_pk_f32_fp8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F32_FP8) +v_cvt_scalef32_pk_f32_bf8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F32_BF8) +v_cvt_scalef32_f32_fp8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_F32_FP8) +v_cvt_scalef32_f32_bf8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_F32_BF8) +v_cvt_scalef32_pk_fp4_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP4_F32) +v_cvt_scalef32_sr_pk_fp4_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_F32) +v_cvt_scalef32_pk_f32_fp4 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F32_FP4) +v_cvt_scalef32_pk_fp8_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP8_F16) +v_cvt_scalef32_pk_bf8_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF8_F16) +v_cvt_scalef32_sr_fp8_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_FP8_F16) +v_cvt_scalef32_sr_bf8_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_BF8_F16) +v_cvt_scalef32_pk_fp8_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP8_BF16) +v_cvt_scalef32_pk_bf8_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF8_BF16) +v_cvt_scalef32_sr_fp8_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_FP8_BF16) +v_cvt_scalef32_sr_bf8_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_BF8_BF16) +v_cvt_scalef32_pk_f16_fp8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F16_FP8) +v_cvt_scalef32_pk_f16_bf8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F16_BF8) +v_cvt_scalef32_f16_fp8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_F16_FP8) +v_cvt_scalef32_f16_bf8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_F16_BF8) +v_cvt_scalef32_pk_fp4_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP4_F16) +v_cvt_scalef32_pk_fp4_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_FP4_BF16) +v_cvt_scalef32_sr_pk_fp4_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_F16) +v_cvt_scalef32_sr_pk_fp4_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_BF16) +v_cvt_scalef32_pk_f16_fp4 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_F16_FP4) +v_cvt_scalef32_pk_bf16_fp4 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF16_FP4) +v_cvt_scalef32_2xpk16_fp6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_2XPK16_FP6_F32) +v_cvt_scalef32_2xpk16_bf6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_2XPK16_BF6_F32) +v_cvt_scalef32_sr_pk32_fp6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_FP6_F32) +v_cvt_scalef32_sr_pk32_bf6_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_BF6_F32) +v_cvt_scalef32_pk32_f32_fp6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F32_FP6) +v_cvt_scalef32_pk32_f32_bf6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F32_BF6) +cdna4 = functools.partial(VOP3A, VOP3AOp.CDNA4) +v_cvt_scalef32_pk32_fp6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_FP6_BF16) +v_cvt_scalef32_pk32_bf6_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF6_F16) +v_cvt_scalef32_pk32_bf6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF6_BF16) +v_cvt_scalef32_sr_pk32_fp6_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_FP6_F16) +v_cvt_scalef32_sr_pk32_fp6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_FP6_BF16) +v_cvt_scalef32_sr_pk32_bf6_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_BF6_F16) +v_cvt_scalef32_sr_pk32_bf6_bf16 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_SR_PK32_BF6_BF16) +v_cvt_scalef32_pk32_f16_fp6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F16_FP6) +v_cvt_scalef32_pk32_bf16_fp6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF16_FP6) +v_cvt_scalef32_pk32_f16_bf6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_F16_BF6) +v_cvt_scalef32_pk32_bf16_bf6 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK32_BF16_BF6) +v_ashr_pk_i8_i32 = functools.partial(VOP3A, VOP3AOp.V_ASHR_PK_I8_I32) +v_ashr_pk_u8_i32 = functools.partial(VOP3A, VOP3AOp.V_ASHR_PK_U8_I32) +v_cvt_pk_f16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_F16_F32) +v_cvt_pk_bf16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_BF16_F32) +v_cvt_scalef32_pk_bf16_fp8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF16_FP8) +v_cvt_scalef32_pk_bf16_bf8 = functools.partial(VOP3A, VOP3AOp.V_CVT_SCALEF32_PK_BF16_BF8) +v_add_f64 = functools.partial(VOP3A, VOP3AOp.V_ADD_F64) +v_mul_f64 = functools.partial(VOP3A, VOP3AOp.V_MUL_F64) +v_min_f64 = functools.partial(VOP3A, VOP3AOp.V_MIN_F64) +v_max_f64 = functools.partial(VOP3A, VOP3AOp.V_MAX_F64) +v_ldexp_f64 = functools.partial(VOP3A, VOP3AOp.V_LDEXP_F64) +v_mul_lo_u32 = functools.partial(VOP3A, VOP3AOp.V_MUL_LO_U32) +v_mul_hi_u32 = functools.partial(VOP3A, VOP3AOp.V_MUL_HI_U32) +v_mul_hi_i32 = functools.partial(VOP3A, VOP3AOp.V_MUL_HI_I32) +v_ldexp_f32 = functools.partial(VOP3A, VOP3AOp.V_LDEXP_F32) +v_readlane_b32 = functools.partial(VOP3A, VOP3AOp.V_READLANE_B32) +v_writelane_b32 = functools.partial(VOP3A, VOP3AOp.V_WRITELANE_B32) +v_bcnt_u32_b32 = functools.partial(VOP3A, VOP3AOp.V_BCNT_U32_B32) +v_mbcnt_lo_u32_b32 = functools.partial(VOP3A, VOP3AOp.V_MBCNT_LO_U32_B32) +v_mbcnt_hi_u32_b32 = functools.partial(VOP3A, VOP3AOp.V_MBCNT_HI_U32_B32) +v_lshlrev_b64 = functools.partial(VOP3A, VOP3AOp.V_LSHLREV_B64) +v_lshrrev_b64 = functools.partial(VOP3A, VOP3AOp.V_LSHRREV_B64) +v_ashrrev_i64 = functools.partial(VOP3A, VOP3AOp.V_ASHRREV_I64) +v_trig_preop_f64 = functools.partial(VOP3A, VOP3AOp.V_TRIG_PREOP_F64) +v_bfm_b32 = functools.partial(VOP3A, VOP3AOp.V_BFM_B32) +v_cvt_pknorm_i16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKNORM_I16_F32) +v_cvt_pknorm_u16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKNORM_U16_F32) +v_cvt_pkrtz_f16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKRTZ_F16_F32) +v_cvt_pk_u16_u32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_U16_U32) +v_cvt_pk_i16_i32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_I16_I32) +v_cvt_pknorm_i16_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKNORM_I16_F16) +v_cvt_pknorm_u16_f16 = functools.partial(VOP3A, VOP3AOp.V_CVT_PKNORM_U16_F16) +v_add_i32 = functools.partial(VOP3A, VOP3AOp.V_ADD_I32) +v_sub_i32 = functools.partial(VOP3A, VOP3AOp.V_SUB_I32) +v_add_i16 = functools.partial(VOP3A, VOP3AOp.V_ADD_I16) +v_sub_i16 = functools.partial(VOP3A, VOP3AOp.V_SUB_I16) +v_pack_b32_f16 = functools.partial(VOP3A, VOP3AOp.V_PACK_B32_F16) +v_mul_legacy_f32 = functools.partial(VOP3A, VOP3AOp.V_MUL_LEGACY_F32) +v_cvt_pk_fp8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_FP8_F32) +v_cvt_pk_bf8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_PK_BF8_F32) +v_cvt_sr_fp8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SR_FP8_F32) +v_cvt_sr_bf8_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SR_BF8_F32) +v_cvt_sr_f16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SR_F16_F32) +v_cvt_sr_bf16_f32 = functools.partial(VOP3A, VOP3AOp.V_CVT_SR_BF16_F32) +v_minimum3_f32 = functools.partial(VOP3A, VOP3AOp.V_MINIMUM3_F32) +v_maximum3_f32 = functools.partial(VOP3A, VOP3AOp.V_MAXIMUM3_F32) +v_add_co_u32 = functools.partial(VOP3B, VOP3BOp.V_ADD_CO_U32) +v_sub_co_u32 = functools.partial(VOP3B, VOP3BOp.V_SUB_CO_U32) +v_subrev_co_u32 = functools.partial(VOP3B, VOP3BOp.V_SUBREV_CO_U32) +v_addc_co_u32 = functools.partial(VOP3B, VOP3BOp.V_ADDC_CO_U32) +v_subb_co_u32 = functools.partial(VOP3B, VOP3BOp.V_SUBB_CO_U32) +v_subbrev_co_u32 = functools.partial(VOP3B, VOP3BOp.V_SUBBREV_CO_U32) +v_div_scale_f32 = functools.partial(VOP3B, VOP3BOp.V_DIV_SCALE_F32) +v_div_scale_f64 = functools.partial(VOP3B, VOP3BOp.V_DIV_SCALE_F64) +v_mad_u64_u32 = functools.partial(VOP3B, VOP3BOp.V_MAD_U64_U32) +v_mad_i64_i32 = functools.partial(VOP3B, VOP3BOp.V_MAD_I64_I32) +cdna4 = functools.partial(VOP3B, VOP3BOp.CDNA4) +v_pk_mad_i16 = functools.partial(VOP3P, VOP3POp.V_PK_MAD_I16) +v_pk_mul_lo_u16 = functools.partial(VOP3P, VOP3POp.V_PK_MUL_LO_U16) +v_pk_add_i16 = functools.partial(VOP3P, VOP3POp.V_PK_ADD_I16) +v_pk_sub_i16 = functools.partial(VOP3P, VOP3POp.V_PK_SUB_I16) +v_pk_lshlrev_b16 = functools.partial(VOP3P, VOP3POp.V_PK_LSHLREV_B16) +v_pk_lshrrev_b16 = functools.partial(VOP3P, VOP3POp.V_PK_LSHRREV_B16) +v_pk_ashrrev_i16 = functools.partial(VOP3P, VOP3POp.V_PK_ASHRREV_I16) +v_pk_max_i16 = functools.partial(VOP3P, VOP3POp.V_PK_MAX_I16) +v_pk_min_i16 = functools.partial(VOP3P, VOP3POp.V_PK_MIN_I16) +v_pk_mad_u16 = functools.partial(VOP3P, VOP3POp.V_PK_MAD_U16) +v_pk_add_u16 = functools.partial(VOP3P, VOP3POp.V_PK_ADD_U16) +v_pk_sub_u16 = functools.partial(VOP3P, VOP3POp.V_PK_SUB_U16) +v_pk_max_u16 = functools.partial(VOP3P, VOP3POp.V_PK_MAX_U16) +v_pk_min_u16 = functools.partial(VOP3P, VOP3POp.V_PK_MIN_U16) +v_pk_fma_f16 = functools.partial(VOP3P, VOP3POp.V_PK_FMA_F16) +v_pk_add_f16 = functools.partial(VOP3P, VOP3POp.V_PK_ADD_F16) +v_pk_mul_f16 = functools.partial(VOP3P, VOP3POp.V_PK_MUL_F16) +v_pk_min_f16 = functools.partial(VOP3P, VOP3POp.V_PK_MIN_F16) +v_pk_max_f16 = functools.partial(VOP3P, VOP3POp.V_PK_MAX_F16) +v_dot2_f32_bf16 = functools.partial(VOP3P, VOP3POp.V_DOT2_F32_BF16) +v_pk_minimum3_f16 = functools.partial(VOP3P, VOP3POp.V_PK_MINIMUM3_F16) +v_pk_maximum3_f16 = functools.partial(VOP3P, VOP3POp.V_PK_MAXIMUM3_F16) +v_mad_mix_f32 = functools.partial(VOP3P, VOP3POp.V_MAD_MIX_F32) +v_mad_mixlo_f16 = functools.partial(VOP3P, VOP3POp.V_MAD_MIXLO_F16) +v_mad_mixhi_f16 = functools.partial(VOP3P, VOP3POp.V_MAD_MIXHI_F16) +v_dot2_f32_f16 = functools.partial(VOP3P, VOP3POp.V_DOT2_F32_F16) +v_dot2_i32_i16 = functools.partial(VOP3P, VOP3POp.V_DOT2_I32_I16) +v_dot2_u32_u16 = functools.partial(VOP3P, VOP3POp.V_DOT2_U32_U16) +v_dot4_i32_i8 = functools.partial(VOP3P, VOP3POp.V_DOT4_I32_I8) +v_dot4_u32_u8 = functools.partial(VOP3P, VOP3POp.V_DOT4_U32_U8) +v_dot8_i32_i4 = functools.partial(VOP3P, VOP3POp.V_DOT8_I32_I4) +v_dot8_u32_u4 = functools.partial(VOP3P, VOP3POp.V_DOT8_U32_U4) +v_mfma_f32_16x16x128_f8f6f4 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X128_F8F6F4) +v_mfma_f32_32x32x64_f8f6f4 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X64_F8F6F4) +v_pk_fma_f32 = functools.partial(VOP3P, VOP3POp.V_PK_FMA_F32) +v_pk_mul_f32 = functools.partial(VOP3P, VOP3POp.V_PK_MUL_F32) +v_pk_add_f32 = functools.partial(VOP3P, VOP3POp.V_PK_ADD_F32) +v_pk_mov_b32 = functools.partial(VOP3P, VOP3POp.V_PK_MOV_B32) +v_mfma_f32_16x16x32_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_BF16) +v_mfma_i32_16x16x64_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_16X16X64_I8) +v_mfma_f32_32x32x16_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_BF16) +v_mfma_i32_32x32x32_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_32X32X32_I8) +v_smfmac_f32_16x16x64_bf16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_BF16) +v_smfmac_i32_16x16x128_i8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_I32_16X16X128_I8) +v_smfmac_f32_16x16x128_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_BF8_BF8) +v_smfmac_f32_16x16x128_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_BF8_FP8) +v_smfmac_f32_16x16x128_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_FP8_BF8) +v_mfma_f32_32x32x1_2b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X1_2B_F32) +v_mfma_f32_16x16x1_4b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X1_4B_F32) +v_mfma_f32_4x4x1_16b_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_4X4X1_16B_F32) +v_smfmac_f32_16x16x128_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X128_FP8_FP8) +v_mfma_f32_32x32x2_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X2_F32) +v_mfma_f32_16x16x4_f32 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X4_F32) +v_smfmac_f32_32x32x32_bf16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_BF16) +v_smfmac_i32_32x32x64_i8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_I32_32X32X64_I8) +v_mfma_f32_32x32x4_2b_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X4_2B_F16) +v_mfma_f32_16x16x4_4b_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X4_4B_F16) +v_mfma_f32_4x4x4_16b_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_4X4X4_16B_F16) +v_smfmac_f32_32x32x64_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X64_BF8_BF8) +v_mfma_f32_32x32x8_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X8_F16) +v_mfma_f32_16x16x16_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X16_F16) +v_smfmac_f32_32x32x64_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X64_BF8_FP8) +v_smfmac_f32_32x32x64_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X64_FP8_BF8) +v_mfma_i32_32x32x4_2b_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_32X32X4_2B_I8) +v_mfma_i32_16x16x4_4b_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_16X16X4_4B_I8) +v_mfma_i32_4x4x4_16b_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_4X4X4_16B_I8) +v_smfmac_f32_32x32x64_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X64_FP8_FP8) +v_mfma_f32_16x16x32_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_F16) +v_mfma_f32_32x32x16_f16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_F16) +v_mfma_i32_32x32x16_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_32X32X16_I8) +v_mfma_i32_16x16x32_i8 = functools.partial(VOP3P, VOP3POp.V_MFMA_I32_16X16X32_I8) +v_accvgpr_read = functools.partial(VOP3P, VOP3POp.V_ACCVGPR_READ) +v_accvgpr_write = functools.partial(VOP3P, VOP3POp.V_ACCVGPR_WRITE) +v_smfmac_f32_16x16x64_f16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_F16) +v_smfmac_f32_32x32x32_f16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_F16) +v_mfma_f32_32x32x4_2b_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X4_2B_BF16) +v_mfma_f32_16x16x4_4b_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X4_4B_BF16) +v_mfma_f32_4x4x4_16b_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_4X4X4_16B_BF16) +v_mfma_f32_32x32x8_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X8_BF16) +v_mfma_f32_16x16x16_bf16 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X16_BF16) +v_smfmac_f32_16x16x32_f16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X32_F16) +v_smfmac_f32_32x32x16_f16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X16_F16) +v_smfmac_f32_16x16x32_bf16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X32_BF16) +v_smfmac_f32_32x32x16_bf16 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X16_BF16) +v_smfmac_i32_16x16x64_i8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_I32_16X16X64_I8) +v_smfmac_i32_32x32x32_i8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_I32_32X32X32_I8) +v_mfma_f64_16x16x4_f64 = functools.partial(VOP3P, VOP3POp.V_MFMA_F64_16X16X4_F64) +v_mfma_f64_4x4x4_4b_f64 = functools.partial(VOP3P, VOP3POp.V_MFMA_F64_4X4X4_4B_F64) +v_mfma_f32_16x16x32_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_BF8_BF8) +v_mfma_f32_16x16x32_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_BF8_FP8) +v_mfma_f32_16x16x32_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_FP8_BF8) +v_mfma_f32_16x16x32_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_16X16X32_FP8_FP8) +v_mfma_f32_32x32x16_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_BF8_BF8) +v_mfma_f32_32x32x16_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_BF8_FP8) +v_mfma_f32_32x32x16_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_FP8_BF8) +v_mfma_f32_32x32x16_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_MFMA_F32_32X32X16_FP8_FP8) +v_smfmac_f32_16x16x64_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_BF8_BF8) +v_smfmac_f32_16x16x64_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_BF8_FP8) +v_smfmac_f32_16x16x64_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_FP8_BF8) +v_smfmac_f32_16x16x64_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_16X16X64_FP8_FP8) +v_smfmac_f32_32x32x32_bf8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_BF8_BF8) +v_smfmac_f32_32x32x32_bf8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_BF8_FP8) +v_smfmac_f32_32x32x32_fp8_bf8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_FP8_BF8) +v_smfmac_f32_32x32x32_fp8_fp8 = functools.partial(VOP3P, VOP3POp.V_SMFMAC_F32_32X32X32_FP8_FP8) +cdna4 = functools.partial(VOP3P, VOP3POp.CDNA4) +v_cmp_class_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_CLASS_F32) +v_cmpx_class_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_CLASS_F32) +v_cmp_class_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_CLASS_F64) +v_cmpx_class_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_CLASS_F64) +v_cmp_class_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_CLASS_F16) +v_cmpx_class_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_CLASS_F16) +v_cmp_f_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_F16) +v_cmp_lt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_F16) +v_cmp_eq_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_F16) +v_cmp_le_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_F16) +v_cmp_gt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_F16) +v_cmp_lg_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LG_F16) +v_cmp_ge_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_F16) +v_cmp_o_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_O_F16) +v_cmp_u_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_U_F16) +v_cmp_nge_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGE_F16) +v_cmp_nlg_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLG_F16) +v_cmp_ngt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGT_F16) +v_cmp_nle_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLE_F16) +v_cmp_neq_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NEQ_F16) +v_cmp_nlt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLT_F16) +v_cmp_tru_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_TRU_F16) +v_cmpx_f_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_F16) +v_cmpx_lt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_F16) +v_cmpx_eq_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_F16) +v_cmpx_le_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_F16) +v_cmpx_gt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_F16) +v_cmpx_lg_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LG_F16) +v_cmpx_ge_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_F16) +v_cmpx_o_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_O_F16) +v_cmpx_u_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_U_F16) +v_cmpx_nge_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGE_F16) +v_cmpx_nlg_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLG_F16) +v_cmpx_ngt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGT_F16) +v_cmpx_nle_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLE_F16) +v_cmpx_neq_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NEQ_F16) +v_cmpx_nlt_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLT_F16) +v_cmpx_tru_f16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_TRU_F16) +v_cmp_f_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_F32) +v_cmp_lt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_F32) +v_cmp_eq_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_F32) +v_cmp_le_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_F32) +v_cmp_gt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_F32) +v_cmp_lg_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LG_F32) +v_cmp_ge_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_F32) +v_cmp_o_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_O_F32) +v_cmp_u_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_U_F32) +v_cmp_nge_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGE_F32) +v_cmp_nlg_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLG_F32) +v_cmp_ngt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGT_F32) +v_cmp_nle_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLE_F32) +v_cmp_neq_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NEQ_F32) +v_cmp_nlt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLT_F32) +v_cmp_tru_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_TRU_F32) +v_cmpx_f_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_F32) +v_cmpx_lt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_F32) +v_cmpx_eq_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_F32) +v_cmpx_le_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_F32) +v_cmpx_gt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_F32) +v_cmpx_lg_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LG_F32) +v_cmpx_ge_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_F32) +v_cmpx_o_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_O_F32) +v_cmpx_u_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_U_F32) +v_cmpx_nge_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGE_F32) +v_cmpx_nlg_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLG_F32) +v_cmpx_ngt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGT_F32) +v_cmpx_nle_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLE_F32) +v_cmpx_neq_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NEQ_F32) +v_cmpx_nlt_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLT_F32) +v_cmpx_tru_f32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_TRU_F32) +v_cmp_f_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_F64) +v_cmp_lt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_F64) +v_cmp_eq_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_F64) +v_cmp_le_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_F64) +v_cmp_gt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_F64) +v_cmp_lg_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LG_F64) +v_cmp_ge_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_F64) +v_cmp_o_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_O_F64) +v_cmp_u_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_U_F64) +v_cmp_nge_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGE_F64) +v_cmp_nlg_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLG_F64) +v_cmp_ngt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NGT_F64) +v_cmp_nle_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLE_F64) +v_cmp_neq_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NEQ_F64) +v_cmp_nlt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NLT_F64) +v_cmp_tru_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_TRU_F64) +v_cmpx_f_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_F64) +v_cmpx_lt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_F64) +v_cmpx_eq_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_F64) +v_cmpx_le_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_F64) +v_cmpx_gt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_F64) +v_cmpx_lg_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LG_F64) +v_cmpx_ge_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_F64) +v_cmpx_o_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_O_F64) +v_cmpx_u_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_U_F64) +v_cmpx_nge_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGE_F64) +v_cmpx_nlg_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLG_F64) +v_cmpx_ngt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NGT_F64) +v_cmpx_nle_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLE_F64) +v_cmpx_neq_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NEQ_F64) +v_cmpx_nlt_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NLT_F64) +v_cmpx_tru_f64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_TRU_F64) +v_cmp_f_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_I16) +v_cmp_lt_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_I16) +v_cmp_eq_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_I16) +v_cmp_le_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_I16) +v_cmp_gt_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_I16) +v_cmp_ne_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_I16) +v_cmp_ge_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_I16) +v_cmp_t_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_I16) +v_cmp_f_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_U16) +v_cmp_lt_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_U16) +v_cmp_eq_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_U16) +v_cmp_le_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_U16) +v_cmp_gt_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_U16) +v_cmp_ne_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_U16) +v_cmp_ge_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_U16) +v_cmp_t_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_U16) +v_cmpx_f_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_I16) +v_cmpx_lt_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_I16) +v_cmpx_eq_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_I16) +v_cmpx_le_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_I16) +v_cmpx_gt_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_I16) +v_cmpx_ne_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_I16) +v_cmpx_ge_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_I16) +v_cmpx_t_i16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_I16) +v_cmpx_f_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_U16) +v_cmpx_lt_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_U16) +v_cmpx_eq_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_U16) +v_cmpx_le_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_U16) +v_cmpx_gt_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_U16) +v_cmpx_ne_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_U16) +v_cmpx_ge_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_U16) +v_cmpx_t_u16_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_U16) +v_cmp_f_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_I32) +v_cmp_lt_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_I32) +v_cmp_eq_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_I32) +v_cmp_le_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_I32) +v_cmp_gt_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_I32) +v_cmp_ne_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_I32) +v_cmp_ge_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_I32) +v_cmp_t_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_I32) +v_cmp_f_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_U32) +v_cmp_lt_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_U32) +v_cmp_eq_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_U32) +v_cmp_le_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_U32) +v_cmp_gt_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_U32) +v_cmp_ne_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_U32) +v_cmp_ge_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_U32) +v_cmp_t_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_U32) +v_cmpx_f_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_I32) +v_cmpx_lt_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_I32) +v_cmpx_eq_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_I32) +v_cmpx_le_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_I32) +v_cmpx_gt_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_I32) +v_cmpx_ne_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_I32) +v_cmpx_ge_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_I32) +v_cmpx_t_i32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_I32) +v_cmpx_f_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_U32) +v_cmpx_lt_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_U32) +v_cmpx_eq_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_U32) +v_cmpx_le_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_U32) +v_cmpx_gt_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_U32) +v_cmpx_ne_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_U32) +v_cmpx_ge_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_U32) +v_cmpx_t_u32_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_U32) +v_cmp_f_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_I64) +v_cmp_lt_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_I64) +v_cmp_eq_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_I64) +v_cmp_le_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_I64) +v_cmp_gt_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_I64) +v_cmp_ne_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_I64) +v_cmp_ge_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_I64) +v_cmp_t_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_I64) +v_cmp_f_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_F_U64) +v_cmp_lt_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LT_U64) +v_cmp_eq_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_EQ_U64) +v_cmp_le_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_LE_U64) +v_cmp_gt_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GT_U64) +v_cmp_ne_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_NE_U64) +v_cmp_ge_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_GE_U64) +v_cmp_t_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMP_T_U64) +v_cmpx_f_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_I64) +v_cmpx_lt_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_I64) +v_cmpx_eq_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_I64) +v_cmpx_le_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_I64) +v_cmpx_gt_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_I64) +v_cmpx_ne_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_I64) +v_cmpx_ge_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_I64) +v_cmpx_t_i64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_I64) +v_cmpx_f_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_F_U64) +v_cmpx_lt_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LT_U64) +v_cmpx_eq_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_EQ_U64) +v_cmpx_le_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_LE_U64) +v_cmpx_gt_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GT_U64) +v_cmpx_ne_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_NE_U64) +v_cmpx_ge_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_GE_U64) +v_cmpx_t_u64_e32 = functools.partial(VOPC, VOPCOp.V_CMPX_T_U64) +cdna4_e32 = functools.partial(VOPC, VOPCOp.CDNA4) + +S_ADD_U32 = SrcEnum.S_ADD_U32 +S_SUB_U32 = SrcEnum.S_SUB_U32 +S_ADD_I32 = SrcEnum.S_ADD_I32 +S_SUB_I32 = SrcEnum.S_SUB_I32 +S_ADDC_U32 = SrcEnum.S_ADDC_U32 +S_SUBB_U32 = SrcEnum.S_SUBB_U32 +S_MIN_I32 = SrcEnum.S_MIN_I32 +FLAT_SCRATCH_LO = SrcEnum.FLAT_SCRATCH_LO +FLAT_SCRATCH_HI = SrcEnum.FLAT_SCRATCH_HI +XNACK_MASK_LO = SrcEnum.XNACK_MASK_LO +XNACK_MASK_HI = SrcEnum.XNACK_MASK_HI +VCC_LO = SrcEnum.VCC_LO +VCC_HI = SrcEnum.VCC_HI +M0 = SrcEnum.M0 +EXEC_LO = SrcEnum.EXEC_LO +EXEC_HI = SrcEnum.EXEC_HI +ZERO = SrcEnum.ZERO +DPP8FI = SrcEnum.DPP8FI +SHARED_BASE = SrcEnum.SHARED_BASE +SHARED_LIMIT = SrcEnum.SHARED_LIMIT +PRIVATE_BASE = SrcEnum.PRIVATE_BASE +PRIVATE_LIMIT = SrcEnum.PRIVATE_LIMIT +RESERVED = SrcEnum.RESERVED +POS_HALF = SrcEnum.POS_HALF +NEG_HALF = SrcEnum.NEG_HALF +POS_ONE = SrcEnum.POS_ONE +NEG_ONE = SrcEnum.NEG_ONE +POS_TWO = SrcEnum.POS_TWO +NEG_TWO = SrcEnum.NEG_TWO +POS_FOUR = SrcEnum.POS_FOUR +NEG_FOUR = SrcEnum.NEG_FOUR +INV_2PI = SrcEnum.INV_2PI +VCCZ = SrcEnum.VCCZ +EXECZ = SrcEnum.EXECZ +SCC = SrcEnum.SCC +LDS_DIRECT = SrcEnum.LDS_DIRECT \ No newline at end of file diff --git a/extra/assembly/amd/autogen/cdna4/gen_pcode.py b/extra/assembly/amd/autogen/cdna4/gen_pcode.py new file mode 100644 index 0000000000..a039acb815 --- /dev/null +++ b/extra/assembly/amd/autogen/cdna4/gen_pcode.py @@ -0,0 +1,1630 @@ +# autogenerated by pcode.py - do not edit +# to regenerate: python -m extra.assembly.amd.pcode --arch cdna4 +# ruff: noqa: E501,F405,F403 +# mypy: ignore-errors +from extra.assembly.amd.autogen.cdna4 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp +from extra.assembly.amd.pcode import * + +def _SOP1Op_S_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _SOP1Op_S_MOV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b64 = S0.b64 + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.b64 = S0.b64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _SOP1Op_S_CMOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if SCC then + # D0.b32 = S0.b32 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + SCC = Reg(scc) + # --- compiled pseudocode --- + if SCC: + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP1Op_S_CMOV_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # if SCC then + # D0.b64 = S0.b64 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + SCC = Reg(scc) + # --- compiled pseudocode --- + if SCC: + D0.b64 = S0.b64 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + result['d0_64'] = True + return result + +SOP1Op_FUNCTIONS = { + SOP1Op.S_MOV_B32: _SOP1Op_S_MOV_B32, + SOP1Op.S_MOV_B64: _SOP1Op_S_MOV_B64, + SOP1Op.S_CMOV_B32: _SOP1Op_S_CMOV_B32, + SOP1Op.S_CMOV_B64: _SOP1Op_S_CMOV_B64, +} + +def _SOP2Op_S_ADD_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32); + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32)) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_SUB_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32; + # SCC = S1.u32 > S0.u32 ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32) + SCC = Reg(((1) if (S1.u32 > S0.u32) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_ADD_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.i32 + S1.i32; + # SCC = ((S0.u32[31] == S1.u32[31]) && (S0.u32[31] != tmp.u32[31])); + # D0.i32 = tmp.i32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg(S0.i32 + S1.i32) + SCC = Reg(((S0.u32[31] == S1.u32[31]) and (S0.u32[31] != tmp.u32[31]))) + D0.i32 = tmp.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_SUB_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.i32 - S1.i32; + # SCC = ((S0.u32[31] != S1.u32[31]) && (S0.u32[31] != tmp.u32[31])); + # D0.i32 = tmp.i32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg(S0.i32 - S1.i32) + SCC = Reg(((S0.u32[31] != S1.u32[31]) and (S0.u32[31] != tmp.u32[31]))) + D0.i32 = tmp.i32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_ADDC_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = 64'U(S0.u32) + 64'U(S1.u32) + SCC.u64; + # SCC = tmp >= 0x100000000ULL ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg((S0.u32) + (S1.u32) + SCC.u64) + SCC = Reg(((1) if (tmp >= 0x100000000) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_SUBB_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp = S0.u32 - S1.u32 - SCC.u32; + # SCC = 64'U(S1.u32) + SCC.u64 > 64'U(S0.u32) ? 1'1U : 1'0U; + # D0.u32 = tmp.u32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + SCC = Reg(scc) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp = Reg(S0.u32 - S1.u32 - SCC.u32) + SCC = Reg(((1) if ((S1.u32) + SCC.u64 > (S0.u32)) else (0))) + D0.u32 = tmp.u32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': SCC._val & 1} + return result + +def _SOP2Op_S_PACK_LL_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[15 : 0].u16, S0[15 : 0].u16 } + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[15 : 0].u16, S0[15 : 0].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _SOP2Op_S_PACK_LH_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[31 : 16].u16, S0[15 : 0].u16 } + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[31 : 16].u16, S0[15 : 0].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _SOP2Op_S_PACK_HH_B32_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0 = { S1[31 : 16].u16, S0[31 : 16].u16 } + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0 = Reg(_pack(S1[31 : 16].u16, S0[31 : 16].u16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +SOP2Op_FUNCTIONS = { + SOP2Op.S_ADD_U32: _SOP2Op_S_ADD_U32, + SOP2Op.S_SUB_U32: _SOP2Op_S_SUB_U32, + SOP2Op.S_ADD_I32: _SOP2Op_S_ADD_I32, + SOP2Op.S_SUB_I32: _SOP2Op_S_SUB_I32, + SOP2Op.S_ADDC_U32: _SOP2Op_S_ADDC_U32, + SOP2Op.S_SUBB_U32: _SOP2Op_S_SUBB_U32, + SOP2Op.S_PACK_LL_B32_B16: _SOP2Op_S_PACK_LL_B32_B16, + SOP2Op.S_PACK_LH_B32_B16: _SOP2Op_S_PACK_LH_B32_B16, + SOP2Op.S_PACK_HH_B32_B16: _SOP2Op_S_PACK_HH_B32_B16, +} + +def _SOPCOp_S_CMP_EQ_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 == S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 == S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LG_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 <> S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 != S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_GT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 > S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 > S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_GE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 >= S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 >= S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LT_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 < S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 < S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LE_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.i32 <= S1.i32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.i32 <= S1.i32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_EQ_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 == S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 == S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LG_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 <> S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 != S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_GT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 > S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 > S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_GE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 >= S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 >= S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LT_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 < S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 < S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_CMP_LE_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32 <= S1.u32 + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32 <= S1.u32) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_BITCMP0_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32[S1.u32[4 : 0]] == 1'0U + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32[S1.u32[4 : 0]] == 0) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_BITCMP1_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u32[S1.u32[4 : 0]] == 1'1U + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u32[S1.u32[4 : 0]] == 1) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_BITCMP0_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64[S1.u32[5 : 0]] == 1'0U + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u64[S1.u32[5 : 0]] == 0) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +def _SOPCOp_S_BITCMP1_B64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # SCC = S0.u64[S1.u32[5 : 0]] == 1'1U + S0 = Reg(s0) + S1 = Reg(s1) + SCC = Reg(scc) + # --- compiled pseudocode --- + SCC = Reg(S0.u64[S1.u32[5 : 0]] == 1) + # --- end pseudocode --- + result = {'d0': d0, 'scc': SCC._val & 1} + return result + +SOPCOp_FUNCTIONS = { + SOPCOp.S_CMP_EQ_I32: _SOPCOp_S_CMP_EQ_I32, + SOPCOp.S_CMP_LG_I32: _SOPCOp_S_CMP_LG_I32, + SOPCOp.S_CMP_GT_I32: _SOPCOp_S_CMP_GT_I32, + SOPCOp.S_CMP_GE_I32: _SOPCOp_S_CMP_GE_I32, + SOPCOp.S_CMP_LT_I32: _SOPCOp_S_CMP_LT_I32, + SOPCOp.S_CMP_LE_I32: _SOPCOp_S_CMP_LE_I32, + SOPCOp.S_CMP_EQ_U32: _SOPCOp_S_CMP_EQ_U32, + SOPCOp.S_CMP_LG_U32: _SOPCOp_S_CMP_LG_U32, + SOPCOp.S_CMP_GT_U32: _SOPCOp_S_CMP_GT_U32, + SOPCOp.S_CMP_GE_U32: _SOPCOp_S_CMP_GE_U32, + SOPCOp.S_CMP_LT_U32: _SOPCOp_S_CMP_LT_U32, + SOPCOp.S_CMP_LE_U32: _SOPCOp_S_CMP_LE_U32, + SOPCOp.S_BITCMP0_B32: _SOPCOp_S_BITCMP0_B32, + SOPCOp.S_BITCMP1_B32: _SOPCOp_S_BITCMP1_B32, + SOPCOp.S_BITCMP0_B64: _SOPCOp_S_BITCMP0_B64, + SOPCOp.S_BITCMP1_B64: _SOPCOp_S_BITCMP1_B64, +} + +def _SOPKOp_S_MOVK_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = 32'I(signext(S0.i16)) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.i32 = (signext(S0.i16)) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +SOPKOp_FUNCTIONS = { + SOPKOp.S_MOVK_I32: _SOPKOp_S_MOVK_I32, +} + +def _SOPPOp_S_NOP(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # for i in 0U : SIMM16.u16[3 : 0].u32 do + # endfor + SIMM16 = Reg(literal) + # --- compiled pseudocode --- + for i in range(0, int(SIMM16.u16[3 : 0].u32)+1): + pass + # --- end pseudocode --- + result = {'d0': d0, 'scc': scc & 1} + return result + +SOPPOp_FUNCTIONS = { + SOPPOp.S_NOP: _SOPPOp_S_NOP, +} + +def _VOP1Op_V_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare lane : 32'I; + # if EXEC == 0x0LL then + # lane = 0; + # // Force lane 0 if all lanes are disabled + # else + # lane = s_ff1_i32_b64(EXEC); + # // Lowest active lane + # endif; + # D0.b32 = VGPR[lane][SRC0.u32] + D0 = Reg(d0) + EXEC = Reg(exec_mask) + SRC0 = Reg(src0_idx) + # --- compiled pseudocode --- + if EXEC == 0x0: + lane = 0 + else: + lane = s_ff1_i32_b64(EXEC) + D0.b32 = VGPR[lane][SRC0.u32] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP1Op_V_CVT_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f64_to_i32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.i32 = f64_to_i32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F64_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = i32_to_f64(S0.i32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = i32_to_f64(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_CVT_F32_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = i32_to_f32(S0.i32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = i32_to_f32(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0.u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_U32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f32_to_u32(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.u32 = f32_to_u32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = f32_to_f16(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f16 = f32_to_f16(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0.f16) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f64_to_f32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = f64_to_f32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F64_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = f32_to_f64(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = f32_to_f64(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_CVT_F32_UBYTE0(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[7 : 0].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[7 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_UBYTE1(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[15 : 8].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[15 : 8].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_UBYTE2(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[23 : 16].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[23 : 16].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F32_UBYTE3(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[31 : 24].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[31 : 24].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_U32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f64_to_u32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.u32 = f64_to_u32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP1Op_V_CVT_F64_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = u32_to_f64(S0.u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = u32_to_f64(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_TRUNC_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_CEIL_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += 1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 > 0.0) and (S0.f64 != D0.f64)): + D0.f64 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_RNDNE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = floor(S0.f64 + 0.5); + # if (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then + # D0.f64 -= 1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = floor(S0.f64 + 0.5) + if (isEven(floor(S0.f64)) and (fract(S0.f64) == 0.5)): + D0.f64 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP1Op_V_FLOOR_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += -1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 < 0.0) and (S0.f64 != D0.f64)): + D0.f64 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +VOP1Op_FUNCTIONS = { + VOP1Op.V_MOV_B32: _VOP1Op_V_MOV_B32, + VOP1Op.V_READFIRSTLANE_B32: _VOP1Op_V_READFIRSTLANE_B32, + VOP1Op.V_CVT_I32_F64: _VOP1Op_V_CVT_I32_F64, + VOP1Op.V_CVT_F64_I32: _VOP1Op_V_CVT_F64_I32, + VOP1Op.V_CVT_F32_I32: _VOP1Op_V_CVT_F32_I32, + VOP1Op.V_CVT_F32_U32: _VOP1Op_V_CVT_F32_U32, + VOP1Op.V_CVT_U32_F32: _VOP1Op_V_CVT_U32_F32, + VOP1Op.V_CVT_I32_F32: _VOP1Op_V_CVT_I32_F32, + VOP1Op.V_CVT_F16_F32: _VOP1Op_V_CVT_F16_F32, + VOP1Op.V_CVT_F32_F16: _VOP1Op_V_CVT_F32_F16, + VOP1Op.V_CVT_F32_F64: _VOP1Op_V_CVT_F32_F64, + VOP1Op.V_CVT_F64_F32: _VOP1Op_V_CVT_F64_F32, + VOP1Op.V_CVT_F32_UBYTE0: _VOP1Op_V_CVT_F32_UBYTE0, + VOP1Op.V_CVT_F32_UBYTE1: _VOP1Op_V_CVT_F32_UBYTE1, + VOP1Op.V_CVT_F32_UBYTE2: _VOP1Op_V_CVT_F32_UBYTE2, + VOP1Op.V_CVT_F32_UBYTE3: _VOP1Op_V_CVT_F32_UBYTE3, + VOP1Op.V_CVT_U32_F64: _VOP1Op_V_CVT_U32_F64, + VOP1Op.V_CVT_F64_U32: _VOP1Op_V_CVT_F64_U32, + VOP1Op.V_TRUNC_F64: _VOP1Op_V_TRUNC_F64, + VOP1Op.V_CEIL_F64: _VOP1Op_V_CEIL_F64, + VOP1Op.V_RNDNE_F64: _VOP1Op_V_RNDNE_F64, + VOP1Op.V_FLOOR_F64: _VOP1Op_V_FLOOR_F64, +} + +def _VOP2Op_V_PK_FMAC_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, D0[15 : 0].f16); + # D0[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, D0[31 : 16].f16) + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, D0[15 : 0].f16) + D0[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, D0[31 : 16].f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +VOP2Op_FUNCTIONS = { + VOP2Op.V_PK_FMAC_F16: _VOP2Op_V_PK_FMAC_F16, +} + +def _VOP3Op_V_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.b32 = S0.b32 + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.b32 = S0.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_READFIRSTLANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare lane : 32'I; + # if EXEC == 0x0LL then + # lane = 0; + # // Force lane 0 if all lanes are disabled + # else + # lane = s_ff1_i32_b64(EXEC); + # // Lowest active lane + # endif; + # D0.b32 = VGPR[lane][SRC0.u32] + D0 = Reg(d0) + EXEC = Reg(exec_mask) + SRC0 = Reg(src0_idx) + # --- compiled pseudocode --- + if EXEC == 0x0: + lane = 0 + else: + lane = s_ff1_i32_b64(EXEC) + D0.b32 = VGPR[lane][SRC0.u32] + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if EXEC._val != exec_mask: result['exec'] = EXEC._val + return result + +def _VOP3Op_V_CVT_I32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f64_to_i32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.i32 = f64_to_i32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F64_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = i32_to_f64(S0.i32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = i32_to_f64(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_CVT_F32_I32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = i32_to_f32(S0.i32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = i32_to_f32(S0.i32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0.u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_U32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f32_to_u32(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.u32 = f32_to_u32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_I32_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.i32 = f32_to_i32(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.i32 = f32_to_i32(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F16_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f16 = f32_to_f16(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f16 = f32_to_f16(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f16_to_f32(S0.f16) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = f16_to_f32(S0.f16) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = f64_to_f32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = f64_to_f32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F64_F32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = f32_to_f64(S0.f32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = f32_to_f64(S0.f32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_CVT_F32_UBYTE0(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[7 : 0].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[7 : 0].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_UBYTE1(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[15 : 8].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[15 : 8].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_UBYTE2(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[23 : 16].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[23 : 16].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F32_UBYTE3(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f32 = u32_to_f32(S0[31 : 24].u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f32 = u32_to_f32(S0[31 : 24].u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_U32_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.u32 = f64_to_u32(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.u32 = f64_to_u32(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3Op_V_CVT_F64_U32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = u32_to_f64(S0.u32) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = u32_to_f64(S0.u32) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_TRUNC_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64) + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_CEIL_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 > 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += 1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 > 0.0) and (S0.f64 != D0.f64)): + D0.f64 += 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_RNDNE_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = floor(S0.f64 + 0.5); + # if (isEven(floor(S0.f64)) && (fract(S0.f64) == 0.5)) then + # D0.f64 -= 1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = floor(S0.f64 + 0.5) + if (isEven(floor(S0.f64)) and (fract(S0.f64) == 0.5)): + D0.f64 -= 1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +def _VOP3Op_V_FLOOR_F64(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # D0.f64 = trunc(S0.f64); + # if ((S0.f64 < 0.0) && (S0.f64 != D0.f64)) then + # D0.f64 += -1.0 + # endif + S0 = Reg(s0) + D0 = Reg(d0) + # --- compiled pseudocode --- + D0.f64 = trunc(S0.f64) + if ((S0.f64 < 0.0) and (S0.f64 != D0.f64)): + D0.f64 += -1.0 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + result['d0_64'] = True + return result + +VOP3Op_FUNCTIONS = { + VOP3Op.V_MOV_B32: _VOP3Op_V_MOV_B32, + VOP3Op.V_READFIRSTLANE_B32: _VOP3Op_V_READFIRSTLANE_B32, + VOP3Op.V_CVT_I32_F64: _VOP3Op_V_CVT_I32_F64, + VOP3Op.V_CVT_F64_I32: _VOP3Op_V_CVT_F64_I32, + VOP3Op.V_CVT_F32_I32: _VOP3Op_V_CVT_F32_I32, + VOP3Op.V_CVT_F32_U32: _VOP3Op_V_CVT_F32_U32, + VOP3Op.V_CVT_U32_F32: _VOP3Op_V_CVT_U32_F32, + VOP3Op.V_CVT_I32_F32: _VOP3Op_V_CVT_I32_F32, + VOP3Op.V_CVT_F16_F32: _VOP3Op_V_CVT_F16_F32, + VOP3Op.V_CVT_F32_F16: _VOP3Op_V_CVT_F32_F16, + VOP3Op.V_CVT_F32_F64: _VOP3Op_V_CVT_F32_F64, + VOP3Op.V_CVT_F64_F32: _VOP3Op_V_CVT_F64_F32, + VOP3Op.V_CVT_F32_UBYTE0: _VOP3Op_V_CVT_F32_UBYTE0, + VOP3Op.V_CVT_F32_UBYTE1: _VOP3Op_V_CVT_F32_UBYTE1, + VOP3Op.V_CVT_F32_UBYTE2: _VOP3Op_V_CVT_F32_UBYTE2, + VOP3Op.V_CVT_F32_UBYTE3: _VOP3Op_V_CVT_F32_UBYTE3, + VOP3Op.V_CVT_U32_F64: _VOP3Op_V_CVT_U32_F64, + VOP3Op.V_CVT_F64_U32: _VOP3Op_V_CVT_F64_U32, + VOP3Op.V_TRUNC_F64: _VOP3Op_V_TRUNC_F64, + VOP3Op.V_CEIL_F64: _VOP3Op_V_CEIL_F64, + VOP3Op.V_RNDNE_F64: _VOP3Op_V_RNDNE_F64, + VOP3Op.V_FLOOR_F64: _VOP3Op_V_FLOOR_F64, +} + +def _VOP3POp_V_PK_MAD_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = S0[15 : 0].i16 * S1[15 : 0].i16 + S2[15 : 0].i16; + # tmp[31 : 16].i16 = S0[31 : 16].i16 * S1[31 : 16].i16 + S2[31 : 16].i16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + S2 = Reg(s2) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = S0[15 : 0].i16 * S1[15 : 0].i16 + S2[15 : 0].i16 + tmp[31 : 16].i16 = S0[31 : 16].i16 * S1[31 : 16].i16 + S2[31 : 16].i16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MUL_LO_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16; + # tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16; + # D0.b32 = tmp.b32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_ADD_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = S0[15 : 0].i16 + S1[15 : 0].i16; + # tmp[31 : 16].i16 = S0[31 : 16].i16 + S1[31 : 16].i16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = S0[15 : 0].i16 + S1[15 : 0].i16 + tmp[31 : 16].i16 = S0[31 : 16].i16 + S1[31 : 16].i16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_SUB_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = S0[15 : 0].i16 - S1[15 : 0].i16; + # tmp[31 : 16].i16 = S0[31 : 16].i16 - S1[31 : 16].i16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = S0[15 : 0].i16 - S1[15 : 0].i16 + tmp[31 : 16].i16 = S0[31 : 16].i16 - S1[31 : 16].i16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_LSHLREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = (S1[31 : 16].u16 << S0.u32[19 : 16].u32); + # tmp[15 : 0].u16 = (S1[15 : 0].u16 << S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = (S1[31 : 16].u16 << S0.u32[19 : 16].u32) + tmp[15 : 0].u16 = (S1[15 : 0].u16 << S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_LSHRREV_B16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].u16 = (S1[31 : 16].u16 >> S0.u32[19 : 16].u32); + # tmp[15 : 0].u16 = (S1[15 : 0].u16 >> S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[31 : 16].u16 = (S1[31 : 16].u16 >> S0.u32[19 : 16].u32) + tmp[15 : 0].u16 = (S1[15 : 0].u16 >> S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_ASHRREV_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # tmp[31 : 16].i16 = (S1[31 : 16].i16 >> S0.u32[19 : 16].u32); + # tmp[15 : 0].i16 = (S1[15 : 0].i16 >> S0.u32[3 : 0].u32); + # D0.b32 = tmp.b32 + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[31 : 16].i16 = (S1[31 : 16].i16 >> S0.u32[19 : 16].u32) + tmp[15 : 0].i16 = (S1[15 : 0].i16 >> S0.u32[3 : 0].u32) + D0.b32 = tmp.b32 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MAX_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = S0[15 : 0].i16 >= S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16; + # tmp[31 : 16].i16 = S0[31 : 16].i16 >= S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = ((S0[15 : 0].i16) if (S0[15 : 0].i16 >= S1[15 : 0].i16) else (S1[15 : 0].i16)) + tmp[31 : 16].i16 = ((S0[31 : 16].i16) if (S0[31 : 16].i16 >= S1[31 : 16].i16) else (S1[31 : 16].i16)) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MIN_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].i16 = S0[15 : 0].i16 < S1[15 : 0].i16 ? S0[15 : 0].i16 : S1[15 : 0].i16; + # tmp[31 : 16].i16 = S0[31 : 16].i16 < S1[31 : 16].i16 ? S0[31 : 16].i16 : S1[31 : 16].i16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].i16 = ((S0[15 : 0].i16) if (S0[15 : 0].i16 < S1[15 : 0].i16) else (S1[15 : 0].i16)) + tmp[31 : 16].i16 = ((S0[31 : 16].i16) if (S0[31 : 16].i16 < S1[31 : 16].i16) else (S1[31 : 16].i16)) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MAD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + S2[15 : 0].u16; + # tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + S2[31 : 16].u16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + S2 = Reg(s2) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = S0[15 : 0].u16 * S1[15 : 0].u16 + S2[15 : 0].u16 + tmp[31 : 16].u16 = S0[31 : 16].u16 * S1[31 : 16].u16 + S2[31 : 16].u16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_ADD_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = S0[15 : 0].u16 + S1[15 : 0].u16; + # tmp[31 : 16].u16 = S0[31 : 16].u16 + S1[31 : 16].u16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = S0[15 : 0].u16 + S1[15 : 0].u16 + tmp[31 : 16].u16 = S0[31 : 16].u16 + S1[31 : 16].u16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_SUB_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = S0[15 : 0].u16 - S1[15 : 0].u16; + # tmp[31 : 16].u16 = S0[31 : 16].u16 - S1[31 : 16].u16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = S0[15 : 0].u16 - S1[15 : 0].u16 + tmp[31 : 16].u16 = S0[31 : 16].u16 - S1[31 : 16].u16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MAX_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = S0[15 : 0].u16 >= S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16; + # tmp[31 : 16].u16 = S0[31 : 16].u16 >= S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = ((S0[15 : 0].u16) if (S0[15 : 0].u16 >= S1[15 : 0].u16) else (S1[15 : 0].u16)) + tmp[31 : 16].u16 = ((S0[31 : 16].u16) if (S0[31 : 16].u16 >= S1[31 : 16].u16) else (S1[31 : 16].u16)) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MIN_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].u16 = S0[15 : 0].u16 < S1[15 : 0].u16 ? S0[15 : 0].u16 : S1[15 : 0].u16; + # tmp[31 : 16].u16 = S0[31 : 16].u16 < S1[31 : 16].u16 ? S0[31 : 16].u16 : S1[31 : 16].u16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].u16 = ((S0[15 : 0].u16) if (S0[15 : 0].u16 < S1[15 : 0].u16) else (S1[15 : 0].u16)) + tmp[31 : 16].u16 = ((S0[31 : 16].u16) if (S0[31 : 16].u16 < S1[31 : 16].u16) else (S1[31 : 16].u16)) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_FMA_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, S2[15 : 0].f16); + # tmp[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, S2[31 : 16].f16); + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + S2 = Reg(s2) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].f16 = fma(S0[15 : 0].f16, S1[15 : 0].f16, S2[15 : 0].f16) + tmp[31 : 16].f16 = fma(S0[31 : 16].f16, S1[31 : 16].f16, S2[31 : 16].f16) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_ADD_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].f16 = S0[15 : 0].f16 + S1[15 : 0].f16; + # tmp[31 : 16].f16 = S0[31 : 16].f16 + S1[31 : 16].f16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].f16 = S0[15 : 0].f16 + S1[15 : 0].f16 + tmp[31 : 16].f16 = S0[31 : 16].f16 + S1[31 : 16].f16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MUL_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].f16 = S0[15 : 0].f16 * S1[15 : 0].f16; + # tmp[31 : 16].f16 = S0[31 : 16].f16 * S1[31 : 16].f16; + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].f16 = S0[15 : 0].f16 * S1[15 : 0].f16 + tmp[31 : 16].f16 = S0[31 : 16].f16 * S1[31 : 16].f16 + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MIN_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].f16 = v_min_f16(S0[15 : 0].f16, S1[15 : 0].f16); + # tmp[31 : 16].f16 = v_min_f16(S0[31 : 16].f16, S1[31 : 16].f16); + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].f16 = v_min_f16(S0[15 : 0].f16, S1[15 : 0].f16) + tmp[31 : 16].f16 = v_min_f16(S0[31 : 16].f16, S1[31 : 16].f16) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +def _VOP3POp_V_PK_MAX_F16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # declare tmp : 32'B; + # tmp[15 : 0].f16 = v_max_f16(S0[15 : 0].f16, S1[15 : 0].f16); + # tmp[31 : 16].f16 = v_max_f16(S0[31 : 16].f16, S1[31 : 16].f16); + # D0.b32 = tmp + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + tmp = Reg(0) + # --- compiled pseudocode --- + tmp[15 : 0].f16 = v_max_f16(S0[15 : 0].f16, S1[15 : 0].f16) + tmp[31 : 16].f16 = v_max_f16(S0[31 : 16].f16, S1[31 : 16].f16) + D0.b32 = tmp + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + return result + +VOP3POp_FUNCTIONS = { + VOP3POp.V_PK_MAD_I16: _VOP3POp_V_PK_MAD_I16, + VOP3POp.V_PK_MUL_LO_U16: _VOP3POp_V_PK_MUL_LO_U16, + VOP3POp.V_PK_ADD_I16: _VOP3POp_V_PK_ADD_I16, + VOP3POp.V_PK_SUB_I16: _VOP3POp_V_PK_SUB_I16, + VOP3POp.V_PK_LSHLREV_B16: _VOP3POp_V_PK_LSHLREV_B16, + VOP3POp.V_PK_LSHRREV_B16: _VOP3POp_V_PK_LSHRREV_B16, + VOP3POp.V_PK_ASHRREV_I16: _VOP3POp_V_PK_ASHRREV_I16, + VOP3POp.V_PK_MAX_I16: _VOP3POp_V_PK_MAX_I16, + VOP3POp.V_PK_MIN_I16: _VOP3POp_V_PK_MIN_I16, + VOP3POp.V_PK_MAD_U16: _VOP3POp_V_PK_MAD_U16, + VOP3POp.V_PK_ADD_U16: _VOP3POp_V_PK_ADD_U16, + VOP3POp.V_PK_SUB_U16: _VOP3POp_V_PK_SUB_U16, + VOP3POp.V_PK_MAX_U16: _VOP3POp_V_PK_MAX_U16, + VOP3POp.V_PK_MIN_U16: _VOP3POp_V_PK_MIN_U16, + VOP3POp.V_PK_FMA_F16: _VOP3POp_V_PK_FMA_F16, + VOP3POp.V_PK_ADD_F16: _VOP3POp_V_PK_ADD_F16, + VOP3POp.V_PK_MUL_F16: _VOP3POp_V_PK_MUL_F16, + VOP3POp.V_PK_MIN_F16: _VOP3POp_V_PK_MIN_F16, + VOP3POp.V_PK_MAX_F16: _VOP3POp_V_PK_MAX_F16, +} + +def _VOPCOp_V_CMPX_LT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 < S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 < S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_EQ_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 == S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 == S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_LE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 <= S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 <= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_GT_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 > S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 > S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_NE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 <> S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 != S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_GE_I16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 >= S1.i16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.i16 >= S1.i16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_LT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 < S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 < S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_EQ_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # Set the per-lane condition code to 1 iff the first input is equal to the second input. Store the result into the EXEC + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 == S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 == S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_LE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 <= S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 <= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_GT_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 > S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 > S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_NE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC mask and to VCC or a scalar register. + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 <> S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 != S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +def _VOPCOp_V_CMPX_GE_U16(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + # EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 >= S1.u16; + # // D0 = VCC in VOPC encoding. + S0 = Reg(s0) + S1 = Reg(s1) + D0 = Reg(d0) + VCC = Reg(vcc) + EXEC = Reg(exec_mask) + laneId = lane + # --- compiled pseudocode --- + EXEC.u64[laneId] = D0.u64[laneId] = S0.u16 >= S1.u16 + # --- end pseudocode --- + result = {'d0': D0._val, 'scc': scc & 1} + if VCC._val != vcc: result['vcc_lane'] = (VCC._val >> lane) & 1 + result['exec_lane'] = (EXEC._val >> lane) & 1 + result['vcc_lane'] = (D0._val >> lane) & 1 + result['d0_64'] = True + return result + +VOPCOp_FUNCTIONS = { + VOPCOp.V_CMPX_LT_I16: _VOPCOp_V_CMPX_LT_I16, + VOPCOp.V_CMPX_EQ_I16: _VOPCOp_V_CMPX_EQ_I16, + VOPCOp.V_CMPX_LE_I16: _VOPCOp_V_CMPX_LE_I16, + VOPCOp.V_CMPX_GT_I16: _VOPCOp_V_CMPX_GT_I16, + VOPCOp.V_CMPX_NE_I16: _VOPCOp_V_CMPX_NE_I16, + VOPCOp.V_CMPX_GE_I16: _VOPCOp_V_CMPX_GE_I16, + VOPCOp.V_CMPX_LT_U16: _VOPCOp_V_CMPX_LT_U16, + VOPCOp.V_CMPX_EQ_U16: _VOPCOp_V_CMPX_EQ_U16, + VOPCOp.V_CMPX_LE_U16: _VOPCOp_V_CMPX_LE_U16, + VOPCOp.V_CMPX_GT_U16: _VOPCOp_V_CMPX_GT_U16, + VOPCOp.V_CMPX_NE_U16: _VOPCOp_V_CMPX_NE_U16, + VOPCOp.V_CMPX_GE_U16: _VOPCOp_V_CMPX_GE_U16, +} + + +# V_WRITELANE_B32: Write scalar to specific lane's VGPR (not in PDF pseudocode) +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): + wr_lane = s1 & 0x1f # lane select (5 bits for wave32) + return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} +VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32 + +COMPILED_FUNCTIONS = { + SOP1Op: SOP1Op_FUNCTIONS, + SOP2Op: SOP2Op_FUNCTIONS, + SOPCOp: SOPCOp_FUNCTIONS, + SOPKOp: SOPKOp_FUNCTIONS, + SOPPOp: SOPPOp_FUNCTIONS, + VOP1Op: VOP1Op_FUNCTIONS, + VOP2Op: VOP2Op_FUNCTIONS, + VOP3Op: VOP3Op_FUNCTIONS, + VOP3POp: VOP3POp_FUNCTIONS, + VOPCOp: VOPCOp_FUNCTIONS, +} + +def get_compiled_functions(): return COMPILED_FUNCTIONS \ No newline at end of file diff --git a/extra/assembly/rdna3/autogen/__init__.py b/extra/assembly/amd/autogen/rdna3/__init__.py similarity index 99% rename from extra/assembly/rdna3/autogen/__init__.py rename to extra/assembly/amd/autogen/rdna3/__init__.py index 05480ee0be..3c9955a9a5 100644 --- a/extra/assembly/rdna3/autogen/__init__.py +++ b/extra/assembly/amd/autogen/rdna3/__init__.py @@ -1,7 +1,7 @@ # autogenerated from AMD RDNA3.5 ISA PDF by lib.py - do not edit from enum import IntEnum from typing import Annotated -from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField +from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField import functools class SrcEnum(IntEnum): diff --git a/extra/assembly/rdna3/autogen/gen_pcode.py b/extra/assembly/amd/autogen/rdna3/gen_pcode.py similarity index 99% rename from extra/assembly/rdna3/autogen/gen_pcode.py rename to extra/assembly/amd/autogen/rdna3/gen_pcode.py index 9366cffdd0..7eb6362cad 100644 --- a/extra/assembly/rdna3/autogen/gen_pcode.py +++ b/extra/assembly/amd/autogen/rdna3/gen_pcode.py @@ -1,9 +1,9 @@ # autogenerated by pcode.py - do not edit -# to regenerate: python -m extra.assembly.rdna3.pcode +# to regenerate: python -m extra.assembly.amd.pcode --arch rdna3 # ruff: noqa: E501,F405,F403 # mypy: ignore-errors -from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp -from extra.assembly.rdna3.pcode import * +from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp +from extra.assembly.amd.pcode import * def _SOP1Op_S_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): # D0.b32 = S0.b32 diff --git a/extra/assembly/rdna3/emu.py b/extra/assembly/amd/emu.py similarity index 99% rename from extra/assembly/rdna3/emu.py rename to extra/assembly/amd/emu.py index 1e1ecebc35..521b602e57 100644 --- a/extra/assembly/rdna3/emu.py +++ b/extra/assembly/amd/emu.py @@ -2,10 +2,10 @@ # mypy: ignore-errors from __future__ import annotations import ctypes, os -from extra.assembly.rdna3.lib import Inst, RawImm -from extra.assembly.rdna3.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64 -from extra.assembly.rdna3.autogen.gen_pcode import get_compiled_functions -from extra.assembly.rdna3.autogen import ( +from extra.assembly.amd.lib import Inst, RawImm +from extra.assembly.amd.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64 +from extra.assembly.amd.autogen.rdna3.gen_pcode import get_compiled_functions +from extra.assembly.amd.autogen.rdna3 import ( SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD, SrcEnum, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, VOPDOp ) diff --git a/extra/assembly/rdna3/lib.py b/extra/assembly/amd/lib.py similarity index 98% rename from extra/assembly/rdna3/lib.py rename to extra/assembly/amd/lib.py index bfa2400e33..20e066b7de 100644 --- a/extra/assembly/rdna3/lib.py +++ b/extra/assembly/amd/lib.py @@ -217,7 +217,7 @@ class Inst: # op may be an enum (from __init__) or an int (from from_int) op_name = op.name if hasattr(op, 'name') else None if op_name is None and self.__class__.__name__ == 'VOP3': - from extra.assembly.rdna3.autogen import VOP3Op + from extra.assembly.amd.autogen.rdna3 import VOP3Op try: op_name = VOP3Op(op).name except ValueError: pass if op_name is None: return False @@ -277,7 +277,7 @@ class Inst: def __hash__(self): return hash((self.__class__.__name__, tuple(sorted((k, repr(v)) for k, v in self._values.items())), self._literal)) def disasm(self) -> str: - from extra.assembly.rdna3.asm import disasm + from extra.assembly.amd.asm import disasm return disasm(self) class Inst32(Inst): pass @@ -459,7 +459,7 @@ def generate(output_path: str | None = None, arch: str = "rdna3") -> dict: def field_key(f): return order.index(f[0].lower()) if f[0].lower() in order else 1000 lines = [f"# autogenerated from AMD {doc_name} ISA PDF by lib.py - do not edit", "from enum import IntEnum", "from typing import Annotated", - "from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField", + "from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField", "import functools", ""] lines += enum_lines("SrcEnum", src_enum) + sum([enum_lines(n, ops) for n, ops in sorted(enums.items())], []) # Format-specific field defaults (verified against LLVM test vectors) @@ -521,5 +521,5 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate instruction definitions from AMD ISA PDF") parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)") args = parser.parse_args() - result = generate("extra/assembly/rdna3/autogen/__init__.py", arch=args.arch) + result = generate(f"extra/assembly/amd/autogen/{args.arch}/__init__.py", arch=args.arch) print(f"generated SrcEnum ({len(result['src_enum'])}) + {len(result['enums'])} opcode enums + {len(result['formats'])} format classes") diff --git a/extra/assembly/rdna3/pcode.py b/extra/assembly/amd/pcode.py similarity index 98% rename from extra/assembly/rdna3/pcode.py rename to extra/assembly/amd/pcode.py index cb0f1967c1..1d6dc0e60c 100644 --- a/extra/assembly/rdna3/pcode.py +++ b/extra/assembly/amd/pcode.py @@ -702,7 +702,7 @@ class ExecContext: # PDF EXTRACTION AND CODE GENERATION # ═══════════════════════════════════════════════════════════════════════════════ -from extra.assembly.rdna3.lib import PDF_URLS +from extra.assembly.amd.lib import PDF_URLS INST_PATTERN = re.compile(r'^([SV]_[A-Z0-9_]+)\s+(\d+)\s*$', re.M) # Patterns that can't be handled by the DSL (require special handling in emu.py) @@ -740,7 +740,7 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict: """Parse pseudocode from PDF for all ops. Returns {enum_cls: {op: pseudocode}}.""" import pdfplumber from tinygrad.helpers import fetch - from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp + from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp] defined_ops = {} @@ -783,10 +783,10 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict: return instructions -def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcode.py", arch: str = "rdna3"): +def generate_gen_pcode(output_path: str = "extra/assembly/amd/autogen/rdna3/gen_pcode.py", arch: str = "rdna3"): """Generate gen_pcode.py - compiled pseudocode functions for the emulator.""" from pathlib import Path - from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp + from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp] @@ -803,12 +803,12 @@ def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcod print(f"Total: {total_found}/{total_ops} ({100*total_found//total_ops}%)") print("\nCompiling to pseudocode functions...") - lines = ['''# autogenerated by pcode.py - do not edit -# to regenerate: python -m extra.assembly.rdna3.pcode + lines = [f'''# autogenerated by pcode.py - do not edit +# to regenerate: python -m extra.assembly.amd.pcode --arch {arch} # ruff: noqa: E501,F405,F403 # mypy: ignore-errors -from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp -from extra.assembly.rdna3.pcode import * +from extra.assembly.amd.autogen.{arch} import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp +from extra.assembly.amd.pcode import * '''] compiled_count, skipped_count = 0, 0 @@ -989,4 +989,4 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate pseudocode functions from AMD ISA PDF") parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)") args = parser.parse_args() - generate_gen_pcode(arch=args.arch) + generate_gen_pcode(output_path=f"extra/assembly/amd/autogen/{args.arch}/gen_pcode.py", arch=args.arch) diff --git a/extra/assembly/rdna3/test/bench_emu.py b/extra/assembly/amd/test/bench_emu.py similarity index 98% rename from extra/assembly/rdna3/test/bench_emu.py rename to extra/assembly/amd/test/bench_emu.py index 9a77ab1ced..1f889329db 100644 --- a/extra/assembly/rdna3/test/bench_emu.py +++ b/extra/assembly/amd/test/bench_emu.py @@ -7,7 +7,7 @@ from typing import Callable # Set AMD=1 before importing tinygrad os.environ["AMD"] = "1" -from extra.assembly.rdna3.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE +from extra.assembly.amd.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so" if not REMU_PATH.exists(): diff --git a/extra/assembly/rdna3/test/external_test_usability.py b/extra/assembly/amd/test/external_test_usability.py similarity index 98% rename from extra/assembly/rdna3/test/external_test_usability.py rename to extra/assembly/amd/test/external_test_usability.py index 5b3827c6c3..4d3c4813e4 100644 --- a/extra/assembly/rdna3/test/external_test_usability.py +++ b/extra/assembly/amd/test/external_test_usability.py @@ -3,8 +3,8 @@ # Currently many of these tests fail - they document desired behavior import unittest -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.lib import Inst, RawImm, SGPR, VGPR +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.lib import Inst, RawImm, SGPR, VGPR class TestRegisterSliceSyntax(unittest.TestCase): """ diff --git a/extra/assembly/rdna3/test/helpers.py b/extra/assembly/amd/test/helpers.py similarity index 100% rename from extra/assembly/rdna3/test/helpers.py rename to extra/assembly/amd/test/helpers.py diff --git a/extra/assembly/rdna3/test/test_compare_emulators.py b/extra/assembly/amd/test/test_compare_emulators.py similarity index 98% rename from extra/assembly/rdna3/test/test_compare_emulators.py rename to extra/assembly/amd/test/test_compare_emulators.py index 9455028e26..8a1f32aa18 100644 --- a/extra/assembly/rdna3/test/test_compare_emulators.py +++ b/extra/assembly/amd/test/test_compare_emulators.py @@ -9,8 +9,8 @@ os.environ["AMD"] = "1" os.environ["MOCKGPU"] = "1" os.environ["PYTHON_REMU"] = "1" -from extra.assembly.rdna3.emu import WaveState, decode_program, step_wave, WAVE_SIZE -from extra.assembly.rdna3.test.helpers import KernelInfo +from extra.assembly.amd.emu import WaveState, decode_program, step_wave, WAVE_SIZE, set_valid_mem_ranges +from extra.assembly.amd.test.helpers import KernelInfo REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so" @@ -223,7 +223,6 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int, int], max_steps: int = 1000, debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]: """Run all kernels through both emulators with shared buffer pool.""" - from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program if buf_data is None: buf_data = {} # Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once) @@ -267,8 +266,6 @@ def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False, global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]: """Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper.""" - from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program - # Allocate buffers buffers = [] for size in buf_sizes: diff --git a/extra/assembly/rdna3/test/test_emu.py b/extra/assembly/amd/test/test_emu.py similarity index 99% rename from extra/assembly/rdna3/test/test_emu.py rename to extra/assembly/amd/test/test_emu.py index 98095d0fde..605ef88eef 100644 --- a/extra/assembly/rdna3/test/test_emu.py +++ b/extra/assembly/amd/test/test_emu.py @@ -6,10 +6,10 @@ Set USE_HW=1 to run on both emulator and real hardware, comparing results. """ import ctypes, unittest, os, struct -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.lib import RawImm -from extra.assembly.rdna3.emu import WaveState, run_asm, set_valid_mem_ranges -from extra.assembly.rdna3.pcode import _i32, _f32 +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.lib import RawImm +from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges +from extra.assembly.amd.pcode import _i32, _f32 VCC = SrcEnum.VCC_LO # For VOP3SD sdst field USE_HW = os.environ.get("USE_HW", "0") == "1" @@ -1776,7 +1776,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_cvt_f16_f32_basic(self): """V_CVT_F16_F32 converts f32 to f16 in low 16 bits.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 instructions = [ v_mov_b32_e32(v[0], 1.0), # f32 1.0 = 0x3f800000 v_cvt_f16_f32_e32(v[1], v[0]), @@ -1789,7 +1789,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_cvt_f16_f32_negative(self): """V_CVT_F16_F32 converts negative f32 to f16.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 instructions = [ v_mov_b32_e32(v[0], -2.0), # f32 -2.0 = 0xc0000000 v_cvt_f16_f32_e32(v[1], v[0]), @@ -1802,7 +1802,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_cvt_f16_f32_small(self): """V_CVT_F16_F32 converts small f32 value.""" - from extra.assembly.rdna3.pcode import _f16, f32_to_f16 + from extra.assembly.amd.pcode import _f16, f32_to_f16 instructions = [ v_mov_b32_e32(v[0], 0.5), v_cvt_f16_f32_e32(v[1], v[0]), @@ -1862,7 +1862,7 @@ class TestF16Conversions(unittest.TestCase): which would produce wrong results when the significant bits of the f32 value are in the upper bits (as they are for most f32 values > 1.0 or < -1.0). """ - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong. # Correct f16 result: 0x3e00 (1.5 in half precision) instructions = [ @@ -1886,7 +1886,7 @@ class TestF16Conversions(unittest.TestCase): is in the name), causing it to read only low 16 bits of the f32 input. This resulted in WMMA receiving zero inputs and producing zero outputs. """ - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # Simulate loading two f32 values and converting/packing for WMMA # f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000 # After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100 @@ -1914,7 +1914,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_pack_b32_f16_basic(self): """V_PACK_B32_F16 packs two f16 values into one 32-bit register.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 instructions = [ # First convert two f32 values to f16 v_mov_b32_e32(v[0], 1.0), # Will become f16 0x3c00 @@ -1934,7 +1934,7 @@ class TestF16Conversions(unittest.TestCase): def test_v_pack_b32_f16_both_positive(self): """V_PACK_B32_F16 packs two positive f16 values.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 instructions = [ v_mov_b32_e32(v[0], 0.5), # f16 0x3800 v_mov_b32_e32(v[2], 2.0), # f16 0x4000 @@ -2186,7 +2186,7 @@ class TestVOP3P(unittest.TestCase): def test_v_pk_add_f16_basic(self): """V_PK_ADD_F16 adds two packed f16 values.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0) # Result should be packed (4.0, 6.0) instructions = [ @@ -2209,7 +2209,7 @@ class TestVOP3P(unittest.TestCase): Inline constants for VOP3P are f16 values in the low 16 bits only. The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant. """ - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # v0 = packed (1.0, 1.0), add POS_ONE # With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant) # But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding @@ -2230,7 +2230,7 @@ class TestVOP3P(unittest.TestCase): def test_v_pk_mul_f16_basic(self): """V_PK_MUL_F16 multiplies two packed f16 values.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0) # Result should be packed (8.0, 15.0) instructions = [ @@ -2251,7 +2251,7 @@ class TestVOP3P(unittest.TestCase): """V_PK_MUL_F16 with inline constant POS_TWO (2.0). Inline constant has value only in low 16 bits, hi is 0. """ - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # v0 = packed (3.0, 4.0), multiply by POS_TWO # lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0) instructions = [ @@ -2268,7 +2268,7 @@ class TestVOP3P(unittest.TestCase): def test_v_pk_fma_f16_basic(self): """V_PK_FMA_F16: D = A * B + C for packed f16.""" - from extra.assembly.rdna3.pcode import _f16 + from extra.assembly.amd.pcode import _f16 # A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0) # Result should be packed (2*4+1=9.0, 3*5+1=16.0) instructions = [ diff --git a/extra/assembly/rdna3/test/test_formats.py b/extra/assembly/amd/test/test_formats.py similarity index 98% rename from extra/assembly/rdna3/test/test_formats.py rename to extra/assembly/amd/test/test_formats.py index c8ae65674f..bae1addfc3 100644 --- a/extra/assembly/rdna3/test/test_formats.py +++ b/extra/assembly/amd/test/test_formats.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """Test MUBUF, MTBUF, MIMG, EXP, DS formats against LLVM.""" import unittest -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.lib import encode_src +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.lib import encode_src class TestMUBUF(unittest.TestCase): """Test MUBUF (buffer) instructions.""" @@ -308,7 +308,7 @@ class TestVOP3Literal(unittest.TestCase): def test_vop3_with_literal(self): # v_add3_u32 v5, vcc_hi, 0xaf123456, v255 # GFX11: encoding: [0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] - from extra.assembly.rdna3.lib import RawImm + from extra.assembly.amd.lib import RawImm inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=RawImm(107), src1=0xaf123456, src2=v[255]) expected = bytes([0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]) self.assertEqual(inst.to_bytes(), expected) @@ -316,14 +316,14 @@ class TestVOP3Literal(unittest.TestCase): def test_vop3_literal_null_operand(self): # v_add3_u32 v5, null, exec_lo, 0xaf123456 # GFX11: encoding: [0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] - from extra.assembly.rdna3.lib import RawImm + from extra.assembly.amd.lib import RawImm inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=NULL, src1=RawImm(126), src2=0xaf123456) expected = bytes([0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]) self.assertEqual(inst.to_bytes(), expected) def test_vop3p_with_literal(self): # Test VOP3P literal encoding (also uses Inst64) - from extra.assembly.rdna3.lib import RawImm + from extra.assembly.amd.lib import RawImm inst = VOP3P(VOP3POp.V_PK_ADD_F16, vdst=v[5], src0=RawImm(240), src1=0x12345678, src2=v[0]) self.assertEqual(len(inst.to_bytes()), 12) # 8 bytes + 4 byte literal diff --git a/extra/assembly/rdna3/test/test_handwritten.py b/extra/assembly/amd/test/test_handwritten.py similarity index 96% rename from extra/assembly/rdna3/test/test_handwritten.py rename to extra/assembly/amd/test/test_handwritten.py index fc0c16a05f..b075ea006a 100644 --- a/extra/assembly/rdna3/test/test_handwritten.py +++ b/extra/assembly/amd/test/test_handwritten.py @@ -2,10 +2,10 @@ # the Inst constructor should be looking at the types of the fields to correctly set the value import unittest, struct -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.lib import Inst -from extra.assembly.rdna3.asm import asm -from extra.assembly.rdna3.test.test_roundtrip import compile_asm +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.lib import Inst +from extra.assembly.amd.asm import asm +from extra.assembly.amd.test.test_roundtrip import compile_asm class TestIntegration(unittest.TestCase): inst: Inst diff --git a/extra/assembly/rdna3/test/test_integration.py b/extra/assembly/amd/test/test_integration.py similarity index 98% rename from extra/assembly/rdna3/test/test_integration.py rename to extra/assembly/amd/test/test_integration.py index 443f5c923c..6c98976423 100644 --- a/extra/assembly/rdna3/test/test_integration.py +++ b/extra/assembly/amd/test/test_integration.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 """Integration test: round-trip RDNA3 assembly through AMD toolchain.""" import unittest, re, io, sys, subprocess -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.asm import waitcnt, asm -from extra.assembly.rdna3.test.helpers import get_llvm_mc +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.asm import waitcnt, asm +from extra.assembly.amd.test.helpers import get_llvm_mc def disassemble(lib: bytes, arch: str = "gfx1100") -> str: """Disassemble ELF binary using tinygrad's compiler, return raw output.""" diff --git a/extra/assembly/rdna3/test/test_llvm.py b/extra/assembly/amd/test/test_llvm.py similarity index 98% rename from extra/assembly/rdna3/test/test_llvm.py rename to extra/assembly/amd/test/test_llvm.py index fec84f02a0..97b1cab756 100644 --- a/extra/assembly/rdna3/test/test_llvm.py +++ b/extra/assembly/amd/test/test_llvm.py @@ -2,9 +2,9 @@ """Test RDNA3 assembler/disassembler against LLVM test vectors.""" import unittest, re, subprocess from tinygrad.helpers import fetch -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.asm import asm -from extra.assembly.rdna3.test.helpers import get_llvm_mc +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.asm import asm +from extra.assembly.amd.test.helpers import get_llvm_mc LLVM_BASE = "https://raw.githubusercontent.com/llvm/llvm-project/main/llvm/test/MC/AMDGPU" diff --git a/extra/assembly/rdna3/test/test_mockgpu_invalid.py b/extra/assembly/amd/test/test_mockgpu_invalid.py similarity index 100% rename from extra/assembly/rdna3/test/test_mockgpu_invalid.py rename to extra/assembly/amd/test/test_mockgpu_invalid.py diff --git a/extra/assembly/rdna3/test/test_pcode.py b/extra/assembly/amd/test/test_pcode.py similarity index 97% rename from extra/assembly/rdna3/test/test_pcode.py rename to extra/assembly/amd/test/test_pcode.py index a1474f3792..b9b9c0395c 100644 --- a/extra/assembly/rdna3/test/test_pcode.py +++ b/extra/assembly/amd/test/test_pcode.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """Tests for the RDNA3 pseudocode DSL.""" import unittest -from extra.assembly.rdna3.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan -from extra.assembly.rdna3.autogen.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32 +from extra.assembly.amd.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan +from extra.assembly.amd.autogen.rdna3.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32 class TestReg(unittest.TestCase): def test_u32_read(self): diff --git a/extra/assembly/rdna3/test/test_pdf_parser.py b/extra/assembly/amd/test/test_pdf_parser.py similarity index 98% rename from extra/assembly/rdna3/test/test_pdf_parser.py rename to extra/assembly/amd/test/test_pdf_parser.py index 1097b525b7..8158ed9651 100644 --- a/extra/assembly/rdna3/test/test_pdf_parser.py +++ b/extra/assembly/amd/test/test_pdf_parser.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Test that PDF parser correctly extracts format fields.""" import unittest, os -from extra.assembly.rdna3.autogen import ( +from extra.assembly.amd.autogen.rdna3 import ( SOP1, SOP2, SOPK, SOPP, VOP1, VOP2, VOP3SD, VOPC, FLAT, VOPD, SOP1Op, SOP2Op, VOP1Op, VOP3Op ) @@ -41,7 +41,7 @@ class TestPDFParserGenerate(unittest.TestCase): def test_pdf_parser(self): """Single test that validates all PDF parser outputs.""" - from extra.assembly.rdna3.lib import generate + from extra.assembly.amd.lib import generate result = generate() # test_all_formats_present diff --git a/extra/assembly/rdna3/test/test_rdna3_asm.py b/extra/assembly/amd/test/test_rdna3_asm.py similarity index 96% rename from extra/assembly/rdna3/test/test_rdna3_asm.py rename to extra/assembly/amd/test/test_rdna3_asm.py index 45e7948988..74ba86d9f7 100644 --- a/extra/assembly/rdna3/test/test_rdna3_asm.py +++ b/extra/assembly/amd/test/test_rdna3_asm.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import unittest, subprocess -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.test.helpers import get_llvm_mc +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.test.helpers import get_llvm_mc def llvm_assemble(asm: str) -> bytes: """Assemble using llvm-mc and return bytes.""" diff --git a/extra/assembly/rdna3/test/test_roundtrip.py b/extra/assembly/amd/test/test_roundtrip.py similarity index 97% rename from extra/assembly/rdna3/test/test_roundtrip.py rename to extra/assembly/amd/test/test_roundtrip.py index c8a21bdb3c..938ceb48aa 100644 --- a/extra/assembly/rdna3/test/test_roundtrip.py +++ b/extra/assembly/amd/test/test_roundtrip.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 """Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match.""" import unittest, io, sys, re, subprocess, os -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.lib import Inst -from extra.assembly.rdna3.asm import asm -from extra.assembly.rdna3.test.helpers import get_llvm_mc, get_llvm_objdump +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.lib import Inst +from extra.assembly.amd.asm import asm +from extra.assembly.amd.test.helpers import get_llvm_mc, get_llvm_objdump # Instruction format detection based on encoding bits def detect_format(data: bytes) -> type[Inst] | None: @@ -140,7 +140,7 @@ class TestTinygradKernelRoundtrip(unittest.TestCase): 2. asm(disasm()) matches LLVM output 3. our disasm() matches LLVM's disassembly string exactly """ - from extra.assembly.rdna3.test.test_compare_emulators import get_kernels_from_tinygrad + from extra.assembly.amd.test.test_compare_emulators import get_kernels_from_tinygrad from tinygrad.runtime.support.compiler_amd import HIPCompiler kernels, _, _ = get_kernels_from_tinygrad(op_fn) diff --git a/extra/assembly/rocm/.gitignore b/extra/assembly/rocm/.gitignore deleted file mode 100644 index 5b7b00aaa5..0000000000 --- a/extra/assembly/rocm/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.deb -build -src -sniffer/sniff.so diff --git a/extra/assembly/rocm/NOTES b/extra/assembly/rocm/NOTES deleted file mode 100644 index e86a63c7d9..0000000000 --- a/extra/assembly/rocm/NOTES +++ /dev/null @@ -1,20 +0,0 @@ -Built ROCT-Thunk-Interface (hsakmt) - hsakmt-roct-dev_5.4.4.99999-local_amd64.deb - note: installs to /opt/rocm -Built ROCm-Device-Libs - Works with ROCM_PATH=/home/tiny/build/ROCm-Device-Libs/build/dist - rocm-device-libs_1.0.0.99999-local_amd64.deb -Built ROCm-CompilerSupport (amd_comgr) - no deb, sudo make install to /usr/local -Built ROCR-Runtime - hsa-rocr_1.8.0-local_amd64.deb - hsa-rocr-dev_1.8.0-local_amd64.deb -Built ROCm-OpenCL-Runtime - rocm-ocl-icd_2.0.0-local_amd64.deb - ISSUE: these depend on "comgr" - rocm-opencl_2.0.0-local_amd64.deb - rocm-opencl-dev_2.0.0-local_amd64.deb - Did sudo make install - - - diff --git a/extra/assembly/rocm/kernel_crashes/dump b/extra/assembly/rocm/kernel_crashes/dump deleted file mode 100644 index cc09d4decc..0000000000 --- a/extra/assembly/rocm/kernel_crashes/dump +++ /dev/null @@ -1,41 +0,0 @@ -# run two "rocm-bandwidth-test" in a loop -# amdgpu-6.0.5-1581431.20.04 -# fixed in kernel 6.2.14 - -[ 72.153646] RIP: 0010:pm_send_runlist+0x4a/0x630 [amdgpu] -[ 72.153815] Code: 30 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 80 fb 01 0f 87 aa 9d 49 00 83 e3 01 0f 85 1c 05 00 00 49 8b 3f b8 01 00 00 00 <48> 8b 97 30 01 00 00 44 8b b7 6c 01 00 00 8b 9f 70 01 00 00 8b 8a -[ 72.153900] RSP: 0018:ffffb48445c03c30 EFLAGS: 00010246 -[ 72.153928] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 -[ 72.153962] RDX: 000000000000007b RSI: ffff9395e1562558 RDI: 0000000000000000 -[ 72.153996] RBP: ffffb48445c03cb8 R08: 0000000000000000 R09: 0000000000000001 -[ 72.154030] R10: ffff9395c900d840 R11: 0000000000000000 R12: 0000000000000000 -[ 72.154065] R13: ffff9395c9e00400 R14: 0000000000000001 R15: ffff9395e15624e0 -[ 72.154099] FS: 00007f345c6463c0(0000) GS:ffff93a4aee80000(0000) knlGS:0000000000000000 -[ 72.154137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 -[ 72.154165] CR2: 0000000000000130 CR3: 0000000112840000 CR4: 0000000000750ee0 -[ 72.154201] PKRU: 55555554 -[ 72.154215] Call Trace: -[ 72.154230] -[ 72.154244] map_queues_cpsch+0x75/0xc0 [amdgpu] -[ 72.154365] debug_map_and_unlock+0x51/0x90 [amdgpu] -[ 72.154480] debug_refresh_runlist+0x1f/0x30 [amdgpu] -[ 72.154591] kfd_dbg_runtime_disable+0x13c/0x240 [amdgpu] -[ 72.154705] kfd_ioctl_dbg_set_debug_trap+0x69d/0x8b0 [amdgpu] -[ 72.154820] kfd_ioctl+0x24a/0x5b0 [amdgpu] -[ 72.154925] ? kfd_ioctl_create_queue+0x770/0x770 [amdgpu] -[ 72.155035] ? syscall_exit_to_user_mode+0x27/0x50 -[ 72.155061] ? exit_to_user_mode_prepare+0x3d/0x1c0 -[ 72.155088] __x64_sys_ioctl+0x95/0xd0 -[ 72.155109] do_syscall_64+0x5c/0xc0 -[ 72.155128] ? syscall_exit_to_user_mode+0x27/0x50 -[ 72.155151] ? do_syscall_64+0x69/0xc0 -[ 72.155172] entry_SYSCALL_64_after_hwframe+0x61/0xcb -[ 72.155198] RIP: 0033:0x7f345c7f63ab -[ 72.155218] Code: 0f 1e fa 48 8b 05 e5 7a 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 7a 0d 00 f7 d8 64 89 01 48 -[ 72.155301] RSP: 002b:00007ffc97cc89f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 -[ 72.155339] RAX: ffffffffffffffda RBX: 00007ffc97cc8a30 RCX: 00007f345c7f63ab -[ 72.155375] RDX: 00007ffc97cc8a30 RSI: 00000000c0284b82 RDI: 0000000000000003 -[ 72.155411] RBP: 00000000c0284b82 R08: 0000000000000000 R09: 0000000000000000 -[ 72.155447] R10: 00007f345cd4ddb0 R11: 0000000000000246 R12: 00007ffc97cc8a30 -[ 72.155481] R13: 0000000000000003 R14: 00007ffc97cc8d20 R15: 0000000000000000 -[ 72.155517] diff --git a/extra/assembly/rocm/kernel_crashes/dump2 b/extra/assembly/rocm/kernel_crashes/dump2 deleted file mode 100644 index 8cd24ccfce..0000000000 --- a/extra/assembly/rocm/kernel_crashes/dump2 +++ /dev/null @@ -1,41 +0,0 @@ -# run two tinygrad matrix example in a loop -# amdgpu-6.0.5-1581431.20.04 -# NOT fixed in kernel 6.2.14 - -[ 553.016624] gmc_v11_0_process_interrupt: 30 callbacks suppressed -[ 553.016631] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:9 pasid:32770, for process python3 pid 10001 thread python3 pid 10001) -[ 553.016790] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10 -[ 553.016892] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00901A30 -[ 553.016974] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) -[ 553.017051] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0 -[ 553.017111] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 -[ 553.017173] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 -[ 553.017238] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 -[ 553.017300] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 -[ 553.123921] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2 -[ 553.124153] amdgpu: failed to add hardware queue to MES, doorbell=0x1a16 -[ 553.124195] amdgpu: MES might be in unrecoverable state, issue a GPU reset -[ 553.124237] amdgpu: Failed to restore queue 2 -[ 553.124266] amdgpu: Failed to restore process queues -[ 553.124270] amdgpu: Failed to evict queue 3 -[ 553.124297] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD - -# alternative crash in kernel 6.2.14 - -[ 151.097948] gmc_v11_0_process_interrupt: 30 callbacks suppressed -[ 151.097953] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32771, for process python3 pid 7525 thread python3 pid 7525) -[ 151.097993] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10 -[ 151.098008] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00801A30 -[ 151.098020] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) -[ 151.098032] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0 -[ 151.098042] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 -[ 151.098052] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 -[ 151.098062] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 -[ 151.098071] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 -[ 151.209517] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2 -[ 151.209724] amdgpu: failed to add hardware queue to MES, doorbell=0x1002 -[ 151.209734] amdgpu: MES might be in unrecoverable state, issue a GPU reset -[ 151.209743] amdgpu: Failed to restore queue 1 -[ 151.209751] amdgpu: Failed to restore process queues -[ 151.209759] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD -[ 151.209858] amdgpu 0000:0b:00.0: amdgpu: GPU reset begin! diff --git a/extra/assembly/rocm/kernel_crashes/dump3 b/extra/assembly/rocm/kernel_crashes/dump3 deleted file mode 100644 index ea7ec1d76b..0000000000 --- a/extra/assembly/rocm/kernel_crashes/dump3 +++ /dev/null @@ -1,20 +0,0 @@ -# two tinygrad + two bandwidth test -# RDNA2, driver 6.0.5 -# recovered from this! - -[ 136.971209] gmc_v10_0_process_interrupt: 39 callbacks suppressed -[ 136.971218] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:11 pasid:32773, for process rocm-bandwidth- pid 20281 thread rocm-bandwidth- pid 20281) -[ 136.971228] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f5c2b800000 from client 0x1b (UTCL2) -[ 136.971232] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00B01A31 -[ 136.971233] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) -[ 136.971235] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x1 -[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 -[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 -[ 136.971237] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 -[ 136.971238] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 -... -[ 136.993979] amdgpu 0000:0b:00.0: amdgpu: IH ring buffer overflow (0x000BE5A0, 0x0003C480, 0x0003E5C0) -[ 138.209072] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004000 flags=0x0000] -[ 138.209078] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004d80 flags=0x0000] -[ 138.209081] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005000 flags=0x0000] -[ 138.209084] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005d80 flags=0x0000] diff --git a/extra/assembly/rocm/kernel_crashes/dump4 b/extra/assembly/rocm/kernel_crashes/dump4 deleted file mode 100644 index 8c1fa909ac..0000000000 --- a/extra/assembly/rocm/kernel_crashes/dump4 +++ /dev/null @@ -1,33 +0,0 @@ -# ROCK-Kernel-Driver 0b579de9622f5c93021dcb7927d13926313740a2 -# non fatal "crash" - -[ 127.418045] ------------[ cut here ]------------ -[ 127.418046] User pages unexpectedly invalid -[ 127.418056] WARNING: CPU: 16 PID: 260 at drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:3000 amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu] -[ 127.418235] Modules linked in: rfcomm cmac algif_hash algif_skcipher af_alg bnep nls_iso8859_1 iwlmvm mac80211 intel_rapl_msr intel_rapl_common edac_mce_amd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi kvm_amd binfmt_misc snd_hda_intel snd_intel_dspcfg kvm libarc4 snd_intel_sdw_acpi snd_hda_codec btusb iwlwifi btrtl snd_hda_core btbcm btintel irqbypass btmtk snd_hwdep crct10dif_pclmul snd_pcm polyval_clmulni bluetooth snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq polyval_generic cfg80211 ghash_clmulni_intel eeepc_wmi snd_seq_device snd_timer aesni_intel asus_wmi ecdh_generic snd platform_profile crypto_simd ledtrig_audio cryptd ecc ccp soundcore sparse_keymap rapl k10temp wmi_bmof mac_hid sch_fq_codel msr parport_pc ppdev lp parport ramoops pstore_blk efi_pstore reed_solomon pstore_zone ip_tables x_tables autofs4 amdgpu hid_generic usbhid hid i2c_algo_bit drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper drm_kms_helper syscopyarea -[ 127.418276] sysfillrect sysimgblt fb_sys_fops drm nvme nvme_core cec r8169 ahci crc32_pclmul rc_core i2c_piix4 xhci_pci libahci nvme_common xhci_pci_renesas realtek wmi -[ 127.418284] CPU: 16 PID: 260 Comm: kworker/16:1 Tainted: G W 6.0.0 #4 -[ 127.418286] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS (WI-FI), BIOS 3603 03/20/2021 -[ 127.418287] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu] -[ 127.418455] RIP: 0010:amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu] -[ 127.418601] Code: ff e8 2b 8a 96 d1 e9 66 fe ff ff 48 c7 c7 40 4f f5 c0 e8 56 7b 8a d1 0f 0b e9 2e ff ff ff 48 c7 c7 d8 d0 ed c0 e8 43 7b 8a d1 <0f> 0b e9 0a fe ff ff 4c 89 ef e8 f8 89 96 d1 e9 cb fd ff ff e8 ce -[ 127.418603] RSP: 0018:ffffb36740a83dc8 EFLAGS: 00010282 -[ 127.418604] RAX: 0000000000000000 RBX: ffff9d159ee9df30 RCX: 0000000000000027 -[ 127.418605] RDX: 0000000000000027 RSI: ffffb36740a83c88 RDI: ffff9d242a220568 -[ 127.418606] RBP: ffffb36740a83e58 R08: ffff9d242a220560 R09: 0000000000000001 -[ 127.418607] R10: 0000000000000001 R11: 0000000000000020 R12: ffff9d159ee9df98 -[ 127.418607] R13: ffff9d159ee9df70 R14: ffff9d159ee9dee0 R15: ffff9d159ee9dee0 -[ 127.418608] FS: 0000000000000000(0000) GS:ffff9d242a200000(0000) knlGS:0000000000000000 -[ 127.418609] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 -[ 127.418610] CR2: 00007fd5d4715000 CR3: 0000000120ffe000 CR4: 0000000000750ee0 -[ 127.418611] PKRU: 55555554 -[ 127.418611] Call Trace: -[ 127.418612] -[ 127.418613] process_one_work+0x21f/0x3f0 -[ 127.418615] worker_thread+0x4a/0x3c0 -[ 127.418617] ? process_one_work+0x3f0/0x3f0 -[ 127.418618] kthread+0xf0/0x120 -[ 127.418619] ? kthread_complete_and_exit+0x20/0x20 -[ 127.418620] ret_from_fork+0x22/0x30 -[ 127.418622] -[ 127.418623] ---[ end trace 0000000000000000 ]--- \ No newline at end of file diff --git a/extra/assembly/rocm/rdna3/asm.py b/extra/assembly/rocm/rdna3/asm.py deleted file mode 100644 index 9c65fa7360..0000000000 --- a/extra/assembly/rocm/rdna3/asm.py +++ /dev/null @@ -1,80 +0,0 @@ -import numpy as np -import pathlib -from hexdump import hexdump -from tinygrad.helpers import colored -from extra.helpers import enable_early_exec -early_exec = enable_early_exec() - -from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH - -ENABLE_NON_ASM = False - -WMMA = True -DUAL_ALU = True -F32 = True - -if ENABLE_NON_ASM: - buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) - prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }") - asm_real = prg_empty.binary() - with open("/tmp/cc.elf", "wb") as f: - f.write(asm_real) - prg_empty([1], [1], buf, wait=True) - print(buf.toCPU()) - -print(colored("creating CLBuffer", "green")) -buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) -code = open(pathlib.Path(__file__).parent / "prog.s", "r").read() - -gen = [] -FLOPS = 0 -MAX_REG = 251 -for j in range(1): - if WMMA: - KY, KX = 4, 4 - for y in range(KY): - for x in range(KX): - c = (y*KX+x)*8 - a = (KY*KX*8) + y*8 - b = (KY*KX*8) + (KY*8) + x*8 - gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]") - FLOPS += 16*8*2 - else: - for i in range(0, MAX_REG, 6): - if DUAL_ALU: - if F32: - gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}") - FLOPS += 4 - else: - gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}") - FLOPS += 8 - else: - assert F32 - gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}") - gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}") -code = code.replace("// FLOPS", '\n'.join(gen)) -print(code) - - -# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa' - -object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8"))) -asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object)) - -with open("/tmp/cc2.o", "wb") as f: - f.write(object) -with open("/tmp/cc2.elf", "wb") as f: - f.write(asm) - -print(colored("creating CLProgram", "green")) -prg = CLProgram("code", asm) - -print(colored("running program", "green")) -G = 512 -FLOPS *= 100000*G*G # loop * global_size -for i in range(3): - tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True) - print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS") - -print(colored("transferring buffer", "green")) -print(buf.toCPU()) diff --git a/extra/assembly/rocm/rdna3/prog.s b/extra/assembly/rocm/rdna3/prog.s deleted file mode 100644 index 38efca5407..0000000000 --- a/extra/assembly/rocm/rdna3/prog.s +++ /dev/null @@ -1,80 +0,0 @@ -.global _start -_start: -.rodata -.align 0x10 -.global code.kd -.type code.kd,STT_OBJECT -# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right) -code.kd: -# amd_kernel_..., amd_machine_... -.long 0,0,0,0 -# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset -.long 0x00000bc0,0x00000000,0x00000000,0x00000000 -# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size -.long 0,0,0,0 -# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size -.long 0x60af0000,0x0000009e,0x00000408,0x00000000 -# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64 -# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE -# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF -# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X -# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1 -# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1 -.text -.global code -.type code,STT_FUNC -code: -# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state -# s[0:1] contains the kernarg_address -# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs? -s_load_b64 s[2:3], s[0:1], null - -s_mov_b32 s8, 0 -loop: -s_addk_i32 s8, 1 -s_cmp_eq_u32 s8, 100000 -// FLOPS -s_cbranch_scc0 loop - -# wait for the s_load_b64 -s_waitcnt lgkmcnt(0) - -v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0 -global_store_b32 v0, v1, s[2:3] - -# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM. -s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -s_endpgm -s_code_end - -.amdgpu_metadata -amdhsa.kernels: - - .args: - - .address_space: global - .name: a - .offset: 0 - .size: 8 - .type_name: 'float*' - .value_kind: global_buffer - .group_segment_fixed_size: 0 - .kernarg_segment_align: 8 - .kernarg_segment_size: 8 - .language: OpenCL C - .language_version: - - 1 - - 2 - .max_flat_workgroup_size: 256 - .name: code - .private_segment_fixed_size: 0 - .sgpr_count: 2 - .sgpr_spill_count: 0 - .symbol: code.kd - .uses_dynamic_stack: false - .vgpr_count: 256 - .vgpr_spill_count: 0 - .wavefront_size: 32 -amdhsa.target: amdgcn-amd-amdhsa--gfx1100 -amdhsa.version: - - 1 - - 2 -.end_amdgpu_metadata diff --git a/extra/assembly/rocm/rocm_clone.sh b/extra/assembly/rocm/rocm_clone.sh deleted file mode 100755 index bff5e938b7..0000000000 --- a/extra/assembly/rocm/rocm_clone.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -mkdir -p src -cd src -git clone https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git -b rocm-5.5.0 -git clone https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git -b rocm-5.5.0 -git clone https://github.com/RadeonOpenCompute/llvm-project.git -b rocm-5.5.0 --depth 1 -git clone https://github.com/RadeonOpenCompute/ROCR-Runtime.git -b rocm-5.5.0 -git clone https://github.com/ROCm-Developer-Tools/ROCclr.git -b rocm-5.5.0 -git clone https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git -b rocm-5.5.0 -git clone https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git -b rocm-5.5.0 -cd ../ \ No newline at end of file diff --git a/extra/assembly/rocm/rocm_from_scratch.sh b/extra/assembly/rocm/rocm_from_scratch.sh deleted file mode 100755 index 2be75a351c..0000000000 --- a/extra/assembly/rocm/rocm_from_scratch.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -mkdir -p build/debs -cd build - -# ROCT-Thunk-Interface (hsakmt) -if [ ! -f debs/hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ] -then - mkdir -p ROCT-Thunk-Interface - cd ROCT-Thunk-Interface - cmake ../../src/ROCT-Thunk-Interface - make -j32 package - cp hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ../debs - cd ../ -fi - - -# build custom LLVM -if [ ! -f llvm-project/bin/clang ] -then - mkdir -p llvm-project - cd llvm-project - cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="llvm;clang;lld" -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" ../../src/llvm-project/llvm - make -j32 - cd .. -fi - -# use custom LLVM -export PATH="$PWD/llvm-project/bin:$PATH" - -# ROCm-Device-Libs -if [ ! -f debs/rocm-device-libs_1.0.0.99999-local_amd64.deb ] -then - mkdir -p ROCm-Device-Libs - cd ROCm-Device-Libs - cmake ../../src/ROCm-Device-Libs - make -j32 package - cp rocm-device-libs_1.0.0.99999-local_amd64.deb ../debs - cd ../ -fi - -# ROCR-Runtime -if [ ! -f debs/hsa-rocr_1.8.0-local_amd64.deb ] -then - mkdir -p ROCR-Runtime - cd ROCR-Runtime - cmake ../../src/ROCR-Runtime/src - make -j32 package - cp hsa-rocr_1.8.0-local_amd64.deb ../debs - cp hsa-rocr-dev_1.8.0-local_amd64.deb ../debs - cd ../ -fi - -# ROCm-OpenCL-Runtime (needs ROCclr) -if [ ! -f debs/rocm-opencl_2.0.0-local_amd64.deb ] -then - mkdir -p ROCm-OpenCL-Runtime - cd ROCm-OpenCL-Runtime - cmake ../../src/ROCm-OpenCL-Runtime - make -j32 package - cp rocm-opencl_2.0.0-local_amd64.deb ../debs - cp rocm-opencl-dev_2.0.0-local_amd64.deb ../debs - cp rocm-ocl-icd_2.0.0-local_amd64.deb ../debs -fi - -# ROCm-CompilerSupport (broken) -#mkdir -p ROCm-CompilerSupport -#cd ROCm-CompilerSupport -#cmake ../../src/ROCm-CompilerSupport/lib/comgr -#make -j32 \ No newline at end of file diff --git a/extra/assembly/rocm/rocm_setup.sh b/extra/assembly/rocm/rocm_setup.sh deleted file mode 100755 index 880105cb8c..0000000000 --- a/extra/assembly/rocm/rocm_setup.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -rm amdgpu-install_5.5.50500-1_all.deb -wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/$(lsb_release -cs)/amdgpu-install_5.5.50500-1_all.deb -sudo dpkg -i amdgpu-install_5.5.50500-1_all.deb -sudo apt-get update - -# kernel driver -sudo apt-get install amdgpu-dkms - -# for opencl -sudo apt-get install rocm-opencl-runtime - -# for HIP -sudo apt-get install hip-runtime-amd rocm-device-libs hip-dev diff --git a/extra/assembly/rocm/sniffer/build.sh b/extra/assembly/rocm/sniffer/build.sh deleted file mode 100755 index 86fcc05fa2..0000000000 --- a/extra/assembly/rocm/sniffer/build.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -e -clang sniff.cc -Werror -shared -fPIC -I../src/ -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++ -#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld -#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so $HOME/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld -#AMD_LOG_LEVEL=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py -DEBUG=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py -#AMD_LOG_LEVEL=5 HSAKMT_DEBUG_LEVEL=7 DEBUG=5 LD_PRELOAD=$PWD/sniff.so strace -F python3 ../rdna3/asm.py -#LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py -#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so FORWARD_ONLY=1 DEBUG=2 python3 ../../../test/test_ops.py TestOps.test_add -#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1 -#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 1 -d 2 -m 1 diff --git a/extra/assembly/rocm/sniffer/sniff.cc b/extra/assembly/rocm/sniffer/sniff.cc deleted file mode 100644 index 9527059190..0000000000 --- a/extra/assembly/rocm/sniffer/sniff.cc +++ /dev/null @@ -1,282 +0,0 @@ -// template copied from https://github.com/geohot/cuda_ioctl_sniffer/blob/master/sniff.cc - -#include -#include -#include -#include -#include -#include - -#include - -// includes from the ROCm sources -#include -#include -#include -#include -using namespace rocr::AMD; - -#include -#include -std::map files; -std::map ring_base_addresses; - -#define D(args...) fprintf(stderr, args) - -uint64_t doorbell_offset = -1; -std::map queue_types; - -void hexdump(void *d, int l) { - for (int i = 0; i < l; i++) { - if (i%0x10 == 0 && i != 0) printf("\n"); - if (i%0x10 == 8) printf(" "); - if (i%0x10 == 0) printf("%8X: ", i); - printf("%2.2X ", ((uint8_t*)d)[i]); - } - printf("\n"); -} - -extern "C" { - -// https://defuse.ca/online-x86-assembler.htm#disassembly2 -static void handler(int sig, siginfo_t *si, void *unused) { - ucontext_t *u = (ucontext_t *)unused; - uint8_t *rip = (uint8_t*)u->uc_mcontext.gregs[REG_RIP]; - - int store_size = 0; - uint64_t value; - if (rip[0] == 0x48 && rip[1] == 0x89 && rip[2] == 0x30) { - // 0: 48 89 30 mov QWORD PTR [rax],rsi - store_size = 8; - value = u->uc_mcontext.gregs[REG_RSI]; - u->uc_mcontext.gregs[REG_RIP] += 3; - } else if (rip[0] == 0x4c && rip[1] == 0x89 && rip[2] == 0x28) { - // 0: 4c 89 28 mov QWORD PTR [rax],r13 - store_size = 8; - value = u->uc_mcontext.gregs[REG_R13]; - u->uc_mcontext.gregs[REG_RIP] += 3; - } else { - D("segfault %02X %02X %02X %02X %02X %02X %02X %02X rip: %p addr: %p\n", rip[0], rip[1], rip[2], rip[3], rip[4], rip[5], rip[6], rip[7], rip, si->si_addr); - D("rax: %llx rcx: %llx rdx: %llx rsi: %llx rbx: %llx\n", u->uc_mcontext.gregs[REG_RAX], u->uc_mcontext.gregs[REG_RCX], u->uc_mcontext.gregs[REG_RDX], u->uc_mcontext.gregs[REG_RSI], u->uc_mcontext.gregs[REG_RBX]); - exit(-1); - } - - uint64_t ring_base_address = ring_base_addresses[((uint64_t)si->si_addr)&0xFFF]; - int queue_type = queue_types[((uint64_t)si->si_addr)&0xFFF]; - D("%16p: \u001b[31mDING DONG\u001b[0m (queue_type %d) store(%d): 0x%8lx -> %p ring_base_address:0x%lx\n", rip, queue_type, store_size, value, si->si_addr, ring_base_address); - - if (queue_type == KFD_IOC_QUEUE_TYPE_SDMA) { - uint8_t *sdma_ptr = (uint8_t*)(ring_base_address); - while (sdma_ptr < ((uint8_t*)(ring_base_address)+value)) { - D("0x%3lx: ", sdma_ptr-(uint8_t*)(ring_base_address)); - if (sdma_ptr[0] == SDMA_OP_TIMESTAMP) { - D("SDMA_PKT_TIMESTAMP\n"); - sdma_ptr += sizeof(SDMA_PKT_TIMESTAMP); - } else if (sdma_ptr[0] == SDMA_OP_GCR) { - D("SDMA_PKT_GCR\n"); - sdma_ptr += sizeof(SDMA_PKT_GCR); - } else if (sdma_ptr[0] == SDMA_OP_ATOMIC) { - D("SDMA_PKT_ATOMIC\n"); - sdma_ptr += sizeof(SDMA_PKT_ATOMIC); - } else if (sdma_ptr[0] == SDMA_OP_FENCE) { - D("SDMA_PKT_FENCE\n"); - sdma_ptr += sizeof(SDMA_PKT_FENCE); - } else if (sdma_ptr[0] == SDMA_OP_TRAP) { - D("SDMA_PKT_TRAP\n"); - sdma_ptr += sizeof(SDMA_PKT_TRAP); - } else if (sdma_ptr[0] == SDMA_OP_COPY && sdma_ptr[1] == SDMA_SUBOP_COPY_LINEAR) { - SDMA_PKT_COPY_LINEAR *pkt = (SDMA_PKT_COPY_LINEAR *)sdma_ptr; - D("SDMA_PKT_COPY_LINEAR: count:0x%x src:0x%lx dst:0x%lx\n", pkt->COUNT_UNION.count+1, - (uint64_t)pkt->SRC_ADDR_LO_UNION.src_addr_31_0 | ((uint64_t)pkt->SRC_ADDR_HI_UNION.src_addr_63_32 << 32), - (uint64_t)pkt->DST_ADDR_LO_UNION.dst_addr_31_0 | ((uint64_t)pkt->DST_ADDR_HI_UNION.dst_addr_63_32 << 32) - ); - sdma_ptr += sizeof(SDMA_PKT_COPY_LINEAR); - } else { - D("unhandled packet type %d %d, exiting\n", sdma_ptr[0], sdma_ptr[1]); - break; - } - } - - //hexdump((void*)(ring_base_address), 0x100); - } else if (queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) { - hsa_kernel_dispatch_packet_t *pkt = (hsa_kernel_dispatch_packet_t *)(ring_base_address+value*0x40); - if ((pkt->header&0xFF) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { - D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address); - amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object; - D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset); - uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset); - int code_len = 0; - while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++; - hexdump(kernel_code, code_len*4); - /*FILE *f = fopen("/tmp/kernel_code", "wb"); - fwrite(kernel_code, 4, code_len, f); - fclose(f); - system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/ - D("kernargs (kernarg_segment_byte_size:0x%lx)\n", code->kernarg_segment_byte_size); - // get length - int i; - for (i = 0; i < 0x400; i+=0x10) { - if (memcmp((void*)((uint64_t)pkt->kernarg_address+i), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 0x10) == 0) break; - } - hexdump((void*)pkt->kernarg_address, i+0x10); - } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) { - hsa_barrier_and_packet_t *pkt_and = (hsa_barrier_and_packet_t *)(ring_base_address+value*0x40); - D("HSA_PACKET_TYPE_BARRIER_AND completion_signal:0x%lx\n", pkt_and->completion_signal.handle); - //hexdump((void*)(ring_base_address+value*0x40), 0x40); - } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_VENDOR_SPECIFIC) { - D("HSA_PACKET_TYPE_VENDOR_SPECIFIC\n"); - hexdump((void*)(ring_base_address+value*0x40), 0x40); - } else { - hexdump((void*)(ring_base_address+value*0x40), 0x40); - } - } - - mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_READ | PROT_WRITE); - if (store_size == 8) { - *(volatile uint64_t*)(si->si_addr) = value; - } else if (store_size == 4) { - *(volatile uint32_t*)(si->si_addr) = value; - } else if (store_size == 2) { - *(volatile uint16_t*)(si->si_addr) = value; - } else { - D("store size not supported\n"); - exit(-1); - } - mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_NONE); -} - -void register_sigsegv_handler() { - struct sigaction sa = {0}; - sa.sa_flags = SA_SIGINFO; - sigemptyset(&sa.sa_mask); - sa.sa_sigaction = handler; - if (sigaction(SIGSEGV, &sa, NULL) == -1) { - D("ERROR: failed to register sigsegv handler"); - exit(-1); - } - // NOTE: python (or ocl runtime?) blocks the SIGSEGV signal - sigset_t x; - sigemptyset(&x); - sigaddset(&x, SIGSEGV); - sigprocmask(SIG_UNBLOCK, &x, NULL); -} - -int (*my_open)(const char *pathname, int flags, mode_t mode); -#undef open -int open(const char *pathname, int flags, mode_t mode) { - if (my_open == NULL) my_open = reinterpret_cast(dlsym(RTLD_NEXT, "open")); - int ret = my_open(pathname, flags, mode); - //D("open %s (0o%o) = %d\n", pathname, flags, ret); - files[ret] = pathname; - return ret; -} - - -int (*my_open64)(const char *pathname, int flags, mode_t mode); -#undef open -int open64(const char *pathname, int flags, mode_t mode) { - if (my_open64 == NULL) my_open64 = reinterpret_cast(dlsym(RTLD_NEXT, "open64")); - int ret = my_open64(pathname, flags, mode); - //D("open %s (0o%o) = %d\n", pathname, flags, ret); - files[ret] = pathname; - return ret; -} - -void *(*my_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset); -#undef mmap -void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { - if (my_mmap == NULL) my_mmap = reinterpret_cast(dlsym(RTLD_NEXT, "mmap")); - void *ret = my_mmap(addr, length, prot, flags, fd, offset); - - if (doorbell_offset != -1 && offset == doorbell_offset) { - D("HIDDEN DOORBELL %p, handled by %p\n", addr, handler); - register_sigsegv_handler(); - mprotect(addr, length, PROT_NONE); - } - - if (fd != -1) D("mmapped %p (target %p) with flags 0x%x length 0x%zx fd %d %s offset 0x%lx\n", ret, addr, flags, length, fd, files[fd].c_str(), offset); - return ret; -} - -void *(*my_mmap64)(void *addr, size_t length, int prot, int flags, int fd, off_t offset); -#undef mmap64 -void *mmap64(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { return mmap(addr, length, prot, flags, fd, offset); } - -int ioctl_num = 1; -int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL; -#undef ioctl -int ioctl(int filedes, unsigned long request, void *argp) { - if (my_ioctl == NULL) my_ioctl = reinterpret_cast(dlsym(RTLD_NEXT, "ioctl")); - int ret = 0; - ret = my_ioctl(filedes, request, argp); - if (!files.count(filedes)) return ret; - - uint8_t type = (request >> 8) & 0xFF; - uint8_t nr = (request >> 0) & 0xFF; - uint16_t size = (request >> 16) & 0xFFF; - - D("%3d: %d = %3d(%20s) 0x%3x ", ioctl_num, ret, filedes, files[filedes].c_str(), size); - - if (request == AMDKFD_IOC_SET_EVENT) { - kfd_ioctl_set_event_args *args = (kfd_ioctl_set_event_args *)argp; - D("AMDKFD_IOC_SET_EVENT event_id:%d", args->event_id); - } else if (request == AMDKFD_IOC_ALLOC_MEMORY_OF_GPU) { - kfd_ioctl_alloc_memory_of_gpu_args *args = (kfd_ioctl_alloc_memory_of_gpu_args *)argp; - D("AMDKFD_IOC_ALLOC_MEMORY_OF_GPU va_addr:0x%llx size:0x%llx handle:%llX gpu_id:0x%x", args->va_addr, args->size, args->handle, args->gpu_id); - } else if (request == AMDKFD_IOC_MAP_MEMORY_TO_GPU) { - kfd_ioctl_map_memory_to_gpu_args *args = (kfd_ioctl_map_memory_to_gpu_args *)argp; - D("AMDKFD_IOC_MAP_MEMORY_TO_GPU handle:%llX", args->handle); - } else if (request == AMDKFD_IOC_CREATE_EVENT) { - kfd_ioctl_create_event_args *args = (kfd_ioctl_create_event_args *)argp; - D("AMDKFD_IOC_CREATE_EVENT event_page_offset:0x%llx event_type:%d event_id:%d", args->event_page_offset, args->event_type, args->event_id); - } else if (request == AMDKFD_IOC_WAIT_EVENTS) { - D("AMDKFD_IOC_WAIT_EVENTS"); - } else if (request == AMDKFD_IOC_SET_XNACK_MODE) { - D("AMDKFD_IOC_SET_XNACK_MODE"); - } else if (request == AMDKFD_IOC_SVM || (type == 0x4b && nr == 0x20)) { - // NOTE: this one is variable length - kfd_ioctl_svm_args *args = (kfd_ioctl_svm_args *)argp; - D("AMDKFD_IOC_SVM start_addr:0x%llx size:0x%llx op:%d", args->start_addr, args->size, args->op); - } else if (request == AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU) { - kfd_ioctl_unmap_memory_from_gpu_args *args = (kfd_ioctl_unmap_memory_from_gpu_args *)argp; - D("AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU handle:%llX", args->handle); - } else if (request == AMDKFD_IOC_FREE_MEMORY_OF_GPU) { - D("AMDKFD_IOC_FREE_MEMORY_OF_GPU"); - } else if (request == AMDKFD_IOC_SET_SCRATCH_BACKING_VA) { - D("AMDKFD_IOC_SET_SCRATCH_BACKING_VA"); - } else if (request == AMDKFD_IOC_GET_TILE_CONFIG) { - D("AMDKFD_IOC_GET_TILE_CONFIG"); - } else if (request == AMDKFD_IOC_SET_TRAP_HANDLER) { - D("AMDKFD_IOC_SET_TRAP_HANDLER"); - } else if (request == AMDKFD_IOC_GET_VERSION) { - kfd_ioctl_get_version_args *args = (kfd_ioctl_get_version_args *)argp; - D("AMDKFD_IOC_GET_VERSION major_version:%d minor_version:%d", args->major_version, args->minor_version); - } else if (request == AMDKFD_IOC_GET_PROCESS_APERTURES_NEW) { - D("AMDKFD_IOC_GET_PROCESS_APERTURES_NEW"); - } else if (request == AMDKFD_IOC_ACQUIRE_VM) { - D("AMDKFD_IOC_ACQUIRE_VM"); - } else if (request == AMDKFD_IOC_SET_MEMORY_POLICY) { - D("AMDKFD_IOC_SET_MEMORY_POLICY"); - } else if (request == AMDKFD_IOC_GET_CLOCK_COUNTERS) { - D("AMDKFD_IOC_GET_CLOCK_COUNTERS"); - } else if (request == AMDKFD_IOC_CREATE_QUEUE) { - kfd_ioctl_create_queue_args *args = (kfd_ioctl_create_queue_args *)argp; - D("AMDKFD_IOC_CREATE_QUEUE\n"); - D("queue_type:%d ring_base_address:0x%llx\n", args->queue_type, args->ring_base_address); - D("eop_buffer_address:0x%llx ctx_save_restore_address:0x%llx\n", args->eop_buffer_address, args->ctx_save_restore_address); - D("ring_size:0x%x queue_priority:%d\n", args->ring_size, args->queue_priority); - D("RETURNS write_pointer_address:0x%llx read_pointer_address:0x%llx doorbell_offset:0x%llx queue_id:%d\n", args->write_pointer_address, args->read_pointer_address, args->doorbell_offset, args->queue_id); - //D("RETURNS *write_pointer_address:0x%llx *read_pointer_address:0x%llx\n", *(uint64_t*)args->write_pointer_address, *(uint64_t*)args->read_pointer_address); - ring_base_addresses[args->doorbell_offset&0xFFF] = args->ring_base_address; - queue_types[args->doorbell_offset&0xFFF] = args->queue_type; - doorbell_offset = args->doorbell_offset&~0xFFF; - } else { - D("type:0x%x nr:0x%x size:0x%x", type, nr, size); - } - - D("\n"); - ioctl_num++; - return ret; -} - -} diff --git a/extra/remu/test/hwtest.py b/extra/remu/test/hwtest.py index 0d769099e9..19d10909fd 100644 --- a/extra/remu/test/hwtest.py +++ b/extra/remu/test/hwtest.py @@ -11,8 +11,8 @@ from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble from tinygrad.renderer import ProgramSpec from tinygrad.engine.realize import CompiledRunner -from extra.assembly.rdna3.autogen import * -from extra.assembly.rdna3.asm import waitcnt +from extra.assembly.amd.autogen.rdna3 import * +from extra.assembly.amd.asm import waitcnt from test.testextra.test_cfg_viz import template def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]): diff --git a/test/mockgpu/helpers.py b/test/mockgpu/helpers.py index 39b8daa336..e83098cea1 100644 --- a/test/mockgpu/helpers.py +++ b/test/mockgpu/helpers.py @@ -21,7 +21,7 @@ class PythonRemu: rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int: - from extra.assembly.rdna3.emu import run_asm, set_valid_mem_ranges + from extra.assembly.amd.emu import run_asm, set_valid_mem_ranges # Pad ranges to handle GPU loads that may read past small buffers (e.g. s_load_b128 on 12-byte buffer) set_valid_mem_ranges({(start, size + 4096) for start, size in self.valid_mem_ranges}) return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2) diff --git a/test/testextra/test_cfg_viz.py b/test/testextra/test_cfg_viz.py index eb8a2b68f1..329e11268f 100644 --- a/test/testextra/test_cfg_viz.py +++ b/test/testextra/test_cfg_viz.py @@ -10,7 +10,7 @@ from tinygrad.renderer import ProgramSpec from tinygrad.helpers import TracingKey, getenv from tinygrad.engine.realize import ExecItem, CompiledRunner -from extra.assembly.rdna3.autogen import * +from extra.assembly.amd.autogen.rdna3 import * # TODO: use the RDNA3 renderer when it's in master template = """.text