diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8c5f411aba..0400dee2b5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -684,6 +684,9 @@ jobs: run: AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=auto test/test_dtype_alu.py test/test_dtype.py - name: Run RDNA3 dtype tests (AMD_LLVM=1) run: AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=auto test/test_dtype_alu.py test/test_dtype.py + # TODO: run all once emulator is faster + - name: Run RDNA3 ops tests + run: SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=auto test/test_ops.py -k "test_sparse_categorical_crossentropy or test_tril" testamdautogen: name: AMD autogen diff --git a/extra/assembly/amd/README b/extra/assembly/amd/README index d9cb00c8d4..e065336c78 100644 --- a/extra/assembly/amd/README +++ b/extra/assembly/amd/README @@ -11,6 +11,8 @@ Test with `PYTHONPATH="." pytest -n12 extra/assembly/amd/` The code should be as readable and deduplicated as possible. asm and emu shouldn't be required for dsl. +The autogen folder is autogenerated from the AMD PDFs with `python3 -m extra.assembly.amd.pdf --arch all` + test_emu.py has a good set of instruction tests for the emulation, with USE_HW=1 it will compare to real hardware. Whenever an instruction is fixed, regression tests should be added here and confirmed with real hardware. @@ -26,6 +28,12 @@ The ops tests also pass, but they are very slow, so you should run them one at a `SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/test_ops.py` `SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/test_ops.py` -When something is caught by main tinygrad tests, a local regression test should be added to `extra/assembly/amd/test`. While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware, so if a test is failing with `AMD=1 PYTHON_REMU=1 MOCKGPU=1` it's likely because an instruction is emulated incorrectly. You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator. +When something is caught by main tinygrad tests, a local regression test should be added to `extra/assembly/amd/test`. +While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware +If a test is failing with `AMD=1 PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly. +You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator. +IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug. Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~2000 lines. +Get line count with `cloc --by-file extra/assembly/amd/*.py` + diff --git a/extra/assembly/amd/autogen/cdna/gen_pcode.py b/extra/assembly/amd/autogen/cdna/gen_pcode.py index d6d79c1a84..efa2bd407c 100644 --- a/extra/assembly/amd/autogen/cdna/gen_pcode.py +++ b/extra/assembly/amd/autogen/cdna/gen_pcode.py @@ -1,9 +1,9 @@ # autogenerated by pdf.py - do not edit # to regenerate: python -m extra.assembly.amd.pdf --arch cdna -# ruff: noqa: E501,F405,F403 +# ruff: noqa: E501 # mypy: ignore-errors -from extra.assembly.amd.autogen.cdna.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3POp, VOPCOp, VOP3AOp, VOP3BOp -from extra.assembly.amd.pcode import * +from extra.assembly.amd.autogen.cdna.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3POp, VOPCOp, VOP3AOp, VOP3BOp, DSOp, FLATOp, GLOBALOp, SCRATCHOp +from extra.assembly.amd.pcode import ABSDIFF, BYTE_PERMUTE, DENORM, F, INF, OVERFLOW_F32, OVERFLOW_F64, PI, ROUND_MODE, Reg, SAT8, TWO_OVER_PI_1201, UNDERFLOW_F32, UNDERFLOW_F64, WAVE_MODE, _pack, _pack32, bf16_to_f32, cos, cvtToQuietNAN, exponent, f16_to_f32, f16_to_i16, f16_to_snorm, f16_to_u16, f16_to_unorm, f32_to_bf16, f32_to_f16, f32_to_f64, f32_to_i32, f32_to_snorm, f32_to_u32, f32_to_u8, f32_to_unorm, f64_to_f32, f64_to_i32, f64_to_u32, floor, fma, fract, i16_to_f16, i32_to_f32, i32_to_f64, i32_to_i16, isEven, isNAN, isQuietNAN, isSignalNAN, ldexp, log2, mantissa, pow, s_ff1_i32_b64, sign, signext, signext_from_bit, sin, sqrt, trunc, u16_to_f16, u32_to_f32, u32_to_f64, u32_to_u16, u4_to_u32, u8_to_u32, v_max3_f16, v_max3_f32, v_max3_i16, v_max3_i32, v_max3_u16, v_max3_u32, v_max_f16, v_max_f32, v_max_i16, v_max_i32, v_max_u16, v_max_u32, v_min3_f16, v_min3_f32, v_min_f16, v_min_f32, v_min_i16, v_min_i32, v_min_u16, v_min_u32, v_msad_u8, v_sad_u8 def _SOP1Op_S_MOV_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): D0.b32 = S0.b32 @@ -976,6 +976,10 @@ def _SOPPOp_S_CBRANCH_CDBGSYS_AND_USER(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, l return {'PC': PC} def _SOPPOp_S_SET_GPR_IDX_MODE(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): + SIMM16 = Reg(literal) + VDST = Reg(vdst_idx) + # --- compiled pseudocode --- + SIMM16[1] = VSRC1_REL, SIMM16[2] = VSRC2_REL and SIMM16[3] = VDST_REL. return {} SOPPOp_FUNCTIONS = { @@ -1387,21 +1391,6 @@ def _VOP1Op_V_CVT_PK_F32_BF8(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VG D0[63 : 32].f32 = bf8_to_f32(tmp[15 : 8].bf8) return {'D0': D0} -def _VOP1Op_V_PERMLANE16_SWAP_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - SRC0 = Reg(src0_idx) - # --- compiled pseudocode --- - for pass_ in range(0, int(1)+1): - for lane in range(0, int(15)+1): - tmp = Reg(VGPR[pass_ * 32 + lane][SRC0.u32]) - return {} - -def _VOP1Op_V_PERMLANE32_SWAP_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - SRC0 = Reg(src0_idx) - # --- compiled pseudocode --- - for lane in range(0, int(31)+1): - tmp = Reg(VGPR[lane][SRC0.u32]) - return {} - def _VOP1Op_V_CVT_F32_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): D0.f32 = F(_pack(S0.b16, 0)) return {'D0': D0} @@ -1484,8 +1473,6 @@ VOP1Op_FUNCTIONS = { VOP1Op.V_CVT_F32_BF8: _VOP1Op_V_CVT_F32_BF8, VOP1Op.V_CVT_PK_F32_FP8: _VOP1Op_V_CVT_PK_F32_FP8, VOP1Op.V_CVT_PK_F32_BF8: _VOP1Op_V_CVT_PK_F32_BF8, - VOP1Op.V_PERMLANE16_SWAP_B32: _VOP1Op_V_PERMLANE16_SWAP_B32, - VOP1Op.V_PERMLANE32_SWAP_B32: _VOP1Op_V_PERMLANE32_SWAP_B32, VOP1Op.V_CVT_F32_BF16: _VOP1Op_V_CVT_F32_BF16, } @@ -3076,13 +3063,12 @@ def _VOPCOp_V_CMPX_GE_U64(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, return {'D0': D0, 'EXEC': EXEC} def _VOPCOp_V_CMPX_T_U64(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): + VDST = Reg(vdst_idx) + # --- compiled pseudocode --- EXEC.u64[laneId] = D0.u64[laneId] = 1 - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) + OFFSET0 = Unsigned byte offset added to the address from the ADDR VGPR. + OFFSET1 = Unsigned byte offset added to the address from the ADDR VGPR. + VDST = Destination VGPR 0- 255. return {'D0': D0, 'EXEC': EXEC} VOPCOp_FUNCTIONS = { @@ -4147,13 +4133,12 @@ def _VOP3AOp_V_CMPX_GE_U64(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR return {'D0': D0} def _VOP3AOp_V_CMPX_T_U64(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): + VDST = Reg(vdst_idx) + # --- compiled pseudocode --- EXEC.u64[laneId] = D0.u64[laneId] = 1 - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) - addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) - tmp = Reg(MEM[addr].u32) + OFFSET0 = Unsigned byte offset added to the address from the ADDR VGPR. + OFFSET1 = Unsigned byte offset added to the address from the ADDR VGPR. + VDST = Destination VGPR 0- 255. return {'D0': D0} def _VOP3AOp_V_MOV_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): @@ -5385,36 +5370,6 @@ def _VOP3AOp_V_DOT2C_F32_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, V D0.f32 = tmp return {'D0': D0} -def _VOP3AOp_V_CVT_SCALEF32_PK_FP8_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - tmp0 = f32_to_fp8_scale(S0.f32, scale.u8) - tmp1 = f32_to_fp8_scale(S1.f32, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_PK_BF8_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - tmp0 = f32_to_bf8_scale(S0.f32, scale.u8) - tmp1 = f32_to_bf8_scale(S1.f32, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_FP8_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(f32_to_fp8_sr_scale(S0.f32, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_BF8_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(f32_to_bf8_sr_scale(S0.f32, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - def _VOP3AOp_V_CVT_SCALEF32_PK_F32_FP8(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): tmp = Reg(0) SRC0 = Reg(src0_idx) @@ -5455,25 +5410,6 @@ def _VOP3AOp_V_CVT_SCALEF32_F32_BF8(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, lite tmp = Reg(bf8_to_f32_scale(src, scale.u8)) return {} -def _VOP3AOp_V_CVT_SCALEF32_PK_FP4_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - tmp0 = f32_to_fp4_scale(S0.f32, scale.u8) - tmp1 = f32_to_fp4_scale(S1.f32, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_F32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - randomVal = S1.u32 - tmp0 = f32_to_fp4_sr_scale(S0[31 : 0].f32, randomVal, scale.u8) - tmp1 = f32_to_fp4_sr_scale(S0[63 : 32].f32, randomVal, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - def _VOP3AOp_V_CVT_SCALEF32_PK_F32_FP4(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): tmp = Reg(0) SRC0 = Reg(src0_idx) @@ -5485,66 +5421,6 @@ def _VOP3AOp_V_CVT_SCALEF32_PK_F32_FP4(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, l D0[63 : 32].f32 = tmp1 return {'D0': D0} -def _VOP3AOp_V_CVT_SCALEF32_PK_FP8_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = f16_to_fp8_scale(S0[15 : 0].f16, scale.u8) - tmp1 = f16_to_fp8_scale(S0[31 : 16].f16, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_PK_BF8_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = f16_to_bf8_scale(S0[15 : 0].f16, scale.u8) - tmp1 = f16_to_bf8_scale(S0[31 : 16].f16, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_FP8_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(f16_to_fp8_sr_scale(S0.f16, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_BF8_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(f16_to_bf8_sr_scale(S0.f16, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_PK_FP8_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = bf16_to_fp8_scale(S0[15 : 0].bf16, scale.u8) - tmp1 = bf16_to_fp8_scale(S0[31 : 16].bf16, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_PK_BF8_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = bf16_to_bf8_scale(S0[15 : 0].bf16, scale.u8) - tmp1 = bf16_to_bf8_scale(S0[31 : 16].bf16, scale.u8) - dstword = OPSEL[3].i32 * 16 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_FP8_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(bf16_to_fp8_sr_scale(S0.bf16, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_BF8_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - scale = (exponent(S2.f32)) - tmp = Reg(bf16_to_bf8_sr_scale(S0.bf16, S1.u32, scale.u8)) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - def _VOP3AOp_V_CVT_SCALEF32_PK_F16_FP8(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): tmp = Reg(0) SRC0 = Reg(src0_idx) @@ -5585,44 +5461,6 @@ def _VOP3AOp_V_CVT_SCALEF32_F16_BF8(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, lite tmp = Reg(bf8_to_f16_scale(src, scale.u8)) return {} -def _VOP3AOp_V_CVT_SCALEF32_PK_FP4_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = f16_to_fp4_scale(S0[15 : 0].f16, scale.u8) - tmp1 = f16_to_fp4_scale(S0[31 : 16].f16, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_PK_FP4_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S1.f32)) - tmp0 = bf16_to_fp4_scale(S0[15 : 0].bf16, scale.u8) - tmp1 = bf16_to_fp4_scale(S0[31 : 16].bf16, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_F16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - randomVal = S1.u32 - tmp0 = f16_to_fp4_sr_scale(S0[15 : 0].f16, randomVal, scale.u8) - tmp1 = f16_to_fp4_sr_scale(S0[31 : 16].f16, randomVal, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - -def _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_BF16(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): - tmp = Reg(0) - # --- compiled pseudocode --- - scale = (exponent(S2.f32)) - randomVal = S1.u32 - tmp0 = bf16_to_fp4_sr_scale(S0[15 : 0].bf16, randomVal, scale.u8) - tmp1 = bf16_to_fp4_sr_scale(S0[31 : 16].bf16, randomVal, scale.u8) - dstbyte = OPSEL[3 : 2].i32 * 8 - return {} - def _VOP3AOp_V_CVT_SCALEF32_PK_F16_FP4(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): tmp = Reg(0) SRC0 = Reg(src0_idx) @@ -6114,33 +5952,15 @@ VOP3AOp_FUNCTIONS = { VOP3AOp.V_PACK_B32_F16: _VOP3AOp_V_PACK_B32_F16, VOP3AOp.V_MUL_LEGACY_F32: _VOP3AOp_V_MUL_LEGACY_F32, VOP3AOp.V_DOT2C_F32_BF16: _VOP3AOp_V_DOT2C_F32_BF16, - VOP3AOp.V_CVT_SCALEF32_PK_FP8_F32: _VOP3AOp_V_CVT_SCALEF32_PK_FP8_F32, - VOP3AOp.V_CVT_SCALEF32_PK_BF8_F32: _VOP3AOp_V_CVT_SCALEF32_PK_BF8_F32, - VOP3AOp.V_CVT_SCALEF32_SR_FP8_F32: _VOP3AOp_V_CVT_SCALEF32_SR_FP8_F32, - VOP3AOp.V_CVT_SCALEF32_SR_BF8_F32: _VOP3AOp_V_CVT_SCALEF32_SR_BF8_F32, VOP3AOp.V_CVT_SCALEF32_PK_F32_FP8: _VOP3AOp_V_CVT_SCALEF32_PK_F32_FP8, VOP3AOp.V_CVT_SCALEF32_PK_F32_BF8: _VOP3AOp_V_CVT_SCALEF32_PK_F32_BF8, VOP3AOp.V_CVT_SCALEF32_F32_FP8: _VOP3AOp_V_CVT_SCALEF32_F32_FP8, VOP3AOp.V_CVT_SCALEF32_F32_BF8: _VOP3AOp_V_CVT_SCALEF32_F32_BF8, - VOP3AOp.V_CVT_SCALEF32_PK_FP4_F32: _VOP3AOp_V_CVT_SCALEF32_PK_FP4_F32, - VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_F32: _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_F32, VOP3AOp.V_CVT_SCALEF32_PK_F32_FP4: _VOP3AOp_V_CVT_SCALEF32_PK_F32_FP4, - VOP3AOp.V_CVT_SCALEF32_PK_FP8_F16: _VOP3AOp_V_CVT_SCALEF32_PK_FP8_F16, - VOP3AOp.V_CVT_SCALEF32_PK_BF8_F16: _VOP3AOp_V_CVT_SCALEF32_PK_BF8_F16, - VOP3AOp.V_CVT_SCALEF32_SR_FP8_F16: _VOP3AOp_V_CVT_SCALEF32_SR_FP8_F16, - VOP3AOp.V_CVT_SCALEF32_SR_BF8_F16: _VOP3AOp_V_CVT_SCALEF32_SR_BF8_F16, - VOP3AOp.V_CVT_SCALEF32_PK_FP8_BF16: _VOP3AOp_V_CVT_SCALEF32_PK_FP8_BF16, - VOP3AOp.V_CVT_SCALEF32_PK_BF8_BF16: _VOP3AOp_V_CVT_SCALEF32_PK_BF8_BF16, - VOP3AOp.V_CVT_SCALEF32_SR_FP8_BF16: _VOP3AOp_V_CVT_SCALEF32_SR_FP8_BF16, - VOP3AOp.V_CVT_SCALEF32_SR_BF8_BF16: _VOP3AOp_V_CVT_SCALEF32_SR_BF8_BF16, VOP3AOp.V_CVT_SCALEF32_PK_F16_FP8: _VOP3AOp_V_CVT_SCALEF32_PK_F16_FP8, VOP3AOp.V_CVT_SCALEF32_PK_F16_BF8: _VOP3AOp_V_CVT_SCALEF32_PK_F16_BF8, VOP3AOp.V_CVT_SCALEF32_F16_FP8: _VOP3AOp_V_CVT_SCALEF32_F16_FP8, VOP3AOp.V_CVT_SCALEF32_F16_BF8: _VOP3AOp_V_CVT_SCALEF32_F16_BF8, - VOP3AOp.V_CVT_SCALEF32_PK_FP4_F16: _VOP3AOp_V_CVT_SCALEF32_PK_FP4_F16, - VOP3AOp.V_CVT_SCALEF32_PK_FP4_BF16: _VOP3AOp_V_CVT_SCALEF32_PK_FP4_BF16, - VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_F16: _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_F16, - VOP3AOp.V_CVT_SCALEF32_SR_PK_FP4_BF16: _VOP3AOp_V_CVT_SCALEF32_SR_PK_FP4_BF16, VOP3AOp.V_CVT_SCALEF32_PK_F16_FP4: _VOP3AOp_V_CVT_SCALEF32_PK_F16_FP4, VOP3AOp.V_CVT_SCALEF32_PK_BF16_FP4: _VOP3AOp_V_CVT_SCALEF32_PK_BF16_FP4, VOP3AOp.V_ASHR_PK_I8_I32: _VOP3AOp_V_ASHR_PK_I8_I32, @@ -6270,6 +6090,2773 @@ VOP3BOp_FUNCTIONS = { VOP3BOp.V_MAD_I64_I32: _VOP3BOp_V_MAD_I64_I32, } +def _DSOp_DS_ADD_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 = DATA.u32 - MEM[addr].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRITE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + return {} + +def _DSOp_DS_WRITE2_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET0.u32 * 4].b32 = DATA[31 : 0] + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET1.u32 * 4].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_WRITE2ST64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET0.u32 * 256].b32 = DATA[31 : 0] + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET1.u32 * 256].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_CMPST_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + src = DATA2.b32 + cmp = DATA.b32 + MEM[addr].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA2.f32 + cmp = DATA.f32 + MEM[addr].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA.f32 + MEM[addr].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA.f32 + MEM[addr].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + MEM[addr].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_F16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].f16 = tmp[31 : 16].f16 + src[31 : 16].f16 + dst[15 : 0].f16 = tmp[15 : 0].f16 + src[15 : 0].f16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_PK_ADD_BF16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].bf16 = tmp[31 : 16].bf16 + src[31 : 16].bf16 + dst[15 : 0].bf16 = tmp[15 : 0].bf16 + src[15 : 0].bf16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_WRITE_B8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[7 : 0] + return {} + +def _DSOp_DS_WRITE_B16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[15 : 0] + return {} + +def _DSOp_DS_ADD_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 = DATA.u32 - MEM[addr].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG2_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 4 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 4 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG2ST64_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 256 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 256 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b32) + src = DATA2.b32 + cmp = DATA.b32 + MEM[addr].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA2.f32 + cmp = DATA.f32 + MEM[addr].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA.f32 + MEM[addr].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + src = DATA.f32 + MEM[addr].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRAP_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 = ((tmp - DATA.u32) if (tmp >= DATA.u32) else (tmp + DATA2.u32)) + RETURN_DATA = tmp + return {} + +def _DSOp_DS_ADD_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f32) + MEM[addr].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ2_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 4].b32 + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[63 : 32] = MEM[addr + OFFSET1.u32 * 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ2ST64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 256].b32 + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[63 : 32] = MEM[addr + OFFSET1.u32 * 256].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_I8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_U8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_I16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_U16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + for i in range(0, int(63)+1): + tmp[i] = 0x0 + for i in range(0, int(63)+1): + if EXEC[i].u1: + dst_lane = (VGPR[i][ADDR].u32 + OFFSET.u32) / 4 % 64 + tmp[dst_lane] = VGPR[i][DATA0] + for i in range(0, int(63)+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_BPERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + for i in range(0, int(63)+1): + tmp[i] = 0x0 + for i in range(0, int(63)+1): + src_lane = (VGPR[i][ADDR].u32 + OFFSET.u32) / 4 % 64 + if EXEC[src_lane].u1: + tmp[i] = VGPR[src_lane][DATA0] + for i in range(0, int(63)+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_ADD_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 = DATA.u64 - MEM[addr].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRITE_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + return {} + +def _DSOp_DS_WRITE2_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET0.u32 * 8].b32 = DATA[31 : 0] + MEM[addr + OFFSET0.u32 * 8 + 4].b32 = DATA[63 : 32] + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET1.u32 * 8].b32 = DATA2[31 : 0] + MEM[addr + OFFSET1.u32 * 8 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_WRITE2ST64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET0.u32 * 512].b32 = DATA[31 : 0] + MEM[addr + OFFSET0.u32 * 512 + 4].b32 = DATA[63 : 32] + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET1.u32 * 512].b32 = DATA2[31 : 0] + MEM[addr + OFFSET1.u32 * 512 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_CMPST_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + src = DATA2.b64 + cmp = DATA.b64 + MEM[addr].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA2.f64 + cmp = DATA.f64 + MEM[addr].f64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRITE_B8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[23 : 16] + return {} + +def _DSOp_DS_WRITE_B16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[31 : 16] + return {} + +def _DSOp_DS_READ_U8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_U8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_I8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_I8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_U16_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_U16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + MEM[ADDR].f64 += DATA.f64 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_ADD_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 = DATA.u64 - MEM[addr].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG2_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 8 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 8 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRXCHG2ST64_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 512 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 512 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].b64) + src = DATA2.b64 + cmp = DATA.b64 + MEM[addr].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPST_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA2.f64 + cmp = DATA.f64 + MEM[addr].f64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET.u32 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ2_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 8].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET0.u32 * 8 + 4].b32 + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[95 : 64] = MEM[addr + OFFSET1.u32 * 8].b32 + RETURN_DATA[127 : 96] = MEM[addr + OFFSET1.u32 * 8 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ2ST64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 512].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET0.u32 * 512 + 4].b32 + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[95 : 64] = MEM[addr + OFFSET1.u32 * 512].b32 + RETURN_DATA[127 : 96] = MEM[addr + OFFSET1.u32 * 512 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + MEM[ADDR].f64 += DATA.f64 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_CONDXCHG32_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + ADDR = S0.u32 + DATA = S1.u64 + offset = _pack(OFFSET1, OFFSET0) + RETURN_DATA[0] = LDS[ADDR0].u32 + if DATA[31]: + LDS[ADDR0] = _pack(0, DATA[30 : 0]) + RETURN_DATA[1] = LDS[ADDR1].u32 + if DATA[63]: + LDS[ADDR1] = _pack(0, DATA[62 : 32]) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_RTN_F16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].f16 = tmp[31 : 16].f16 + src[31 : 16].f16 + dst[15 : 0].f16 = tmp[15 : 0].f16 + src[15 : 0].f16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_PK_ADD_RTN_BF16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].bf16 = tmp[31 : 16].bf16 + src[31 : 16].bf16 + dst[15 : 0].bf16 = tmp[15 : 0].bf16 + src[15 : 0].bf16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _DSOp_DS_WRITE_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[addr + OFFSET.u32 + 8].b32 = DATA[95 : 64] + return {} + +def _DSOp_DS_WRITE_B128(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[addr + OFFSET.u32 + 8].b32 = DATA[95 : 64] + MEM[addr + OFFSET.u32 + 12].b32 = DATA[127 : 96] + return {} + +def _DSOp_DS_READ_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET.u32 + 4].b32 + RETURN_DATA[95 : 64] = MEM[addr + OFFSET.u32 + 8].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_READ_B128(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(ADDR.b32, 0x0, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET.u32 + 4].b32 + RETURN_DATA[95 : 64] = MEM[addr + OFFSET.u32 + 8].b32 + RETURN_DATA[127 : 96] = MEM[addr + OFFSET.u32 + 12].b32 + OFFSET = Unsigned immediate byte offset. + OFFEN = Send offset either as VADDR or as zero.. + IDXEN = Send index either as VADDR or as zero. + VADDR = VGPR address source. + VDATA = Destination vector GPR. + SOFFSET = Byte offset added to the memory address of an SGPR. + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetX()]) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetX()]) + VDATA[63 : 32].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetY()]) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetX()]) + VDATA[63 : 32].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetY()]) + VDATA[95 : 64].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetZ()]) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetX()]) + VDATA[63 : 32].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetY()]) + VDATA[95 : 64].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetZ()]) + VDATA[127 : 96].b32 = ConvertFromFormat(MEM[addr + ChannelOffsetW()]) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat(VDATA[31 : 0].b32) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat(VDATA[31 : 0].b32) + MEM[addr + ChannelOffsetY()] = ConvertToFormat(VDATA[63 : 32].b32) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat(VDATA[31 : 0].b32) + MEM[addr + ChannelOffsetY()] = ConvertToFormat(VDATA[63 : 32].b32) + MEM[addr + ChannelOffsetZ()] = ConvertToFormat(VDATA[95 : 64].b32) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat(VDATA[31 : 0].b32) + MEM[addr + ChannelOffsetY()] = ConvertToFormat(VDATA[63 : 32].b32) + MEM[addr + ChannelOffsetZ()] = ConvertToFormat(VDATA[95 : 64].b32) + MEM[addr + ChannelOffsetW()] = ConvertToFormat(VDATA[127 : 96].b32) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetX()])) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetX()])) + VDATA[31 : 16].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetY()])) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetX()])) + VDATA[31 : 16].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetY()])) + VDATA[47 : 32].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetZ()])) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetX()])) + VDATA[31 : 16].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetY()])) + VDATA[47 : 32].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetZ()])) + VDATA[63 : 48].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetW()])) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat((VDATA[15 : 0].b16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat((VDATA[15 : 0].b16)) + MEM[addr + ChannelOffsetY()] = ConvertToFormat((VDATA[31 : 16].b16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat((VDATA[15 : 0].b16)) + MEM[addr + ChannelOffsetY()] = ConvertToFormat((VDATA[31 : 16].b16)) + MEM[addr + ChannelOffsetZ()] = ConvertToFormat((VDATA[47 : 32].b16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat((VDATA[15 : 0].b16)) + MEM[addr + ChannelOffsetY()] = ConvertToFormat((VDATA[31 : 16].b16)) + MEM[addr + ChannelOffsetZ()] = ConvertToFormat((VDATA[47 : 32].b16)) + MEM[addr + ChannelOffsetW()] = ConvertToFormat((VDATA[63 : 48].b16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + VDATA[127 : 96] = MEM[addr + 12].b32 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[7 : 0] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[23 : 16] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[15 : 0] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[31 : 16] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + MEM[addr + 12].b32 = VDATA[127 : 96] + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].u16 = (_pack(0, MEM[addr].u8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 16].u16 = (_pack(0, MEM[addr].u8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].i16 = (signext(MEM[addr].i8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 16].i16 = (signext(MEM[addr].i8)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[15 : 0].b16 = MEM[addr].b16 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 16].b16 = MEM[addr].b16 + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + VDATA[31 : 16].b16 = (ConvertFromFormat(MEM[addr + ChannelOffsetX()])) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + MEM[addr + ChannelOffsetX()] = ConvertToFormat((VDATA[31 : 16].b16)) + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA[31 : 0].u32 + cmp = DATA[63 : 32].u32 + MEM[addr].u32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + addr = CalcBufferAddr(VADDR.b32, SRSRC.b32, SOFFSET.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA = tmp + return {'RETURN_DATA': RETURN_DATA} + +DSOp_FUNCTIONS = { + DSOp.DS_ADD_U32: _DSOp_DS_ADD_U32, + DSOp.DS_SUB_U32: _DSOp_DS_SUB_U32, + DSOp.DS_RSUB_U32: _DSOp_DS_RSUB_U32, + DSOp.DS_INC_U32: _DSOp_DS_INC_U32, + DSOp.DS_DEC_U32: _DSOp_DS_DEC_U32, + DSOp.DS_MIN_I32: _DSOp_DS_MIN_I32, + DSOp.DS_MAX_I32: _DSOp_DS_MAX_I32, + DSOp.DS_MIN_U32: _DSOp_DS_MIN_U32, + DSOp.DS_MAX_U32: _DSOp_DS_MAX_U32, + DSOp.DS_AND_B32: _DSOp_DS_AND_B32, + DSOp.DS_OR_B32: _DSOp_DS_OR_B32, + DSOp.DS_XOR_B32: _DSOp_DS_XOR_B32, + DSOp.DS_MSKOR_B32: _DSOp_DS_MSKOR_B32, + DSOp.DS_WRITE_B32: _DSOp_DS_WRITE_B32, + DSOp.DS_WRITE2_B32: _DSOp_DS_WRITE2_B32, + DSOp.DS_WRITE2ST64_B32: _DSOp_DS_WRITE2ST64_B32, + DSOp.DS_CMPST_B32: _DSOp_DS_CMPST_B32, + DSOp.DS_CMPST_F32: _DSOp_DS_CMPST_F32, + DSOp.DS_MIN_F32: _DSOp_DS_MIN_F32, + DSOp.DS_MAX_F32: _DSOp_DS_MAX_F32, + DSOp.DS_ADD_F32: _DSOp_DS_ADD_F32, + DSOp.DS_PK_ADD_F16: _DSOp_DS_PK_ADD_F16, + DSOp.DS_PK_ADD_BF16: _DSOp_DS_PK_ADD_BF16, + DSOp.DS_WRITE_B8: _DSOp_DS_WRITE_B8, + DSOp.DS_WRITE_B16: _DSOp_DS_WRITE_B16, + DSOp.DS_ADD_RTN_U32: _DSOp_DS_ADD_RTN_U32, + DSOp.DS_SUB_RTN_U32: _DSOp_DS_SUB_RTN_U32, + DSOp.DS_RSUB_RTN_U32: _DSOp_DS_RSUB_RTN_U32, + DSOp.DS_INC_RTN_U32: _DSOp_DS_INC_RTN_U32, + DSOp.DS_DEC_RTN_U32: _DSOp_DS_DEC_RTN_U32, + DSOp.DS_MIN_RTN_I32: _DSOp_DS_MIN_RTN_I32, + DSOp.DS_MAX_RTN_I32: _DSOp_DS_MAX_RTN_I32, + DSOp.DS_MIN_RTN_U32: _DSOp_DS_MIN_RTN_U32, + DSOp.DS_MAX_RTN_U32: _DSOp_DS_MAX_RTN_U32, + DSOp.DS_AND_RTN_B32: _DSOp_DS_AND_RTN_B32, + DSOp.DS_OR_RTN_B32: _DSOp_DS_OR_RTN_B32, + DSOp.DS_XOR_RTN_B32: _DSOp_DS_XOR_RTN_B32, + DSOp.DS_MSKOR_RTN_B32: _DSOp_DS_MSKOR_RTN_B32, + DSOp.DS_WRXCHG_RTN_B32: _DSOp_DS_WRXCHG_RTN_B32, + DSOp.DS_WRXCHG2_RTN_B32: _DSOp_DS_WRXCHG2_RTN_B32, + DSOp.DS_WRXCHG2ST64_RTN_B32: _DSOp_DS_WRXCHG2ST64_RTN_B32, + DSOp.DS_CMPST_RTN_B32: _DSOp_DS_CMPST_RTN_B32, + DSOp.DS_CMPST_RTN_F32: _DSOp_DS_CMPST_RTN_F32, + DSOp.DS_MIN_RTN_F32: _DSOp_DS_MIN_RTN_F32, + DSOp.DS_MAX_RTN_F32: _DSOp_DS_MAX_RTN_F32, + DSOp.DS_WRAP_RTN_B32: _DSOp_DS_WRAP_RTN_B32, + DSOp.DS_ADD_RTN_F32: _DSOp_DS_ADD_RTN_F32, + DSOp.DS_READ_B32: _DSOp_DS_READ_B32, + DSOp.DS_READ2_B32: _DSOp_DS_READ2_B32, + DSOp.DS_READ2ST64_B32: _DSOp_DS_READ2ST64_B32, + DSOp.DS_READ_I8: _DSOp_DS_READ_I8, + DSOp.DS_READ_U8: _DSOp_DS_READ_U8, + DSOp.DS_READ_I16: _DSOp_DS_READ_I16, + DSOp.DS_READ_U16: _DSOp_DS_READ_U16, + DSOp.DS_PERMUTE_B32: _DSOp_DS_PERMUTE_B32, + DSOp.DS_BPERMUTE_B32: _DSOp_DS_BPERMUTE_B32, + DSOp.DS_ADD_U64: _DSOp_DS_ADD_U64, + DSOp.DS_SUB_U64: _DSOp_DS_SUB_U64, + DSOp.DS_RSUB_U64: _DSOp_DS_RSUB_U64, + DSOp.DS_INC_U64: _DSOp_DS_INC_U64, + DSOp.DS_DEC_U64: _DSOp_DS_DEC_U64, + DSOp.DS_MIN_I64: _DSOp_DS_MIN_I64, + DSOp.DS_MAX_I64: _DSOp_DS_MAX_I64, + DSOp.DS_MIN_U64: _DSOp_DS_MIN_U64, + DSOp.DS_MAX_U64: _DSOp_DS_MAX_U64, + DSOp.DS_AND_B64: _DSOp_DS_AND_B64, + DSOp.DS_OR_B64: _DSOp_DS_OR_B64, + DSOp.DS_XOR_B64: _DSOp_DS_XOR_B64, + DSOp.DS_MSKOR_B64: _DSOp_DS_MSKOR_B64, + DSOp.DS_WRITE_B64: _DSOp_DS_WRITE_B64, + DSOp.DS_WRITE2_B64: _DSOp_DS_WRITE2_B64, + DSOp.DS_WRITE2ST64_B64: _DSOp_DS_WRITE2ST64_B64, + DSOp.DS_CMPST_B64: _DSOp_DS_CMPST_B64, + DSOp.DS_CMPST_F64: _DSOp_DS_CMPST_F64, + DSOp.DS_MIN_F64: _DSOp_DS_MIN_F64, + DSOp.DS_MAX_F64: _DSOp_DS_MAX_F64, + DSOp.DS_WRITE_B8_D16_HI: _DSOp_DS_WRITE_B8_D16_HI, + DSOp.DS_WRITE_B16_D16_HI: _DSOp_DS_WRITE_B16_D16_HI, + DSOp.DS_READ_U8_D16: _DSOp_DS_READ_U8_D16, + DSOp.DS_READ_U8_D16_HI: _DSOp_DS_READ_U8_D16_HI, + DSOp.DS_READ_I8_D16: _DSOp_DS_READ_I8_D16, + DSOp.DS_READ_I8_D16_HI: _DSOp_DS_READ_I8_D16_HI, + DSOp.DS_READ_U16_D16: _DSOp_DS_READ_U16_D16, + DSOp.DS_READ_U16_D16_HI: _DSOp_DS_READ_U16_D16_HI, + DSOp.DS_ADD_F64: _DSOp_DS_ADD_F64, + DSOp.DS_ADD_RTN_U64: _DSOp_DS_ADD_RTN_U64, + DSOp.DS_SUB_RTN_U64: _DSOp_DS_SUB_RTN_U64, + DSOp.DS_RSUB_RTN_U64: _DSOp_DS_RSUB_RTN_U64, + DSOp.DS_INC_RTN_U64: _DSOp_DS_INC_RTN_U64, + DSOp.DS_DEC_RTN_U64: _DSOp_DS_DEC_RTN_U64, + DSOp.DS_MIN_RTN_I64: _DSOp_DS_MIN_RTN_I64, + DSOp.DS_MAX_RTN_I64: _DSOp_DS_MAX_RTN_I64, + DSOp.DS_MIN_RTN_U64: _DSOp_DS_MIN_RTN_U64, + DSOp.DS_MAX_RTN_U64: _DSOp_DS_MAX_RTN_U64, + DSOp.DS_AND_RTN_B64: _DSOp_DS_AND_RTN_B64, + DSOp.DS_OR_RTN_B64: _DSOp_DS_OR_RTN_B64, + DSOp.DS_XOR_RTN_B64: _DSOp_DS_XOR_RTN_B64, + DSOp.DS_MSKOR_RTN_B64: _DSOp_DS_MSKOR_RTN_B64, + DSOp.DS_WRXCHG_RTN_B64: _DSOp_DS_WRXCHG_RTN_B64, + DSOp.DS_WRXCHG2_RTN_B64: _DSOp_DS_WRXCHG2_RTN_B64, + DSOp.DS_WRXCHG2ST64_RTN_B64: _DSOp_DS_WRXCHG2ST64_RTN_B64, + DSOp.DS_CMPST_RTN_B64: _DSOp_DS_CMPST_RTN_B64, + DSOp.DS_CMPST_RTN_F64: _DSOp_DS_CMPST_RTN_F64, + DSOp.DS_MIN_RTN_F64: _DSOp_DS_MIN_RTN_F64, + DSOp.DS_MAX_RTN_F64: _DSOp_DS_MAX_RTN_F64, + DSOp.DS_READ_B64: _DSOp_DS_READ_B64, + DSOp.DS_READ2_B64: _DSOp_DS_READ2_B64, + DSOp.DS_READ2ST64_B64: _DSOp_DS_READ2ST64_B64, + DSOp.DS_ADD_RTN_F64: _DSOp_DS_ADD_RTN_F64, + DSOp.DS_CONDXCHG32_RTN_B64: _DSOp_DS_CONDXCHG32_RTN_B64, + DSOp.DS_PK_ADD_RTN_F16: _DSOp_DS_PK_ADD_RTN_F16, + DSOp.DS_PK_ADD_RTN_BF16: _DSOp_DS_PK_ADD_RTN_BF16, + DSOp.DS_WRITE_B96: _DSOp_DS_WRITE_B96, + DSOp.DS_WRITE_B128: _DSOp_DS_WRITE_B128, + DSOp.DS_READ_B96: _DSOp_DS_READ_B96, + DSOp.DS_READ_B128: _DSOp_DS_READ_B128, +} + +def _FLATOp_FLAT_LOAD_UBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_USHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u16)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SSHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i16)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + VDATA[127 : 96] = MEM[addr + 12].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_STORE_BYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[7 : 0] + return {} + +def _FLATOp_FLAT_STORE_BYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[23 : 16] + return {} + +def _FLATOp_FLAT_STORE_SHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[15 : 0] + return {} + +def _FLATOp_FLAT_STORE_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[31 : 16] + return {} + +def _FLATOp_FLAT_STORE_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + return {} + +def _FLATOp_FLAT_STORE_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + return {} + +def _FLATOp_FLAT_STORE_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + return {} + +def _FLATOp_FLAT_STORE_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + MEM[addr + 12].b32 = VDATA[127 : 96] + return {} + +def _FLATOp_FLAT_LOAD_UBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[15 : 0].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_UBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 16].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[15 : 0].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 16].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SHORT_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[15 : 0].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + VDATA[31 : 16].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_ATOMIC_SWAP(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_CMPSWAP(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA[31 : 0].u32 + cmp = DATA[63 : 32].u32 + MEM[addr].u32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SUB(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SMIN(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_UMIN(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SMAX(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_UMAX(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_AND(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_OR(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_XOR(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_INC(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_DEC(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA = tmp + return {} + +def _FLATOp_FLAT_ATOMIC_PK_ADD_F16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].f16 = tmp[31 : 16].f16 + src[31 : 16].f16 + dst[15 : 0].f16 = tmp[15 : 0].f16 + src[15 : 0].f16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _FLATOp_FLAT_ATOMIC_ADD_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + MEM[ADDR].f64 += DATA.f64 + RETURN_DATA = tmp + return {} + +def _FLATOp_FLAT_ATOMIC_MIN_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_PK_ADD_BF16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].bf16 = tmp[31 : 16].bf16 + src[31 : 16].bf16 + dst[15 : 0].bf16 = tmp[15 : 0].bf16 + src[15 : 0].bf16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _FLATOp_FLAT_ATOMIC_SWAP_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_CMPSWAP_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA[63 : 0].u64 + cmp = DATA[127 : 64].u64 + MEM[addr].u64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SUB_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SMIN_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_UMIN_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SMAX_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_UMAX_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_AND_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_OR_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_XOR_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_INC_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_DEC_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcFlatAddr(ADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +FLATOp_FUNCTIONS = { + FLATOp.FLAT_LOAD_UBYTE: _FLATOp_FLAT_LOAD_UBYTE, + FLATOp.FLAT_LOAD_SBYTE: _FLATOp_FLAT_LOAD_SBYTE, + FLATOp.FLAT_LOAD_USHORT: _FLATOp_FLAT_LOAD_USHORT, + FLATOp.FLAT_LOAD_SSHORT: _FLATOp_FLAT_LOAD_SSHORT, + FLATOp.FLAT_LOAD_DWORD: _FLATOp_FLAT_LOAD_DWORD, + FLATOp.FLAT_LOAD_DWORDX2: _FLATOp_FLAT_LOAD_DWORDX2, + FLATOp.FLAT_LOAD_DWORDX3: _FLATOp_FLAT_LOAD_DWORDX3, + FLATOp.FLAT_LOAD_DWORDX4: _FLATOp_FLAT_LOAD_DWORDX4, + FLATOp.FLAT_STORE_BYTE: _FLATOp_FLAT_STORE_BYTE, + FLATOp.FLAT_STORE_BYTE_D16_HI: _FLATOp_FLAT_STORE_BYTE_D16_HI, + FLATOp.FLAT_STORE_SHORT: _FLATOp_FLAT_STORE_SHORT, + FLATOp.FLAT_STORE_SHORT_D16_HI: _FLATOp_FLAT_STORE_SHORT_D16_HI, + FLATOp.FLAT_STORE_DWORD: _FLATOp_FLAT_STORE_DWORD, + FLATOp.FLAT_STORE_DWORDX2: _FLATOp_FLAT_STORE_DWORDX2, + FLATOp.FLAT_STORE_DWORDX3: _FLATOp_FLAT_STORE_DWORDX3, + FLATOp.FLAT_STORE_DWORDX4: _FLATOp_FLAT_STORE_DWORDX4, + FLATOp.FLAT_LOAD_UBYTE_D16: _FLATOp_FLAT_LOAD_UBYTE_D16, + FLATOp.FLAT_LOAD_UBYTE_D16_HI: _FLATOp_FLAT_LOAD_UBYTE_D16_HI, + FLATOp.FLAT_LOAD_SBYTE_D16: _FLATOp_FLAT_LOAD_SBYTE_D16, + FLATOp.FLAT_LOAD_SBYTE_D16_HI: _FLATOp_FLAT_LOAD_SBYTE_D16_HI, + FLATOp.FLAT_LOAD_SHORT_D16: _FLATOp_FLAT_LOAD_SHORT_D16, + FLATOp.FLAT_LOAD_SHORT_D16_HI: _FLATOp_FLAT_LOAD_SHORT_D16_HI, + FLATOp.FLAT_ATOMIC_SWAP: _FLATOp_FLAT_ATOMIC_SWAP, + FLATOp.FLAT_ATOMIC_CMPSWAP: _FLATOp_FLAT_ATOMIC_CMPSWAP, + FLATOp.FLAT_ATOMIC_ADD: _FLATOp_FLAT_ATOMIC_ADD, + FLATOp.FLAT_ATOMIC_SUB: _FLATOp_FLAT_ATOMIC_SUB, + FLATOp.FLAT_ATOMIC_SMIN: _FLATOp_FLAT_ATOMIC_SMIN, + FLATOp.FLAT_ATOMIC_UMIN: _FLATOp_FLAT_ATOMIC_UMIN, + FLATOp.FLAT_ATOMIC_SMAX: _FLATOp_FLAT_ATOMIC_SMAX, + FLATOp.FLAT_ATOMIC_UMAX: _FLATOp_FLAT_ATOMIC_UMAX, + FLATOp.FLAT_ATOMIC_AND: _FLATOp_FLAT_ATOMIC_AND, + FLATOp.FLAT_ATOMIC_OR: _FLATOp_FLAT_ATOMIC_OR, + FLATOp.FLAT_ATOMIC_XOR: _FLATOp_FLAT_ATOMIC_XOR, + FLATOp.FLAT_ATOMIC_INC: _FLATOp_FLAT_ATOMIC_INC, + FLATOp.FLAT_ATOMIC_DEC: _FLATOp_FLAT_ATOMIC_DEC, + FLATOp.FLAT_ATOMIC_ADD_F32: _FLATOp_FLAT_ATOMIC_ADD_F32, + FLATOp.FLAT_ATOMIC_PK_ADD_F16: _FLATOp_FLAT_ATOMIC_PK_ADD_F16, + FLATOp.FLAT_ATOMIC_ADD_F64: _FLATOp_FLAT_ATOMIC_ADD_F64, + FLATOp.FLAT_ATOMIC_MIN_F64: _FLATOp_FLAT_ATOMIC_MIN_F64, + FLATOp.FLAT_ATOMIC_MAX_F64: _FLATOp_FLAT_ATOMIC_MAX_F64, + FLATOp.FLAT_ATOMIC_PK_ADD_BF16: _FLATOp_FLAT_ATOMIC_PK_ADD_BF16, + FLATOp.FLAT_ATOMIC_SWAP_X2: _FLATOp_FLAT_ATOMIC_SWAP_X2, + FLATOp.FLAT_ATOMIC_CMPSWAP_X2: _FLATOp_FLAT_ATOMIC_CMPSWAP_X2, + FLATOp.FLAT_ATOMIC_ADD_X2: _FLATOp_FLAT_ATOMIC_ADD_X2, + FLATOp.FLAT_ATOMIC_SUB_X2: _FLATOp_FLAT_ATOMIC_SUB_X2, + FLATOp.FLAT_ATOMIC_SMIN_X2: _FLATOp_FLAT_ATOMIC_SMIN_X2, + FLATOp.FLAT_ATOMIC_UMIN_X2: _FLATOp_FLAT_ATOMIC_UMIN_X2, + FLATOp.FLAT_ATOMIC_SMAX_X2: _FLATOp_FLAT_ATOMIC_SMAX_X2, + FLATOp.FLAT_ATOMIC_UMAX_X2: _FLATOp_FLAT_ATOMIC_UMAX_X2, + FLATOp.FLAT_ATOMIC_AND_X2: _FLATOp_FLAT_ATOMIC_AND_X2, + FLATOp.FLAT_ATOMIC_OR_X2: _FLATOp_FLAT_ATOMIC_OR_X2, + FLATOp.FLAT_ATOMIC_XOR_X2: _FLATOp_FLAT_ATOMIC_XOR_X2, + FLATOp.FLAT_ATOMIC_INC_X2: _FLATOp_FLAT_ATOMIC_INC_X2, + FLATOp.FLAT_ATOMIC_DEC_X2: _FLATOp_FLAT_ATOMIC_DEC_X2, +} + +def _GLOBALOp_GLOBAL_LOAD_UBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_USHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u16)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SSHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i16)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + VDATA[127 : 96] = MEM[addr + 12].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_STORE_BYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[7 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_BYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[23 : 16] + return {} + +def _GLOBALOp_GLOBAL_STORE_SHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[15 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[31 : 16] + return {} + +def _GLOBALOp_GLOBAL_STORE_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + return {} + +def _GLOBALOp_GLOBAL_STORE_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + return {} + +def _GLOBALOp_GLOBAL_STORE_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + MEM[addr + 12].b32 = VDATA[127 : 96] + return {} + +def _GLOBALOp_GLOBAL_LOAD_UBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_UBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SHORT_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SWAP(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA[31 : 0].u32 + cmp = DATA[63 : 32].u32 + MEM[addr].u32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SUB(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SMIN(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_UMIN(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SMAX(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_UMAX(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_AND(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_OR(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_XOR(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_INC(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_DEC(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA = tmp + return {} + +def _GLOBALOp_GLOBAL_ATOMIC_PK_ADD_F16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].f16 = tmp[31 : 16].f16 + src[31 : 16].f16 + dst[15 : 0].f16 = tmp[15 : 0].f16 + src[15 : 0].f16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + MEM[ADDR].f64 += DATA.f64 + RETURN_DATA = tmp + return {} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_F64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].f64) + src = DATA.f64 + MEM[addr].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_PK_ADD_BF16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR]) + src = DATA + dst[31 : 16].bf16 = tmp[31 : 16].bf16 + src[31 : 16].bf16 + dst[15 : 0].bf16 = tmp[15 : 0].bf16 + src[15 : 0].bf16 + MEM[ADDR] = dst.b32 + RETURN_DATA = tmp + return {} + +def _GLOBALOp_GLOBAL_ATOMIC_SWAP_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA[63 : 0].u64 + cmp = DATA[127 : 64].u64 + MEM[addr].u64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SUB_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SMIN_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_UMIN_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SMAX_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_UMAX_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_AND_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_OR_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_XOR_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_INC_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_DEC_X2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcGlobalAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +GLOBALOp_FUNCTIONS = { + GLOBALOp.GLOBAL_LOAD_UBYTE: _GLOBALOp_GLOBAL_LOAD_UBYTE, + GLOBALOp.GLOBAL_LOAD_SBYTE: _GLOBALOp_GLOBAL_LOAD_SBYTE, + GLOBALOp.GLOBAL_LOAD_USHORT: _GLOBALOp_GLOBAL_LOAD_USHORT, + GLOBALOp.GLOBAL_LOAD_SSHORT: _GLOBALOp_GLOBAL_LOAD_SSHORT, + GLOBALOp.GLOBAL_LOAD_DWORD: _GLOBALOp_GLOBAL_LOAD_DWORD, + GLOBALOp.GLOBAL_LOAD_DWORDX2: _GLOBALOp_GLOBAL_LOAD_DWORDX2, + GLOBALOp.GLOBAL_LOAD_DWORDX3: _GLOBALOp_GLOBAL_LOAD_DWORDX3, + GLOBALOp.GLOBAL_LOAD_DWORDX4: _GLOBALOp_GLOBAL_LOAD_DWORDX4, + GLOBALOp.GLOBAL_STORE_BYTE: _GLOBALOp_GLOBAL_STORE_BYTE, + GLOBALOp.GLOBAL_STORE_BYTE_D16_HI: _GLOBALOp_GLOBAL_STORE_BYTE_D16_HI, + GLOBALOp.GLOBAL_STORE_SHORT: _GLOBALOp_GLOBAL_STORE_SHORT, + GLOBALOp.GLOBAL_STORE_SHORT_D16_HI: _GLOBALOp_GLOBAL_STORE_SHORT_D16_HI, + GLOBALOp.GLOBAL_STORE_DWORD: _GLOBALOp_GLOBAL_STORE_DWORD, + GLOBALOp.GLOBAL_STORE_DWORDX2: _GLOBALOp_GLOBAL_STORE_DWORDX2, + GLOBALOp.GLOBAL_STORE_DWORDX3: _GLOBALOp_GLOBAL_STORE_DWORDX3, + GLOBALOp.GLOBAL_STORE_DWORDX4: _GLOBALOp_GLOBAL_STORE_DWORDX4, + GLOBALOp.GLOBAL_LOAD_UBYTE_D16: _GLOBALOp_GLOBAL_LOAD_UBYTE_D16, + GLOBALOp.GLOBAL_LOAD_UBYTE_D16_HI: _GLOBALOp_GLOBAL_LOAD_UBYTE_D16_HI, + GLOBALOp.GLOBAL_LOAD_SBYTE_D16: _GLOBALOp_GLOBAL_LOAD_SBYTE_D16, + GLOBALOp.GLOBAL_LOAD_SBYTE_D16_HI: _GLOBALOp_GLOBAL_LOAD_SBYTE_D16_HI, + GLOBALOp.GLOBAL_LOAD_SHORT_D16: _GLOBALOp_GLOBAL_LOAD_SHORT_D16, + GLOBALOp.GLOBAL_LOAD_SHORT_D16_HI: _GLOBALOp_GLOBAL_LOAD_SHORT_D16_HI, + GLOBALOp.GLOBAL_ATOMIC_SWAP: _GLOBALOp_GLOBAL_ATOMIC_SWAP, + GLOBALOp.GLOBAL_ATOMIC_CMPSWAP: _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP, + GLOBALOp.GLOBAL_ATOMIC_ADD: _GLOBALOp_GLOBAL_ATOMIC_ADD, + GLOBALOp.GLOBAL_ATOMIC_SUB: _GLOBALOp_GLOBAL_ATOMIC_SUB, + GLOBALOp.GLOBAL_ATOMIC_SMIN: _GLOBALOp_GLOBAL_ATOMIC_SMIN, + GLOBALOp.GLOBAL_ATOMIC_UMIN: _GLOBALOp_GLOBAL_ATOMIC_UMIN, + GLOBALOp.GLOBAL_ATOMIC_SMAX: _GLOBALOp_GLOBAL_ATOMIC_SMAX, + GLOBALOp.GLOBAL_ATOMIC_UMAX: _GLOBALOp_GLOBAL_ATOMIC_UMAX, + GLOBALOp.GLOBAL_ATOMIC_AND: _GLOBALOp_GLOBAL_ATOMIC_AND, + GLOBALOp.GLOBAL_ATOMIC_OR: _GLOBALOp_GLOBAL_ATOMIC_OR, + GLOBALOp.GLOBAL_ATOMIC_XOR: _GLOBALOp_GLOBAL_ATOMIC_XOR, + GLOBALOp.GLOBAL_ATOMIC_INC: _GLOBALOp_GLOBAL_ATOMIC_INC, + GLOBALOp.GLOBAL_ATOMIC_DEC: _GLOBALOp_GLOBAL_ATOMIC_DEC, + GLOBALOp.GLOBAL_ATOMIC_ADD_F32: _GLOBALOp_GLOBAL_ATOMIC_ADD_F32, + GLOBALOp.GLOBAL_ATOMIC_PK_ADD_F16: _GLOBALOp_GLOBAL_ATOMIC_PK_ADD_F16, + GLOBALOp.GLOBAL_ATOMIC_ADD_F64: _GLOBALOp_GLOBAL_ATOMIC_ADD_F64, + GLOBALOp.GLOBAL_ATOMIC_MIN_F64: _GLOBALOp_GLOBAL_ATOMIC_MIN_F64, + GLOBALOp.GLOBAL_ATOMIC_MAX_F64: _GLOBALOp_GLOBAL_ATOMIC_MAX_F64, + GLOBALOp.GLOBAL_ATOMIC_PK_ADD_BF16: _GLOBALOp_GLOBAL_ATOMIC_PK_ADD_BF16, + GLOBALOp.GLOBAL_ATOMIC_SWAP_X2: _GLOBALOp_GLOBAL_ATOMIC_SWAP_X2, + GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_X2: _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_X2, + GLOBALOp.GLOBAL_ATOMIC_ADD_X2: _GLOBALOp_GLOBAL_ATOMIC_ADD_X2, + GLOBALOp.GLOBAL_ATOMIC_SUB_X2: _GLOBALOp_GLOBAL_ATOMIC_SUB_X2, + GLOBALOp.GLOBAL_ATOMIC_SMIN_X2: _GLOBALOp_GLOBAL_ATOMIC_SMIN_X2, + GLOBALOp.GLOBAL_ATOMIC_UMIN_X2: _GLOBALOp_GLOBAL_ATOMIC_UMIN_X2, + GLOBALOp.GLOBAL_ATOMIC_SMAX_X2: _GLOBALOp_GLOBAL_ATOMIC_SMAX_X2, + GLOBALOp.GLOBAL_ATOMIC_UMAX_X2: _GLOBALOp_GLOBAL_ATOMIC_UMAX_X2, + GLOBALOp.GLOBAL_ATOMIC_AND_X2: _GLOBALOp_GLOBAL_ATOMIC_AND_X2, + GLOBALOp.GLOBAL_ATOMIC_OR_X2: _GLOBALOp_GLOBAL_ATOMIC_OR_X2, + GLOBALOp.GLOBAL_ATOMIC_XOR_X2: _GLOBALOp_GLOBAL_ATOMIC_XOR_X2, + GLOBALOp.GLOBAL_ATOMIC_INC_X2: _GLOBALOp_GLOBAL_ATOMIC_INC_X2, + GLOBALOp.GLOBAL_ATOMIC_DEC_X2: _GLOBALOp_GLOBAL_ATOMIC_DEC_X2, +} + +def _SCRATCHOp_SCRATCH_LOAD_UBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SBYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_USHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.u32 = (_pack(0, MEM[addr].u16)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SSHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA.i32 = (signext(MEM[addr].i16)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 0] = MEM[addr].b32 + VDATA[63 : 32] = MEM[addr + 4].b32 + VDATA[95 : 64] = MEM[addr + 8].b32 + VDATA[127 : 96] = MEM[addr + 12].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_STORE_BYTE(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[7 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_BYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b8 = VDATA[23 : 16] + return {} + +def _SCRATCHOp_SCRATCH_STORE_SHORT(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[15 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b16 = VDATA[31 : 16] + return {} + +def _SCRATCHOp_SCRATCH_STORE_DWORD(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_DWORDX2(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + return {} + +def _SCRATCHOp_SCRATCH_STORE_DWORDX3(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + return {} + +def _SCRATCHOp_SCRATCH_STORE_DWORDX4(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + MEM[addr].b32 = VDATA[31 : 0] + MEM[addr + 4].b32 = VDATA[63 : 32] + MEM[addr + 8].b32 = VDATA[95 : 64] + MEM[addr + 12].b32 = VDATA[127 : 96] + return {} + +def _SCRATCHOp_SCRATCH_LOAD_UBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_UBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].u16 = (_pack(0, MEM[addr].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SBYTE_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SBYTE_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].i16 = (signext(MEM[addr].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SHORT_D16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[15 : 0].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_SHORT_D16_HI(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + addr = CalcScratchAddr(ADDR.b32, SADDR.b32, OFFSET.b32) + VDATA[31 : 16].b16 = MEM[addr].b16 + return {'VDATA': VDATA} + +SCRATCHOp_FUNCTIONS = { + SCRATCHOp.SCRATCH_LOAD_UBYTE: _SCRATCHOp_SCRATCH_LOAD_UBYTE, + SCRATCHOp.SCRATCH_LOAD_SBYTE: _SCRATCHOp_SCRATCH_LOAD_SBYTE, + SCRATCHOp.SCRATCH_LOAD_USHORT: _SCRATCHOp_SCRATCH_LOAD_USHORT, + SCRATCHOp.SCRATCH_LOAD_SSHORT: _SCRATCHOp_SCRATCH_LOAD_SSHORT, + SCRATCHOp.SCRATCH_LOAD_DWORD: _SCRATCHOp_SCRATCH_LOAD_DWORD, + SCRATCHOp.SCRATCH_LOAD_DWORDX2: _SCRATCHOp_SCRATCH_LOAD_DWORDX2, + SCRATCHOp.SCRATCH_LOAD_DWORDX3: _SCRATCHOp_SCRATCH_LOAD_DWORDX3, + SCRATCHOp.SCRATCH_LOAD_DWORDX4: _SCRATCHOp_SCRATCH_LOAD_DWORDX4, + SCRATCHOp.SCRATCH_STORE_BYTE: _SCRATCHOp_SCRATCH_STORE_BYTE, + SCRATCHOp.SCRATCH_STORE_BYTE_D16_HI: _SCRATCHOp_SCRATCH_STORE_BYTE_D16_HI, + SCRATCHOp.SCRATCH_STORE_SHORT: _SCRATCHOp_SCRATCH_STORE_SHORT, + SCRATCHOp.SCRATCH_STORE_SHORT_D16_HI: _SCRATCHOp_SCRATCH_STORE_SHORT_D16_HI, + SCRATCHOp.SCRATCH_STORE_DWORD: _SCRATCHOp_SCRATCH_STORE_DWORD, + SCRATCHOp.SCRATCH_STORE_DWORDX2: _SCRATCHOp_SCRATCH_STORE_DWORDX2, + SCRATCHOp.SCRATCH_STORE_DWORDX3: _SCRATCHOp_SCRATCH_STORE_DWORDX3, + SCRATCHOp.SCRATCH_STORE_DWORDX4: _SCRATCHOp_SCRATCH_STORE_DWORDX4, + SCRATCHOp.SCRATCH_LOAD_UBYTE_D16: _SCRATCHOp_SCRATCH_LOAD_UBYTE_D16, + SCRATCHOp.SCRATCH_LOAD_UBYTE_D16_HI: _SCRATCHOp_SCRATCH_LOAD_UBYTE_D16_HI, + SCRATCHOp.SCRATCH_LOAD_SBYTE_D16: _SCRATCHOp_SCRATCH_LOAD_SBYTE_D16, + SCRATCHOp.SCRATCH_LOAD_SBYTE_D16_HI: _SCRATCHOp_SCRATCH_LOAD_SBYTE_D16_HI, + SCRATCHOp.SCRATCH_LOAD_SHORT_D16: _SCRATCHOp_SCRATCH_LOAD_SHORT_D16, + SCRATCHOp.SCRATCH_LOAD_SHORT_D16_HI: _SCRATCHOp_SCRATCH_LOAD_SHORT_D16_HI, +} + COMPILED_FUNCTIONS = { SOP1Op: SOP1Op_FUNCTIONS, SOP2Op: SOP2Op_FUNCTIONS, @@ -6282,6 +8869,10 @@ COMPILED_FUNCTIONS = { VOPCOp: VOPCOp_FUNCTIONS, VOP3AOp: VOP3AOp_FUNCTIONS, VOP3BOp: VOP3BOp_FUNCTIONS, + DSOp: DSOp_FUNCTIONS, + FLATOp: FLATOp_FUNCTIONS, + GLOBALOp: GLOBALOp_FUNCTIONS, + SCRATCHOp: SCRATCHOp_FUNCTIONS, } def get_compiled_functions(): return COMPILED_FUNCTIONS \ No newline at end of file diff --git a/extra/assembly/amd/autogen/rdna3/gen_pcode.py b/extra/assembly/amd/autogen/rdna3/gen_pcode.py index fa9392de7a..eaf9285d16 100644 --- a/extra/assembly/amd/autogen/rdna3/gen_pcode.py +++ b/extra/assembly/amd/autogen/rdna3/gen_pcode.py @@ -1,9 +1,9 @@ # autogenerated by pdf.py - do not edit # to regenerate: python -m extra.assembly.amd.pdf --arch rdna3 -# ruff: noqa: E501,F405,F403 +# ruff: noqa: E501 # mypy: ignore-errors -from extra.assembly.amd.autogen.rdna3.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp -from extra.assembly.amd.pcode import * +from extra.assembly.amd.autogen.rdna3.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, SCRATCHOp +from extra.assembly.amd.pcode import ABSDIFF, BYTE_PERMUTE, DENORM, F, GT_NEG_ZERO, INF, LT_NEG_ZERO, MAX_FLOAT_F32, OVERFLOW_F32, OVERFLOW_F64, PI, ROUND_MODE, Reg, SAT8, SliceProxy, TWO_OVER_PI_1201, UNDERFLOW_F32, UNDERFLOW_F64, WAVE32, WAVE64, WAVE_MODE, _pack, _pack32, bf16_to_f32, cos, cvtToQuietNAN, exponent, f16_to_f32, f16_to_i16, f16_to_snorm, f16_to_u16, f16_to_unorm, f32_to_f16, f32_to_f64, f32_to_i32, f32_to_snorm, f32_to_u32, f32_to_u8, f32_to_unorm, f64_to_f32, f64_to_i32, f64_to_u32, floor, fma, fract, i16_to_f16, i32_to_f32, i32_to_f64, i32_to_i16, isEven, isNAN, isQuietNAN, isSignalNAN, ldexp, log2, mantissa, pow, s_ff1_i32_b32, s_ff1_i32_b64, sign, signext, signext_from_bit, sin, sqrt, trunc, u16_to_f16, u32_to_f32, u32_to_f64, u32_to_u16, u4_to_u32, u8_to_u32, v_cvt_i16_f32, v_cvt_u16_f32, v_max3_f16, v_max3_f32, v_max3_i16, v_max3_i32, v_max3_u16, v_max3_u32, v_max_f16, v_max_f32, v_max_i16, v_max_i32, v_max_u16, v_max_u32, v_min3_f16, v_min3_f32, v_min_f16, v_min_f32, v_min_i16, v_min_i32, v_min_u16, v_min_u32, v_msad_u8, v_sad_u8 def _SOP1Op_S_MOV_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): D0.b32 = S0.b32 @@ -6254,9 +6254,2204 @@ VOPCOp_FUNCTIONS = { VOPCOp.V_CMPX_CLASS_F64: _VOPCOp_V_CMPX_CLASS_F64, } +def _DSOp_DS_ADD_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 = DATA.u32 - MEM[ADDR].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0] + return {} + +def _DSOp_DS_STORE_2ADDR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET0.u32 * 4].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET1.u32 * 4].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_STORE_2ADDR_STRIDE64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET0.u32 * 256].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET1.u32 * 256].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_CMPSTORE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + cmp = DATA2.b32 + MEM[ADDR].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + cmp = DATA2.f32 + MEM[ADDR].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[7 : 0] + return {} + +def _DSOp_DS_STORE_B16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[15 : 0] + return {} + +def _DSOp_DS_ADD_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 = DATA.u32 - MEM[ADDR].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 4 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 4 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 256 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 256 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + cmp = DATA2.b32 + MEM[ADDR].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + cmp = DATA2.f32 + MEM[ADDR].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_WRAP_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 = ((tmp - DATA.u32) if (tmp >= DATA.u32) else (tmp + DATA2.u32)) + RETURN_DATA = tmp + return {} + +def _DSOp_DS_LOAD_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 4].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET1.u32 * 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_STRIDE64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 256].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET1.u32 * 256].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 = DATA.u64 - MEM[ADDR].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET.u32 + 4].b32 = DATA[63 : 32] + return {} + +def _DSOp_DS_STORE_2ADDR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET0.u32 * 8].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET0.u32 * 8 + 4].b32 = DATA[63 : 32] + MEM[ADDR + OFFSET1.u32 * 8].b32 = DATA2[31 : 0] + MEM[ADDR + OFFSET1.u32 * 8 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_STORE_2ADDR_STRIDE64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET0.u32 * 512].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET0.u32 * 512 + 4].b32 = DATA[63 : 32] + MEM[ADDR + OFFSET1.u32 * 512].b32 = DATA2[31 : 0] + MEM[ADDR + OFFSET1.u32 * 512 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_CMPSTORE_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + src = DATA.b64 + cmp = DATA2.b64 + MEM[ADDR].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + cmp = DATA2.f64 + MEM[ADDR].f64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + MEM[ADDR].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + MEM[ADDR].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 = DATA.u64 - MEM[ADDR].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 8 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 8 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 512 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 512 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + src = DATA.b64 + cmp = DATA2.b64 + MEM[ADDR].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + cmp = DATA2.f64 + MEM[ADDR].f64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + MEM[ADDR].f64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_F64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f64) + src = DATA.f64 + MEM[ADDR].f64 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 8].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET0.u32 * 8 + 4].b32 + RETURN_DATA[95 : 64] = MEM[ADDR + OFFSET1.u32 * 8].b32 + RETURN_DATA[127 : 96] = MEM[ADDR + OFFSET1.u32 * 8 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_STRIDE64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET0.u32 * 512].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET0.u32 * 512 + 4].b32 + RETURN_DATA[95 : 64] = MEM[ADDR + OFFSET1.u32 * 512].b32 + RETURN_DATA[127 : 96] = MEM[ADDR + OFFSET1.u32 * 512 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CONDXCHG32_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + ADDR = S0.u32 + DATA = S1.u64 + offset = _pack(OFFSET1, OFFSET0) + RETURN_DATA[0] = LDS[ADDR0].u32 + if DATA[31]: + LDS[ADDR0] = _pack(0, DATA[30 : 0]) + RETURN_DATA[1] = LDS[ADDR1].u32 + if DATA[63]: + LDS[ADDR1] = _pack(0, DATA[62 : 32]) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[23 : 16] + return {} + +def _DSOp_DS_STORE_B16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[31 : 16] + return {} + +def _DSOp_DS_LOAD_U8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + tmp[i] = 0x0 + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + if EXEC[i].u1: + dst_lane = (VGPR[i][ADDR] + OFFSET.b32) / 4 % 32 + tmp[dst_lane] = VGPR[i][DATA0] + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_BPERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + tmp[i] = 0x0 + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + src_lane = (VGPR[i][ADDR] + OFFSET.b32) / 4 % 32 + if EXEC[src_lane].u1: + tmp[i] = VGPR[src_lane][DATA0] + for i in range(0, int(((63) if (WAVE64) else (31)))+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_STORE_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[ADDR + OFFSET.u32 + 8].b32 = DATA[95 : 64] + return {} + +def _DSOp_DS_STORE_B128(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + MEM[ADDR + OFFSET.u32].b32 = DATA[31 : 0] + MEM[ADDR + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[ADDR + OFFSET.u32 + 8].b32 = DATA[95 : 64] + MEM[ADDR + OFFSET.u32 + 12].b32 = DATA[127 : 96] + return {} + +def _DSOp_DS_LOAD_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4].b32 + RETURN_DATA[95 : 64] = MEM[ADDR + OFFSET.u32 + 8].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_B128(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 0] = MEM[ADDR + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[ADDR + OFFSET.u32 + 4].b32 + RETURN_DATA[95 : 64] = MEM[ADDR + OFFSET.u32 + 8].b32 + RETURN_DATA[127 : 96] = MEM[ADDR + OFFSET.u32 + 12].b32 + return {'RETURN_DATA': RETURN_DATA} + +DSOp_FUNCTIONS = { + DSOp.DS_ADD_U32: _DSOp_DS_ADD_U32, + DSOp.DS_SUB_U32: _DSOp_DS_SUB_U32, + DSOp.DS_RSUB_U32: _DSOp_DS_RSUB_U32, + DSOp.DS_INC_U32: _DSOp_DS_INC_U32, + DSOp.DS_DEC_U32: _DSOp_DS_DEC_U32, + DSOp.DS_MIN_I32: _DSOp_DS_MIN_I32, + DSOp.DS_MAX_I32: _DSOp_DS_MAX_I32, + DSOp.DS_MIN_U32: _DSOp_DS_MIN_U32, + DSOp.DS_MAX_U32: _DSOp_DS_MAX_U32, + DSOp.DS_AND_B32: _DSOp_DS_AND_B32, + DSOp.DS_OR_B32: _DSOp_DS_OR_B32, + DSOp.DS_XOR_B32: _DSOp_DS_XOR_B32, + DSOp.DS_MSKOR_B32: _DSOp_DS_MSKOR_B32, + DSOp.DS_STORE_B32: _DSOp_DS_STORE_B32, + DSOp.DS_STORE_2ADDR_B32: _DSOp_DS_STORE_2ADDR_B32, + DSOp.DS_STORE_2ADDR_STRIDE64_B32: _DSOp_DS_STORE_2ADDR_STRIDE64_B32, + DSOp.DS_CMPSTORE_B32: _DSOp_DS_CMPSTORE_B32, + DSOp.DS_CMPSTORE_F32: _DSOp_DS_CMPSTORE_F32, + DSOp.DS_MIN_F32: _DSOp_DS_MIN_F32, + DSOp.DS_MAX_F32: _DSOp_DS_MAX_F32, + DSOp.DS_ADD_F32: _DSOp_DS_ADD_F32, + DSOp.DS_STORE_B8: _DSOp_DS_STORE_B8, + DSOp.DS_STORE_B16: _DSOp_DS_STORE_B16, + DSOp.DS_ADD_RTN_U32: _DSOp_DS_ADD_RTN_U32, + DSOp.DS_SUB_RTN_U32: _DSOp_DS_SUB_RTN_U32, + DSOp.DS_RSUB_RTN_U32: _DSOp_DS_RSUB_RTN_U32, + DSOp.DS_INC_RTN_U32: _DSOp_DS_INC_RTN_U32, + DSOp.DS_DEC_RTN_U32: _DSOp_DS_DEC_RTN_U32, + DSOp.DS_MIN_RTN_I32: _DSOp_DS_MIN_RTN_I32, + DSOp.DS_MAX_RTN_I32: _DSOp_DS_MAX_RTN_I32, + DSOp.DS_MIN_RTN_U32: _DSOp_DS_MIN_RTN_U32, + DSOp.DS_MAX_RTN_U32: _DSOp_DS_MAX_RTN_U32, + DSOp.DS_AND_RTN_B32: _DSOp_DS_AND_RTN_B32, + DSOp.DS_OR_RTN_B32: _DSOp_DS_OR_RTN_B32, + DSOp.DS_XOR_RTN_B32: _DSOp_DS_XOR_RTN_B32, + DSOp.DS_MSKOR_RTN_B32: _DSOp_DS_MSKOR_RTN_B32, + DSOp.DS_STOREXCHG_RTN_B32: _DSOp_DS_STOREXCHG_RTN_B32, + DSOp.DS_STOREXCHG_2ADDR_RTN_B32: _DSOp_DS_STOREXCHG_2ADDR_RTN_B32, + DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, + DSOp.DS_CMPSTORE_RTN_B32: _DSOp_DS_CMPSTORE_RTN_B32, + DSOp.DS_CMPSTORE_RTN_F32: _DSOp_DS_CMPSTORE_RTN_F32, + DSOp.DS_MIN_RTN_F32: _DSOp_DS_MIN_RTN_F32, + DSOp.DS_MAX_RTN_F32: _DSOp_DS_MAX_RTN_F32, + DSOp.DS_WRAP_RTN_B32: _DSOp_DS_WRAP_RTN_B32, + DSOp.DS_LOAD_B32: _DSOp_DS_LOAD_B32, + DSOp.DS_LOAD_2ADDR_B32: _DSOp_DS_LOAD_2ADDR_B32, + DSOp.DS_LOAD_2ADDR_STRIDE64_B32: _DSOp_DS_LOAD_2ADDR_STRIDE64_B32, + DSOp.DS_LOAD_I8: _DSOp_DS_LOAD_I8, + DSOp.DS_LOAD_U8: _DSOp_DS_LOAD_U8, + DSOp.DS_LOAD_I16: _DSOp_DS_LOAD_I16, + DSOp.DS_LOAD_U16: _DSOp_DS_LOAD_U16, + DSOp.DS_ADD_U64: _DSOp_DS_ADD_U64, + DSOp.DS_SUB_U64: _DSOp_DS_SUB_U64, + DSOp.DS_RSUB_U64: _DSOp_DS_RSUB_U64, + DSOp.DS_INC_U64: _DSOp_DS_INC_U64, + DSOp.DS_DEC_U64: _DSOp_DS_DEC_U64, + DSOp.DS_MIN_I64: _DSOp_DS_MIN_I64, + DSOp.DS_MAX_I64: _DSOp_DS_MAX_I64, + DSOp.DS_MIN_U64: _DSOp_DS_MIN_U64, + DSOp.DS_MAX_U64: _DSOp_DS_MAX_U64, + DSOp.DS_AND_B64: _DSOp_DS_AND_B64, + DSOp.DS_OR_B64: _DSOp_DS_OR_B64, + DSOp.DS_XOR_B64: _DSOp_DS_XOR_B64, + DSOp.DS_MSKOR_B64: _DSOp_DS_MSKOR_B64, + DSOp.DS_STORE_B64: _DSOp_DS_STORE_B64, + DSOp.DS_STORE_2ADDR_B64: _DSOp_DS_STORE_2ADDR_B64, + DSOp.DS_STORE_2ADDR_STRIDE64_B64: _DSOp_DS_STORE_2ADDR_STRIDE64_B64, + DSOp.DS_CMPSTORE_B64: _DSOp_DS_CMPSTORE_B64, + DSOp.DS_CMPSTORE_F64: _DSOp_DS_CMPSTORE_F64, + DSOp.DS_MIN_F64: _DSOp_DS_MIN_F64, + DSOp.DS_MAX_F64: _DSOp_DS_MAX_F64, + DSOp.DS_ADD_RTN_U64: _DSOp_DS_ADD_RTN_U64, + DSOp.DS_SUB_RTN_U64: _DSOp_DS_SUB_RTN_U64, + DSOp.DS_RSUB_RTN_U64: _DSOp_DS_RSUB_RTN_U64, + DSOp.DS_INC_RTN_U64: _DSOp_DS_INC_RTN_U64, + DSOp.DS_DEC_RTN_U64: _DSOp_DS_DEC_RTN_U64, + DSOp.DS_MIN_RTN_I64: _DSOp_DS_MIN_RTN_I64, + DSOp.DS_MAX_RTN_I64: _DSOp_DS_MAX_RTN_I64, + DSOp.DS_MIN_RTN_U64: _DSOp_DS_MIN_RTN_U64, + DSOp.DS_MAX_RTN_U64: _DSOp_DS_MAX_RTN_U64, + DSOp.DS_AND_RTN_B64: _DSOp_DS_AND_RTN_B64, + DSOp.DS_OR_RTN_B64: _DSOp_DS_OR_RTN_B64, + DSOp.DS_XOR_RTN_B64: _DSOp_DS_XOR_RTN_B64, + DSOp.DS_MSKOR_RTN_B64: _DSOp_DS_MSKOR_RTN_B64, + DSOp.DS_STOREXCHG_RTN_B64: _DSOp_DS_STOREXCHG_RTN_B64, + DSOp.DS_STOREXCHG_2ADDR_RTN_B64: _DSOp_DS_STOREXCHG_2ADDR_RTN_B64, + DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, + DSOp.DS_CMPSTORE_RTN_B64: _DSOp_DS_CMPSTORE_RTN_B64, + DSOp.DS_CMPSTORE_RTN_F64: _DSOp_DS_CMPSTORE_RTN_F64, + DSOp.DS_MIN_RTN_F64: _DSOp_DS_MIN_RTN_F64, + DSOp.DS_MAX_RTN_F64: _DSOp_DS_MAX_RTN_F64, + DSOp.DS_LOAD_B64: _DSOp_DS_LOAD_B64, + DSOp.DS_LOAD_2ADDR_B64: _DSOp_DS_LOAD_2ADDR_B64, + DSOp.DS_LOAD_2ADDR_STRIDE64_B64: _DSOp_DS_LOAD_2ADDR_STRIDE64_B64, + DSOp.DS_ADD_RTN_F32: _DSOp_DS_ADD_RTN_F32, + DSOp.DS_CONDXCHG32_RTN_B64: _DSOp_DS_CONDXCHG32_RTN_B64, + DSOp.DS_STORE_B8_D16_HI: _DSOp_DS_STORE_B8_D16_HI, + DSOp.DS_STORE_B16_D16_HI: _DSOp_DS_STORE_B16_D16_HI, + DSOp.DS_LOAD_U8_D16: _DSOp_DS_LOAD_U8_D16, + DSOp.DS_LOAD_U8_D16_HI: _DSOp_DS_LOAD_U8_D16_HI, + DSOp.DS_LOAD_I8_D16: _DSOp_DS_LOAD_I8_D16, + DSOp.DS_LOAD_I8_D16_HI: _DSOp_DS_LOAD_I8_D16_HI, + DSOp.DS_LOAD_U16_D16: _DSOp_DS_LOAD_U16_D16, + DSOp.DS_LOAD_U16_D16_HI: _DSOp_DS_LOAD_U16_D16_HI, + DSOp.DS_PERMUTE_B32: _DSOp_DS_PERMUTE_B32, + DSOp.DS_BPERMUTE_B32: _DSOp_DS_BPERMUTE_B32, + DSOp.DS_STORE_B96: _DSOp_DS_STORE_B96, + DSOp.DS_STORE_B128: _DSOp_DS_STORE_B128, + DSOp.DS_LOAD_B96: _DSOp_DS_LOAD_B96, + DSOp.DS_LOAD_B128: _DSOp_DS_LOAD_B128, +} + +def _FLATOp_FLAT_LOAD_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_U16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_I16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i16)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + VDATA[127 : 96] = MEM[ADDR + 12].b32 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_STORE_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[7 : 0] + return {} + +def _FLATOp_FLAT_STORE_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[15 : 0] + return {} + +def _FLATOp_FLAT_STORE_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + return {} + +def _FLATOp_FLAT_STORE_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + return {} + +def _FLATOp_FLAT_STORE_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + return {} + +def _FLATOp_FLAT_STORE_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + MEM[ADDR + 12].b32 = VDATA[127 : 96] + return {} + +def _FLATOp_FLAT_LOAD_D16_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_D16_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_D16_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_D16_HI_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_D16_HI_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _FLATOp_FLAT_LOAD_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _FLATOp_FLAT_STORE_D16_HI_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[23 : 16] + return {} + +def _FLATOp_FLAT_STORE_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[31 : 16] + return {} + +def _FLATOp_FLAT_ATOMIC_SWAP_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_CMPSWAP_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA[31 : 0].u32 + cmp = DATA[63 : 32].u32 + MEM[ADDR].u32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SUB_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MIN_I32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MIN_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_I32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_AND_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_OR_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_XOR_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_INC_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_DEC_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SWAP_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_CMPSWAP_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA[63 : 0].u64 + cmp = DATA[127 : 64].u64 + MEM[ADDR].u64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_SUB_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MIN_I64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MIN_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_I64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_AND_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_OR_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_XOR_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_INC_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_DEC_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_CMPSWAP_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA[31 : 0].f32 + cmp = DATA[63 : 32].f32 + MEM[ADDR].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MIN_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_MAX_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _FLATOp_FLAT_ATOMIC_ADD_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +FLATOp_FUNCTIONS = { + FLATOp.FLAT_LOAD_U8: _FLATOp_FLAT_LOAD_U8, + FLATOp.FLAT_LOAD_I8: _FLATOp_FLAT_LOAD_I8, + FLATOp.FLAT_LOAD_U16: _FLATOp_FLAT_LOAD_U16, + FLATOp.FLAT_LOAD_I16: _FLATOp_FLAT_LOAD_I16, + FLATOp.FLAT_LOAD_B32: _FLATOp_FLAT_LOAD_B32, + FLATOp.FLAT_LOAD_B64: _FLATOp_FLAT_LOAD_B64, + FLATOp.FLAT_LOAD_B96: _FLATOp_FLAT_LOAD_B96, + FLATOp.FLAT_LOAD_B128: _FLATOp_FLAT_LOAD_B128, + FLATOp.FLAT_STORE_B8: _FLATOp_FLAT_STORE_B8, + FLATOp.FLAT_STORE_B16: _FLATOp_FLAT_STORE_B16, + FLATOp.FLAT_STORE_B32: _FLATOp_FLAT_STORE_B32, + FLATOp.FLAT_STORE_B64: _FLATOp_FLAT_STORE_B64, + FLATOp.FLAT_STORE_B96: _FLATOp_FLAT_STORE_B96, + FLATOp.FLAT_STORE_B128: _FLATOp_FLAT_STORE_B128, + FLATOp.FLAT_LOAD_D16_U8: _FLATOp_FLAT_LOAD_D16_U8, + FLATOp.FLAT_LOAD_D16_I8: _FLATOp_FLAT_LOAD_D16_I8, + FLATOp.FLAT_LOAD_D16_B16: _FLATOp_FLAT_LOAD_D16_B16, + FLATOp.FLAT_LOAD_D16_HI_U8: _FLATOp_FLAT_LOAD_D16_HI_U8, + FLATOp.FLAT_LOAD_D16_HI_I8: _FLATOp_FLAT_LOAD_D16_HI_I8, + FLATOp.FLAT_LOAD_D16_HI_B16: _FLATOp_FLAT_LOAD_D16_HI_B16, + FLATOp.FLAT_STORE_D16_HI_B8: _FLATOp_FLAT_STORE_D16_HI_B8, + FLATOp.FLAT_STORE_D16_HI_B16: _FLATOp_FLAT_STORE_D16_HI_B16, + FLATOp.FLAT_ATOMIC_SWAP_B32: _FLATOp_FLAT_ATOMIC_SWAP_B32, + FLATOp.FLAT_ATOMIC_CMPSWAP_B32: _FLATOp_FLAT_ATOMIC_CMPSWAP_B32, + FLATOp.FLAT_ATOMIC_ADD_U32: _FLATOp_FLAT_ATOMIC_ADD_U32, + FLATOp.FLAT_ATOMIC_SUB_U32: _FLATOp_FLAT_ATOMIC_SUB_U32, + FLATOp.FLAT_ATOMIC_MIN_I32: _FLATOp_FLAT_ATOMIC_MIN_I32, + FLATOp.FLAT_ATOMIC_MIN_U32: _FLATOp_FLAT_ATOMIC_MIN_U32, + FLATOp.FLAT_ATOMIC_MAX_I32: _FLATOp_FLAT_ATOMIC_MAX_I32, + FLATOp.FLAT_ATOMIC_MAX_U32: _FLATOp_FLAT_ATOMIC_MAX_U32, + FLATOp.FLAT_ATOMIC_AND_B32: _FLATOp_FLAT_ATOMIC_AND_B32, + FLATOp.FLAT_ATOMIC_OR_B32: _FLATOp_FLAT_ATOMIC_OR_B32, + FLATOp.FLAT_ATOMIC_XOR_B32: _FLATOp_FLAT_ATOMIC_XOR_B32, + FLATOp.FLAT_ATOMIC_INC_U32: _FLATOp_FLAT_ATOMIC_INC_U32, + FLATOp.FLAT_ATOMIC_DEC_U32: _FLATOp_FLAT_ATOMIC_DEC_U32, + FLATOp.FLAT_ATOMIC_SWAP_B64: _FLATOp_FLAT_ATOMIC_SWAP_B64, + FLATOp.FLAT_ATOMIC_CMPSWAP_B64: _FLATOp_FLAT_ATOMIC_CMPSWAP_B64, + FLATOp.FLAT_ATOMIC_ADD_U64: _FLATOp_FLAT_ATOMIC_ADD_U64, + FLATOp.FLAT_ATOMIC_SUB_U64: _FLATOp_FLAT_ATOMIC_SUB_U64, + FLATOp.FLAT_ATOMIC_MIN_I64: _FLATOp_FLAT_ATOMIC_MIN_I64, + FLATOp.FLAT_ATOMIC_MIN_U64: _FLATOp_FLAT_ATOMIC_MIN_U64, + FLATOp.FLAT_ATOMIC_MAX_I64: _FLATOp_FLAT_ATOMIC_MAX_I64, + FLATOp.FLAT_ATOMIC_MAX_U64: _FLATOp_FLAT_ATOMIC_MAX_U64, + FLATOp.FLAT_ATOMIC_AND_B64: _FLATOp_FLAT_ATOMIC_AND_B64, + FLATOp.FLAT_ATOMIC_OR_B64: _FLATOp_FLAT_ATOMIC_OR_B64, + FLATOp.FLAT_ATOMIC_XOR_B64: _FLATOp_FLAT_ATOMIC_XOR_B64, + FLATOp.FLAT_ATOMIC_INC_U64: _FLATOp_FLAT_ATOMIC_INC_U64, + FLATOp.FLAT_ATOMIC_DEC_U64: _FLATOp_FLAT_ATOMIC_DEC_U64, + FLATOp.FLAT_ATOMIC_CMPSWAP_F32: _FLATOp_FLAT_ATOMIC_CMPSWAP_F32, + FLATOp.FLAT_ATOMIC_MIN_F32: _FLATOp_FLAT_ATOMIC_MIN_F32, + FLATOp.FLAT_ATOMIC_MAX_F32: _FLATOp_FLAT_ATOMIC_MAX_F32, + FLATOp.FLAT_ATOMIC_ADD_F32: _FLATOp_FLAT_ATOMIC_ADD_F32, +} + +def _GLOBALOp_GLOBAL_LOAD_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_U16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_I16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i16)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + VDATA[127 : 96] = MEM[ADDR + 12].b32 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_STORE_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[7 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[15 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + return {} + +def _GLOBALOp_GLOBAL_STORE_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + return {} + +def _GLOBALOp_GLOBAL_STORE_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + return {} + +def _GLOBALOp_GLOBAL_STORE_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + MEM[ADDR + 12].b32 = VDATA[127 : 96] + return {} + +def _GLOBALOp_GLOBAL_LOAD_D16_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_D16_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_D16_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_D16_HI_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_D16_HI_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_LOAD_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _GLOBALOp_GLOBAL_STORE_D16_HI_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[23 : 16] + return {} + +def _GLOBALOp_GLOBAL_STORE_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[31 : 16] + return {} + +def _GLOBALOp_GLOBAL_ATOMIC_SWAP_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA[31 : 0].u32 + cmp = DATA[63 : 32].u32 + MEM[ADDR].u32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SUB_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + MEM[ADDR].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CSUB_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + old_value = MEM[ADDR].u32 + if old_value < DATA.u32: + new_value = 0 + else: + new_value = old_value - DATA.u32 + MEM[ADDR].u32 = new_value + RETURN_DATA.u32 = old_value + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_I32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_I32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i32) + src = DATA.i32 + MEM[ADDR].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_AND_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_OR_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_XOR_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + MEM[ADDR].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_INC_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_DEC_U32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SWAP_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA[63 : 0].u64 + cmp = DATA[127 : 64].u64 + MEM[ADDR].u64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_SUB_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + MEM[ADDR].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_I64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_I64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].i64) + src = DATA.i64 + MEM[ADDR].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_AND_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_OR_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_XOR_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b64) + MEM[ADDR].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_INC_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_DEC_U64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].u64) + src = DATA.u64 + MEM[ADDR].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA[31 : 0].f32 + cmp = DATA[63 : 32].f32 + MEM[ADDR].f32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MIN_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_MAX_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + src = DATA.f32 + MEM[ADDR].f32 = ((src) if (src > tmp) else (tmp)) + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _GLOBALOp_GLOBAL_ATOMIC_ADD_F32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].f32) + MEM[ADDR].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +GLOBALOp_FUNCTIONS = { + GLOBALOp.GLOBAL_LOAD_U8: _GLOBALOp_GLOBAL_LOAD_U8, + GLOBALOp.GLOBAL_LOAD_I8: _GLOBALOp_GLOBAL_LOAD_I8, + GLOBALOp.GLOBAL_LOAD_U16: _GLOBALOp_GLOBAL_LOAD_U16, + GLOBALOp.GLOBAL_LOAD_I16: _GLOBALOp_GLOBAL_LOAD_I16, + GLOBALOp.GLOBAL_LOAD_B32: _GLOBALOp_GLOBAL_LOAD_B32, + GLOBALOp.GLOBAL_LOAD_B64: _GLOBALOp_GLOBAL_LOAD_B64, + GLOBALOp.GLOBAL_LOAD_B96: _GLOBALOp_GLOBAL_LOAD_B96, + GLOBALOp.GLOBAL_LOAD_B128: _GLOBALOp_GLOBAL_LOAD_B128, + GLOBALOp.GLOBAL_STORE_B8: _GLOBALOp_GLOBAL_STORE_B8, + GLOBALOp.GLOBAL_STORE_B16: _GLOBALOp_GLOBAL_STORE_B16, + GLOBALOp.GLOBAL_STORE_B32: _GLOBALOp_GLOBAL_STORE_B32, + GLOBALOp.GLOBAL_STORE_B64: _GLOBALOp_GLOBAL_STORE_B64, + GLOBALOp.GLOBAL_STORE_B96: _GLOBALOp_GLOBAL_STORE_B96, + GLOBALOp.GLOBAL_STORE_B128: _GLOBALOp_GLOBAL_STORE_B128, + GLOBALOp.GLOBAL_LOAD_D16_U8: _GLOBALOp_GLOBAL_LOAD_D16_U8, + GLOBALOp.GLOBAL_LOAD_D16_I8: _GLOBALOp_GLOBAL_LOAD_D16_I8, + GLOBALOp.GLOBAL_LOAD_D16_B16: _GLOBALOp_GLOBAL_LOAD_D16_B16, + GLOBALOp.GLOBAL_LOAD_D16_HI_U8: _GLOBALOp_GLOBAL_LOAD_D16_HI_U8, + GLOBALOp.GLOBAL_LOAD_D16_HI_I8: _GLOBALOp_GLOBAL_LOAD_D16_HI_I8, + GLOBALOp.GLOBAL_LOAD_D16_HI_B16: _GLOBALOp_GLOBAL_LOAD_D16_HI_B16, + GLOBALOp.GLOBAL_STORE_D16_HI_B8: _GLOBALOp_GLOBAL_STORE_D16_HI_B8, + GLOBALOp.GLOBAL_STORE_D16_HI_B16: _GLOBALOp_GLOBAL_STORE_D16_HI_B16, + GLOBALOp.GLOBAL_ATOMIC_SWAP_B32: _GLOBALOp_GLOBAL_ATOMIC_SWAP_B32, + GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B32: _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_B32, + GLOBALOp.GLOBAL_ATOMIC_ADD_U32: _GLOBALOp_GLOBAL_ATOMIC_ADD_U32, + GLOBALOp.GLOBAL_ATOMIC_SUB_U32: _GLOBALOp_GLOBAL_ATOMIC_SUB_U32, + GLOBALOp.GLOBAL_ATOMIC_CSUB_U32: _GLOBALOp_GLOBAL_ATOMIC_CSUB_U32, + GLOBALOp.GLOBAL_ATOMIC_MIN_I32: _GLOBALOp_GLOBAL_ATOMIC_MIN_I32, + GLOBALOp.GLOBAL_ATOMIC_MIN_U32: _GLOBALOp_GLOBAL_ATOMIC_MIN_U32, + GLOBALOp.GLOBAL_ATOMIC_MAX_I32: _GLOBALOp_GLOBAL_ATOMIC_MAX_I32, + GLOBALOp.GLOBAL_ATOMIC_MAX_U32: _GLOBALOp_GLOBAL_ATOMIC_MAX_U32, + GLOBALOp.GLOBAL_ATOMIC_AND_B32: _GLOBALOp_GLOBAL_ATOMIC_AND_B32, + GLOBALOp.GLOBAL_ATOMIC_OR_B32: _GLOBALOp_GLOBAL_ATOMIC_OR_B32, + GLOBALOp.GLOBAL_ATOMIC_XOR_B32: _GLOBALOp_GLOBAL_ATOMIC_XOR_B32, + GLOBALOp.GLOBAL_ATOMIC_INC_U32: _GLOBALOp_GLOBAL_ATOMIC_INC_U32, + GLOBALOp.GLOBAL_ATOMIC_DEC_U32: _GLOBALOp_GLOBAL_ATOMIC_DEC_U32, + GLOBALOp.GLOBAL_ATOMIC_SWAP_B64: _GLOBALOp_GLOBAL_ATOMIC_SWAP_B64, + GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_B64: _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_B64, + GLOBALOp.GLOBAL_ATOMIC_ADD_U64: _GLOBALOp_GLOBAL_ATOMIC_ADD_U64, + GLOBALOp.GLOBAL_ATOMIC_SUB_U64: _GLOBALOp_GLOBAL_ATOMIC_SUB_U64, + GLOBALOp.GLOBAL_ATOMIC_MIN_I64: _GLOBALOp_GLOBAL_ATOMIC_MIN_I64, + GLOBALOp.GLOBAL_ATOMIC_MIN_U64: _GLOBALOp_GLOBAL_ATOMIC_MIN_U64, + GLOBALOp.GLOBAL_ATOMIC_MAX_I64: _GLOBALOp_GLOBAL_ATOMIC_MAX_I64, + GLOBALOp.GLOBAL_ATOMIC_MAX_U64: _GLOBALOp_GLOBAL_ATOMIC_MAX_U64, + GLOBALOp.GLOBAL_ATOMIC_AND_B64: _GLOBALOp_GLOBAL_ATOMIC_AND_B64, + GLOBALOp.GLOBAL_ATOMIC_OR_B64: _GLOBALOp_GLOBAL_ATOMIC_OR_B64, + GLOBALOp.GLOBAL_ATOMIC_XOR_B64: _GLOBALOp_GLOBAL_ATOMIC_XOR_B64, + GLOBALOp.GLOBAL_ATOMIC_INC_U64: _GLOBALOp_GLOBAL_ATOMIC_INC_U64, + GLOBALOp.GLOBAL_ATOMIC_DEC_U64: _GLOBALOp_GLOBAL_ATOMIC_DEC_U64, + GLOBALOp.GLOBAL_ATOMIC_CMPSWAP_F32: _GLOBALOp_GLOBAL_ATOMIC_CMPSWAP_F32, + GLOBALOp.GLOBAL_ATOMIC_MIN_F32: _GLOBALOp_GLOBAL_ATOMIC_MIN_F32, + GLOBALOp.GLOBAL_ATOMIC_MAX_F32: _GLOBALOp_GLOBAL_ATOMIC_MAX_F32, + GLOBALOp.GLOBAL_ATOMIC_ADD_F32: _GLOBALOp_GLOBAL_ATOMIC_ADD_F32, +} + +def _SCRATCHOp_SCRATCH_LOAD_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_U16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_I16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA.i32 = (signext(MEM[ADDR].i16)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 0] = MEM[ADDR].b32 + VDATA[63 : 32] = MEM[ADDR + 4].b32 + VDATA[95 : 64] = MEM[ADDR + 8].b32 + VDATA[127 : 96] = MEM[ADDR + 12].b32 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_STORE_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[7 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[15 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_B32(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + return {} + +def _SCRATCHOp_SCRATCH_STORE_B64(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + return {} + +def _SCRATCHOp_SCRATCH_STORE_B96(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + return {} + +def _SCRATCHOp_SCRATCH_STORE_B128(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b32 = VDATA[31 : 0] + MEM[ADDR + 4].b32 = VDATA[63 : 32] + MEM[ADDR + 8].b32 = VDATA[95 : 64] + MEM[ADDR + 12].b32 = VDATA[127 : 96] + return {} + +def _SCRATCHOp_SCRATCH_LOAD_D16_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_D16_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_D16_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[15 : 0].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_D16_HI_U8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_D16_HI_I8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_LOAD_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + VDATA[31 : 16].b16 = MEM[ADDR].b16 + return {'VDATA': VDATA} + +def _SCRATCHOp_SCRATCH_STORE_D16_HI_B8(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b8 = VDATA[23 : 16] + return {} + +def _SCRATCHOp_SCRATCH_STORE_D16_HI_B16(MEM, ADDR, VDATA, VDST, RETURN_DATA): + DATA = VDATA + # --- compiled pseudocode --- + MEM[ADDR].b16 = VDATA[31 : 16] + return {} + +SCRATCHOp_FUNCTIONS = { + SCRATCHOp.SCRATCH_LOAD_U8: _SCRATCHOp_SCRATCH_LOAD_U8, + SCRATCHOp.SCRATCH_LOAD_I8: _SCRATCHOp_SCRATCH_LOAD_I8, + SCRATCHOp.SCRATCH_LOAD_U16: _SCRATCHOp_SCRATCH_LOAD_U16, + SCRATCHOp.SCRATCH_LOAD_I16: _SCRATCHOp_SCRATCH_LOAD_I16, + SCRATCHOp.SCRATCH_LOAD_B32: _SCRATCHOp_SCRATCH_LOAD_B32, + SCRATCHOp.SCRATCH_LOAD_B64: _SCRATCHOp_SCRATCH_LOAD_B64, + SCRATCHOp.SCRATCH_LOAD_B96: _SCRATCHOp_SCRATCH_LOAD_B96, + SCRATCHOp.SCRATCH_LOAD_B128: _SCRATCHOp_SCRATCH_LOAD_B128, + SCRATCHOp.SCRATCH_STORE_B8: _SCRATCHOp_SCRATCH_STORE_B8, + SCRATCHOp.SCRATCH_STORE_B16: _SCRATCHOp_SCRATCH_STORE_B16, + SCRATCHOp.SCRATCH_STORE_B32: _SCRATCHOp_SCRATCH_STORE_B32, + SCRATCHOp.SCRATCH_STORE_B64: _SCRATCHOp_SCRATCH_STORE_B64, + SCRATCHOp.SCRATCH_STORE_B96: _SCRATCHOp_SCRATCH_STORE_B96, + SCRATCHOp.SCRATCH_STORE_B128: _SCRATCHOp_SCRATCH_STORE_B128, + SCRATCHOp.SCRATCH_LOAD_D16_U8: _SCRATCHOp_SCRATCH_LOAD_D16_U8, + SCRATCHOp.SCRATCH_LOAD_D16_I8: _SCRATCHOp_SCRATCH_LOAD_D16_I8, + SCRATCHOp.SCRATCH_LOAD_D16_B16: _SCRATCHOp_SCRATCH_LOAD_D16_B16, + SCRATCHOp.SCRATCH_LOAD_D16_HI_U8: _SCRATCHOp_SCRATCH_LOAD_D16_HI_U8, + SCRATCHOp.SCRATCH_LOAD_D16_HI_I8: _SCRATCHOp_SCRATCH_LOAD_D16_HI_I8, + SCRATCHOp.SCRATCH_LOAD_D16_HI_B16: _SCRATCHOp_SCRATCH_LOAD_D16_HI_B16, + SCRATCHOp.SCRATCH_STORE_D16_HI_B8: _SCRATCHOp_SCRATCH_STORE_D16_HI_B8, + SCRATCHOp.SCRATCH_STORE_D16_HI_B16: _SCRATCHOp_SCRATCH_STORE_D16_HI_B16, +} + # V_WRITELANE_B32: Write scalar to specific lane's VGPR (not in PDF pseudocode) -def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): wr_lane = s1 & 0x1f return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32 @@ -6273,6 +8468,10 @@ COMPILED_FUNCTIONS = { VOP3SDOp: VOP3SDOp_FUNCTIONS, VOP3POp: VOP3POp_FUNCTIONS, VOPCOp: VOPCOp_FUNCTIONS, + DSOp: DSOp_FUNCTIONS, + FLATOp: FLATOp_FUNCTIONS, + GLOBALOp: GLOBALOp_FUNCTIONS, + SCRATCHOp: SCRATCHOp_FUNCTIONS, } def get_compiled_functions(): return COMPILED_FUNCTIONS \ No newline at end of file diff --git a/extra/assembly/amd/autogen/rdna4/gen_pcode.py b/extra/assembly/amd/autogen/rdna4/gen_pcode.py index c7331ddd59..1e10b3451f 100644 --- a/extra/assembly/amd/autogen/rdna4/gen_pcode.py +++ b/extra/assembly/amd/autogen/rdna4/gen_pcode.py @@ -1,9 +1,9 @@ # autogenerated by pdf.py - do not edit # to regenerate: python -m extra.assembly.amd.pdf --arch rdna4 -# ruff: noqa: E501,F405,F403 +# ruff: noqa: E501 # mypy: ignore-errors -from extra.assembly.amd.autogen.rdna4.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp -from extra.assembly.amd.pcode import * +from extra.assembly.amd.autogen.rdna4.enum import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp +from extra.assembly.amd.pcode import ABSDIFF, BYTE_PERMUTE, DENORM, F, INF, MAX_FLOAT_F32, OVERFLOW_F32, OVERFLOW_F64, PI, ROUND_MODE, Reg, SAT8, SliceProxy, TWO_OVER_PI_1201, UNDERFLOW_F32, UNDERFLOW_F64, WAVE32, WAVE64, _pack, _pack32, bf16_to_f32, cos, cvtToQuietNAN, exponent, f16_to_f32, f16_to_i16, f16_to_snorm, f16_to_u16, f16_to_unorm, f32_to_f16, f32_to_f64, f32_to_i32, f32_to_snorm, f32_to_u32, f32_to_u8, f32_to_unorm, f64_to_f32, f64_to_i32, f64_to_u32, floor, fma, fract, i16_to_f16, i32_to_f32, i32_to_f64, i32_to_i16, isEven, isNAN, isQuietNAN, isSignalNAN, ldexp, log2, mantissa, pow, s_ff1_i32_b32, s_ff1_i32_b64, sign, signext, signext_from_bit, sin, sqrt, trunc, u16_to_f16, u32_to_f32, u32_to_f64, u32_to_u16, u4_to_u32, u8_to_u32, v_cvt_i16_f32, v_cvt_u16_f32, v_max3_i16, v_max3_i32, v_max3_u16, v_max3_u32, v_max_i16, v_max_i32, v_max_u16, v_max_u32, v_min_i16, v_min_i32, v_min_u16, v_min_u32, v_msad_u8, v_sad_u8 def _SOP1Op_S_MOV_B32(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): D0.b32 = S0.b32 @@ -6179,9 +6179,1217 @@ VOPCOp_FUNCTIONS = { VOPCOp.V_CMPX_CLASS_F64: _VOPCOp_V_CMPX_CLASS_F64, } +def _DSOp_DS_ADD_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 = DATA.u32 - MEM[addr].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + return {} + +def _DSOp_DS_STORE_2ADDR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET0.u32 * 4].b32 = DATA[31 : 0] + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET1.u32 * 4].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_STORE_2ADDR_STRIDE64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET0.u32 * 256].b32 = DATA[31 : 0] + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET1.u32 * 256].b32 = DATA2[31 : 0] + return {} + +def _DSOp_DS_CMPSTORE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + src = DATA.b32 + cmp = DATA2.b32 + MEM[addr].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].f32) + MEM[addr].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[7 : 0] + return {} + +def _DSOp_DS_STORE_B16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[15 : 0] + return {} + +def _DSOp_DS_ADD_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 += DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 -= DATA.u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + MEM[addr].u32 = DATA.u32 - MEM[addr].u32 + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i32) + src = DATA.i32 + MEM[addr].i32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[addr].u32 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp & DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp | DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = (tmp ^ DATA.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = ((tmp & ~DATA.b32) | DATA2.b32) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + MEM[addr].b32 = DATA.b32 + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 4 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 4 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 256 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 256 + tmp1 = MEM[addr1].b32 + tmp2 = MEM[addr2].b32 + MEM[addr1].b32 = DATA.b32 + MEM[addr2].b32 = DATA2.b32 + RETURN_DATA[31 : 0] = tmp1 + RETURN_DATA[63 : 32] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b32) + src = DATA.b32 + cmp = DATA2.b32 + MEM[addr].b32 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 4].b32 + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[63 : 32] = MEM[addr + OFFSET1.u32 * 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_STRIDE64_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 256].b32 + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[63 : 32] = MEM[addr + OFFSET1.u32 * 256].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U8(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.i32 = (signext(MEM[ADDR].i16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA.u32 = (_pack(0, MEM[ADDR].u16)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CONSUME(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + addr = offset + rtnval = LDS(addr) + GPR[VDST] = rtnval + return {} + +def _DSOp_DS_APPEND(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + addr = offset + rtnval = LDS(addr) + GPR[VDST] = rtnval + return {} + +def _DSOp_DS_ADD_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 = DATA.u64 - MEM[addr].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + return {} + +def _DSOp_DS_STORE_2ADDR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET0.u32 * 8].b32 = DATA[31 : 0] + MEM[addr + OFFSET0.u32 * 8 + 4].b32 = DATA[63 : 32] + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET1.u32 * 8].b32 = DATA2[31 : 0] + MEM[addr + OFFSET1.u32 * 8 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_STORE_2ADDR_STRIDE64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET0.u32 * 512].b32 = DATA[31 : 0] + MEM[addr + OFFSET0.u32 * 512 + 4].b32 = DATA[63 : 32] + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET1.u32 * 512].b32 = DATA2[31 : 0] + MEM[addr + OFFSET1.u32 * 512 + 4].b32 = DATA2[63 : 32] + return {} + +def _DSOp_DS_CMPSTORE_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + src = DATA.b64 + cmp = DATA2.b64 + MEM[addr].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 += DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 -= DATA.u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_RSUB_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + MEM[addr].u64 = DATA.u64 - MEM[addr].u64 + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_INC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((0) if (tmp >= src) else (tmp + 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_DEC_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (((tmp == 0) or (tmp > src))) else (tmp - 1)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_I64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].i64) + src = DATA.i64 + MEM[addr].i64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.i64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MIN_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src < tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MAX_RTN_U64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u64) + src = DATA.u64 + MEM[addr].u64 = ((src) if (src >= tmp) else (tmp)) + RETURN_DATA.u64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_AND_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp & DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_OR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp | DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_XOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = (tmp ^ DATA.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_MSKOR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = ((tmp & ~DATA.b64) | DATA2.b64) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + MEM[addr].b64 = DATA.b64 + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 8 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 8 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + OFFSET = OFFSET0 + ADDR_BASE = ADDR + # --- compiled pseudocode --- + addr1 = ADDR_BASE.u32 + OFFSET0.u32 * 512 + addr2 = ADDR_BASE.u32 + OFFSET1.u32 * 512 + tmp1 = MEM[addr1].b64 + tmp2 = MEM[addr2].b64 + MEM[addr1].b64 = DATA.b64 + MEM[addr2].b64 = DATA2.b64 + RETURN_DATA[63 : 0] = tmp1 + RETURN_DATA[127 : 64] = tmp2 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CMPSTORE_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + DATA2 = DATA1 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].b64) + src = DATA.b64 + cmp = DATA2.b64 + MEM[addr].b64 = ((src) if (tmp == cmp) else (tmp)) + RETURN_DATA.b64 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET.u32 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 8].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET0.u32 * 8 + 4].b32 + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[95 : 64] = MEM[addr + OFFSET1.u32 * 8].b32 + RETURN_DATA[127 : 96] = MEM[addr + OFFSET1.u32 * 8 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_2ADDR_STRIDE64_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET0.u32 * 512].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET0.u32 * 512 + 4].b32 + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[95 : 64] = MEM[addr + OFFSET1.u32 * 512].b32 + RETURN_DATA[127 : 96] = MEM[addr + OFFSET1.u32 * 512 + 4].b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_ADD_RTN_F32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].f32) + MEM[addr].f32 += DATA.f32 + RETURN_DATA.f32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_CONDXCHG32_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + ADDR = S0.u32 + DATA = S1.u64 + offset = _pack(OFFSET1, OFFSET0) + RETURN_DATA[0] = LDS[ADDR0].u32 + if DATA[31]: + LDS[ADDR0] = _pack(0, DATA[30 : 0]) + RETURN_DATA[1] = LDS[ADDR1].u32 + if DATA[63]: + LDS[ADDR1] = _pack(0, DATA[62 : 32]) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_COND_SUB_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((tmp - src) if (tmp >= src) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_CLAMP_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + old_value = MEM[ADDR].u32 + if old_value < DATA.u32: + new_value = 0 + else: + new_value = old_value - DATA.u32 + MEM[ADDR].u32 = new_value + RETURN_DATA.u32 = old_value + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_F16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + dst[15 : 0].f16 = src[15 : 0].f16 + tmp[15 : 0].f16 + dst[31 : 16].f16 = src[31 : 16].f16 + tmp[31 : 16].f16 + MEM[ADDR].b32 = dst.b32 + RETURN_DATA.b32 = tmp.b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_BF16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + dst[15 : 0].bf16 = src[15 : 0].bf16 + tmp[15 : 0].bf16 + dst[31 : 16].bf16 = src[31 : 16].bf16 + tmp[31 : 16].bf16 + MEM[ADDR].b32 = dst.b32 + RETURN_DATA.b32 = tmp.b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_STORE_B8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b8 = DATA[23 : 16] + return {} + +def _DSOp_DS_STORE_B16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + MEM[ADDR].b16 = DATA[31 : 16] + return {} + +def _DSOp_DS_LOAD_U8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = (_pack(0, MEM[ADDR].u8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_I8_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].i16 = (signext(MEM[ADDR].i8)) + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16_D16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_U16_D16_HI(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_COND_SUB_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, offset.b32) + tmp = Reg(MEM[addr].u32) + src = DATA.u32 + MEM[ADDR].u32 = ((tmp - src) if (tmp >= src) else (tmp)) + RETURN_DATA.u32 = tmp + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_SUB_CLAMP_RTN_U32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + old_value = MEM[ADDR].u32 + if old_value < DATA.u32: + new_value = 0 + else: + new_value = old_value - DATA.u32 + MEM[ADDR].u32 = new_value + RETURN_DATA.u32 = old_value + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_RTN_F16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + dst[15 : 0].f16 = src[15 : 0].f16 + tmp[15 : 0].f16 + dst[31 : 16].f16 = src[31 : 16].f16 + tmp[31 : 16].f16 + MEM[ADDR].b32 = dst.b32 + RETURN_DATA.b32 = tmp.b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PK_ADD_RTN_BF16(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + # --- compiled pseudocode --- + tmp = Reg(MEM[ADDR].b32) + src = DATA.b32 + dst[15 : 0].bf16 = src[15 : 0].bf16 + tmp[15 : 0].bf16 + dst[31 : 16].bf16 = src[31 : 16].bf16 + tmp[31 : 16].bf16 + MEM[ADDR].b32 = dst.b32 + RETURN_DATA.b32 = tmp.b32 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_PERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + num_lanes = ((64) if (WAVE64) else (32)) + for i in range(0, int(num_lanes - 1)+1): + tmp[i] = 0x0 + for i in range(0, int(num_lanes - 1)+1): + if EXEC[i].u1: + dst_lane = (VGPR[i][ADDR] + OFFSET.b32) / 4 % num_lanes + tmp[dst_lane] = VGPR[i][DATA0] + for i in range(0, int(num_lanes - 1)+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_BPERMUTE_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + num_lanes = ((64) if (WAVE64) else (32)) + for i in range(0, int(num_lanes - 1)+1): + tmp[i] = 0x0 + for i in range(0, int(num_lanes - 1)+1): + src_lane = (VGPR[i][ADDR] + OFFSET.b32) / 4 % num_lanes + if EXEC[src_lane].u1: + tmp[i] = VGPR[src_lane][DATA0] + for i in range(0, int(num_lanes - 1)+1): + if EXEC[i].u1: + VGPR[i][VDST] = tmp[i] + return {} + +def _DSOp_DS_STORE_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[addr + OFFSET.u32 + 8].b32 = DATA[95 : 64] + return {} + +def _DSOp_DS_STORE_B128(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + MEM[addr + OFFSET.u32].b32 = DATA[31 : 0] + MEM[addr + OFFSET.u32 + 4].b32 = DATA[63 : 32] + MEM[addr + OFFSET.u32 + 8].b32 = DATA[95 : 64] + MEM[addr + OFFSET.u32 + 12].b32 = DATA[127 : 96] + return {} + +def _DSOp_DS_BVH_STACK_PUSH4_POP1_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + _pack(stack_base, stack_index) = (DECODE_ADDR(ADDR, OFFSET0)) + last_node_ptr = DATA0.b32 + for i in range(0, int(2)+1): + if DATA_VALID(DATA1[i * 32 + 31 : i * 32]): + MEM[stack_base.u32 + stack_index] = DATA1[i * 32 + 31 : i * 32] + stack_index += 1 + elif DATA1[i].b32 == last_node_ptr: + pass + if DATA_VALID(DATA1[127 : 96]): + RETURN_DATA[31 : 0] = DATA1[127 : 96] + else: + RETURN_DATA[31 : 0] = MEM[stack_base.u32 + stack_index] + MEM[stack_base.u32 + stack_index] = INVALID_NODE + stack_index -= 1 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_BVH_STACK_PUSH8_POP1_RTN_B32(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + _pack(stack_base, stack_index) = (DECODE_ADDR(ADDR, OFFSET0)) + last_node_ptr = DATA0.b32 + for i in range(0, int(6)+1): + if DATA_VALID(DATA1[i * 32 + 31 : i * 32]): + MEM[stack_base.u32 + stack_index] = DATA1[i * 32 + 31 : i * 32] + stack_index += 1 + elif DATA1[i].b32 == last_node_ptr: + pass + if DATA_VALID(DATA1[255 : 224]): + RETURN_DATA[31 : 0] = DATA1[255 : 224] + else: + RETURN_DATA[31 : 0] = MEM[stack_base.u32 + stack_index] + MEM[stack_base.u32 + stack_index] = INVALID_NODE + stack_index -= 1 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_BVH_STACK_PUSH8_POP2_RTN_B64(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + _pack(stack_base, stack_index) = (DECODE_ADDR(ADDR, OFFSET0)) + last_node_ptr = DATA0.b32 + for i in range(0, int(6)+1): + if DATA_VALID(DATA1[i * 32 + 31 : i * 32]): + MEM[stack_base.u32 + stack_index] = DATA1[i * 32 + 31 : i * 32] + stack_index += 1 + elif DATA1[i].b32 == last_node_ptr: + pass + if DATA_VALID(DATA1[255 : 224]): + RETURN_DATA[31 : 0] = DATA1[255 : 224] + else: + RETURN_DATA[31 : 0] = MEM[stack_base.u32 + stack_index] + MEM[stack_base.u32 + stack_index] = INVALID_NODE + stack_index -= 1 + if DATA_VALID(MEM[stack_base.u32 + stack_index]): + RETURN_DATA[63 : 32] = MEM[stack_base.u32 + stack_index] + MEM[stack_base.u32 + stack_index] = INVALID_NODE + stack_index -= 1 + return {'RETURN_DATA': RETURN_DATA} + +def _DSOp_DS_LOAD_B96(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA): + DATA = DATA0 + OFFSET = OFFSET0 + # --- compiled pseudocode --- + addr = CalcDsAddr(vgpr_a.b32, 0x0) + RETURN_DATA[31 : 0] = MEM[addr + OFFSET.u32].b32 + RETURN_DATA[63 : 32] = MEM[addr + OFFSET.u32 + 4].b32 + RETURN_DATA[95 : 64] = MEM[addr + OFFSET.u32 + 8].b32 + return {'RETURN_DATA': RETURN_DATA} + +DSOp_FUNCTIONS = { + DSOp.DS_ADD_U32: _DSOp_DS_ADD_U32, + DSOp.DS_SUB_U32: _DSOp_DS_SUB_U32, + DSOp.DS_RSUB_U32: _DSOp_DS_RSUB_U32, + DSOp.DS_INC_U32: _DSOp_DS_INC_U32, + DSOp.DS_DEC_U32: _DSOp_DS_DEC_U32, + DSOp.DS_MIN_I32: _DSOp_DS_MIN_I32, + DSOp.DS_MAX_I32: _DSOp_DS_MAX_I32, + DSOp.DS_MIN_U32: _DSOp_DS_MIN_U32, + DSOp.DS_MAX_U32: _DSOp_DS_MAX_U32, + DSOp.DS_AND_B32: _DSOp_DS_AND_B32, + DSOp.DS_OR_B32: _DSOp_DS_OR_B32, + DSOp.DS_XOR_B32: _DSOp_DS_XOR_B32, + DSOp.DS_MSKOR_B32: _DSOp_DS_MSKOR_B32, + DSOp.DS_STORE_B32: _DSOp_DS_STORE_B32, + DSOp.DS_STORE_2ADDR_B32: _DSOp_DS_STORE_2ADDR_B32, + DSOp.DS_STORE_2ADDR_STRIDE64_B32: _DSOp_DS_STORE_2ADDR_STRIDE64_B32, + DSOp.DS_CMPSTORE_B32: _DSOp_DS_CMPSTORE_B32, + DSOp.DS_ADD_F32: _DSOp_DS_ADD_F32, + DSOp.DS_STORE_B8: _DSOp_DS_STORE_B8, + DSOp.DS_STORE_B16: _DSOp_DS_STORE_B16, + DSOp.DS_ADD_RTN_U32: _DSOp_DS_ADD_RTN_U32, + DSOp.DS_SUB_RTN_U32: _DSOp_DS_SUB_RTN_U32, + DSOp.DS_RSUB_RTN_U32: _DSOp_DS_RSUB_RTN_U32, + DSOp.DS_INC_RTN_U32: _DSOp_DS_INC_RTN_U32, + DSOp.DS_DEC_RTN_U32: _DSOp_DS_DEC_RTN_U32, + DSOp.DS_MIN_RTN_I32: _DSOp_DS_MIN_RTN_I32, + DSOp.DS_MAX_RTN_I32: _DSOp_DS_MAX_RTN_I32, + DSOp.DS_MIN_RTN_U32: _DSOp_DS_MIN_RTN_U32, + DSOp.DS_MAX_RTN_U32: _DSOp_DS_MAX_RTN_U32, + DSOp.DS_AND_RTN_B32: _DSOp_DS_AND_RTN_B32, + DSOp.DS_OR_RTN_B32: _DSOp_DS_OR_RTN_B32, + DSOp.DS_XOR_RTN_B32: _DSOp_DS_XOR_RTN_B32, + DSOp.DS_MSKOR_RTN_B32: _DSOp_DS_MSKOR_RTN_B32, + DSOp.DS_STOREXCHG_RTN_B32: _DSOp_DS_STOREXCHG_RTN_B32, + DSOp.DS_STOREXCHG_2ADDR_RTN_B32: _DSOp_DS_STOREXCHG_2ADDR_RTN_B32, + DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, + DSOp.DS_CMPSTORE_RTN_B32: _DSOp_DS_CMPSTORE_RTN_B32, + DSOp.DS_LOAD_B32: _DSOp_DS_LOAD_B32, + DSOp.DS_LOAD_2ADDR_B32: _DSOp_DS_LOAD_2ADDR_B32, + DSOp.DS_LOAD_2ADDR_STRIDE64_B32: _DSOp_DS_LOAD_2ADDR_STRIDE64_B32, + DSOp.DS_LOAD_I8: _DSOp_DS_LOAD_I8, + DSOp.DS_LOAD_U8: _DSOp_DS_LOAD_U8, + DSOp.DS_LOAD_I16: _DSOp_DS_LOAD_I16, + DSOp.DS_LOAD_U16: _DSOp_DS_LOAD_U16, + DSOp.DS_CONSUME: _DSOp_DS_CONSUME, + DSOp.DS_APPEND: _DSOp_DS_APPEND, + DSOp.DS_ADD_U64: _DSOp_DS_ADD_U64, + DSOp.DS_SUB_U64: _DSOp_DS_SUB_U64, + DSOp.DS_RSUB_U64: _DSOp_DS_RSUB_U64, + DSOp.DS_INC_U64: _DSOp_DS_INC_U64, + DSOp.DS_DEC_U64: _DSOp_DS_DEC_U64, + DSOp.DS_MIN_I64: _DSOp_DS_MIN_I64, + DSOp.DS_MAX_I64: _DSOp_DS_MAX_I64, + DSOp.DS_MIN_U64: _DSOp_DS_MIN_U64, + DSOp.DS_MAX_U64: _DSOp_DS_MAX_U64, + DSOp.DS_AND_B64: _DSOp_DS_AND_B64, + DSOp.DS_OR_B64: _DSOp_DS_OR_B64, + DSOp.DS_XOR_B64: _DSOp_DS_XOR_B64, + DSOp.DS_MSKOR_B64: _DSOp_DS_MSKOR_B64, + DSOp.DS_STORE_B64: _DSOp_DS_STORE_B64, + DSOp.DS_STORE_2ADDR_B64: _DSOp_DS_STORE_2ADDR_B64, + DSOp.DS_STORE_2ADDR_STRIDE64_B64: _DSOp_DS_STORE_2ADDR_STRIDE64_B64, + DSOp.DS_CMPSTORE_B64: _DSOp_DS_CMPSTORE_B64, + DSOp.DS_ADD_RTN_U64: _DSOp_DS_ADD_RTN_U64, + DSOp.DS_SUB_RTN_U64: _DSOp_DS_SUB_RTN_U64, + DSOp.DS_RSUB_RTN_U64: _DSOp_DS_RSUB_RTN_U64, + DSOp.DS_INC_RTN_U64: _DSOp_DS_INC_RTN_U64, + DSOp.DS_DEC_RTN_U64: _DSOp_DS_DEC_RTN_U64, + DSOp.DS_MIN_RTN_I64: _DSOp_DS_MIN_RTN_I64, + DSOp.DS_MAX_RTN_I64: _DSOp_DS_MAX_RTN_I64, + DSOp.DS_MIN_RTN_U64: _DSOp_DS_MIN_RTN_U64, + DSOp.DS_MAX_RTN_U64: _DSOp_DS_MAX_RTN_U64, + DSOp.DS_AND_RTN_B64: _DSOp_DS_AND_RTN_B64, + DSOp.DS_OR_RTN_B64: _DSOp_DS_OR_RTN_B64, + DSOp.DS_XOR_RTN_B64: _DSOp_DS_XOR_RTN_B64, + DSOp.DS_MSKOR_RTN_B64: _DSOp_DS_MSKOR_RTN_B64, + DSOp.DS_STOREXCHG_RTN_B64: _DSOp_DS_STOREXCHG_RTN_B64, + DSOp.DS_STOREXCHG_2ADDR_RTN_B64: _DSOp_DS_STOREXCHG_2ADDR_RTN_B64, + DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: _DSOp_DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, + DSOp.DS_CMPSTORE_RTN_B64: _DSOp_DS_CMPSTORE_RTN_B64, + DSOp.DS_LOAD_B64: _DSOp_DS_LOAD_B64, + DSOp.DS_LOAD_2ADDR_B64: _DSOp_DS_LOAD_2ADDR_B64, + DSOp.DS_LOAD_2ADDR_STRIDE64_B64: _DSOp_DS_LOAD_2ADDR_STRIDE64_B64, + DSOp.DS_ADD_RTN_F32: _DSOp_DS_ADD_RTN_F32, + DSOp.DS_CONDXCHG32_RTN_B64: _DSOp_DS_CONDXCHG32_RTN_B64, + DSOp.DS_COND_SUB_U32: _DSOp_DS_COND_SUB_U32, + DSOp.DS_SUB_CLAMP_U32: _DSOp_DS_SUB_CLAMP_U32, + DSOp.DS_PK_ADD_F16: _DSOp_DS_PK_ADD_F16, + DSOp.DS_PK_ADD_BF16: _DSOp_DS_PK_ADD_BF16, + DSOp.DS_STORE_B8_D16_HI: _DSOp_DS_STORE_B8_D16_HI, + DSOp.DS_STORE_B16_D16_HI: _DSOp_DS_STORE_B16_D16_HI, + DSOp.DS_LOAD_U8_D16: _DSOp_DS_LOAD_U8_D16, + DSOp.DS_LOAD_U8_D16_HI: _DSOp_DS_LOAD_U8_D16_HI, + DSOp.DS_LOAD_I8_D16: _DSOp_DS_LOAD_I8_D16, + DSOp.DS_LOAD_I8_D16_HI: _DSOp_DS_LOAD_I8_D16_HI, + DSOp.DS_LOAD_U16_D16: _DSOp_DS_LOAD_U16_D16, + DSOp.DS_LOAD_U16_D16_HI: _DSOp_DS_LOAD_U16_D16_HI, + DSOp.DS_COND_SUB_RTN_U32: _DSOp_DS_COND_SUB_RTN_U32, + DSOp.DS_SUB_CLAMP_RTN_U32: _DSOp_DS_SUB_CLAMP_RTN_U32, + DSOp.DS_PK_ADD_RTN_F16: _DSOp_DS_PK_ADD_RTN_F16, + DSOp.DS_PK_ADD_RTN_BF16: _DSOp_DS_PK_ADD_RTN_BF16, + DSOp.DS_PERMUTE_B32: _DSOp_DS_PERMUTE_B32, + DSOp.DS_BPERMUTE_B32: _DSOp_DS_BPERMUTE_B32, + DSOp.DS_STORE_B96: _DSOp_DS_STORE_B96, + DSOp.DS_STORE_B128: _DSOp_DS_STORE_B128, + DSOp.DS_BVH_STACK_PUSH4_POP1_RTN_B32: _DSOp_DS_BVH_STACK_PUSH4_POP1_RTN_B32, + DSOp.DS_BVH_STACK_PUSH8_POP1_RTN_B32: _DSOp_DS_BVH_STACK_PUSH8_POP1_RTN_B32, + DSOp.DS_BVH_STACK_PUSH8_POP2_RTN_B64: _DSOp_DS_BVH_STACK_PUSH8_POP2_RTN_B64, + DSOp.DS_LOAD_B96: _DSOp_DS_LOAD_B96, +} + # V_WRITELANE_B32: Write scalar to specific lane's VGPR (not in PDF pseudocode) -def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): wr_lane = s1 & 0x1f return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32 @@ -6198,6 +7406,7 @@ COMPILED_FUNCTIONS = { VOP3SDOp: VOP3SDOp_FUNCTIONS, VOP3POp: VOP3POp_FUNCTIONS, VOPCOp: VOPCOp_FUNCTIONS, + DSOp: DSOp_FUNCTIONS, } def get_compiled_functions(): return COMPILED_FUNCTIONS \ No newline at end of file diff --git a/extra/assembly/amd/dsl.py b/extra/assembly/amd/dsl.py index cde5ef6984..eb1aba9347 100644 --- a/extra/assembly/amd/dsl.py +++ b/extra/assembly/amd/dsl.py @@ -9,7 +9,7 @@ from extra.assembly.amd.autogen.rdna3.enum import (VOP1Op, VOP2Op, VOP3Op, VOP3S SOPCOp, SOPKOp, SOPPOp, SMEMOp, DSOp, FLATOp, MUBUFOp, MTBUFOp, MIMGOp, VINTERPOp) # Common masks and bit conversion functions -MASK32, MASK64 = 0xffffffff, 0xffffffffffffffff +MASK32, MASK64, MASK128 = 0xffffffff, 0xffffffffffffffff, (1 << 128) - 1 _struct_f, _struct_I = struct.Struct(" int: return (cur & 0x0000ffff) | def _vgpr_hi(src: int) -> bool: return src >= 256 and ((src - 256) & 0x80) != 0 def _vgpr_masked(src: int) -> int: return ((src - 256) & 0x7f) + 256 if src >= 256 else src +# Helper: get number of dwords from memory op name +def _op_ndwords(name: str) -> int: + if '_B128' in name: return 4 + if '_B96' in name: return 3 + if any(s in name for s in ('_B64', '_U64', '_I64', '_F64')): return 2 + return 1 + +# Helper: build multi-dword Reg from consecutive VGPRs +def _vgpr_read(V: list, base: int, ndwords: int) -> Reg: return Reg(sum(V[base + i] << (32 * i) for i in range(ndwords))) + +# Helper: write multi-dword value to consecutive VGPRs +def _vgpr_write(V: list, base: int, val: int, ndwords: int): + for i in range(ndwords): V[base + i] = (val >> (32 * i)) & MASK32 + # Memory access _valid_mem_ranges: list[tuple[int, int]] = [] def set_valid_mem_ranges(ranges: set[tuple[int, int]]) -> None: _valid_mem_ranges.clear(); _valid_mem_ranges.extend(ranges) def _mem_valid(addr: int, size: int) -> bool: return not _valid_mem_ranges or any(s <= addr and addr + size <= s + z for s, z in _valid_mem_ranges) -def _ctypes_at(addr: int, size: int): return (ctypes.c_uint8 if size == 1 else ctypes.c_uint16 if size == 2 else ctypes.c_uint32).from_address(addr) +def _ctypes_at(addr: int, size: int): return (ctypes.c_uint8 if size == 1 else ctypes.c_uint16 if size == 2 else ctypes.c_uint64 if size == 8 else ctypes.c_uint32).from_address(addr) def mem_read(addr: int, size: int) -> int: return _ctypes_at(addr, size).value if _mem_valid(addr, size) else 0 def mem_write(addr: int, size: int, val: int) -> None: if _mem_valid(addr, size): _ctypes_at(addr, size).value = val -# Memory op tables (not pseudocode - these are format descriptions) -def _mem_ops(ops, suffix_map): - return {getattr(e, f"{p}_{s}"): v for e in ops for s, v in suffix_map.items() for p in [e.__name__.replace("Op", "")]} -_LOAD_MAP = {'LOAD_B32': (1,4,0), 'LOAD_B64': (2,4,0), 'LOAD_B96': (3,4,0), 'LOAD_B128': (4,4,0), 'LOAD_U8': (1,1,0), 'LOAD_I8': (1,1,1), 'LOAD_U16': (1,2,0), 'LOAD_I16': (1,2,1)} -_STORE_MAP = {'STORE_B32': (1,4), 'STORE_B64': (2,4), 'STORE_B96': (3,4), 'STORE_B128': (4,4), 'STORE_B8': (1,1), 'STORE_B16': (1,2)} -FLAT_LOAD, FLAT_STORE = _mem_ops([GLOBALOp, FLATOp], _LOAD_MAP), _mem_ops([GLOBALOp, FLATOp], _STORE_MAP) -# D16 ops: load/store 16-bit to lower or upper half of VGPR. Format: (size, sign, hi) where hi=1 means upper 16 bits -_D16_LOAD_MAP = {'LOAD_D16_U8': (1,0,0), 'LOAD_D16_I8': (1,1,0), 'LOAD_D16_B16': (2,0,0), - 'LOAD_D16_HI_U8': (1,0,1), 'LOAD_D16_HI_I8': (1,1,1), 'LOAD_D16_HI_B16': (2,0,1)} -_D16_STORE_MAP = {'STORE_D16_HI_B8': (1,1), 'STORE_D16_HI_B16': (2,1)} # (size, hi) -FLAT_D16_LOAD = _mem_ops([GLOBALOp, FLATOp], _D16_LOAD_MAP) -FLAT_D16_STORE = _mem_ops([GLOBALOp, FLATOp], _D16_STORE_MAP) -DS_LOAD = {DSOp.DS_LOAD_B32: (1,4,0), DSOp.DS_LOAD_B64: (2,4,0), DSOp.DS_LOAD_B128: (4,4,0), DSOp.DS_LOAD_U8: (1,1,0), DSOp.DS_LOAD_I8: (1,1,1), DSOp.DS_LOAD_U16: (1,2,0), DSOp.DS_LOAD_I16: (1,2,1)} -DS_STORE = {DSOp.DS_STORE_B32: (1,4), DSOp.DS_STORE_B64: (2,4), DSOp.DS_STORE_B128: (4,4), DSOp.DS_STORE_B8: (1,1), DSOp.DS_STORE_B16: (1,2)} -# 2ADDR ops: load/store two values using offset0 and offset1 -DS_LOAD_2ADDR = {DSOp.DS_LOAD_2ADDR_B32: 4, DSOp.DS_LOAD_2ADDR_B64: 8} -DS_STORE_2ADDR = {DSOp.DS_STORE_2ADDR_B32: 4, DSOp.DS_STORE_2ADDR_B64: 8} +def _make_mem_accessor(read_fn, write_fn): + """Create a memory accessor class with the given read/write functions.""" + class _MemAccessor: + __slots__ = ('_addr',) + def __init__(self, addr: int): self._addr = int(addr) + u8 = property(lambda s: read_fn(s._addr, 1), lambda s, v: write_fn(s._addr, 1, int(v))) + u16 = property(lambda s: read_fn(s._addr, 2), lambda s, v: write_fn(s._addr, 2, int(v))) + u32 = property(lambda s: read_fn(s._addr, 4), lambda s, v: write_fn(s._addr, 4, int(v))) + u64 = property(lambda s: read_fn(s._addr, 8), lambda s, v: write_fn(s._addr, 8, int(v))) + i8 = property(lambda s: _sext(read_fn(s._addr, 1), 8), lambda s, v: write_fn(s._addr, 1, int(v))) + i16 = property(lambda s: _sext(read_fn(s._addr, 2), 16), lambda s, v: write_fn(s._addr, 2, int(v))) + i32 = property(lambda s: _sext(read_fn(s._addr, 4), 32), lambda s, v: write_fn(s._addr, 4, int(v))) + i64 = property(lambda s: _sext(read_fn(s._addr, 8), 64), lambda s, v: write_fn(s._addr, 8, int(v))) + b8, b16, b32, b64 = u8, u16, u32, u64 + return _MemAccessor + +_GlobalMemAccessor = _make_mem_accessor(mem_read, mem_write) + +class _GlobalMem: + """Global memory wrapper that supports MEM[addr].u32 style access.""" + def __getitem__(self, addr) -> _GlobalMemAccessor: return _GlobalMemAccessor(addr) +GlobalMem = _GlobalMem() + +class LDSMem: + """LDS memory wrapper that supports MEM[addr].u32 style access.""" + __slots__ = ('_lds',) + def __init__(self, lds: bytearray): self._lds = lds + def _read(self, addr: int, size: int) -> int: + addr = addr & 0xffff + return int.from_bytes(self._lds[addr:addr+size], 'little') if addr + size <= len(self._lds) else 0 + def _write(self, addr: int, size: int, val: int): + addr = addr & 0xffff + if addr + size <= len(self._lds): self._lds[addr:addr+size] = (int(val) & ((1 << (size*8)) - 1)).to_bytes(size, 'little') + def __getitem__(self, addr): return _make_mem_accessor(self._read, self._write)(addr) + SMEM_LOAD = {SMEMOp.S_LOAD_B32: 1, SMEMOp.S_LOAD_B64: 2, SMEMOp.S_LOAD_B128: 4, SMEMOp.S_LOAD_B256: 8, SMEMOp.S_LOAD_B512: 16} # VOPD op -> VOP3 op mapping (VOPD is dual-issue of VOP1/VOP2 ops, use VOP3 enums for pseudocode lookup) @@ -197,60 +229,28 @@ def exec_scalar(st: WaveState, inst: Inst) -> int: return new_pc_words - st.pc - 1 # -1 because emulator adds inst_words (1 for scalar) return 0 -def exec_vector(st: WaveState, inst: Inst, lane: int, lds: bytearray | None = None) -> None: +def exec_vector(st: WaveState, inst: Inst, lane: int, lds: LDSMem | None = None) -> None: """Execute vector instruction for one lane.""" compiled = _get_compiled() V = st.vgpr[lane] - # Memory ops (not ALU pseudocode) - if isinstance(inst, FLAT): - op, addr_reg, data_reg, vdst, offset, saddr = inst.op, inst.addr, inst.data, inst.vdst, _sext(inst.offset, 13), inst.saddr - addr = V[addr_reg] | (V[addr_reg+1] << 32) - addr = (st.rsgpr64(saddr) + V[addr_reg] + offset) & MASK64 if saddr not in (NULL, 0x7f) else (addr + offset) & MASK64 - if op in FLAT_LOAD: - cnt, sz, sign = FLAT_LOAD[op] - for i in range(cnt): val = mem_read(addr + i * sz, sz); V[vdst + i] = _sext(val, sz * 8) & MASK32 if sign else val - elif op in FLAT_STORE: - cnt, sz = FLAT_STORE[op] - for i in range(cnt): mem_write(addr + i * sz, sz, V[data_reg + i] & ((1 << (sz * 8)) - 1)) - elif op in FLAT_D16_LOAD: - sz, sign, hi = FLAT_D16_LOAD[op] - val = mem_read(addr, sz) - if sign: val = _sext(val, sz * 8) & 0xffff - V[vdst] = _dst16(V[vdst], val, hi) - elif op in FLAT_D16_STORE: - sz, hi = FLAT_D16_STORE[op] - mem_write(addr, sz, _src16(V[data_reg], hi) & ((1 << (sz * 8)) - 1)) - else: raise NotImplementedError(f"FLAT op {op}") - return - - if isinstance(inst, DS): - op, addr0, vdst = inst.op, (V[inst.addr] + inst.offset0) & 0xffff, inst.vdst - if op in DS_LOAD: - cnt, sz, sign = DS_LOAD[op] - for i in range(cnt): val = int.from_bytes(lds[addr0+i*sz:addr0+i*sz+sz], 'little'); V[vdst + i] = _sext(val, sz * 8) & MASK32 if sign else val - elif op in DS_STORE: - cnt, sz = DS_STORE[op] - for i in range(cnt): lds[addr0+i*sz:addr0+i*sz+sz] = (V[inst.data0 + i] & ((1 << (sz * 8)) - 1)).to_bytes(sz, 'little') - elif op in DS_LOAD_2ADDR: - # Load two values from addr+offset0*sz and addr+offset1*sz into vdst (B32: 1 dword each, B64: 2 dwords each) - # Note: offsets are scaled by data size (4 for B32, 8 for B64) per AMD ISA - sz = DS_LOAD_2ADDR[op] - addr0 = (V[inst.addr] + inst.offset0 * sz) & 0xffff - addr1 = (V[inst.addr] + inst.offset1 * sz) & 0xffff - cnt = sz // 4 # 1 for B32, 2 for B64 - for i in range(cnt): V[vdst + i] = int.from_bytes(lds[addr0+i*4:addr0+i*4+4], 'little') - for i in range(cnt): V[vdst + cnt + i] = int.from_bytes(lds[addr1+i*4:addr1+i*4+4], 'little') - elif op in DS_STORE_2ADDR: - # Store two values from data0 and data1 to addr+offset0*sz and addr+offset1*sz - # Note: offsets are scaled by data size (4 for B32, 8 for B64) per AMD ISA - sz = DS_STORE_2ADDR[op] - addr0 = (V[inst.addr] + inst.offset0 * sz) & 0xffff - addr1 = (V[inst.addr] + inst.offset1 * sz) & 0xffff - cnt = sz // 4 - for i in range(cnt): lds[addr0+i*4:addr0+i*4+4] = (V[inst.data0 + i] & MASK32).to_bytes(4, 'little') - for i in range(cnt): lds[addr1+i*4:addr1+i*4+4] = (V[inst.data1 + i] & MASK32).to_bytes(4, 'little') - else: raise NotImplementedError(f"DS op {op}") + # Memory ops (FLAT/GLOBAL/SCRATCH and DS) - use generated pcode + if isinstance(inst, (FLAT, DS)): + op, vdst, op_name = inst.op, inst.vdst, inst.op.name + fn, ndwords = compiled[type(op)][op], _op_ndwords(op_name) + if isinstance(inst, FLAT): + addr = V[inst.addr] | (V[inst.addr + 1] << 32) + ADDR = (st.rsgpr64(inst.saddr) + V[inst.addr] + _sext(inst.offset, 13)) & MASK64 if inst.saddr not in (NULL, 0x7f) else (addr + _sext(inst.offset, 13)) & MASK64 + # For loads, VDATA comes from vdst (preserves unwritten bits); for stores, from inst.data + vdata_src = vdst if 'LOAD' in op_name else inst.data + result = fn(GlobalMem, ADDR, _vgpr_read(V, vdata_src, ndwords), Reg(V[vdst]), Reg(0)) + if 'VDATA' in result: _vgpr_write(V, vdst, result['VDATA']._val, ndwords) + if 'RETURN_DATA' in result: _vgpr_write(V, vdst, result['RETURN_DATA']._val, ndwords) + else: # DS + DATA0, DATA1 = _vgpr_read(V, inst.data0, ndwords), _vgpr_read(V, inst.data1, ndwords) if inst.data1 is not None else Reg(0) + result = fn(lds, Reg(V[inst.addr]), DATA0, DATA1, Reg(inst.offset0), Reg(inst.offset1), Reg(0)) + if 'RETURN_DATA' in result and ('_RTN' in op_name or '_LOAD' in op_name): + _vgpr_write(V, vdst, result['RETURN_DATA']._val, ndwords * 2 if '_2ADDR_' in op_name else ndwords) return # VOPD: dual-issue, execute two ops simultaneously (read all inputs before writes) @@ -423,7 +423,7 @@ def exec_wmma(st: WaveState, inst, op: VOP3POp) -> None: # MAIN EXECUTION LOOP # ═══════════════════════════════════════════════════════════════════════════════ -def step_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> int: +def step_wave(program: Program, st: WaveState, lds: LDSMem, n_lanes: int) -> int: inst = program.get(st.pc) if inst is None: return 1 inst_words, st.literal = inst._words, getattr(inst, '_literal', None) or 0 @@ -443,7 +443,7 @@ def step_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> st.pc += inst_words return 0 -def exec_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> int: +def exec_wave(program: Program, st: WaveState, lds: LDSMem, n_lanes: int) -> int: while st.pc in program: result = step_wave(program, st, lds, n_lanes) if result == -1: return 0 @@ -453,7 +453,7 @@ def exec_wave(program: Program, st: WaveState, lds: bytearray, n_lanes: int) -> def exec_workgroup(program: Program, workgroup_id: tuple[int, int, int], local_size: tuple[int, int, int], args_ptr: int, wg_id_sgpr_base: int, wg_id_enables: tuple[bool, bool, bool]) -> None: lx, ly, lz = local_size - total_threads, lds = lx * ly * lz, bytearray(65536) + total_threads, lds = lx * ly * lz, LDSMem(bytearray(65536)) waves: list[tuple[WaveState, int, int]] = [] for wave_start in range(0, total_threads, WAVE_SIZE): n_lanes, st = min(WAVE_SIZE, total_threads - wave_start), WaveState() diff --git a/extra/assembly/amd/pcode.py b/extra/assembly/amd/pcode.py index 65e1193f68..3e6b83c0e9 100644 --- a/extra/assembly/amd/pcode.py +++ b/extra/assembly/amd/pcode.py @@ -1,6 +1,6 @@ # DSL for RDNA3 pseudocode - makes pseudocode expressions work directly as Python import struct, math -from extra.assembly.amd.dsl import MASK32, MASK64, _f32, _i32, _sext, _f16, _i16, _f64, _i64 +from extra.assembly.amd.dsl import MASK32, MASK64, MASK128, _f32, _i32, _sext, _f16, _i16, _f64, _i64 # ═══════════════════════════════════════════════════════════════════════════════ # HELPER FUNCTIONS @@ -206,47 +206,6 @@ def signext_from_bit(val, bit): if val & (1 << (bit - 1)): return val - (1 << bit) return val -# ═══════════════════════════════════════════════════════════════════════════════ -# DSL EXPORTS -# ═══════════════════════════════════════════════════════════════════════════════ - -__all__ = [ - # Classes - 'Reg', 'SliceProxy', 'TypedView', - # Pack functions - '_pack', '_pack32', 'pack', 'pack32', - # Constants - 'WAVE32', 'WAVE64', 'MASK32', 'MASK64', 'WAVE_MODE', 'DENORM', 'OVERFLOW_F32', 'UNDERFLOW_F32', - 'OVERFLOW_F64', 'UNDERFLOW_F64', 'MAX_FLOAT_F32', 'ROUND_MODE', 'cvtToQuietNAN', 'DST', 'INF', 'PI', - 'TWO_OVER_PI_1201', - # Aliases for pseudocode - 's_ff1_i32_b32', 's_ff1_i32_b64', 'GT_NEG_ZERO', 'LT_NEG_ZERO', - 'isNAN', 'isQuietNAN', 'isSignalNAN', 'fma', 'ldexp', 'sign', 'exponent', 'F', 'signext', - # Conversion functions - '_f32', '_i32', '_f16', '_i16', '_f64', '_i64', '_sext', '_to_f16_bits', '_f16_to_f32_bits', - 'i32_to_f32', 'u32_to_f32', 'i32_to_f64', 'u32_to_f64', 'f32_to_f64', 'f64_to_f32', - 'f32_to_i32', 'f32_to_u32', 'f64_to_i32', 'f64_to_u32', 'f32_to_f16', 'f16_to_f32', - 'i16_to_f16', 'u16_to_f16', 'f16_to_i16', 'f16_to_u16', 'u32_to_u16', 'i32_to_i16', - 'f16_to_snorm', 'f16_to_unorm', 'f32_to_snorm', 'f32_to_unorm', 'v_cvt_i16_f32', 'v_cvt_u16_f32', - 'SAT8', 'f32_to_u8', 'u8_to_u32', 'u4_to_u32', - # BF16 conversion functions - '_bf16', '_ibf16', 'bf16_to_f32', 'f32_to_bf16', - # Math functions - 'trunc', 'floor', 'ceil', 'sqrt', 'log2', 'sin', 'cos', 'pow', 'fract', 'isEven', 'mantissa', - # Min/max functions - 'v_min_f32', 'v_max_f32', 'v_min_i32', 'v_max_i32', 'v_min_u32', 'v_max_u32', - 'v_min_f16', 'v_max_f16', 'v_min_i16', 'v_max_i16', 'v_min_u16', 'v_max_u16', - 'v_min3_f32', 'v_max3_f32', 'v_min3_i32', 'v_max3_i32', 'v_min3_u32', 'v_max3_u32', - 'v_min3_f16', 'v_max3_f16', 'v_min3_i16', 'v_max3_i16', 'v_min3_u16', 'v_max3_u16', - 'ABSDIFF', - # Byte/SAD helper functions - 'BYTE_PERMUTE', 'v_sad_u8', 'v_msad_u8', - # Bit manipulation - '_brev32', '_brev64', '_ctz32', '_ctz64', '_exponent', '_is_denorm_f32', '_is_denorm_f64', - '_sign', '_mantissa_f32', '_div', '_isnan', '_isquietnan', '_issignalnan', '_gt_neg_zero', '_lt_neg_zero', '_fma', '_ldexp', '_signext', - 'signext_from_bit', -] - # Aliases used in pseudocode s_ff1_i32_b32, s_ff1_i32_b64 = _ctz32, _ctz64 GT_NEG_ZERO, LT_NEG_ZERO = _gt_neg_zero, _lt_neg_zero @@ -341,12 +300,6 @@ class _Denorm: f64 = _DenormChecker(64) DENORM = _Denorm() -def _brev(v, bits): - """Bit-reverse a value.""" - result = 0 - for i in range(bits): result |= ((v >> i) & 1) << (bits - 1 - i) - return result - class SliceProxy: """Proxy for D0[31:16] that supports .f16/.u16 etc getters and setters.""" __slots__ = ('_reg', '_high', '_low', '_reversed') @@ -474,9 +427,9 @@ class TypedView: def u32(s): return s if s._bits == 32 and not s._signed else int(s) & MASK32 class Reg: - """GPU register: D0.f32 = S0.f32 + S1.f32 just works.""" + """GPU register: D0.f32 = S0.f32 + S1.f32 just works. Supports up to 128 bits for DS_LOAD_B128.""" __slots__ = ('_val',) - def __init__(self, val=0): self._val = int(val) & MASK64 + def __init__(self, val=0): self._val = int(val) & MASK128 # Typed views u64 = property(lambda s: TypedView(s, 64), lambda s, v: setattr(s, '_val', int(v) & MASK64)) diff --git a/extra/assembly/amd/pdf.py b/extra/assembly/amd/pdf.py index abd35022cd..88c646f12e 100644 --- a/extra/assembly/amd/pdf.py +++ b/extra/assembly/amd/pdf.py @@ -36,7 +36,7 @@ FIELD_ORDER = { SRC_EXTRAS = {233: 'DPP8', 234: 'DPP8FI', 250: 'DPP16', 251: 'VCCZ', 252: 'EXECZ', 254: 'LDS_DIRECT'} FLOAT_MAP = {'0.5': 'POS_HALF', '-0.5': 'NEG_HALF', '1.0': 'POS_ONE', '-1.0': 'NEG_ONE', '2.0': 'POS_TWO', '-2.0': 'NEG_TWO', '4.0': 'POS_FOUR', '-4.0': 'NEG_FOUR', '1/(2*PI)': 'INV_2PI', '0': 'ZERO'} -INST_PATTERN = re.compile(r'^([SV]_[A-Z0-9_]+)\s+(\d+)\s*$', re.M) +INST_PATTERN = re.compile(r'^([SVD]S?_[A-Z0-9_]+|(?:FLAT|GLOBAL|SCRATCH)_[A-Z0-9_]+)\s+(\d+)\s*$', re.M) # Patterns that can't be handled by the DSL (require special handling in emu.py) UNSUPPORTED = ['SGPR[', 'V_SWAP', 'eval ', 'FATAL_HALT', 'HW_REGISTERS', @@ -46,7 +46,8 @@ UNSUPPORTED = ['SGPR[', 'V_SWAP', 'eval ', 'FATAL_HALT', 'HW_REGISTERS', 'if n.', 'DST.u32', 'addrd = DST', 'addr = DST', 'BARRIER_STATE', 'ReallocVgprs', 'GPR_IDX', 'VSKIP', 'specified in', 'TTBL', - 'fp6', 'bf6'] # Malformed pseudocode from PDF + 'fp6', 'bf6', 'GS_REGS', 'M0.base', 'DS_DATA', '= 0..', 'sign(src', 'if no LDS', 'gds_base', 'vector mask', + 'SGPR_ADDR', 'INST_OFFSET', 'laneID'] # FLAT ops with non-standard vars # ═══════════════════════════════════════════════════════════════════════════════ # COMPILER: pseudocode -> Python (minimal transforms) @@ -68,8 +69,8 @@ def compile_pseudocode(pseudocode: str) -> str: lines = [] indent, need_pass, in_first_match_loop = 0, False, False for line in joined_lines: - line = line.strip() - if not line or line.startswith('//'): continue + line = line.split('//')[0].strip() # Strip C-style comments + if not line: continue if line.startswith('if '): lines.append(' ' * indent + f"if {_expr(line[3:].rstrip(' then'))}:") indent += 1 @@ -351,8 +352,9 @@ def _extract_pseudocode(text: str) -> str | None: for line in lines: s = line.strip() if not s or re.match(r'^\d+ of \d+$', s) or re.match(r'^\d+\.\d+\..*Instructions', s): continue - if s.startswith(('Notes', 'Functional examples')): break + if s.startswith(('Notes', 'Functional examples', '•', '-')): break # Stop at notes/bullets if s.startswith(('"RDNA', 'AMD ', 'CDNA')): continue + if '•' in s or '–' in s: continue # Skip lines with bullets/dashes if '= lambda(' in s: in_lambda += 1; continue if in_lambda > 0: if s.endswith(');'): in_lambda -= 1 @@ -362,7 +364,8 @@ def _extract_pseudocode(text: str) -> str | None: if s.endswith('.') and not any(p in s for p in ['D0', 'D1', 'S0', 'S1', 'S2', 'SCC', 'VCC', 'tmp', '=']): continue if re.match(r'^[a-z].*\.$', s) and '=' not in s: continue is_code = (any(p in s for p in ['D0.', 'D1.', 'S0.', 'S1.', 'S2.', 'SCC =', 'SCC ?', 'VCC', 'EXEC', 'tmp =', 'tmp[', 'lane =', 'PC =', - 'D0[', 'D1[', 'S0[', 'S1[', 'S2[']) or + 'D0[', 'D1[', 'S0[', 'S1[', 'S2[', 'MEM[', 'RETURN_DATA', + 'VADDR', 'VDATA', 'VDST', 'SADDR', 'OFFSET']) or s.startswith(('if ', 'else', 'elsif', 'endif', 'declare ', 'for ', 'endfor', '//')) or re.match(r'^[a-z_]+\s*=', s) or re.match(r'^[a-z_]+\[', s) or (depth > 0 and '=' in s)) if is_code: result.append(s) @@ -448,28 +451,23 @@ def _generate_gen_pcode_py(enums, pseudocode, arch) -> str: # Get op enums for this arch (import from .ins which re-exports from .enum) import importlib autogen = importlib.import_module(f"extra.assembly.amd.autogen.{arch}.ins") - OP_ENUMS = [getattr(autogen, name) for name in ['SOP1Op', 'SOP2Op', 'SOPCOp', 'SOPKOp', 'SOPPOp', 'VOP1Op', 'VOP2Op', 'VOP3Op', 'VOP3SDOp', 'VOP3POp', 'VOPCOp', 'VOP3AOp', 'VOP3BOp'] if hasattr(autogen, name)] + OP_ENUMS = [getattr(autogen, name) for name in ['SOP1Op', 'SOP2Op', 'SOPCOp', 'SOPKOp', 'SOPPOp', 'VOP1Op', 'VOP2Op', 'VOP3Op', 'VOP3SDOp', 'VOP3POp', 'VOPCOp', 'VOP3AOp', 'VOP3BOp', 'DSOp', 'FLATOp', 'GLOBALOp', 'SCRATCHOp'] if hasattr(autogen, name)] # Build defined ops mapping defined_ops: dict[tuple, list] = {} for enum_cls in OP_ENUMS: for op in enum_cls: - if op.name.startswith(('S_', 'V_')): defined_ops.setdefault((op.name, op.value), []).append((enum_cls, op)) + if op.name.startswith(('S_', 'V_', 'DS_', 'FLAT_', 'GLOBAL_', 'SCRATCH_')): defined_ops.setdefault((op.name, op.value), []).append((enum_cls, op)) enum_names = [e.__name__ for e in OP_ENUMS] - lines = [f'''# autogenerated by pdf.py - do not edit -# to regenerate: python -m extra.assembly.amd.pdf --arch {arch} -# ruff: noqa: E501,F405,F403 -# mypy: ignore-errors -from extra.assembly.amd.autogen.{arch}.enum import {", ".join(enum_names)} -from extra.assembly.amd.pcode import * -'''] - instructions: dict = {cls: {} for cls in OP_ENUMS} for key, pc in pseudocode.items(): if key in defined_ops: for enum_cls, enum_val in defined_ops[key]: instructions[enum_cls][enum_val] = pc + # First pass: generate all function code + fn_lines: list[str] = [] + all_fn_entries: dict = {} for enum_cls in OP_ENUMS: cls_name = enum_cls.__name__ if not instructions.get(enum_cls): continue @@ -480,28 +478,44 @@ from extra.assembly.amd.pcode import * code = compile_pseudocode(pc) code = _apply_pseudocode_fixes(op, code) fn_name, fn_code = _generate_function(cls_name, op, pc, code) - lines.append(fn_code) + fn_lines.append(fn_code) fn_entries.append((op, fn_name)) except Exception as e: print(f" Warning: Failed to compile {op.name}: {e}") if fn_entries: - lines.append(f'{cls_name}_FUNCTIONS = {{') - for op, fn_name in fn_entries: lines.append(f" {cls_name}.{op.name}: {fn_name},") - lines.append('}\n') + all_fn_entries[enum_cls] = fn_entries + fn_lines.append(f'{cls_name}_FUNCTIONS = {{') + for op, fn_name in fn_entries: fn_lines.append(f" {cls_name}.{op.name}: {fn_name},") + fn_lines.append('}\n') # Add V_WRITELANE_B32 if VOP3Op exists if 'VOP3Op' in enum_names: - lines.append(''' + fn_lines.append(''' # V_WRITELANE_B32: Write scalar to specific lane's VGPR (not in PDF pseudocode) -def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0): +def _VOP3Op_V_WRITELANE_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None): wr_lane = s1 & 0x1f return {'d0': d0, 'scc': scc, 'vgpr_write': (wr_lane, vdst_idx, s0 & 0xffffffff)} VOP3Op_FUNCTIONS[VOP3Op.V_WRITELANE_B32] = _VOP3Op_V_WRITELANE_B32 ''') - lines.append('COMPILED_FUNCTIONS = {') + fn_lines.append('COMPILED_FUNCTIONS = {') for enum_cls in OP_ENUMS: - if instructions.get(enum_cls): lines.append(f' {enum_cls.__name__}: {enum_cls.__name__}_FUNCTIONS,') - lines.append('}\n\ndef get_compiled_functions(): return COMPILED_FUNCTIONS') + if all_fn_entries.get(enum_cls): fn_lines.append(f' {enum_cls.__name__}: {enum_cls.__name__}_FUNCTIONS,') + fn_lines.append('}\n\ndef get_compiled_functions(): return COMPILED_FUNCTIONS') + + # Second pass: scan generated code for pcode imports + fn_code_str = '\n'.join(fn_lines) + import extra.assembly.amd.pcode as pcode_module + pcode_exports = [name for name in dir(pcode_module) if not name.startswith('_') or name.startswith('_') and not name.startswith('__')] + used_imports = sorted(name for name in pcode_exports if re.search(rf'\b{re.escape(name)}\b', fn_code_str)) + + # Build final output with explicit imports + lines = [f'''# autogenerated by pdf.py - do not edit +# to regenerate: python -m extra.assembly.amd.pdf --arch {arch} +# ruff: noqa: E501 +# mypy: ignore-errors +from extra.assembly.amd.autogen.{arch}.enum import {", ".join(enum_names)} +from extra.assembly.amd.pcode import {", ".join(used_imports)} +'''] + fn_lines return '\n'.join(lines) def _apply_pseudocode_fixes(op, code: str) -> str: @@ -541,19 +555,32 @@ def _generate_function(cls_name: str, op, pc: str, code: str) -> tuple[str, str] is_cmpx = (cls_name in ('VOPCOp', 'VOP3Op')) and 'EXEC.u64[laneId]' in pc is_div_scale = 'DIV_SCALE' in op.name has_sdst = cls_name == 'VOP3SDOp' and ('VCC.u64[laneId]' in pc or is_div_scale) + is_ds = cls_name == 'DSOp' + is_flat = cls_name in ('FLATOp', 'GLOBALOp', 'SCRATCHOp') combined = code + pc fn_name = f"_{cls_name}_{op.name}" # Function accepts Reg objects directly (uppercase names), laneId is passed directly as int - lines = [f"def {fn_name}(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None):"] + # DSOp functions get additional MEM and offset parameters + # FLAT/GLOBAL ops get MEM, vaddr, vdata, saddr, offset parameters + if is_ds: + lines = [f"def {fn_name}(MEM, ADDR, DATA0, DATA1, OFFSET0, OFFSET1, RETURN_DATA):"] + elif is_flat: + lines = [f"def {fn_name}(MEM, ADDR, VDATA, VDST, RETURN_DATA):"] + else: + lines = [f"def {fn_name}(S0, S1, S2, D0, SCC, VCC, laneId, EXEC, literal, VGPR, src0_idx=0, vdst_idx=0, PC=None):"] - # Registers that need special handling (not passed directly) - # Only init if used but not first assigned as `name = Reg(...)` in the compiled code + # Registers that need special handling (aliases or init) def needs_init(name): return name in combined and not re.search(rf'^\s*{name}\s*=\s*Reg\(', code, re.MULTILINE) - special_regs = [('D1', 'Reg(0)'), ('SIMM16', 'Reg(literal)'), ('SIMM32', 'Reg(literal)'), - ('SRC0', 'Reg(src0_idx)'), ('VDST', 'Reg(vdst_idx)')] - if needs_init('tmp'): special_regs.insert(0, ('tmp', 'Reg(0)')) - if needs_init('saveexec'): special_regs.insert(0, ('saveexec', 'Reg(EXEC._val)')) + special_regs = [] + if is_ds: special_regs = [('DATA', 'DATA0'), ('DATA2', 'DATA1'), ('OFFSET', 'OFFSET0'), ('ADDR_BASE', 'ADDR')] + elif is_flat: special_regs = [('DATA', 'VDATA')] + else: + special_regs = [('D1', 'Reg(0)'), ('SIMM16', 'Reg(literal)'), ('SIMM32', 'Reg(literal)'), + ('SRC0', 'Reg(src0_idx)'), ('VDST', 'Reg(vdst_idx)')] + if needs_init('tmp'): special_regs.insert(0, ('tmp', 'Reg(0)')) + if needs_init('saveexec'): special_regs.insert(0, ('saveexec', 'Reg(EXEC._val)')) + used = {name for name, _ in special_regs if name in combined} # Detect which registers are modified (not just read) - look for assignments @@ -562,6 +589,10 @@ def _generate_function(cls_name: str, op, pc: str, code: str) -> tuple[str, str] modifies_vcc = has_sdst or bool(re.search(r'VCC\.(u32|u64|b32|b64)\s*=|VCC\.u64\[laneId\]\s*=', combined)) modifies_scc = bool(re.search(r'\bSCC\s*=', combined)) modifies_pc = bool(re.search(r'\bPC\s*=', combined)) + # DS/FLAT ops: detect memory writes (MEM[...] = ...) + modifies_mem = (is_ds or is_flat) and bool(re.search(r'MEM\[.*\]\.[a-z0-9]+\s*=', combined)) + # FLAT ops: detect VDST writes + modifies_vdst = is_flat and bool(re.search(r'VDST[\.\[].*=', combined)) # Build init code for special registers init_lines = [] @@ -587,6 +618,15 @@ def _generate_function(cls_name: str, op, pc: str, code: str) -> tuple[str, str] if modifies_exec: result_items.append("'EXEC': EXEC") if has_d1: result_items.append("'D1': D1") if modifies_pc: result_items.append("'PC': PC") + # DS ops: return RETURN_DATA if it was written (left side of assignment) + if is_ds and 'RETURN_DATA' in combined and re.search(r'^\s*RETURN_DATA[\.\[].*=', code, re.MULTILINE): + result_items.append("'RETURN_DATA': RETURN_DATA") + # FLAT ops: return RETURN_DATA for atomics, VDATA for loads (only if written to) + if is_flat: + if 'RETURN_DATA' in combined and re.search(r'^\s*RETURN_DATA[\.\[].*=', code, re.MULTILINE): + result_items.append("'RETURN_DATA': RETURN_DATA") + if re.search(r'^\s*VDATA[\.\[].*=', code, re.MULTILINE): + result_items.append("'VDATA': VDATA") lines.append(f" return {{{', '.join(result_items)}}}\n") return fn_name, '\n'.join(lines) diff --git a/extra/assembly/amd/test/test_compare_emulators.py b/extra/assembly/amd/test/test_compare_emulators.py index 6f10c75575..12b05805ee 100644 --- a/extra/assembly/amd/test/test_compare_emulators.py +++ b/extra/assembly/amd/test/test_compare_emulators.py @@ -9,7 +9,7 @@ os.environ["AMD"] = "1" os.environ["MOCKGPU"] = "1" os.environ["PYTHON_REMU"] = "1" -from extra.assembly.amd.emu import WaveState, decode_program, step_wave, WAVE_SIZE, set_valid_mem_ranges +from extra.assembly.amd.emu import WaveState, decode_program, step_wave, WAVE_SIZE, set_valid_mem_ranges, LDSMem from extra.assembly.amd.test.helpers import KernelInfo REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so" @@ -99,7 +99,7 @@ class PythonEmulator: self.program = decode_program(kernel) self.state = WaveState() self.state.exec_mask = (1 << n_lanes) - 1 - self.lds = bytearray(65536) + self.lds = LDSMem(bytearray(65536)) self.n_lanes = n_lanes def step(self) -> int: diff --git a/extra/assembly/amd/test/test_emu.py b/extra/assembly/amd/test/test_emu.py index e099aca931..66b5bb4d30 100644 --- a/extra/assembly/amd/test/test_emu.py +++ b/extra/assembly/amd/test/test_emu.py @@ -142,6 +142,7 @@ test: .amdhsa_wavefront_size32 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_kernarg_size 8 + .amdhsa_group_segment_fixed_size 65536 .end_amdhsa_kernel .amdgpu_metadata @@ -153,7 +154,7 @@ amdhsa.kernels: - .name: test .symbol: test.kd .kernarg_segment_size: 8 - .group_segment_fixed_size: 0 + .group_segment_fixed_size: 65536 .private_segment_fixed_size: 0 .kernarg_segment_align: 8 .wavefront_size: 32 @@ -989,6 +990,152 @@ class TestLaneInstructions(unittest.TestCase): for lane in range(4): self.assertEqual(st.vgpr[lane][1], 10, f"Sum 1+2+3+4 should be 10") + def test_v_writelane_b32_different_vgpr(self): + """V_WRITELANE_B32 writes to a non-zero VGPR index. + + Regression test for bug where vdst_idx was always 0 due to function signature + mismatch (_vars parameter shifted all arguments). This caused all WRITELANE + operations to write to v[0] regardless of the actual destination register. + """ + instructions = [ + v_mov_b32_e32(v[0], 0), # Initialize v0 = 0 + v_mov_b32_e32(v[5], 0), # Initialize v5 = 0 + s_mov_b32(s[0], 0x12345678), # Value to write + v_writelane_b32(v[5], s[0], 1), # Write to lane 1's v5 (NOT v0!) + ] + st = run_program(instructions, n_lanes=4) + # v[0] should remain 0 for all lanes (bug would have written here) + for lane in range(4): + self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)") + # v[5] should have the value only in lane 1 + for lane in range(4): + if lane == 1: + self.assertEqual(st.vgpr[lane][5], 0x12345678, f"v[5] lane 1 should have 0x12345678") + else: + self.assertEqual(st.vgpr[lane][5], 0, f"v[5] lane {lane} should be 0") + + def test_v_writelane_b32_high_vgpr_index(self): + """V_WRITELANE_B32 writes to a high VGPR index (v[15]). + + Tests that the vdst_idx is correctly passed through for larger register indices. + """ + instructions = [ + v_mov_b32_e32(v[0], 0), # Initialize v0 = 0 + v_mov_b32_e32(v[15], 0), # Initialize v15 = 0 + s_mov_b32(s[0], 0xCAFEBABE), # Value to write + v_writelane_b32(v[15], s[0], 0), # Write to lane 0's v15 + ] + st = run_program(instructions, n_lanes=4) + # v[0] should remain 0 for all lanes + for lane in range(4): + self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0") + # v[15] should have the value only in lane 0 + self.assertEqual(st.vgpr[0][15], 0xCAFEBABE, "v[15] lane 0 should have 0xCAFEBABE") + for lane in range(1, 4): + self.assertEqual(st.vgpr[lane][15], 0, f"v[15] lane {lane} should be 0") + + def test_v_writelane_b32_multiple_writes_different_vgprs(self): + """V_WRITELANE_B32 writes to multiple different VGPRs. + + This is the pattern used in sparse_categorical_crossentropy where values + are written to different VGPR indices via writelane, then read back. + """ + instructions = [ + # Initialize all target VGPRs to 0 + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[3], 0), + v_mov_b32_e32(v[7], 0), + v_mov_b32_e32(v[10], 0), + # Write different values to different VGPRs at different lanes + s_mov_b32(s[0], 100), + v_writelane_b32(v[3], s[0], 0), # v[3] lane 0 = 100 + s_mov_b32(s[0], 200), + v_writelane_b32(v[7], s[0], 1), # v[7] lane 1 = 200 + s_mov_b32(s[0], 300), + v_writelane_b32(v[10], s[0], 2), # v[10] lane 2 = 300 + ] + st = run_program(instructions, n_lanes=4) + + # v[0] should remain 0 everywhere + for lane in range(4): + self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0") + + # Check each target VGPR + self.assertEqual(st.vgpr[0][3], 100, "v[3] lane 0 should be 100") + for lane in range(1, 4): + self.assertEqual(st.vgpr[lane][3], 0, f"v[3] lane {lane} should be 0") + + self.assertEqual(st.vgpr[1][7], 200, "v[7] lane 1 should be 200") + for lane in [0, 2, 3]: + self.assertEqual(st.vgpr[lane][7], 0, f"v[7] lane {lane} should be 0") + + self.assertEqual(st.vgpr[2][10], 300, "v[10] lane 2 should be 300") + for lane in [0, 1, 3]: + self.assertEqual(st.vgpr[lane][10], 0, f"v[10] lane {lane} should be 0") + + def test_v_writelane_then_readlane_different_vgpr(self): + """V_WRITELANE followed by V_READLANE on a non-zero VGPR. + + Regression test: the original bug caused writelane to always write to v[0], + so reading back from the intended VGPR would return 0 instead of the written value. + This is the exact pattern that failed in sparse_categorical_crossentropy. + """ + instructions = [ + v_mov_b32_e32(v[0], 0), # Initialize v0 = 0 + v_mov_b32_e32(v[8], 0), # Initialize v8 = 0 + s_mov_b32(s[0], 0xABCD1234), + v_writelane_b32(v[8], s[0], 2), # Write to lane 2's v8 + self._readlane(1, v[8], 2), # Read back from lane 2's v8 into s1 + v_mov_b32_e32(v[1], s[1]), # Broadcast to all lanes + ] + st = run_program(instructions, n_lanes=4) + # The read value should be what we wrote + for lane in range(4): + self.assertEqual(st.vgpr[lane][1], 0xABCD1234, + f"Lane {lane}: readlane should return 0xABCD1234, got 0x{st.vgpr[lane][1]:08x}") + # v[0] should still be 0 (bug would have written here instead of v[8]) + for lane in range(4): + self.assertEqual(st.vgpr[lane][0], 0, f"v[0] lane {lane} should be 0 (untouched)") + + def test_v_writelane_b32_accumulate_pattern(self): + """V_WRITELANE_B32 used to accumulate values across lanes into a single VGPR. + + This pattern is used in reductions where each lane writes its result to + a different lane of the same VGPR, then the results are read back. + """ + instructions = [ + v_mov_b32_e32(v[6], 0), # Initialize accumulator v6 = 0 + # Each "iteration" writes to a different lane + s_mov_b32(s[0], 10), + v_writelane_b32(v[6], s[0], 0), # lane 0 gets 10 + s_mov_b32(s[0], 20), + v_writelane_b32(v[6], s[0], 1), # lane 1 gets 20 + s_mov_b32(s[0], 30), + v_writelane_b32(v[6], s[0], 2), # lane 2 gets 30 + s_mov_b32(s[0], 40), + v_writelane_b32(v[6], s[0], 3), # lane 3 gets 40 + # Now read them all back and sum + self._readlane(0, v[6], 0), # s0 = 10 + self._readlane(1, v[6], 1), # s1 = 20 + s_add_u32(s[0], s[0], s[1]), # s0 = 30 + self._readlane(1, v[6], 2), # s1 = 30 + s_add_u32(s[0], s[0], s[1]), # s0 = 60 + self._readlane(1, v[6], 3), # s1 = 40 + s_add_u32(s[0], s[0], s[1]), # s0 = 100 + v_mov_b32_e32(v[7], s[0]), # Broadcast sum to all lanes + ] + st = run_program(instructions, n_lanes=4) + + # Check that each lane of v[6] has the correct value + self.assertEqual(st.vgpr[0][6], 10, "v[6] lane 0 should be 10") + self.assertEqual(st.vgpr[1][6], 20, "v[6] lane 1 should be 20") + self.assertEqual(st.vgpr[2][6], 30, "v[6] lane 2 should be 30") + self.assertEqual(st.vgpr[3][6], 40, "v[6] lane 3 should be 40") + + # Check the sum + for lane in range(4): + self.assertEqual(st.vgpr[lane][7], 100, f"Sum should be 100, got {st.vgpr[lane][7]}") + class TestTrigonometry(unittest.TestCase): """Tests for trigonometric instructions.""" @@ -3690,10 +3837,6 @@ class TestVOP3F16Modifiers(unittest.TestCase): self.assertAlmostEqual(result, -6.0, delta=0.01, msg=f"Expected -6.0, got {result}") -if __name__ == '__main__': - unittest.main() - - class TestVFmaMixSinCase(unittest.TestCase): """Tests for the specific V_FMA_MIXLO_F16 case that fails in AMD_LLVM sin(0) kernel.""" @@ -4256,3 +4399,1370 @@ class TestDS2Addr(unittest.TestCase): # v6,v7 from addr 8-15: 0x33333333, 0x44444444 self.assertEqual(st.vgpr[0][6], 0x33333333, "v6 should be 0x33333333") self.assertEqual(st.vgpr[0][7], 0x44444444, "v7 should be 0x44444444") + + +class TestDSAtomic(unittest.TestCase): + """Tests for DS atomic instructions (add, max, min, and, or, xor, cmpstore, etc.).""" + + def test_ds_max_rtn_u32(self): + """DS_MAX_RTN_U32: atomically store max(mem, data) and return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), # addr = 0 + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), # initial value = 100 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 200), + v_mov_b32_e32(v[1], s[2]), # data = 200 (greater than 100) + ds_max_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), # read result + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)") + self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200") + + def test_ds_max_u32_no_rtn(self): + """DS_MAX_U32 (no RTN): atomically store max, no return value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), # initial = 100 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 200), + v_mov_b32_e32(v[1], s[2]), # data = 200 + ds_max_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][3], 200, "v3 should have max(100, 200) = 200") + + def test_ds_add_u32_no_rtn_preserves_vdst(self): + """DS_ADD_U32 (no RTN) should NOT write to vdst - vdst should preserve sentinel value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + # Set sentinel value in vdst + s_mov_b32(s[2], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[2]), # sentinel in v2 + # Store initial value + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + # Do non-RTN add (should NOT write to v2) + s_mov_b32(s[2], 50), + v_mov_b32_e32(v[1], s[2]), + ds_add_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + # Load result to verify add worked + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xDEADBEEF, "v2 should preserve sentinel (no RTN)") + self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150") + + def test_ds_add_rtn_u32_writes_vdst(self): + """DS_ADD_RTN_U32 should write old value to vdst.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + # Set sentinel value in vdst + s_mov_b32(s[2], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[2]), # sentinel in v2 + # Store initial value + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + # Do RTN add (SHOULD write old value to v2) + s_mov_b32(s[2], 50), + v_mov_b32_e32(v[1], s[2]), + ds_add_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + # Load result to verify add worked + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 100, "v2 should have old value (100)") + self.assertEqual(st.vgpr[0][3], 150, "v3 should have 100 + 50 = 150") + + def test_ds_min_rtn_u32(self): + """DS_MIN_RTN_U32: atomically store min(mem, data) and return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 200), + v_mov_b32_e32(v[0], s[2]), # initial = 200 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[1], s[2]), # data = 100 + ds_min_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 200, "v2 should have old value (200)") + self.assertEqual(st.vgpr[0][3], 100, "v3 should have min(200, 100) = 100") + + def test_ds_and_rtn_b32(self): + """DS_AND_RTN_B32: atomically AND mem with data and return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 0xFF00FF00), + v_mov_b32_e32(v[0], s[2]), # initial = 0xFF00FF00 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 0xFFFF0000), + v_mov_b32_e32(v[1], s[2]), # data = 0xFFFF0000 + ds_and_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xFF00FF00, "v2 should have old value") + self.assertEqual(st.vgpr[0][3], 0xFF000000, "v3 should have 0xFF00FF00 & 0xFFFF0000 = 0xFF000000") + + def test_ds_or_rtn_b32(self): + """DS_OR_RTN_B32: atomically OR mem with data and return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 0x00FF0000), + v_mov_b32_e32(v[0], s[2]), # initial = 0x00FF0000 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 0x000000FF), + v_mov_b32_e32(v[1], s[2]), # data = 0x000000FF + ds_or_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0x00FF0000, "v2 should have old value") + self.assertEqual(st.vgpr[0][3], 0x00FF00FF, "v3 should have 0x00FF0000 | 0x000000FF = 0x00FF00FF") + + def test_ds_xor_rtn_b32(self): + """DS_XOR_RTN_B32: atomically XOR mem with data and return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 0xAAAAAAAA), + v_mov_b32_e32(v[0], s[2]), # initial = 0xAAAAAAAA + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 0xFFFFFFFF), + v_mov_b32_e32(v[1], s[2]), # data = 0xFFFFFFFF + ds_xor_rtn_b32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have old value") + self.assertEqual(st.vgpr[0][3], 0x55555555, "v3 should have 0xAAAAAAAA ^ 0xFFFFFFFF = 0x55555555") + + def test_ds_cmpstore_b32_match(self): + """DS_CMPSTORE_B32: conditional store when compare matches.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), # initial = 100 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 200), + v_mov_b32_e32(v[1], s[2]), # new value = 200 + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[2], s[2]), # compare = 100 (matches current) + ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[4], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 200, "mem should be updated to 200 (compare matched)") + + def test_ds_cmpstore_b32_no_match(self): + """DS_CMPSTORE_B32: no store when compare doesn't match.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 100), + v_mov_b32_e32(v[0], s[2]), # initial = 100 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 200), + v_mov_b32_e32(v[1], s[2]), # new value = 200 + s_mov_b32(s[2], 50), + v_mov_b32_e32(v[2], s[2]), # compare = 50 (doesn't match 100) + ds_cmpstore_b32(addr=v[10], data0=v[1], data1=v[2], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[4], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 100, "mem should still be 100 (compare didn't match)") + + def test_ds_inc_rtn_u32(self): + """DS_INC_RTN_U32: increment with wrap, return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 5), + v_mov_b32_e32(v[0], s[2]), # initial = 5 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 10), + v_mov_b32_e32(v[1], s[2]), # limit = 10 + ds_inc_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 5, "v2 should have old value (5)") + self.assertEqual(st.vgpr[0][3], 6, "v3 should have incremented value (6)") + + def test_ds_dec_rtn_u32(self): + """DS_DEC_RTN_U32: decrement with wrap, return old value.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 5), + v_mov_b32_e32(v[0], s[2]), # initial = 5 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 10), + v_mov_b32_e32(v[1], s[2]), # limit = 10 + ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 5, "v2 should have old value (5)") + self.assertEqual(st.vgpr[0][3], 4, "v3 should have decremented value (4)") + + def test_ds_dec_rtn_u32_wrap(self): + """DS_DEC_RTN_U32: wraps to limit when value is 0 or > limit.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[2], 0), + v_mov_b32_e32(v[0], s[2]), # initial = 0 + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[2], 10), + v_mov_b32_e32(v[1], s[2]), # limit = 10 + ds_dec_rtn_u32(addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0, "v2 should have old value (0)") + self.assertEqual(st.vgpr[0][3], 10, "v3 should wrap to limit (10)") + + +class TestDSRegisterWidth(unittest.TestCase): + """Regression tests: DS loads should only write the correct number of VGPRs.""" + + def test_ds_load_b32_no_overwrite(self): + """DS_LOAD_B32 should only write 1 VGPR, not overwrite subsequent registers.""" + instructions = [ + v_mov_b32_e32(v[0], 0), # addr = 0 + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[1], s[0]), # store value + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[2], s[0]), # sentinel + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[3], s[0]), # sentinel + s_mov_b32(s[0], 0x33333333), + v_mov_b32_e32(v[4], s[0]), # sentinel + ds_store_b32(addr=v[0], data0=v[1], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[0], vdst=v[1], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][1], 0xDEADBEEF, "v1 should have loaded value") + self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should be untouched") + self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should be untouched") + self.assertEqual(st.vgpr[0][4], 0x33333333, "v4 should be untouched") + + def test_ds_load_b64_no_overwrite(self): + """DS_LOAD_B64 should only write 2 VGPRs, not overwrite subsequent registers.""" + instructions = [ + v_mov_b32_e32(v[0], 0), # addr = 0 + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[1], s[0]), # low dword + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[2], s[0]), # high dword + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[5], s[0]), # sentinel + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[6], s[0]), # sentinel + DS(DSOp.DS_STORE_B64, addr=v[0], data0=v[1], vdst=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + DS(DSOp.DS_LOAD_B64, addr=v[0], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][3], 0xDEADBEEF, "v3 should have low dword") + self.assertEqual(st.vgpr[0][4], 0xCAFEBABE, "v4 should have high dword") + self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should be untouched") + self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should be untouched") + + def test_ds_load_2addr_b32_no_overwrite(self): + """DS_LOAD_2ADDR_B32 should only write 2 VGPRs, not overwrite subsequent registers.""" + instructions = [ + v_mov_b32_e32(v[0], 0), # addr = 0 + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[1], s[0]), # first value + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[2], s[0]), # second value + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[5], s[0]), # sentinel + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[6], s[0]), # sentinel + DS(DSOp.DS_STORE_2ADDR_B32, addr=v[0], data0=v[1], data1=v[2], vdst=v[0], offset0=0, offset1=1), + s_waitcnt(lgkmcnt=0), + DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[0], vdst=v[3], offset0=0, offset1=1), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][3], 0xAAAAAAAA, "v3 should have first value") + self.assertEqual(st.vgpr[0][4], 0xBBBBBBBB, "v4 should have second value") + self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should be untouched") + self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should be untouched") + + +class TestDS2AddrStride64(unittest.TestCase): + """Tests for DS_*_2ADDR_STRIDE64 instructions (offset * 256 for B32, offset * 512 for B64).""" + + def test_ds_store_load_2addr_stride64_b32(self): + """DS_STORE_2ADDR_STRIDE64_B32: stores at ADDR + offset*256.""" + instructions = [ + v_mov_b32_e32(v[10], 0), # base addr = 0 + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[0], s[0]), # first value + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[1], s[0]), # second value + # Store with STRIDE64: offset0=1 -> addr 256, offset1=2 -> addr 512 + DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + # Load back using STRIDE64 + DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[2], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have value from addr 256") + self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB, "v3 should have value from addr 512") + + def test_ds_store_load_2addr_stride64_b64(self): + """DS_STORE_2ADDR_STRIDE64_B64: stores at ADDR + offset*512.""" + instructions = [ + v_mov_b32_e32(v[10], 0), # base addr = 0 + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[0], s[0]), # first value low + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[1], s[0]), # first value high + s_mov_b32(s[0], 0x12345678), + v_mov_b32_e32(v[2], s[0]), # second value low + s_mov_b32(s[0], 0x9ABCDEF0), + v_mov_b32_e32(v[3], s[0]), # second value high + # Store with STRIDE64: offset0=1 -> addr 512, offset1=2 -> addr 1024 + DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[2], vdst=v[0], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + # Load back using STRIDE64 + DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[4], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have first low dword") + self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have first high dword") + self.assertEqual(st.vgpr[0][6], 0x12345678, "v6 should have second low dword") + self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0, "v7 should have second high dword") + + +class TestDSStorexchg(unittest.TestCase): + """Tests for DS_STOREXCHG (exchange) instructions.""" + + def test_ds_storexchg_rtn_b32(self): + """DS_STOREXCHG_RTN_B32: exchange value and return old.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[0], s[0]), # initial value + ds_store_b32(addr=v[10], data0=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[1], s[0]), # new value + DS(DSOp.DS_STOREXCHG_RTN_B32, addr=v[10], data0=v[1], vdst=v[2], offset0=0), + s_waitcnt(lgkmcnt=0), + ds_load_b32(addr=v[10], vdst=v[3], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0xAAAAAAAA, "v2 should have old value") + self.assertEqual(st.vgpr[0][3], 0xBBBBBBBB, "memory should have new value") + + def test_ds_storexchg_2addr_rtn_b32(self): + """DS_STOREXCHG_2ADDR_RTN_B32: exchange at two addresses (offset*4).""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[0], s[0]), # initial at offset0 + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[1], s[0]), # initial at offset1 + # Store initial values at offset 0 and 4 (offset0=0, offset1=1, each *4) + DS(DSOp.DS_STORE_2ADDR_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=0, offset1=1), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), # new value for offset0 + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[3], s[0]), # new value for offset1 + # Exchange: write new values, return old + DS(DSOp.DS_STOREXCHG_2ADDR_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=0, offset1=1), + s_waitcnt(lgkmcnt=0), + # Load back to verify new values + DS(DSOp.DS_LOAD_2ADDR_B32, addr=v[10], vdst=v[6], offset0=0, offset1=1), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + # Return value: v4=old[0], v5=old[1] + self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value from offset0") + self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have old value from offset1") + # Memory should have new values + self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have new value at offset0") + self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have new value at offset1") + + def test_ds_storexchg_2addr_stride64_rtn_b32(self): + """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32: exchange at two addresses (offset*256).""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[0], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[1], s[0]), + # Store initial values at offset*256 + DS(DSOp.DS_STORE_2ADDR_STRIDE64_B32, addr=v[10], data0=v[0], data1=v[1], vdst=v[0], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[3], s[0]), + # Exchange + DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32, addr=v[10], data0=v[2], data1=v[3], vdst=v[4], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + # Load back + DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B32, addr=v[10], vdst=v[6], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have old value") + self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have old value") + self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have new value") + self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have new value") + + def test_ds_storexchg_rtn_b64(self): + """DS_STOREXCHG_RTN_B64: exchange 64-bit value and return old.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[0], s[0]), # initial low + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[1], s[0]), # initial high + DS(DSOp.DS_STORE_B64, addr=v[10], data0=v[0], vdst=v[0], offset0=0), + s_waitcnt(lgkmcnt=0), + s_mov_b32(s[0], 0x12345678), + v_mov_b32_e32(v[2], s[0]), # new low + s_mov_b32(s[0], 0x9ABCDEF0), + v_mov_b32_e32(v[3], s[0]), # new high + DS(DSOp.DS_STOREXCHG_RTN_B64, addr=v[10], data0=v[2], vdst=v[4], offset0=0), + s_waitcnt(lgkmcnt=0), + DS(DSOp.DS_LOAD_B64, addr=v[10], vdst=v[6], offset0=0), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have old low dword") + self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have old high dword") + self.assertEqual(st.vgpr[0][6], 0x12345678, "v6 should have new low dword") + self.assertEqual(st.vgpr[0][7], 0x9ABCDEF0, "v7 should have new high dword") + + def test_ds_store_load_2addr_stride64_b64_roundtrip(self): + """DS_STORE_2ADDR_STRIDE64_B64 followed by DS_LOAD_2ADDR_STRIDE64_B64 works correctly.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[0], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[1], s[0]), + DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + DS(DSOp.DS_LOAD_2ADDR_STRIDE64_B64, addr=v[10], vdst=v[2], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][2], 0x11111111, "v2 should have val1 low") + self.assertEqual(st.vgpr[0][3], 0x22222222, "v3 should have val1 high") + self.assertEqual(st.vgpr[0][4], 0x11111111, "v4 should have val2 low") + self.assertEqual(st.vgpr[0][5], 0x22222222, "v5 should have val2 high") + + def test_ds_storexchg_2addr_stride64_rtn_b64_returns_old(self): + """DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64: returns old values correctly.""" + instructions = [ + v_mov_b32_e32(v[10], 0), + # Store initial values + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[0], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[1], s[0]), + DS(DSOp.DS_STORE_2ADDR_STRIDE64_B64, addr=v[10], data0=v[0], data1=v[0], vdst=v[0], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + # Exchange with new values + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[6], s[0]), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[7], s[0]), + DS(DSOp.DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64, addr=v[10], data0=v[6], data1=v[6], vdst=v[8], offset0=1, offset1=2), + s_waitcnt(lgkmcnt=0), + ] + st = run_program(instructions, n_lanes=1) + # Return: v8-v11 = old values (4 dwords for 2x64-bit) + self.assertEqual(st.vgpr[0][8], 0x11111111, "v8 should have old val1 low") + self.assertEqual(st.vgpr[0][9], 0x22222222, "v9 should have old val1 high") + self.assertEqual(st.vgpr[0][10], 0x11111111, "v10 should have old val2 low") + self.assertEqual(st.vgpr[0][11], 0x22222222, "v11 should have old val2 high") + +class TestFLATAtomic(unittest.TestCase): + """Tests for FLAT and GLOBAL atomic instructions.""" + + # Helper to set up address in v[0:1] and clear after test + def _make_test(self, setup_instrs, atomic_instr, check_fn, test_offset=2000): + """Helper to create atomic test instructions.""" + instructions = [ + # Load output buffer address from args (saved in s[80:81] by prologue) + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), # addr low + v_mov_b32_e32(v[1], s[3]), # addr high + ] + setup_instrs + [atomic_instr, s_waitcnt(vmcnt=0), + # Clear address registers that differ between emu/hw + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check_fn(st) + + def test_flat_atomic_inc_u64_returns_old_value(self): + """FLAT_ATOMIC_INC_U64 should return full 64-bit old value.""" + TEST_OFFSET = 2000 + setup = [ + # Store initial 64-bit value: 0xCAFEBABE_DEADBEEF + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[3], s[0]), + global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Threshold: 0xFFFFFFFF_FFFFFFFF + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[4], s[0]), + v_mov_b32_e32(v[5], s[0]), + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][6], 0xDEADBEEF, "v6 should have old value low dword") + self.assertEqual(st.vgpr[0][7], 0xCAFEBABE, "v7 should have old value high dword") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_add_u32(self): + """FLAT_ATOMIC_ADD_U32 adds to memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 100), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 50), + v_mov_b32_e32(v[3], s[0]), # add 50 + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_sub_u32(self): + """FLAT_ATOMIC_SUB_U32 subtracts from memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 100), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 30), + v_mov_b32_e32(v[3], s[0]), # sub 30 + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_SUB_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_swap_b32(self): + """FLAT_ATOMIC_SWAP_B32 swaps memory value and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[3], s[0]), # new value + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 should have old value") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_and_b32(self): + """FLAT_ATOMIC_AND_B32 ANDs with memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xFF00FF00), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0xFFFF0000), + v_mov_b32_e32(v[3], s[0]), # AND mask + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_AND_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 0xFF00FF00, "v4 should have old value") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_or_b32(self): + """FLAT_ATOMIC_OR_B32 ORs with memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0x00FF0000), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0x0000FF00), + v_mov_b32_e32(v[3], s[0]), # OR mask + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_OR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 0x00FF0000, "v4 should have old value") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_xor_b32(self): + """FLAT_ATOMIC_XOR_B32 XORs with memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[3], s[0]), # XOR mask + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_XOR_B32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 0xAAAAAAAA, "v4 should have old value") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_min_u32(self): + """FLAT_ATOMIC_MIN_U32 stores min and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 100), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 50), + v_mov_b32_e32(v[3], s[0]), # compare value (smaller) + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_MIN_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_max_u32(self): + """FLAT_ATOMIC_MAX_U32 stores max and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 50), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 100), + v_mov_b32_e32(v[3], s[0]), # compare value (larger) + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_MAX_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 50, "v4 should have old value (50)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_inc_u32(self): + """FLAT_ATOMIC_INC_U32 increments and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 10), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 100), # threshold + v_mov_b32_e32(v[3], s[0]), + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_INC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 10, "v4 should have old value (10)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_dec_u32(self): + """FLAT_ATOMIC_DEC_U32 decrements and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 10), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 100), # threshold + v_mov_b32_e32(v[3], s[0]), + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_DEC_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][4], 10, "v4 should have old value (10)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_add_u64(self): + """FLAT_ATOMIC_ADD_U64 adds 64-bit value and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[3], s[0]), + global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0x00000001), # add 1 + v_mov_b32_e32(v[4], s[0]), + s_mov_b32(s[0], 0x00000000), + v_mov_b32_e32(v[5], s[0]), + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][6], 0x11111111, "v6 should have old value low") + self.assertEqual(st.vgpr[0][7], 0x22222222, "v7 should have old value high") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_atomic_swap_b64(self): + """FLAT_ATOMIC_SWAP_B64 swaps 64-bit value and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[3], s[0]), + global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 0xCCCCCCCC), + v_mov_b32_e32(v[4], s[0]), + s_mov_b32(s[0], 0xDDDDDDDD), + v_mov_b32_e32(v[5], s[0]), + ] + atomic = FLAT(FLATOp.FLAT_ATOMIC_SWAP_B64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1) + def check(st): + self.assertEqual(st.vgpr[0][6], 0xAAAAAAAA, "v6 should have old value low") + self.assertEqual(st.vgpr[0][7], 0xBBBBBBBB, "v7 should have old value high") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_global_atomic_add_u32(self): + """GLOBAL_ATOMIC_ADD_U32 adds to memory and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 100), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + s_mov_b32(s[0], 50), + v_mov_b32_e32(v[3], s[0]), + ] + atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U32, addr=v[0], data=v[3], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2) + def check(st): + self.assertEqual(st.vgpr[0][4], 100, "v4 should have old value (100)") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_global_atomic_add_u64(self): + """GLOBAL_ATOMIC_ADD_U64 adds 64-bit value and returns old value.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xFFFFFFFF), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0x00000000), + v_mov_b32_e32(v[3], s[0]), + global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Add 1 to cause carry + s_mov_b32(s[0], 0x00000001), + v_mov_b32_e32(v[4], s[0]), + s_mov_b32(s[0], 0x00000000), + v_mov_b32_e32(v[5], s[0]), + ] + atomic = FLAT(GLOBALOp.GLOBAL_ATOMIC_ADD_U64, addr=v[0], data=v[4], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, glc=1, seg=2) + def check(st): + self.assertEqual(st.vgpr[0][6], 0xFFFFFFFF, "v6 should have old value low") + self.assertEqual(st.vgpr[0][7], 0x00000000, "v7 should have old value high") + self._make_test(setup, atomic, check, TEST_OFFSET) + + def test_flat_load_b32(self): + """FLAT_LOAD_B32 loads 32-bit value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[0]), + global_store_b32(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(FLATOp.FLAT_LOAD_B32, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET) + def check(st): + self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have loaded value") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + def test_flat_load_b64(self): + """FLAT_LOAD_B64 loads 64-bit value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[3], s[0]), + global_store_b64(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(FLATOp.FLAT_LOAD_B64, addr=v[0], vdst=v[4], saddr=SrcEnum.NULL, offset=TEST_OFFSET) + def check(st): + self.assertEqual(st.vgpr[0][4], 0xDEADBEEF, "v4 should have loaded low dword") + self.assertEqual(st.vgpr[0][5], 0xCAFEBABE, "v5 should have loaded high dword") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + def test_flat_load_b96(self): + """FLAT_LOAD_B96 loads 96-bit (3 dword) value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[3], s[0]), + s_mov_b32(s[0], 0x33333333), + v_mov_b32_e32(v[4], s[0]), + global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(FLATOp.FLAT_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET) + def check(st): + self.assertEqual(st.vgpr[0][5], 0x11111111, "v5 should have dword 0") + self.assertEqual(st.vgpr[0][6], 0x22222222, "v6 should have dword 1") + self.assertEqual(st.vgpr[0][7], 0x33333333, "v7 should have dword 2") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + def test_flat_load_b128(self): + """FLAT_LOAD_B128 loads 128-bit (4 dword) value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0x11111111), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0x22222222), + v_mov_b32_e32(v[3], s[0]), + s_mov_b32(s[0], 0x33333333), + v_mov_b32_e32(v[4], s[0]), + s_mov_b32(s[0], 0x44444444), + v_mov_b32_e32(v[5], s[0]), + global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(FLATOp.FLAT_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET) + def check(st): + self.assertEqual(st.vgpr[0][6], 0x11111111, "v6 should have dword 0") + self.assertEqual(st.vgpr[0][7], 0x22222222, "v7 should have dword 1") + self.assertEqual(st.vgpr[0][8], 0x33333333, "v8 should have dword 2") + self.assertEqual(st.vgpr[0][9], 0x44444444, "v9 should have dword 3") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + def test_global_load_b96(self): + """GLOBAL_LOAD_B96 loads 96-bit value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xAAAAAAAA), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xBBBBBBBB), + v_mov_b32_e32(v[3], s[0]), + s_mov_b32(s[0], 0xCCCCCCCC), + v_mov_b32_e32(v[4], s[0]), + global_store_b96(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(GLOBALOp.GLOBAL_LOAD_B96, addr=v[0], vdst=v[5], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2) + def check(st): + self.assertEqual(st.vgpr[0][5], 0xAAAAAAAA, "v5 should have dword 0") + self.assertEqual(st.vgpr[0][6], 0xBBBBBBBB, "v6 should have dword 1") + self.assertEqual(st.vgpr[0][7], 0xCCCCCCCC, "v7 should have dword 2") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + def test_global_load_b128(self): + """GLOBAL_LOAD_B128 loads 128-bit value correctly.""" + TEST_OFFSET = 2000 + setup = [ + s_mov_b32(s[0], 0xDEADBEEF), + v_mov_b32_e32(v[2], s[0]), + s_mov_b32(s[0], 0xCAFEBABE), + v_mov_b32_e32(v[3], s[0]), + s_mov_b32(s[0], 0x12345678), + v_mov_b32_e32(v[4], s[0]), + s_mov_b32(s[0], 0x9ABCDEF0), + v_mov_b32_e32(v[5], s[0]), + global_store_b128(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + ] + load = FLAT(GLOBALOp.GLOBAL_LOAD_B128, addr=v[0], vdst=v[6], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2) + def check(st): + self.assertEqual(st.vgpr[0][6], 0xDEADBEEF, "v6 should have dword 0") + self.assertEqual(st.vgpr[0][7], 0xCAFEBABE, "v7 should have dword 1") + self.assertEqual(st.vgpr[0][8], 0x12345678, "v8 should have dword 2") + self.assertEqual(st.vgpr[0][9], 0x9ABCDEF0, "v9 should have dword 3") + instructions = [ + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + ] + setup + [load, s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], 0), + v_mov_b32_e32(v[1], 0), + s_mov_b32(s[2], 0), + s_mov_b32(s[3], 0), + ] + st = run_program(instructions, n_lanes=1) + check(st) + + +class TestGlobalStoreB64(unittest.TestCase): + """Tests for global_store_b64 instruction.""" + + def test_global_store_b64_basic(self): + """GLOBAL_STORE_B64 stores 8 bytes from v[n:n+1] to memory.""" + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Set up v[2:3] with known values + s_mov_b32(s[4], 0xDEADBEEF), + s_mov_b32(s[5], 0xCAFEBABE), + v_mov_b32_e32(v[2], s[4]), # v2 = 0xDEADBEEF (low dword) + v_mov_b32_e32(v[3], s[5]), # v3 = 0xCAFEBABE (high dword) + # Set up address + v_mov_b32_e32(v[0], 0), + # Store 64 bits + global_store_b64(addr=v[0], data=v[2], saddr=s[2], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Load it back as two 32-bit values + FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[0], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + # Copy to v[0:1] for capture + v_mov_b32_e32(v[0], v[4]), + v_mov_b32_e32(v[1], v[5]), + ] + st = run_program(instructions, n_lanes=1) + self.assertEqual(st.vgpr[0][0], 0xDEADBEEF, f"Low dword: expected 0xDEADBEEF, got 0x{st.vgpr[0][0]:08x}") + self.assertEqual(st.vgpr[0][1], 0xCAFEBABE, f"High dword: expected 0xCAFEBABE, got 0x{st.vgpr[0][1]:08x}") + + def test_global_store_b64_tril_pattern(self): + """Test the exact pattern from tril() kernel that was failing. + + The kernel does: + - global_load_u16 v0, v2, s[2:3] offset:3 (loads bytes 3,4) + - global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 (loads bytes 6,7 into v1 hi16) + - global_load_u8 v3, v2, s[2:3] (loads byte 0) + - global_load_u8 v4, v2, s[2:3] offset:8 (loads byte 8) + - v_and_b32 v5, 0xffff, v0 + - v_lshlrev_b32 v0, 24, v0 + - v_lshrrev_b32 v5, 8, v5 + - v_or_b32 v0, v3, v0 + - v_or_b32 v1, v5, v1 + - global_store_b64 v2, v[0:1], s[0:1] (stores 8 bytes) + + For input all 0x01, the output at byte 5 should be 0x00. + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store input data: 9 bytes of 0x01 + s_mov_b32(s[4], 0x01010101), + v_mov_b32_e32(v[10], s[4]), + v_mov_b32_e32(v[11], s[4]), + s_mov_b32(s[4], 0x01), + v_mov_b32_e32(v[12], s[4]), + v_mov_b32_e32(v[0], 0), + global_store_b64(addr=v[0], data=v[10], saddr=s[2], offset=TEST_OFFSET), + global_store_b8(addr=v[0], data=v[12], saddr=s[2], offset=TEST_OFFSET+8), + s_waitcnt(vmcnt=0), + + # Now execute the tril pattern + v_mov_b32_e32(v[2], 0), + v_mov_b32_e32(v[1], 0), + # Load bytes 3,4 as u16 + FLAT(GLOBALOp.GLOBAL_LOAD_U16, addr=v[2], vdst=v[0], data=v[0], saddr=s[2], offset=TEST_OFFSET+3, seg=2), + # Load bytes 6,7 into v1 hi16 + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET+6, seg=2), + # Load byte 0 + FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[3], data=v[3], saddr=s[2], offset=TEST_OFFSET, seg=2), + # Load byte 8 + FLAT(GLOBALOp.GLOBAL_LOAD_U8, addr=v[2], vdst=v[4], data=v[4], saddr=s[2], offset=TEST_OFFSET+8, seg=2), + s_waitcnt(vmcnt=0), + + # Bit manipulation + v_and_b32_e32(v[5], 0xffff, v[0]), # v5 = v0 & 0xffff = 0x0101 + v_lshlrev_b32_e32(v[0], 24, v[0]), # v0 = v0 << 24 = 0x01000000 + v_lshrrev_b32_e32(v[5], 8, v[5]), # v5 = v5 >> 8 = 0x01 + v_or_b32_e32(v[0], v[3], v[0]), # v0 = v3 | v0 = 0x01000001 + v_or_b32_e32(v[1], v[5], v[1]), # v1 = v5 | v1 + + # Store to different location so we can read it back + global_store_b64(addr=v[2], data=v[0], saddr=s[2], offset=TEST_OFFSET+16), + s_waitcnt(vmcnt=0), + + # Load back to check + FLAT(GLOBALOp.GLOBAL_LOAD_B64, addr=v[2], vdst=v[6], data=v[6], saddr=s[2], offset=TEST_OFFSET+16, seg=2), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[6]), + v_mov_b32_e32(v[1], v[7]), + ] + st = run_program(instructions, n_lanes=1) + + # v0 should be 0x01000001 (bytes 0,1,2,3 = 01,00,00,01) + # v1 should be 0x01010001 (bytes 4,5,6,7 = 01,00,01,01) + v0 = st.vgpr[0][0] + v1 = st.vgpr[0][1] + self.assertEqual(v0, 0x01000001, f"v0: expected 0x01000001, got 0x{v0:08x}") + self.assertEqual(v1, 0x01010001, f"v1: expected 0x01010001, got 0x{v1:08x}") + + # Check individual bytes + byte5 = (v1 >> 8) & 0xff # This is the bug - should be 0x00 + self.assertEqual(byte5, 0x00, f"byte5 (position 1,2): expected 0x00, got 0x{byte5:02x}") + + +class TestD16HiLoads(unittest.TestCase): + """Tests for D16_HI load instructions that load into high 16 bits, preserving low 16 bits.""" + + def test_global_load_d16_hi_b16_preserves_low_bits(self): + """GLOBAL_LOAD_D16_HI_B16 must preserve low 16 bits of destination. + + Regression test for tril() bug where position (1,2) was incorrectly True. + The bug was that D16_HI loads were not preserving the low 16 bits of the + destination register. + """ + # Set up: store 0xCAFE at some memory location, then load it into high 16 bits + # of a register that has 0xBEEF in low 16 bits. Result should be 0xCAFEBEEF. + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Set up address in v[0:1] + v_mov_b32_e32(v[0], s[2]), + v_mov_b32_e32(v[1], s[3]), + # Store 0xCAFE0000 at TEST_OFFSET (we'll load the low 16 bits as b16) + s_mov_b32(s[4], 0xCAFE), + v_mov_b32_e32(v[2], s[4]), + global_store_b16(addr=v[0], data=v[2], saddr=SrcEnum.NULL, offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Set destination register v[3] to have 0xBEEF in low 16 bits + s_mov_b32(s[4], 0x0000BEEF), + v_mov_b32_e32(v[3], s[4]), + # Load 16 bits from memory into HIGH 16 bits of v[3], preserving low 16 bits + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[0], vdst=v[3], data=v[3], saddr=SrcEnum.NULL, offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + # Copy result to v[0] for capture + v_mov_b32_e32(v[0], v[3]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + # Expected: hi=0xCAFE (from memory), lo=0xBEEF (preserved) -> 0xCAFEBEEF + self.assertEqual(result, 0xCAFEBEEF, f"Expected 0xCAFEBEEF, got 0x{result:08x}") + + def test_global_load_d16_hi_b16_same_addr_and_dst_zero_addr(self): + """GLOBAL_LOAD_D16_HI_B16 with same register for addr and vdst, addr value=0. + + This is the exact pattern from tril() that was failing: + global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 + + Where v1=0 is used as both the address offset and destination. + After the load, low 16 bits should remain 0, high 16 bits should have loaded data. + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store 0xCAFE at TEST_OFFSET + s_mov_b32(s[4], 0xCAFE), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[3], 0), # addr offset = 0 + global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Set v[1] to 0 (addr offset = 0, and this is what low 16 bits should stay as) + v_mov_b32_e32(v[1], 0), + # Load using v[1] as both addr and destination + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[1], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + # Copy result to v[0] for capture + v_mov_b32_e32(v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + # Expected: hi=0xCAFE (from memory), lo=0x0000 (preserved) -> 0xCAFE0000 + self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}") + + def test_global_load_d16_hi_b16_data_differs_from_vdst(self): + """GLOBAL_LOAD_D16_HI_B16 where data field differs from vdst. + + This is the ACTUAL pattern from tril() assembly: + global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 + + The instruction encoding has: + vdst = v1 (destination register) + addr = v1 (address offset register) + data = v0 (data field - typically unused for loads but still encoded) + + The bug: emulator was reading VDATA from inst.data (v0) instead of inst.vdst (v1), + so low 16 bits of v0 were preserved instead of low 16 bits of v1. + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store 0xCAFE at TEST_OFFSET + s_mov_b32(s[4], 0xCAFE), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[3], 0), + global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Set v[0] to a DIFFERENT value (0xDEAD) - this is the data field + # The bug would incorrectly preserve v[0]'s low bits instead of v[1]'s + s_mov_b32(s[4], 0x0000DEAD), + v_mov_b32_e32(v[0], s[4]), + # Set v[1] to 0 (this is vdst, whose low bits should be preserved) + v_mov_b32_e32(v[1], 0), + # Load using v[1] as addr AND vdst, but v[0] as data field + # Correct behavior: hi=0xCAFE (loaded), lo=0x0000 (from v1) -> 0xCAFE0000 + # Bug behavior: hi=0xCAFE (loaded), lo=0xDEAD (from v0) -> 0xCAFEDEAD + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + # Copy result to v[0] for capture + v_mov_b32_e32(v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + # Expected: hi=0xCAFE (from memory), lo=0x0000 (preserved from vdst v1) -> 0xCAFE0000 + # Bug would give: 0xCAFEDEAD (low bits from data field v0) + self.assertEqual(result, 0xCAFE0000, f"Expected 0xCAFE0000, got 0x{result:08x}") + + def test_global_load_d16_hi_b16_tril_exact_pattern(self): + """Exact pattern from tril() failure: data=v0 differs from vdst=v1, with v1 having non-zero low bits initially. + + Assembly from tril(): + v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0 + global_load_u16 v0, v2, s[2:3] offset:3 ; v0 = 0x0101 (loads 16 bits) + global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 ; vdst=v1, addr=v1, data=v0 + ... + v_or_b32_e32 v1, v5, v1 + + The bug: since data=v0=0x0101 and vdst=v1=0, the emulator incorrectly + preserved v0's low bits (0x0101) instead of v1's low bits (0x0000). + Result: v1 = 0x01010101 instead of 0x01010000 + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store test data: 0x0101 at offset, 0x0101 at offset+3 + s_mov_b32(s[4], 0x0101), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[3], 0), + global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET), + global_store_b16(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET + 3), + s_waitcnt(vmcnt=0), + # Replicate tril() pattern: + # v2 = 0, v1 = 0 + v_mov_b32_e32(v[2], 0), + v_mov_b32_e32(v[1], 0), + # global_load_u16 v0, v2, s[2:3] offset:3 -> v0 gets 0x0101 + FLAT(GLOBALOp.GLOBAL_LOAD_U16, addr=v[2], vdst=v[0], data=v[0], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + # global_load_d16_hi_b16 v1, v1, s[2:3] offset:6 -> vdst=v1, addr=v1, data=v0 + # This should load 0x0101 into high 16 bits of v1, preserving low 16 bits (0x0000) + # Result should be 0x01010000, NOT 0x01010101 + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_B16, addr=v[1], vdst=v[1], data=v[0], saddr=s[2], offset=TEST_OFFSET + 3, seg=2), + s_waitcnt(vmcnt=0), + # Copy v1 to v[0] for capture + v_mov_b32_e32(v[0], v[1]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + # Expected: hi=0x0101 (from memory), lo=0x0000 (preserved from vdst v1) -> 0x01010000 + # Bug would give: 0x01010101 (low bits from data field v0) + self.assertEqual(result, 0x01010000, f"Expected 0x01010000, got 0x{result:08x}") + + def test_global_load_d16_hi_u8_data_differs_from_vdst(self): + """GLOBAL_LOAD_D16_HI_U8 where data field differs from vdst. + + Similar to B16 test but loads unsigned 8 bits into high 16 bits. + The bug: emulator reads VDATA from inst.data instead of inst.vdst. + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store 0xAB at TEST_OFFSET (single byte) + s_mov_b32(s[4], 0xAB), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[3], 0), + global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Set v[4] to 0xDEAD (data field - should NOT affect result) + s_mov_b32(s[4], 0x0000DEAD), + v_mov_b32_e32(v[4], s[4]), + # Set v[5] to 0xBEEF (vdst - low bits should be preserved) + s_mov_b32(s[4], 0x0000BEEF), + v_mov_b32_e32(v[5], s[4]), + # v[3] = 0 for address offset + v_mov_b32_e32(v[3], 0), + # Load 8 bits into high 16 bits of v[5], preserving low 16 bits + # Correct: hi=0x00AB (zero-extended), lo=0xBEEF -> 0x00ABBEEF + # Bug: hi=0x00AB, lo=0xDEAD (from v4) -> 0x00ABDEAD + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_U8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[5]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + self.assertEqual(result, 0x00ABBEEF, f"Expected 0x00ABBEEF, got 0x{result:08x}") + + def test_global_load_d16_hi_i8_data_differs_from_vdst(self): + """GLOBAL_LOAD_D16_HI_I8 where data field differs from vdst. + + Loads signed 8 bits (sign-extended to 16 bits) into high 16 bits. + The bug: emulator reads VDATA from inst.data instead of inst.vdst. + """ + TEST_OFFSET = 256 + + instructions = [ + # Get output buffer address into s[2:3] + s_load_b64(s[2:3], s[80], 0, soffset=SrcEnum.NULL), + s_waitcnt(lgkmcnt=0), + # Store 0x80 at TEST_OFFSET (negative signed byte = -128) + s_mov_b32(s[4], 0x80), + v_mov_b32_e32(v[2], s[4]), + v_mov_b32_e32(v[3], 0), + global_store_b8(addr=v[3], data=v[2], saddr=s[2], offset=TEST_OFFSET), + s_waitcnt(vmcnt=0), + # Set v[4] to 0xDEAD (data field - should NOT affect result) + s_mov_b32(s[4], 0x0000DEAD), + v_mov_b32_e32(v[4], s[4]), + # Set v[5] to 0xBEEF (vdst - low bits should be preserved) + s_mov_b32(s[4], 0x0000BEEF), + v_mov_b32_e32(v[5], s[4]), + # v[3] = 0 for address offset + v_mov_b32_e32(v[3], 0), + # Load signed 8 bits into high 16 bits of v[5], preserving low 16 bits + # 0x80 sign-extended to 16 bits = 0xFF80 + # Correct: hi=0xFF80, lo=0xBEEF -> 0xFF80BEEF + # Bug: hi=0xFF80, lo=0xDEAD (from v4) -> 0xFF80DEAD + FLAT(GLOBALOp.GLOBAL_LOAD_D16_HI_I8, addr=v[3], vdst=v[5], data=v[4], saddr=s[2], offset=TEST_OFFSET, seg=2), + s_waitcnt(vmcnt=0), + v_mov_b32_e32(v[0], v[5]), + ] + st = run_program(instructions, n_lanes=1) + result = st.vgpr[0][0] + self.assertEqual(result, 0xFF80BEEF, f"Expected 0xFF80BEEF, got 0x{result:08x}") + + +if __name__ == '__main__': + unittest.main()