From 616e9c1483333162bdecdffcde03e31b8dddea0c Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Sat, 31 Jan 2026 08:34:14 -0500 Subject: [PATCH] CDNA assembly gemm in tensor.py with flag (#14310) * work * work * the assembly * remove the old one * remove ws bufs, assert splitk * notes cleanup * work * gemm args * gemm in mixins would be nice * add gemm gradient * print counters * the realize is for DEBUG=2 aesthetics * dedup * rewrite to python dsl, no list copies * leave that * add B, M, N, K to gemm name * it's M0 not NULL * fp16 support * test cleanup + more gemms * work from viz * more work * gemm batch_size * xccg path work * tiny comments on the label naming * s_waitcnt --- .../tinybox_8xMI350X/dev_run.sh | 1 + extra/gemm/asm/cdna/asm.py | 11517 ++++++++++++++++ extra/gemm/asm/cdna/gemm.py | 95 + extra/gemm/asm/cdna/gemm.s | 1621 --- extra/gemm/asm/cdna/template.s | 78 - extra/gemm/asm/cdna/test.py | 73 - extra/gemm/asm/cdna/test_asm_gemm.py | 46 + tinygrad/helpers.py | 2 + tinygrad/tensor.py | 5 +- tinygrad/uop/ops.py | 2 +- tinygrad/viz/serve.py | 2 +- 11 files changed, 11667 insertions(+), 1775 deletions(-) create mode 100644 extra/gemm/asm/cdna/asm.py create mode 100644 extra/gemm/asm/cdna/gemm.py delete mode 100644 extra/gemm/asm/cdna/gemm.s delete mode 100644 extra/gemm/asm/cdna/template.s delete mode 100644 extra/gemm/asm/cdna/test.py create mode 100644 extra/gemm/asm/cdna/test_asm_gemm.py diff --git a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh index 87e303851e..756774b7b0 100755 --- a/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh +++ b/examples/mlperf/training_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh @@ -10,6 +10,7 @@ export DEBUG=${DEBUG:-0} export FLASH_ATTENTION=${FLASH_ATTENTION:-1} export ALL2ALL=${ALL2ALL:-1} export USE_ATOMICS=${USE_ATOMICS:-1} +export ASM_GEMM=${ASM_GEMM:-1} export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16" export DP=${DP:-8} BS=${BS:-8} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2} diff --git a/extra/gemm/asm/cdna/asm.py b/extra/gemm/asm/cdna/asm.py new file mode 100644 index 0000000000..96611dddc9 --- /dev/null +++ b/extra/gemm/asm/cdna/asm.py @@ -0,0 +1,11517 @@ +from extra.assembly.amd.autogen.cdna.ins import * +from tinygrad.dtype import dtypes + +# M0 is encoded with 124 (NULL in RDNA) in CDNA +M0 = NULL + +# (M, N, K) -> (numWG, iters, total) +GEMM_ARGS = { + (8192, 4096, 4096): (256, 64, 32768), + (8192, 14336, 4096): (256, 64, 114688), + (8192, 4096, 14336): (256, 224, 114688), + (8192, 128256, 4096): (16032, 64, 1026048), + (8192, 8192, 8192): (256, 128, 131072), + (4096, 4096, 4096): (256, 64, 16384), + (4096, 14336, 4096): (256, 64, 57344), + (4096, 4096, 14336): (256, 224, 57344), +} +ITERS_ARGS = {64: (67108864, 0), 128: (33554432, 0), 224: (613566757, 2147483656)} + +class Kernel: + def __init__(self, name="gemm"): self.name, self.instructions, self.labels, self.label_at_pos, self.pos = name, [], {}, {}, 0 + + def label(self, name): + self.labels[name] = self.pos + self.label_at_pos[self.pos] = name + + def emit(self, inst, target=None): + self.instructions.append(inst) + inst._target, inst._pos = target, self.pos + self.pos += inst.size() + return inst + + def waitcnt(self, lgkm=None, vm=None): + vmcnt, lgkmcnt, expcnt = vm if vm is not None else 63, lgkm if lgkm is not None else 15, 7 + waitcnt = (vmcnt & 0xF) | ((expcnt & 0x7) << 4) | ((lgkmcnt & 0xF) << 8) | (((vmcnt >> 4) & 0x3) << 14) + self.emit(s_waitcnt(waitcnt)) + + def to_asm(self): + # patch branches + for inst in self.instructions: + if inst._target is None: continue + inst.simm16 = (self.labels[inst._target] - inst._pos - inst.size()) // 4 + # convert instructions to bytes, pack hsa + inst_bytes = b"".join(inst.to_bytes() for inst in self.instructions) + body = "\n".join(" .byte " + ",".join(f"0x{b:02x}" for b in inst_bytes[i:i+16]) for i in range(0, len(inst_bytes), 16)) + hsa = [('group_segment_fixed_size', 133120), ('private_segment_fixed_size', 0), ('kernarg_size', 24), + ('next_free_vgpr', 512), ('next_free_sgpr', 96), ('system_sgpr_workgroup_id_x', 1), + ('system_sgpr_workgroup_id_y', 1), ('system_sgpr_workgroup_id_z', 1), ('user_sgpr_kernarg_segment_ptr', 1), + ('user_sgpr_count', 2), ('user_sgpr_kernarg_preload_length', 0), ('user_sgpr_kernarg_preload_offset', 0), + ('accum_offset', 256), ('uses_dynamic_stack', 0), ('tg_split', 0), ('float_round_mode_32', 0), + ('float_round_mode_16_64', 0), ('float_denorm_mode_32', 3), ('float_denorm_mode_16_64', 3), + ('ieee_mode', 1), ('fp16_overflow', 0), ('dx10_clamp', 1)] + args = '\n'.join(f' - .address_space: generic\n .name: {n}\n .offset: {i*8}\n' + f' .size: 8\n .value_kind: global_buffer' for i,n in enumerate(['C', 'A', 'B'])) + n = self.name + return '\n'.join(['.text', '.section\t.text.', f'.global\t{n}', '.p2align\t8', f'.type\t{n},@function', '', f'{n}:', + body, '', '.section .rodata,"a",@progbits', '.p2align 6, 0x0', f'.amdhsa_kernel {n}', + *[f' .amdhsa_{k} {v}' for k, v in hsa], '.end_amdhsa_kernel', '', '.amdgpu_metadata', '---', 'amdhsa.kernels:', + ' - .args:', args, ' .group_segment_fixed_size: 133120', ' .kernarg_segment_align: 8', + ' .kernarg_segment_size: 24', ' .max_flat_workgroup_size: 256', f' .name: {n}', + ' .private_segment_fixed_size: 0', ' .sgpr_count: 95', ' .sgpr_spill_count: 0', f' .symbol: {n}.kd', + ' .vgpr_count: 249', ' .vgpr_spill_count: 0', ' .wavefront_size: 64', 'amdhsa.version:', ' - 1', + ' - 1', '...', '.end_amdgpu_metadata', '']) + + # outputs readable source code for this kernel + def to_text(self) -> str: + lines, pos = [], 0 + for inst in self.instructions: + if (label := self.label_at_pos.get(pos)) is not None: lines.append(f"{label}:") + lines.append(f" {inst.disasm()}" if inst._target is None else f" {inst.op_name.lower()} {inst._target}") + pos += inst.size() + return "\n".join(lines) + +def build_kernel(batch, M, N, K, dtype): + numWG, iters, total = GEMM_ARGS[(M, N, K)] + total *= batch + magic, shift = ITERS_ARGS[iters] + v_mfma_16x16x32 = {dtypes.half:v_mfma_f32_16x16x32_f16, dtypes.bfloat16:v_mfma_f32_16x16x32_bf16}[dtype] + v_cvt_pk = {dtypes.half:v_cvt_pk_f16_f32, dtypes.bfloat16:v_cvt_pk_bf16_f32}[dtype] + v_cvt = {dtypes.half:v_cvt_f32_f16_e32, dtypes.bfloat16:v_cvt_f32_bf16_e32}[dtype] + k = Kernel(f"gemm_{batch}_{M}_{N}_{K}") + # load D, A, B pointers + k.emit(s_load_dwordx2(s[24:25], s[0:1], s[0], 0, 0, 0, 0, 1)) + k.emit(s_load_dwordx2(s[30:31], s[0:1], s[0], 8, 0, 0, 0, 1)) + k.emit(s_load_dwordx2(s[28:29], s[0:1], s[0], 16, 0, 0, 0, 1)) + k.waitcnt(lgkm=0) + # params as constants + k.emit(s_mov_b32(s[69], numWG)) + k.emit(s_mov_b32(s[20], N)) + k.emit(s_mov_b32(s[21], batch * M)) + k.emit(s_mov_b32(s[22], 1)) + k.emit(s_mov_b32(s[23], K)) + k.emit(s_mov_b32(s[36], N)) + k.emit(s_mov_b32(s[37], 0)) + k.emit(s_mov_b32(s[40], N)) + k.emit(s_mov_b32(s[41], 0)) + k.emit(s_mov_b32(s[42], K)) + k.emit(s_mov_b32(s[43], 0)) + k.emit(s_mov_b32(s[46], iters)) + k.emit(s_mov_b32(s[47], magic)) + k.emit(s_mov_b32(s[48], shift)) + k.emit(s_mov_b32(s[49], total)) + k.emit(s_mov_b32(s[62], 0)) + k.emit(s_mov_b32(s[68], 0)) + # kernel size is 256x256 + k.emit(s_mov_b32(s[51], 256)); k.emit(s_mov_b32(s[52], 256)) + k.emit(s_mov_b32(s[38], s[36])) + k.emit(s_mov_b32(s[39], s[37])) + k.emit(s_mov_b64(s[26:27], s[24:25])) + k.emit(s_and_b32(s[6], s[68], 4294901760)) + k.emit(s_lshr_b32(s[6], s[6], 16)) + k.emit(s_mov_b32(s[63], 0)) + k.emit(s_setprio(3)) + k.emit(s_mov_b32(M0, 133120)) + k.emit(v_mov_b32_e32(v[180], v[0])) + # XCCG=256 + # labels are named based on function: + # PGR = Prefetch Global Read (the global→LDS pipeline stage) + # SK = Stream-K (work partitioning by K-iterations, not tiles) + # WGM = WorkGroup Mapping (tile assignment scheme for cache locality) + # GLVW = Global Load Vector Width (edge tile width handling) + # BM0 = Block M offset 0 (register block position) + # OrdNLL = Ordered No-Load-Loop (final iteration without prefetch loads) + k.emit(s_mov_b32(s[75], 256)) + k.emit(v_cvt_f32_u32_e32(v[18], s[75])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[2])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75])) + k.emit(v_sub_u32_e32(v[19], s[2], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[75])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(v_mov_b32_e32(v[19], 0)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[75])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75])) + k.emit(v_sub_u32_e32(v[19], s[2], v[19])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[71], v[18])) + k.emit(v_readfirstlane_b32_e32(v[72], v[19])) + k.emit(s_mul_i32(s[71], s[71], s[75])) + k.emit(s_lshr_b32(s[72], s[72], 1)) + k.emit(s_add_u32(s[71], s[71], s[72])) + k.emit(v_cvt_f32_u32_e32(v[18], s[75])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[69])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75])) + k.emit(v_sub_u32_e32(v[19], s[69], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[75])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[75])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[72], v[18])) + k.emit(s_mul_i32(s[72], s[72], s[75])) + k.emit(s_sub_u32(s[73], s[69], s[72])) + k.emit(s_cmp_gt_u32(s[2], s[72])) + k.emit(s_cselect_b32(s[72], s[73], s[75])) + k.emit(s_lshr_b32(s[72], s[72], 1)) + k.emit(s_bfm_b32(s[73], 1, 0)) + k.emit(s_and_b32(s[73], s[2], s[73])) + k.emit(s_mul_i32(s[72], s[72], s[73])) + k.emit(s_add_u32(s[2], s[71], s[72])) + k.label('skip_WGMXCC') + k.emit(v_mov_b32_e32(v[20], 256)) + k.emit(v_mov_b32_e32(v[19], s[20])) + k.emit(v_cvt_f32_u32_e32(v[18], v[20])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[21], v[19])) + k.emit(v_mul_f32_e32(v[18], v[18], v[21])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e32(v[21], v[18], v[20])) + k.emit(v_sub_u32_e32(v[21], v[19], v[21])) + k.emit(v_cmp_ne_u32_e64(VCC, v[21], 0)) + k.emit(v_addc_co_u32(v[18], VCC, v[18], 0, VCC)) + k.emit(v_mov_b32_e32(v[20], 256)) + k.emit(v_mov_b32_e32(v[19], s[21])) + k.emit(v_readfirstlane_b32_e32(v[10], v[18])) + k.emit(v_cvt_f32_u32_e32(v[18], v[20])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[21], v[19])) + k.emit(v_mul_f32_e32(v[18], v[18], v[21])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e32(v[21], v[18], v[20])) + k.emit(v_sub_u32_e32(v[21], v[19], v[21])) + k.emit(v_cmp_ne_u32_e64(VCC, v[21], 0)) + k.emit(v_addc_co_u32(v[18], VCC, v[18], 0, VCC)) + k.emit(s_nop()) + k.emit(v_readfirstlane_b32_e32(v[11], v[18])) + k.waitcnt(lgkm=0) + k.emit(s_mov_b32(s[85], 84148480)) + k.emit(s_mov_b32(s[86], 117834498)) + k.emit(s_sub_u32(s[28], s[28], 16)) + k.emit(s_subb_u32(s[29], s[29], 0)) + k.emit(s_sub_u32(s[30], s[30], 16)) + k.emit(s_subb_u32(s[31], s[31], 0)) + k.label('AlphaNonZero') + k.emit(s_mov_b32(s[57], s[2])) + k.emit(s_mul_i32(s[58], s[57], s[46])) + k.emit(s_mov_b32(s[59], s[49])) + k.emit(s_mul_i32(s[87], s[52], s[46])) + k.emit(s_cmp_lt_u32(s[87], s[49])) + k.emit(s_cbranch_scc1(), target='SK_InitDone') + k.emit(s_mul_i32(s[87], s[52], s[46])) + k.emit(s_mul_i32(s[88], s[46], s[51])) + k.emit(s_sub_u32(s[87], s[87], s[88])) + k.emit(s_mul_i32(s[58], s[57], s[46])) + k.emit(s_add_u32(s[58], s[58], s[87])) + k.emit(s_add_u32(s[59], s[58], s[46])) + k.emit(s_add_u32(s[89], s[46], 1)) + k.emit(s_mul_i32(s[88], s[57], s[89])) + k.emit(s_add_u32(s[89], s[88], s[89])) + k.emit(s_cmp_lt_u32(s[57], s[87])) + k.emit(s_cselect_b32(s[58], s[88], s[58])) + k.emit(s_cselect_b32(s[59], s[89], s[59])) + k.emit(s_mul_i32(s[87], s[52], s[46])) + k.emit(s_min_u32(s[59], s[59], s[87])) + k.label('SK_InitDone') + k.emit(s_cmp_ge_u32(s[58], s[49])) + k.emit(s_cbranch_scc1(), target='KernelEnd') + k.label('PersistentLoopStart') + k.emit(v_xor_b32_e32(v[18], v[178], v[16])) + k.emit(v_min_i32_e32(v[16], v[16], v[18])) + k.emit(v_xor_b32_e32(v[18], v[179], v[17])) + k.emit(v_min_i32_e32(v[17], v[17], v[18])) + k.emit(s_mul_hi_u32(s[89], s[58], s[47])) + k.emit(s_lshr_b32(s[90], s[48], 31)) + k.emit(s_mul_i32(s[88], s[58], s[90])) + k.emit(s_add_u32(s[88], s[88], s[89])) + k.emit(s_and_b32(s[90], s[48], 2147483647)) + k.emit(s_lshr_b32(s[88], s[88], s[90])) + k.emit(s_mul_i32(s[89], s[88], s[46])) + k.emit(s_add_u32(s[90], s[89], s[46])) + k.emit(s_sub_u32(s[60], s[58], s[89])) + k.emit(s_min_u32(s[61], s[59], s[90])) + k.emit(s_sub_u32(s[61], s[61], s[89])) + k.emit(s_mul_i32(s[91], s[52], s[46])) + k.emit(s_sub_u32(s[91], s[49], s[91])) + k.emit(s_mul_i32(s[89], s[51], s[46])) + k.emit(s_add_u32(s[89], s[89], s[58])) + k.emit(s_cmp_lt_u32(s[89], s[91])) + k.emit(s_cbranch_scc1(), target='NoBranch_8G3ZEUE1ZDJOP9IU') + k.emit(s_mov_b32(s[89], s[90])) + k.emit(s_cmp_le_u32(s[91], s[58])) + k.emit(s_cbranch_scc1(), target='NoBranch_8G3ZEUE1ZDJOP9IU') + k.emit(s_mul_i32(s[87], s[52], s[46])) + k.emit(s_mul_i32(s[92], s[46], s[51])) + k.emit(s_sub_u32(s[87], s[87], s[92])) + k.emit(s_mul_i32(s[58], s[57], s[46])) + k.emit(s_add_u32(s[58], s[58], s[87])) + k.emit(s_add_u32(s[59], s[58], s[46])) + k.emit(s_add_u32(s[93], s[46], 1)) + k.emit(s_mul_i32(s[92], s[57], s[93])) + k.emit(s_add_u32(s[93], s[92], s[93])) + k.emit(s_cmp_lt_u32(s[57], s[87])) + k.emit(s_cselect_b32(s[58], s[92], s[58])) + k.emit(s_cselect_b32(s[59], s[93], s[59])) + k.emit(s_add_u32(s[89], s[58], s[91])) + k.emit(s_add_u32(s[59], s[59], s[91])) + k.emit(s_min_u32(s[59], s[59], s[49])) + k.emit(s_cmp_ge_u32(s[58], s[49])) + k.emit(s_cbranch_scc1(), target='KernelEnd') + k.label('NoBranch_8G3ZEUE1ZDJOP9IU') + k.emit(s_mov_b32(s[58], s[89])) + k.emit(s_mul_i32(s[89], s[10], s[11])) + k.emit(v_cvt_f32_u32_e32(v[18], s[89])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[88])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[89])) + k.emit(v_sub_u32_e32(v[19], s[88], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[89])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(v_mov_b32_e32(v[19], 0)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[89])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[89])) + k.emit(v_sub_u32_e32(v[19], s[88], v[19])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[4], v[18])) + k.emit(v_readfirstlane_b32_e32(v[90], v[19])) + k.emit(v_cvt_f32_u32_e32(v[18], s[10])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[90])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[10])) + k.emit(v_sub_u32_e32(v[19], s[90], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[10])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(v_mov_b32_e32(v[19], 0)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[10])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[10])) + k.emit(v_sub_u32_e32(v[19], s[90], v[19])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[3], v[18])) + k.emit(v_readfirstlane_b32_e32(v[2], v[19])) + k.label('SKAlphaCheck') + k.emit(s_mov_b32(s[91], 16)) + k.emit(v_cvt_f32_u32_e32(v[18], s[91])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[3])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[91])) + k.emit(v_sub_u32_e32(v[19], s[3], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[91])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[91])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[87], v[18])) + k.emit(s_mul_i32(s[90], s[87], s[91])) + k.emit(s_sub_u32(s[90], s[3], s[90])) + k.emit(s_mul_i32(s[90], s[90], s[10])) + k.emit(s_add_u32(s[90], s[90], s[2])) + k.emit(v_cvt_f32_u32_e32(v[18], s[91])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[11])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[91])) + k.emit(v_sub_u32_e32(v[19], s[11], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[91])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[91])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[88], v[18])) + k.emit(s_mul_i32(s[89], s[91], s[88])) + k.emit(s_sub_u32(s[89], s[11], s[89])) + k.emit(s_cmp_eq_u32(s[89], 0)) + k.emit(s_cmov_b32(s[89], s[91])) + k.emit(s_cmp_ge_u32(s[87], s[88])) + k.emit(s_cselect_b32(s[88], s[89], s[91])) + k.emit(v_cvt_f32_u32_e32(v[18], s[88])) + k.emit(v_rcp_iflag_f32_e32(v[18], v[18])) + k.emit(v_cvt_f32_u32_e32(v[19], s[90])) + k.emit(v_mul_f32_e32(v[18], v[18], v[19])) + k.emit(v_cvt_u32_f32_e32(v[18], v[18])) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[88])) + k.emit(v_sub_u32_e32(v[19], s[90], v[19])) + k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[88])) + k.emit(v_add_u32_e32(v[18], 1, v[18])) + k.emit(v_mov_b32_e32(v[19], 0)) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[88])) + k.emit(v_sub_u32_e64(v[18], v[18], 1)) + k.emit(v_mul_u32_u24_e64(v[19], v[18], s[88])) + k.emit(v_sub_u32_e32(v[19], s[90], v[19])) + k.emit(s_mov_b64(EXEC, -1)) + k.emit(v_readfirstlane_b32_e32(v[2], v[18])) + k.emit(v_readfirstlane_b32_e32(v[3], v[19])) + k.emit(s_mul_i32(s[3], s[2], s[88])) + k.emit(s_sub_u32(s[3], s[90], s[3])) + k.emit(s_mul_i32(s[87], s[87], s[91])) + k.emit(s_add_u32(s[3], s[3], s[87])) + k.label('WGM') + k.emit(v_and_b32_e32(v[19], 63, v[180])) + k.emit(v_and_b32_e32(v[18], 15, v[19])) + k.emit(v_lshlrev_b32_e32(v[18], 3, v[18])) + k.emit(v_lshrrev_b32_e32(v[19], 4, v[19])) + k.emit(v_lshl_add_u32_e64(v[18], v[19], 11, v[18])) + k.emit(v_lshrrev_b32_e32(v[22], 6, v[180])) + k.emit(v_and_b32_e32(v[22], 1, v[22])) + k.emit(v_lshl_add_u32_e64(v[18], v[22], 7, v[18])) + k.emit(v_and_b32_e32(v[20], 63, v[180])) + k.emit(v_and_b32_e32(v[19], 15, v[20])) + k.emit(v_lshlrev_b32_e32(v[19], 6, v[19])) + k.emit(v_lshlrev_b32_e32(v[19], 3, v[19])) + k.emit(v_lshrrev_b32_e32(v[20], 4, v[20])) + k.emit(v_lshl_add_u32_e64(v[19], v[20], 3, v[19])) + k.emit(v_lshrrev_b32_e32(v[21], 7, v[180])) + k.emit(v_and_b32_e32(v[21], 1, v[21])) + k.emit(v_lshl_add_u32_e64(v[19], v[21], 13, v[19])) + k.emit(v_lshrrev_b32_e32(v[20], 6, v[180])) + k.emit(v_lshrrev_b32_e32(v[20], 2, v[20])) + k.emit(s_mov_b32(s[87], 16384)) + k.emit(v_mul_lo_u32(v[20], s[87], v[20])) + k.emit(v_add_lshl_u32_e64(v[16], v[20], v[18], 1)) + k.emit(v_lshrrev_b32_e32(v[18], 6, v[180])) + k.emit(v_lshrrev_b32_e32(v[18], 2, v[18])) + k.emit(s_mov_b32(s[87], 64)) + k.emit(v_mul_lo_u32(v[18], s[87], v[18])) + k.emit(v_add_lshl_u32_e64(v[17], v[18], v[19], 1)) + k.emit(v_lshrrev_b32_e32(v[20], 10, v[17])) + k.emit(v_lshl_add_u32_e64(v[17], v[20], 5, v[17])) + k.emit(v_add_co_u32_e32(v[17], 32768, v[17])) + k.emit(v_add_u32_e32(v[178], 66560, v[16])) + k.emit(v_xor_b32_e32(v[178], v[178], v[16])) + k.emit(v_add_u32_e32(v[179], 66560, v[17])) + k.emit(v_xor_b32_e32(v[179], v[179], v[17])) + k.emit(v_lshrrev_b32_e32(v[19], 5, v[180])) + k.emit(v_and_b32_e32(v[18], 31, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 3, v[18])) + k.emit(v_mov_b32_e32(v[22], v[19])) + k.emit(v_lshrrev_b32_e32(v[20], 3, v[180])) + k.emit(v_and_b32_e32(v[21], 7, v[180])) + k.emit(v_lshlrev_b32_e32(v[21], 3, v[21])) + k.emit(v_mov_b32_e32(v[23], v[21])) + k.emit(v_mul_u32_u24_e32(v[24], 256, v[22])) + k.emit(v_add_lshl_u32_e64(v[24], v[18], v[24], 1)) + k.emit(s_nop()) + k.emit(v_readfirstlane_b32_e32(v[53], v[24])) + k.emit(s_nop()) + k.emit(s_add_u32(s[55], s[53], 66560)) + k.emit(s_xor_b32(s[55], s[55], s[53])) + k.emit(v_mul_u32_u24_e32(v[24], 64, v[20])) + k.emit(v_add_lshl_u32_e64(v[24], v[23], v[24], 1)) + k.emit(v_lshrrev_b32_e32(v[26], 10, v[24])) + k.emit(v_lshl_add_u32_e64(v[24], v[26], 5, v[24])) + k.emit(v_add_co_u32_e32(v[24], 32768, v[24])) + k.emit(s_nop()) + k.emit(v_readfirstlane_b32_e32(v[54], v[24])) + k.emit(s_nop()) + k.emit(s_add_u32(s[56], s[54], 66560)) + k.emit(s_xor_b32(s[56], s[56], s[54])) + k.emit(v_mov_b32_e32(v[24], v[18])) + k.emit(v_mov_b32_e32(v[25], v[20])) + k.emit(v_add_co_u32_e32(v[26], 32, v[25])) + k.emit(v_add_co_u32_e32(v[27], 32, v[26])) + k.emit(v_add_co_u32_e32(v[28], 32, v[27])) + k.emit(v_add_co_u32_e32(v[29], 32, v[28])) + k.emit(v_add_co_u32_e32(v[30], 32, v[29])) + k.emit(v_add_co_u32_e32(v[31], 32, v[30])) + k.emit(v_add_co_u32_e32(v[32], 32, v[31])) + k.emit(v_mov_b32_e32(v[33], v[19])) + k.emit(v_add_co_u32_e32(v[34], 8, v[33])) + k.emit(v_add_co_u32_e32(v[35], 8, v[34])) + k.emit(v_add_co_u32_e32(v[36], 8, v[35])) + k.emit(v_add_co_u32_e32(v[37], 8, v[36])) + k.emit(v_add_co_u32_e32(v[38], 8, v[37])) + k.emit(v_add_co_u32_e32(v[39], 8, v[38])) + k.emit(v_add_co_u32_e32(v[40], 8, v[39])) + k.emit(v_mov_b32_e32(v[41], v[21])) + k.emit(s_mul_i32(s[87], s[2], 256)) + k.emit(s_sub_u32(s[87], s[20], s[87])) + k.emit(s_sub_u32(s[87], s[87], 8)) + k.emit(v_mov_b32_e32(v[42], s[87])) + k.emit(v_min_i32_e32(v[24], v[42], v[24])) + k.emit(v_mul_lo_u32(v[42], s[40], v[33])) + k.emit(v_add_co_u32_e32(v[0], v[24], v[42])) + k.emit(v_add_u32_e32(v[0], 8)) + k.emit(v_lshlrev_b32_e32(v[0], 1)) + k.emit(v_mul_lo_u32(v[42], s[40], v[34])) + k.emit(v_add_co_u32_e32(v[1], v[24], v[42])) + k.emit(v_add_u32_e32(v[1], 8, v[1])) + k.emit(v_lshlrev_b32_e32(v[1], 1, v[1])) + k.emit(v_mul_lo_u32(v[42], s[40], v[35])) + k.emit(v_add_co_u32_e32(v[2], v[24], v[42])) + k.emit(v_add_u32_e32(v[2], 8, v[2])) + k.emit(v_lshlrev_b32_e32(v[2], 1, v[2])) + k.emit(v_mul_lo_u32(v[42], s[40], v[36])) + k.emit(v_add_co_u32_e32(v[3], v[24], v[42])) + k.emit(v_add_u32_e32(v[3], 8, v[3])) + k.emit(v_lshlrev_b32_e32(v[3], 1, v[3])) + k.emit(v_mul_lo_u32(v[42], s[40], v[37])) + k.emit(v_add_co_u32_e32(v[4], v[24], v[42])) + k.emit(v_add_u32_e32(v[4], 8, v[4])) + k.emit(v_lshlrev_b32_e32(v[4], 1, v[4])) + k.emit(v_mul_lo_u32(v[42], s[40], v[38])) + k.emit(v_add_co_u32_e32(v[5], v[24], v[42])) + k.emit(v_add_u32_e32(v[5], 8, v[5])) + k.emit(v_lshlrev_b32_e32(v[5], 1, v[5])) + k.emit(v_mul_lo_u32(v[42], s[40], v[39])) + k.emit(v_add_co_u32_e32(v[6], v[24], v[42])) + k.emit(v_add_u32_e32(v[6], 8, v[6])) + k.emit(v_lshlrev_b32_e32(v[6], 1, v[6])) + k.emit(v_mul_lo_u32(v[42], s[40], v[40])) + k.emit(v_add_co_u32_e32(v[7], v[24], v[42])) + k.emit(v_add_u32_e32(v[7], 8, v[7])) + k.emit(v_lshlrev_b32_e32(v[7], 1, v[7])) + k.emit(v_mul_lo_u32(v[33], s[42], v[25])) + k.emit(v_add_co_u32_e32(v[8], v[41], v[33])) + k.emit(v_add_u32_e32(v[8], 8, v[8])) + k.emit(v_lshlrev_b32_e32(v[8], 1, v[8])) + k.emit(v_mul_lo_u32(v[33], s[42], v[26])) + k.emit(v_add_co_u32_e32(v[9], v[41], v[33])) + k.emit(v_add_u32_e32(v[9], 8, v[9])) + k.emit(v_lshlrev_b32_e32(v[9], 1, v[9])) + k.emit(v_mul_lo_u32(v[33], s[42], v[27])) + k.emit(v_add_co_u32_e32(v[10], v[41], v[33])) + k.emit(v_add_u32_e32(v[10], 8, v[10])) + k.emit(v_lshlrev_b32_e32(v[10], 1, v[10])) + k.emit(v_mul_lo_u32(v[33], s[42], v[28])) + k.emit(v_add_co_u32_e32(v[11], v[41], v[33])) + k.emit(v_add_u32_e32(v[11], 8, v[11])) + k.emit(v_lshlrev_b32_e32(v[11], 1, v[11])) + k.emit(v_mul_lo_u32(v[33], s[42], v[29])) + k.emit(v_add_co_u32_e32(v[12], v[41], v[33])) + k.emit(v_add_u32_e32(v[12], 8, v[12])) + k.emit(v_lshlrev_b32_e32(v[12], 1, v[12])) + k.emit(v_mul_lo_u32(v[33], s[42], v[30])) + k.emit(v_add_co_u32_e32(v[13], v[41], v[33])) + k.emit(v_add_u32_e32(v[13], 8, v[13])) + k.emit(v_lshlrev_b32_e32(v[13], 1, v[13])) + k.emit(v_mul_lo_u32(v[33], s[42], v[31])) + k.emit(v_add_co_u32_e32(v[14], v[41], v[33])) + k.emit(v_add_u32_e32(v[14], 8, v[14])) + k.emit(v_lshlrev_b32_e32(v[14], 1, v[14])) + k.emit(v_mul_lo_u32(v[33], s[42], v[32])) + k.emit(v_add_co_u32_e32(v[15], v[41], v[33])) + k.emit(v_add_u32_e32(v[15], 8, v[15])) + k.emit(v_lshlrev_b32_e32(v[15], 1, v[15])) + k.emit(s_mul_hi_u32(s[91], s[2], 256)) + k.emit(s_mul_i32(s[90], s[2], 256)) + k.emit(s_mul_i32(s[88], s[60], 64)) + k.emit(s_mul_hi_u32(s[89], s[88], s[40])) + k.emit(s_mul_i32(s[88], s[88], s[40])) + k.emit(s_add_u32(s[90], s[90], s[88])) + k.emit(s_addc_u32(s[91], s[91], s[89])) + k.emit(s_mov_b64(s[62:63], 1)) + k.emit(s_sub_u32(s[88], s[20], 1)) + k.emit(s_mul_hi_u32(s[89], 1, s[88])) + k.emit(s_mul_i32(s[88], 1, s[88])) + k.emit(s_add_u32(s[62], s[62], s[88])) + k.emit(s_addc_u32(s[63], s[63], s[89])) + k.emit(s_sub_u32(s[88], s[23], 1)) + k.emit(s_mul_hi_u32(s[89], s[40], s[88])) + k.emit(s_mul_i32(s[88], s[40], s[88])) + k.emit(s_add_u32(s[62], s[62], s[88])) + k.emit(s_addc_u32(s[63], s[63], s[89])) + k.emit(s_sub_u32(s[62], s[62], s[90])) + k.emit(s_subb_u32(s[63], s[63], s[91])) + k.emit(s_lshl_b64(s[62:63], s[62:63], 1)) + k.emit(s_add_u32(s[62], s[62], 16)) + k.emit(s_addc_u32(s[63], s[63], 0)) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(s_mul_hi_u32(s[89], s[41], s[4])) + k.emit(s_mul_i32(s[88], s[41], s[4])) + k.emit(s_add_u32(s[90], s[90], s[88])) + k.emit(s_addc_u32(s[91], s[91], s[89])) + k.emit(s_lshl_b64(s[90:91], s[90:91], 1)) + k.emit(s_add_u32(s[68], s[28], s[90])) + k.emit(s_addc_u32(s[69], s[29], s[91])) + k.emit(s_mov_b32(s[71], 131072)) + k.emit(s_mul_hi_u32(s[91], s[3], 256)) + k.emit(s_mul_i32(s[90], s[3], 256)) + k.emit(s_mul_hi_u32(s[91], s[90], s[42])) + k.emit(s_mul_i32(s[90], s[90], s[42])) + k.emit(s_mul_i32(s[88], s[60], 64)) + k.emit(s_mul_hi_u32(s[89], s[88], 1)) + k.emit(s_mul_i32(s[88], s[88], 1)) + k.emit(s_add_u32(s[90], s[90], s[88])) + k.emit(s_addc_u32(s[91], s[91], s[89])) + k.emit(s_mov_b64(s[76:77], 1)) + k.emit(s_sub_u32(s[88], s[23], 1)) + k.emit(s_mul_hi_u32(s[89], 1, s[88])) + k.emit(s_mul_i32(s[88], 1, s[88])) + k.emit(s_add_u32(s[76], s[76], s[88])) + k.emit(s_addc_u32(s[77], s[77], s[89])) + k.emit(s_sub_u32(s[88], s[21], 1)) + k.emit(s_mul_hi_u32(s[89], s[42], s[88])) + k.emit(s_mul_i32(s[88], s[42], s[88])) + k.emit(s_add_u32(s[76], s[76], s[88])) + k.emit(s_addc_u32(s[77], s[77], s[89])) + k.emit(s_sub_u32(s[76], s[76], s[90])) + k.emit(s_subb_u32(s[77], s[77], s[91])) + k.emit(s_lshl_b64(s[76:77], s[76:77], 1)) + k.emit(s_add_u32(s[76], s[76], 16)) + k.emit(s_addc_u32(s[77], s[77], 0)) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(s_mul_hi_u32(s[89], s[43], s[4])) + k.emit(s_mul_i32(s[88], s[43], s[4])) + k.emit(s_add_u32(s[90], s[90], s[88])) + k.emit(s_addc_u32(s[91], s[91], s[89])) + k.emit(s_lshl_b64(s[90:91], s[90:91], 1)) + k.emit(s_add_u32(s[72], s[30], s[90])) + k.emit(s_addc_u32(s[73], s[31], s[91])) + k.emit(s_mov_b32(s[75], 131072)) + k.emit(s_mul_i32(s[83], 128, s[40])) + k.emit(s_mov_b32(s[84], 128)) + k.emit(s_sub_u32(s[8], s[61], s[60])) + k.label('SKAlphaCheck2') + k.emit(s_and_b32(s[89], 63, s[23])) + k.emit(s_cmp_eq_u32(s[89], 0)) + k.emit(s_cselect_b32(s[88], 0, 1)) + k.emit(s_cmp_eq_u32(s[61], s[46])) + k.emit(s_cselect_b32(s[88], s[88], 0)) + k.emit(s_sub_u32(s[8], s[8], s[88])) + k.emit(s_mov_b32(s[9], s[8])) + k.emit(s_and_b32(s[90], s[6], 7936)) + k.emit(s_lshr_b32(s[90], s[90], 8)) + k.emit(s_and_b32(s[91], s[6], 57344)) + k.emit(s_and_b32(s[6], s[6], 255)) + k.emit(s_mov_b32(s[88], s[6])) + k.label('beginStaggerUIter') + k.emit(s_lshl_b32(s[89], s[88], s[90])) + k.emit(s_cmp_ge_u32(s[9], s[89])) + k.emit(s_cbranch_scc1(), target='endStaggerUIter') + k.emit(s_lshr_b32(s[88], s[88], 1)) + k.emit(s_branch(), target='beginStaggerUIter') + k.label('endStaggerUIter') + k.emit(s_sub_u32(s[89], s[88], 1)) + k.emit(s_cmp_ge_u32(s[88], 1)) + k.emit(s_cselect_b32(s[78], s[89], 0)) + k.emit(s_cmp_eq_u32(s[91], 0)) + k.emit(s_cbranch_scc1(), target='StaggerUMapping_1') + k.emit(s_mov_b32(s[88], s[2])) + k.emit(s_branch(), target='staggerInputEnd') + k.label('StaggerUMapping_1') + k.emit(s_cmp_eq_u32(s[91], 8192)) + k.emit(s_cbranch_scc1(), target='StaggerUMapping_2') + k.emit(s_mov_b32(s[88], s[3])) + k.emit(s_branch(), target='staggerInputEnd') + k.label('StaggerUMapping_2') + k.emit(s_cmp_eq_u32(s[91], 16384)) + k.emit(s_cbranch_scc1(), target='StaggerUMapping_3') + k.emit(s_mov_b32(s[88], -1)) + k.emit(s_branch(), target='staggerInputEnd') + k.label('StaggerUMapping_3') + k.emit(s_cmp_eq_u32(s[91], 24576)) + k.emit(s_cbranch_scc1(), target='StaggerUMapping_4') + k.emit(s_mul_i32(s[89], s[10], s[3])) + k.emit(s_add_u32(s[88], s[88], s[89])) + k.emit(s_add_u32(s[88], s[88], s[2])) + k.emit(s_branch(), target='staggerInputEnd') + k.label('StaggerUMapping_4') + k.emit(s_cmp_eq_u32(s[91], 32768)) + k.emit(s_cbranch_scc1(), target='staggerInputEnd') + k.emit(s_mov_b32(s[88], -1)) + k.emit(s_branch(), target='staggerInputEnd') + k.label('staggerInputEnd') + k.emit(s_and_b32(s[78], s[78], s[88])) + k.emit(s_lshl_b32(s[78], s[78], s[90])) + k.emit(s_cmp_gt_u32(s[60], 0)) + k.emit(s_cmov_b32(s[78], 0)) + k.emit(s_cmp_lt_u32(s[61], s[46])) + k.emit(s_cmov_b32(s[78], 0)) + k.emit(s_mul_hi_i32(s[89], s[78], s[83])) + k.emit(s_mul_i32(s[88], s[78], s[83])) + k.emit(s_mul_hi_i32(s[80], s[8], s[83])) + k.emit(s_mul_i32(s[79], s[8], s[83])) + k.emit(s_sub_u32(s[79], s[83], s[79])) + k.emit(s_subb_u32(s[80], 0, s[80])) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(s_mul_hi_i32(s[89], s[78], s[84])) + k.emit(s_mul_i32(s[88], s[78], s[84])) + k.emit(s_mul_hi_i32(s[82], s[8], s[84])) + k.emit(s_mul_i32(s[81], s[8], s[84])) + k.emit(s_sub_u32(s[81], s[84], s[81])) + k.emit(s_subb_u32(s[82], 0, s[82])) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(s_add_u32(s[78], s[78], 2)) + k.emit(s_cmp_eq_u32(s[8], 0)) + k.emit(s_setprio()) + k.emit(s_cbranch_scc1(), target='ShadowInitStart') + k.emit(s_mov_b32(M0, s[53])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_mov_b32(M0, 133120)) + k.emit(s_mov_b32(M0, s[54])) + k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_mov_b32(M0, 133120)) + k.emit(s_add_u32(s[90], s[8], 1)) + k.emit(s_cmp_eq_u32(s[78], s[90])) + k.emit(s_cselect_b32(s[88], s[79], s[83])) + k.emit(s_cselect_b32(s[89], s[80], 0)) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(s_add_u32(s[90], s[8], 1)) + k.emit(s_cmp_eq_u32(s[78], s[90])) + k.emit(s_cselect_b32(s[88], s[81], s[84])) + k.emit(s_cselect_b32(s[89], s[82], 0)) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.label('ShadowInitStart') + k.emit(s_mov_b64(s[12:13], s[24:25])) + k.emit(s_mov_b32(s[14], 2147483648)) + k.emit(s_mov_b32(s[15], 131072)) + k.emit(s_mov_b64(s[16:17], s[24:25])) + k.emit(s_mov_b32(s[18], 2147483648)) + k.emit(s_mov_b32(s[19], 131072)) + k.emit(s_mov_b32(s[87], 1)) + k.emit(s_mov_b32(s[88], 1)) + k.emit(s_mul_i32(s[92], 256, s[3])) + k.emit(s_mul_hi_u32(s[91], s[92], s[38])) + k.emit(s_mul_i32(s[90], s[92], s[38])) + k.emit(s_lshl_b64(s[90:91], s[90:91], s[87])) + k.emit(s_add_u32(s[16], s[26], s[90])) + k.emit(s_addc_u32(s[17], s[27], s[91])) + k.emit(s_mul_hi_u32(s[91], s[92], s[36])) + k.emit(s_mul_i32(s[90], s[92], s[36])) + k.emit(s_lshl_b64(s[90:91], s[90:91], s[88])) + k.emit(s_add_u32(s[12], s[24], s[90])) + k.emit(s_addc_u32(s[13], s[25], s[91])) + k.emit(s_mul_hi_u32(s[91], s[4], s[39])) + k.emit(s_mul_i32(s[90], s[4], s[39])) + k.emit(s_lshl_b64(s[90:91], s[90:91], s[87])) + k.emit(s_add_u32(s[16], s[16], s[90])) + k.emit(s_addc_u32(s[17], s[17], s[91])) + k.emit(s_mul_hi_u32(s[91], s[4], s[37])) + k.emit(s_mul_i32(s[90], s[4], s[37])) + k.emit(s_lshl_b64(s[90:91], s[90:91], s[88])) + k.emit(s_add_u32(s[12], s[12], s[90])) + k.emit(s_addc_u32(s[13], s[13], s[91])) + k.emit(v_mov_b64_e32(v[182:183], 0)) + k.emit(v_accvgpr_write(v[0], 0)) + k.emit(v_accvgpr_write(v[1], 0)) + k.emit(v_accvgpr_write(v[2], 0)) + k.emit(v_accvgpr_write(v[3], 0)) + k.emit(v_accvgpr_write(v[4], 0)) + k.emit(v_accvgpr_write(v[5], 0)) + k.emit(v_accvgpr_write(v[6], 0)) + k.emit(v_accvgpr_write(v[7], 0)) + k.emit(v_accvgpr_write(v[8], 0)) + k.emit(v_accvgpr_write(v[9], 0)) + k.emit(v_accvgpr_write(v[10], 0)) + k.emit(v_accvgpr_write(v[11], 0)) + k.emit(v_accvgpr_write(v[12], 0)) + k.emit(v_accvgpr_write(v[13], 0)) + k.emit(v_accvgpr_write(v[14], 0)) + k.emit(v_accvgpr_write(v[15], 0)) + k.emit(v_mfma_i32_32x32x16_i8(v[16:31], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[32:47], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[48:63], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[64:79], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[80:95], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[96:111], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[112:127], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[128:143], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[144:159], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[160:175], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[176:191], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[192:207], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[208:223], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[224:239], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_i32_32x32x16_i8(v[240:255], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], 0)) + k.emit(s_cbranch_scc1(), target='toPGR1end_OrdNLL') + k.waitcnt(vm=0) + k.emit(s_barrier()) + k.emit(s_xor_b32(s[53], s[55], s[53])) + k.emit(s_xor_b32(s[54], s[56], s[54])) + k.emit(s_cmp_eq_u32(s[8], 1)) + k.emit(s_cbranch_scc1(), target='skipPGR2') + k.emit(s_mov_b32(M0, s[53])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(s_mov_b32(M0, 133120)) + k.emit(s_mov_b32(M0, s[54])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(s_mov_b32(M0, 133120)) + k.emit(s_xor_b32(s[53], s[55], s[53])) + k.emit(s_xor_b32(s[54], s[56], s[54])) + k.label('skipPGR2') + k.emit(s_barrier()) + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(ds_read_b128(v[114:117], v[17])) + k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128)) + k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1)) + k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1)) + k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2)) + k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3)) + k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3)) + k.waitcnt(lgkm=0) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.label('openLoopL') + k.emit(s_cmp_eq_u32(s[8], 1)) + k.emit(s_cbranch_scc1(), target='toPGR1') + k.emit(s_cmp_le_u32(s[8], 2)) + k.emit(s_cbranch_scc1(), target='LoopEndL') + k.label('LoopBeginL') + k.emit(s_getreg_b32(s[87], 260)) + k.waitcnt(lgkm=0) + k.emit(s_cmp_eq_u32(s[87], 0)) + k.emit(s_cbranch_scc1(), target='LoopBeginL_0') + k.emit(s_cmp_eq_u32(s[87], 1)) + k.emit(s_cbranch_scc1(), target='LoopBeginL_1') + k.label('LoopBeginL_0') + k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64)) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66)) + k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[79], s[83])) + k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[80], 0)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70)) + k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74)) + k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78)) + k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64)) + k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192)) + k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=1) + k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_mov_b32(M0, s[53])) + k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2)) + k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2)) + k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3)) + k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[81], s[84])) + k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[82], 0)) + k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3)) + k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=0) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_mov_b32(M0, s[54])) + k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=17) + k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_xor_b32_e32(v[16], v[178], v[16])) + k.emit(v_xor_b32_e32(v[17], v[179], v[17])) + k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=9) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(ds_read_b128(v[114:117], v[17])) + k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128)) + k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2)) + k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3)) + k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3)) + k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_xor_b32(s[53], s[55], s[53])) + k.emit(s_xor_b32(s[54], s[56], s[54])) + k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[8], s[8], 1)) + k.emit(s_cmp_eq_i32(s[8], 2)) + k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cbranch_scc0(), target='LoopBeginL_0') + k.emit(s_branch(), target='LoopEndL') + k.label('LoopBeginL_1') + k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[79], s[83])) + k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64)) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66)) + k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[80], 0)) + k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70)) + k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74)) + k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78)) + k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64)) + k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=1) + k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192)) + k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_mov_b32(M0, s[53])) + k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2)) + k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2)) + k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3)) + k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[81], s[84])) + k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86])) + k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3)) + k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[82], 0)) + k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=0) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4096)) + k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_mov_b32(M0, s[54])) + k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=17) + k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_xor_b32_e32(v[16], v[178], v[16])) + k.emit(v_xor_b32_e32(v[17], v[179], v[17])) + k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=9) + k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[114:117], v[17])) + k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128)) + k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1)) + k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2)) + k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3)) + k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3)) + k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(M0, M0, 4224)) + k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_xor_b32(s[53], s[55], s[53])) + k.emit(s_xor_b32(s[54], s[56], s[54])) + k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[8], s[8], 1)) + k.emit(s_cmp_eq_i32(s[8], 2)) + k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cbranch_scc0(), target='LoopBeginL_1') + k.emit(s_branch(), target='LoopEndL') + k.label('LoopEndL') + k.emit(s_waitcnt()) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64)) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66)) + k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[79], s[83])) + k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[80], 0)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70)) + k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74)) + k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78)) + k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64)) + k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192)) + k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=1) + k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2)) + k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2)) + k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3)) + k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[8], s[78])) + k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[88], s[81], s[84])) + k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cselect_b32(s[89], s[82], 0)) + k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3)) + k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=0) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=17) + k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_xor_b32_e32(v[16], v[178], v[16])) + k.emit(v_xor_b32_e32(v[17], v[179], v[17])) + k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=9) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(ds_read_b128(v[114:117], v[17])) + k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128)) + k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2)) + k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2)) + k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3)) + k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3)) + k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.label('toPGR1') + k.emit(s_waitcnt()) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64)) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66)) + k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70)) + k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74)) + k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78)) + k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64)) + k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192)) + k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=1) + k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2)) + k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2)) + k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3)) + k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3)) + k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=0) + k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=17) + k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(vm=9) + k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_barrier()) + k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.waitcnt(lgkm=4) + k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.label('toPGR1end_OrdNLL') + k.emit(s_xor_b32(s[87], s[55], s[53])) + k.emit(s_min_u32(s[53], s[53], s[87])) + k.emit(s_xor_b32(s[87], s[56], s[54])) + k.emit(s_min_u32(s[54], s[54], s[87])) + k.emit(s_and_b32(s[8], 63, s[23])) + k.emit(s_cmp_lt_u32(s[61], s[46])) + k.emit(s_cmov_b32(s[8], 0)) + k.emit(s_cmp_eq_u32(s[8], 0)) + k.emit(s_mov_b32(s[9], 0)) + k.emit(s_cbranch_scc1(), target='SkipTailLoopL') + k.emit(s_sub_i32(s[88], 3, s[78])) + k.emit(s_cmp_ge_i32(s[88], 0)) + k.emit(s_cbranch_scc0(), target='Negative_LHNOKZ26V2FLOONQ') + k.emit(s_mul_hi_u32(s[89], s[88], s[83])) + k.emit(s_mul_i32(s[88], s[88], s[83])) + k.emit(s_branch(), target='MultiplyDone_L9DK3KJL31S8WWGN') + k.label('Negative_LHNOKZ26V2FLOONQ') + k.emit(s_abs_i32(s[88], s[88])) + k.emit(s_mul_hi_u32(s[89], s[88], s[83])) + k.emit(s_mul_i32(s[88], s[88], s[83])) + k.emit(s_xor_b32(s[88], s[88], -1)) + k.emit(s_xor_b32(s[89], s[89], -1)) + k.emit(s_add_u32(s[88], s[88], 1)) + k.emit(s_addc_u32(s[89], s[89], 0)) + k.label('MultiplyDone_L9DK3KJL31S8WWGN') + k.emit(s_sub_u32(s[88], s[88], s[79])) + k.emit(s_subb_u32(s[89], s[89], s[80])) + k.emit(s_add_u32(s[68], s[68], s[88])) + k.emit(s_addc_u32(s[69], s[69], s[89])) + k.emit(s_sub_u32(s[62], s[62], s[88])) + k.emit(s_subb_u32(s[63], s[63], s[89])) + k.emit(s_cmp_eq_u32(s[63], 0)) + k.emit(s_cselect_b32(s[70], s[62], -1)) + k.emit(s_sub_i32(s[88], 3, s[78])) + k.emit(s_cmp_ge_i32(s[88], 0)) + k.emit(s_cbranch_scc0(), target='Negative_3U2TZUPK3AVX5ODG') + k.emit(s_mul_hi_u32(s[89], s[88], s[84])) + k.emit(s_mul_i32(s[88], s[88], s[84])) + k.emit(s_branch(), target='MultiplyDone_NW6XNGOG77EAT0NM') + k.label('Negative_3U2TZUPK3AVX5ODG') + k.emit(s_abs_i32(s[88], s[88])) + k.emit(s_mul_hi_u32(s[89], s[88], s[84])) + k.emit(s_mul_i32(s[88], s[88], s[84])) + k.emit(s_xor_b32(s[88], s[88], -1)) + k.emit(s_xor_b32(s[89], s[89], -1)) + k.emit(s_add_u32(s[88], s[88], 1)) + k.emit(s_addc_u32(s[89], s[89], 0)) + k.label('MultiplyDone_NW6XNGOG77EAT0NM') + k.emit(s_sub_u32(s[88], s[88], s[81])) + k.emit(s_subb_u32(s[89], s[89], s[82])) + k.emit(s_add_u32(s[72], s[72], s[88])) + k.emit(s_addc_u32(s[73], s[73], s[89])) + k.emit(s_sub_u32(s[76], s[76], s[88])) + k.emit(s_subb_u32(s[77], s[77], s[89])) + k.emit(s_cmp_eq_u32(s[77], 0)) + k.emit(s_cselect_b32(s[74], s[76], -1)) + k.emit(s_mov_b32(M0, s[53])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(buffer_load_short_d16(v[18], v[0], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[84], v[0], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[19], v[0], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[85], v[0], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[20], v[0], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[86], v[0], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[21], v[0], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[87], v[0], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[22], v[1], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[88], v[1], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[23], v[1], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[89], v[1], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[24], v[1], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[90], v[1], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[25], v[1], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[91], v[1], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[26], v[2], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[92], v[2], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[27], v[2], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[93], v[2], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[28], v[2], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[94], v[2], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[29], v[2], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[95], v[2], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[30], v[3], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[96], v[3], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[31], v[3], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[97], v[3], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[32], v[3], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[98], v[3], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[33], v[3], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[99], v[3], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[34], v[4], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[100], v[4], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[35], v[4], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[101], v[4], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[36], v[4], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[102], v[4], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[37], v[4], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[103], v[4], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[38], v[5], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[104], v[5], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[39], v[5], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[105], v[5], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[40], v[5], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[106], v[5], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[41], v[5], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[107], v[5], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[42], v[6], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[108], v[6], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[43], v[6], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[109], v[6], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[44], v[6], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[110], v[6], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[45], v[6], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[111], v[6], s[68:71], 0, 14, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[46], v[7], s[68:71], 0, 0, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[112], v[7], s[68:71], 0, 2, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[47], v[7], s[68:71], 0, 4, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[113], v[7], s[68:71], 0, 6, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[48], v[7], s[68:71], 0, 8, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[114], v[7], s[68:71], 0, 10, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16(v[49], v[7], s[68:71], 0, 12, 1, 0, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[115], v[7], s[68:71], 0, 14, 1, 0, 0, 1)) + k.waitcnt(vm=0) + k.emit(v_or_b32_e32(v[18], v[18], v[84])) + k.emit(v_or_b32_e32(v[19], v[19], v[85])) + k.emit(v_or_b32_e32(v[20], v[20], v[86])) + k.emit(v_or_b32_e32(v[21], v[21], v[87])) + k.emit(v_or_b32_e32(v[22], v[22], v[88])) + k.emit(v_or_b32_e32(v[23], v[23], v[89])) + k.emit(v_or_b32_e32(v[24], v[24], v[90])) + k.emit(v_or_b32_e32(v[25], v[25], v[91])) + k.emit(v_or_b32_e32(v[26], v[26], v[92])) + k.emit(v_or_b32_e32(v[27], v[27], v[93])) + k.emit(v_or_b32_e32(v[28], v[28], v[94])) + k.emit(v_or_b32_e32(v[29], v[29], v[95])) + k.emit(v_or_b32_e32(v[30], v[30], v[96])) + k.emit(v_or_b32_e32(v[31], v[31], v[97])) + k.emit(v_or_b32_e32(v[32], v[32], v[98])) + k.emit(v_or_b32_e32(v[33], v[33], v[99])) + k.emit(v_or_b32_e32(v[34], v[34], v[100])) + k.emit(v_or_b32_e32(v[35], v[35], v[101])) + k.emit(v_or_b32_e32(v[36], v[36], v[102])) + k.emit(v_or_b32_e32(v[37], v[37], v[103])) + k.emit(v_or_b32_e32(v[38], v[38], v[104])) + k.emit(v_or_b32_e32(v[39], v[39], v[105])) + k.emit(v_or_b32_e32(v[40], v[40], v[106])) + k.emit(v_or_b32_e32(v[41], v[41], v[107])) + k.emit(v_or_b32_e32(v[42], v[42], v[108])) + k.emit(v_or_b32_e32(v[43], v[43], v[109])) + k.emit(v_or_b32_e32(v[44], v[44], v[110])) + k.emit(v_or_b32_e32(v[45], v[45], v[111])) + k.emit(v_or_b32_e32(v[46], v[46], v[112])) + k.emit(v_or_b32_e32(v[47], v[47], v[113])) + k.emit(v_or_b32_e32(v[48], v[48], v[114])) + k.emit(v_or_b32_e32(v[49], v[49], v[115])) + k.emit(s_mov_b32(M0, 133120)) + k.emit(s_mov_b32(M0, s[54])) + k.emit(buffer_load_short_d16(v[50], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[84], v[8], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[51], v[8], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[85], v[8], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[52], v[8], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[86], v[8], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[53], v[8], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[87], v[8], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[54], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[88], v[9], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[55], v[9], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[89], v[9], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[56], v[9], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[90], v[9], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[57], v[9], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[91], v[9], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[58], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[92], v[10], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[59], v[10], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[93], v[10], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[60], v[10], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[94], v[10], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[61], v[10], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[95], v[10], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[62], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[96], v[11], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[63], v[11], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[97], v[11], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[64], v[11], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[98], v[11], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[65], v[11], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[99], v[11], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[66], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[100], v[12], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[67], v[12], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[101], v[12], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[68], v[12], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[102], v[12], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[69], v[12], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[103], v[12], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[70], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[104], v[13], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[71], v[13], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[105], v[13], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[72], v[13], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[106], v[13], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[73], v[13], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[107], v[13], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[74], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[108], v[14], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[75], v[14], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[109], v[14], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[76], v[14], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[110], v[14], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[77], v[14], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[111], v[14], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[78], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[112], v[15], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[79], v[15], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[113], v[15], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[80], v[15], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[114], v[15], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16(v[81], v[15], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1)) + k.emit(buffer_load_short_d16_hi(v[115], v[15], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1)) + k.waitcnt(vm=0) + k.emit(v_or_b32_e32(v[50], v[50], v[84])) + k.emit(v_or_b32_e32(v[51], v[51], v[85])) + k.emit(v_or_b32_e32(v[52], v[52], v[86])) + k.emit(v_or_b32_e32(v[53], v[53], v[87])) + k.emit(v_or_b32_e32(v[54], v[54], v[88])) + k.emit(v_or_b32_e32(v[55], v[55], v[89])) + k.emit(v_or_b32_e32(v[56], v[56], v[90])) + k.emit(v_or_b32_e32(v[57], v[57], v[91])) + k.emit(v_or_b32_e32(v[58], v[58], v[92])) + k.emit(v_or_b32_e32(v[59], v[59], v[93])) + k.emit(v_or_b32_e32(v[60], v[60], v[94])) + k.emit(v_or_b32_e32(v[61], v[61], v[95])) + k.emit(v_or_b32_e32(v[62], v[62], v[96])) + k.emit(v_or_b32_e32(v[63], v[63], v[97])) + k.emit(v_or_b32_e32(v[64], v[64], v[98])) + k.emit(v_or_b32_e32(v[65], v[65], v[99])) + k.emit(v_or_b32_e32(v[66], v[66], v[100])) + k.emit(v_or_b32_e32(v[67], v[67], v[101])) + k.emit(v_or_b32_e32(v[68], v[68], v[102])) + k.emit(v_or_b32_e32(v[69], v[69], v[103])) + k.emit(v_or_b32_e32(v[70], v[70], v[104])) + k.emit(v_or_b32_e32(v[71], v[71], v[105])) + k.emit(v_or_b32_e32(v[72], v[72], v[106])) + k.emit(v_or_b32_e32(v[73], v[73], v[107])) + k.emit(v_or_b32_e32(v[74], v[74], v[108])) + k.emit(v_or_b32_e32(v[75], v[75], v[109])) + k.emit(v_or_b32_e32(v[76], v[76], v[110])) + k.emit(v_or_b32_e32(v[77], v[77], v[111])) + k.emit(v_or_b32_e32(v[78], v[78], v[112])) + k.emit(v_or_b32_e32(v[79], v[79], v[113])) + k.emit(v_or_b32_e32(v[80], v[80], v[114])) + k.emit(v_or_b32_e32(v[81], v[81], v[115])) + k.emit(s_mov_b32(M0, 133120)) + k.waitcnt(vm=0) + k.emit(s_barrier()) + k.emit(v_and_b32_e32(v[82], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[82], 4, v[82])) + k.emit(v_add_u32_e32(v[82], s[53], v[82])) + k.emit(v_and_b32_e32(v[83], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[83], 4, v[83])) + k.emit(v_add_u32_e32(v[83], s[54], v[83])) + k.emit(ds_write_b128(v[0], v[82], v[18:21])) + k.emit(ds_write_b128(v[0], v[82], v[22:25], v[0], 0, 0, 16)) + k.emit(ds_write_b128(v[0], v[82], v[26:29], v[0], 0, 0, 32)) + k.emit(ds_write_b128(v[0], v[82], v[30:33], v[0], 0, 0, 48)) + k.emit(ds_write_b128(v[0], v[82], v[34:37], v[0], 0, 0, 64)) + k.emit(ds_write_b128(v[0], v[82], v[38:41], v[0], 0, 0, 80)) + k.emit(ds_write_b128(v[0], v[82], v[42:45], v[0], 0, 0, 96)) + k.emit(ds_write_b128(v[0], v[82], v[46:49], v[0], 0, 0, 112)) + k.emit(ds_write_b128(v[0], v[83], v[50:53])) + k.emit(ds_write_b128(v[0], v[83], v[54:57], v[0], 0, 128, 16)) + k.emit(ds_write_b128(v[0], v[83], v[58:61], v[0], 0, 0, 33)) + k.emit(ds_write_b128(v[0], v[83], v[62:65], v[0], 0, 128, 49)) + k.emit(ds_write_b128(v[0], v[83], v[66:69], v[0], 0, 0, 66)) + k.emit(ds_write_b128(v[0], v[83], v[70:73], v[0], 0, 128, 82)) + k.emit(ds_write_b128(v[0], v[83], v[74:77], v[0], 0, 0, 99)) + k.emit(ds_write_b128(v[0], v[83], v[78:81], v[0], 0, 128, 115)) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(v_xor_b32_e32(v[181], v[178], v[16])) + k.emit(v_min_i32_e32(v[16], v[16], v[181])) + k.emit(v_xor_b32_e32(v[181], v[179], v[17])) + k.emit(v_min_i32_e32(v[17], v[17], v[181])) + k.label('TailLoopBeginL') + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(ds_read_b128(v[114:117], v[17])) + k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128)) + k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1)) + k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1)) + k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2)) + k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3)) + k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3)) + k.emit(s_mov_b32(s[87], 16384)) + k.emit(v_add_co_u32_e32(v[16], s[87], v[16])) + k.emit(s_mov_b32(s[87], 64)) + k.emit(v_add_co_u32_e32(v[17], s[87], v[17])) + k.waitcnt(lgkm=0) + k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85])) + k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85])) + k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86])) + k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86])) + k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85])) + k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85])) + k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86])) + k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86])) + k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85])) + k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85])) + k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86])) + k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86])) + k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85])) + k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85])) + k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86])) + k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86])) + k.emit(v_and_b32_e32(v[181], 63, v[180])) + k.emit(v_lshrrev_b32_e32(v[181], 4, v[181])) + k.emit(v_lshlrev_b32_e32(v[181], 3, v[181])) + k.emit(v_add_u32_e64(v[182], v[181], 0)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[18], v[18], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[22], v[22], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[26], v[26], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[30], v[30], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[34], v[34], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[38], v[38], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[42], v[42], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[46], v[46], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[19], v[19], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[23], v[23], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[27], v[27], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[31], v[31], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[35], v[35], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[39], v[39], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[43], v[43], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[47], v[47], 0, s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[20], v[20], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[24], v[24], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[28], v[28], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[32], v[32], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[36], v[36], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[40], v[40], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[44], v[44], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[48], v[48], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[21], v[21], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[25], v[25], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[29], v[29], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[33], v[33], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[37], v[37], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[41], v[41], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[45], v[45], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[49], v[49], 0, s[88:89])) + k.emit(v_and_b32_e32(v[181], 63, v[180])) + k.emit(v_lshrrev_b32_e32(v[181], 4, v[181])) + k.emit(v_lshlrev_b32_e32(v[181], 3, v[181])) + k.emit(v_add_u32_e64(v[182], v[181], 0)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[114], v[114], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[118], v[118], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[122], v[122], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[126], v[126], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[130], v[130], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[134], v[134], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[138], v[138], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[142], v[142], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[115], v[115], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[119], v[119], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[123], v[123], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[127], v[127], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[131], v[131], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[135], v[135], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[139], v[139], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[143], v[143], 0, s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[116], v[116], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[120], v[120], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[124], v[124], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[128], v[128], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[132], v[132], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[136], v[136], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[140], v[140], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[144], v[144], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[117], v[117], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[121], v[121], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[125], v[125], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[129], v[129], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[133], v[133], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[137], v[137], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[141], v[141], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[145], v[145], 0, s[88:89])) + k.emit(s_and_b32(s[87], s[23], 7)) + k.emit(s_cmp_eq_u32(s[87], 0)) + k.emit(s_cbranch_scc1(), target='TailLoop_SkipZeroOutMask_0FMPG10PI1CDGWZ9') + k.emit(s_and_b32(s[87], s[8], 7)) + k.emit(s_sub_u32(s[87], 8, s[87])) + k.emit(s_lshl_b32(s[87], s[87], 4)) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[18:19])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[20:21])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[18], v[18], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[19], v[19], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[20], v[20], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[21], v[21], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[22:23])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[24:25])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[22], v[22], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[23], v[23], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[24], v[24], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[25], v[25], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[26:27])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[28:29])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[26], v[26], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[27], v[27], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[28], v[28], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[29], v[29], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[30:31])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[32:33])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[30], v[30], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[31], v[31], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[32], v[32], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[33], v[33], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[34:35])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[36:37])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[34], v[34], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[35], v[35], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[36], v[36], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[37], v[37], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[38:39])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[40:41])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[38], v[38], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[39], v[39], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[40], v[40], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[41], v[41], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[42:43])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[44:45])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[42], v[42], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[43], v[43], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[44], v[44], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[45], v[45], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[46:47])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[48:49])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[46], v[46], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[47], v[47], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[48], v[48], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[49], v[49], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[114:115])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[116:117])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[114], v[114], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[115], v[115], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[116], v[116], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[117], v[117], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[118:119])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[120:121])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[118], v[118], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[119], v[119], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[120], v[120], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[121], v[121], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[122:123])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[124:125])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[122], v[122], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[123], v[123], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[124], v[124], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[125], v[125], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[126:127])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[128:129])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[126], v[126], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[127], v[127], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[128], v[128], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[129], v[129], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[130:131])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[132:133])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[130], v[130], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[131], v[131], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[132], v[132], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[133], v[133], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[134:135])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[136:137])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[134], v[134], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[135], v[135], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[136], v[136], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[137], v[137], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[138:139])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[140:141])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[138], v[138], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[139], v[139], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[140], v[140], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[141], v[141], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[142:143])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[144:145])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[142], v[142], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[143], v[143], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[144], v[144], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[145], v[145], v[187], s[88:89])) + k.label('TailLoop_SkipZeroOutMask_0FMPG10PI1CDGWZ9') + k.emit(s_nop(1)) + k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_i32(s[8], s[8], 32)) + k.emit(s_add_u32(s[9], s[9], 32)) + k.emit(s_cmp_le_i32(s[8], 0)) + k.emit(s_cbranch_scc1(), target='TailLoopEndL') + k.emit(ds_read_b128(v[82:85], v[16])) + k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6)) + k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8)) + k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10)) + k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12)) + k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14)) + k.emit(ds_read_b128(v[146:149], v[17])) + k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 128)) + k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 0, 1)) + k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 128, 1)) + k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 0, 2)) + k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 128, 2)) + k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 0, 3)) + k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 128, 3)) + k.emit(s_mov_b32(s[87], 16384)) + k.emit(v_add_co_u32_e32(v[16], s[87], v[16])) + k.emit(s_mov_b32(s[87], 64)) + k.emit(v_add_co_u32_e32(v[17], s[87], v[17])) + k.waitcnt(lgkm=0) + k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85])) + k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85])) + k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85])) + k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85])) + k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86])) + k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86])) + k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86])) + k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86])) + k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85])) + k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85])) + k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85])) + k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85])) + k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86])) + k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86])) + k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86])) + k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86])) + k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85])) + k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85])) + k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85])) + k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85])) + k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86])) + k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86])) + k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86])) + k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86])) + k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85])) + k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85])) + k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85])) + k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85])) + k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86])) + k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86])) + k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86])) + k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86])) + k.emit(v_and_b32_e32(v[181], 63, v[180])) + k.emit(v_lshrrev_b32_e32(v[181], 4, v[181])) + k.emit(v_lshlrev_b32_e32(v[181], 3, v[181])) + k.emit(v_add_u32_e64(v[182], v[181], 0)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[50], v[50], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[54], v[54], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[58], v[58], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[62], v[62], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[66], v[66], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[70], v[70], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[74], v[74], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[78], v[78], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[51], v[51], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[55], v[55], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[59], v[59], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[63], v[63], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[67], v[67], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[71], v[71], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[75], v[75], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[79], v[79], 0, s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[52], v[52], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[56], v[56], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[60], v[60], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[64], v[64], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[68], v[68], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[72], v[72], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[76], v[76], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[80], v[80], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[53], v[53], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[57], v[57], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[61], v[61], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[65], v[65], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[69], v[69], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[73], v[73], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[77], v[77], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[81], v[81], 0, s[88:89])) + k.emit(v_and_b32_e32(v[181], 63, v[180])) + k.emit(v_lshrrev_b32_e32(v[181], 4, v[181])) + k.emit(v_lshlrev_b32_e32(v[181], 3, v[181])) + k.emit(v_add_u32_e64(v[182], v[181], 0)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[146], v[146], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[150], v[150], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[154], v[154], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[158], v[158], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[162], v[162], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[166], v[166], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[170], v[170], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[174], v[174], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[147], v[147], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[151], v[151], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[155], v[155], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[159], v[159], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[163], v[163], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[167], v[167], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[171], v[171], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[175], v[175], 0, s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[148], v[148], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[152], v[152], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[156], v[156], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[160], v[160], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[164], v[164], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[168], v[168], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[172], v[172], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[176], v[176], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[149], v[149], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[153], v[153], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[157], v[157], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[161], v[161], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[165], v[165], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[169], v[169], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[173], v[173], 0, s[88:89])) + k.emit(v_cndmask_b32_e64(v[177], v[177], 0, s[88:89])) + k.emit(s_and_b32(s[87], s[23], 7)) + k.emit(s_cmp_eq_u32(s[87], 0)) + k.emit(s_cbranch_scc1(), target='TailLoop_SkipZeroOutMask_YVWB1RHZO1Z7SCZY') + k.emit(s_and_b32(s[87], s[8], 7)) + k.emit(s_sub_u32(s[87], 8, s[87])) + k.emit(s_lshl_b32(s[87], s[87], 4)) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[50:51])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[52:53])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[50], v[50], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[51], v[51], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[52], v[52], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[53], v[53], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[54:55])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[56:57])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[54], v[54], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[55], v[55], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[56], v[56], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[57], v[57], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[58:59])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[60:61])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[58], v[58], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[59], v[59], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[60], v[60], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[61], v[61], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[62:63])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[64:65])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[62], v[62], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[63], v[63], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[64], v[64], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[65], v[65], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[66:67])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[68:69])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[66], v[66], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[67], v[67], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[68], v[68], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[69], v[69], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[70:71])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[72:73])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[70], v[70], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[71], v[71], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[72], v[72], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[73], v[73], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[74:75])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[76:77])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[74], v[74], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[75], v[75], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[76], v[76], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[77], v[77], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[78:79])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[80:81])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[78], v[78], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[79], v[79], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[80], v[80], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[81], v[81], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[146:147])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[148:149])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[146], v[146], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[147], v[147], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[148], v[148], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[149], v[149], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[150:151])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[152:153])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[150], v[150], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[151], v[151], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[152], v[152], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[153], v[153], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[154:155])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[156:157])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[154], v[154], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[155], v[155], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[156], v[156], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[157], v[157], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[158:159])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[160:161])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[158], v[158], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[159], v[159], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[160], v[160], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[161], v[161], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[162:163])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[164:165])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[162], v[162], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[163], v[163], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[164], v[164], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[165], v[165], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[166:167])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[168:169])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[166], v[166], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[167], v[167], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[168], v[168], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[169], v[169], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[170:171])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[172:173])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[170], v[170], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[171], v[171], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[172], v[172], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[173], v[173], v[187], s[88:89])) + k.emit(v_lshlrev_b64(v[184:185], s[87], v[174:175])) + k.emit(v_lshlrev_b64(v[186:187], s[87], v[176:177])) + k.emit(v_add_u32_e64(v[182], v[181], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[174], v[174], v[184], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[175], v[175], v[185], s[88:89])) + k.emit(v_add_u32_e64(v[182], v[182], 4)) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[176], v[176], v[186], s[88:89])) + k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8])) + k.emit(v_cndmask_b32_e64(v[177], v[177], v[187], s[88:89])) + k.label('TailLoop_SkipZeroOutMask_YVWB1RHZO1Z7SCZY') + k.emit(s_nop(1)) + k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_sub_i32(s[8], s[8], 32)) + k.emit(s_add_u32(s[9], s[9], 32)) + k.emit(s_cmp_le_i32(s[8], 0)) + k.emit(s_cbranch_scc0(), target='TailLoopBeginL') + k.label('TailLoopEndL') + k.emit(s_mov_b32(s[87], 512)) + k.emit(s_mul_i32(s[87], s[9], s[87])) + k.emit(v_sub_u32_e64(v[16], v[16], s[87])) + k.emit(s_mov_b32(s[87], 2)) + k.emit(s_mul_i32(s[87], s[9], s[87])) + k.emit(v_sub_u32_e64(v[17], v[17], s[87])) + k.label('SkipTailLoopL') + k.emit(s_setprio()) + k.emit(s_mov_b64(s[68:69], 0)) + k.emit(s_mov_b32(s[72], 0)) + k.emit(v_mov_b32_e32(v[21], s[2])) + k.emit(v_mul_i32_i24_e32(v[21], 4294967040, v[21])) + k.emit(v_add_co_u32_e32(v[21], s[20], v[21])) + k.emit(v_mov_b32_e32(v[22], 256)) + k.emit(v_cmp_lt_u32_e64(s[8:9], v[21], v[22])) + k.emit(v_cndmask_b32_e64(v[21], v[22], v[21], s[8:9])) + k.emit(v_lshrrev_b32_e32(v[23], 6, v[180])) + k.emit(v_and_b32_e32(v[23], 1, v[23])) + k.emit(v_lshrrev_b32_e32(v[24], 7, v[21])) + k.emit(v_and_b32_e32(v[24], 1, v[24])) + k.emit(v_cmp_eq_u32_e64(s[8:9], v[24], v[23])) + k.emit(v_cndmask_b32_e64(v[21], v[22], v[21], s[8:9])) + k.emit(v_lshrrev_b32_e32(v[22], 7, v[21])) + k.emit(v_lshlrev_b32_e32(v[24], 0, v[23])) + k.emit(v_sub_u32_e32(v[22], v[22], v[24])) + k.emit(v_lshrrev_b32_e32(v[24], 3, v[21])) + k.emit(v_lshrrev_b32_e32(v[25], 0, v[180])) + k.emit(v_and_b32_e32(v[25], 15, v[25])) + k.emit(v_lshlrev_b32_e32(v[25], 3, v[25])) + k.emit(v_lshrrev_b32_e32(v[25], 3, v[25])) + k.emit(v_lshlrev_b32_e32(v[23], 4, v[23])) + k.emit(v_add_co_u32_e32(v[25], v[23], v[25])) + k.emit(v_sub_u32_e32(v[24], v[24], v[25])) + k.emit(v_and_b32_e32(v[23], 7, v[21])) + k.emit(v_lshrrev_b32_e32(v[23], 3, v[23])) + k.emit(v_and_b32_e32(v[25], 7, v[21])) + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 1)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 2)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 3)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 4)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 5)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 6)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6') + k.emit(v_cmp_eq_u32_e64(VCC, v[25], 7)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7') + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW1') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1_BM0') + k.label('ShiftVectorComponents0_GLVW2') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2_BM0') + k.label('ShiftVectorComponents0_GLVW3') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3_BM0') + k.label('ShiftVectorComponents0_GLVW4') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4_BM0') + k.label('ShiftVectorComponents0_GLVW5') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5_BM0') + k.label('ShiftVectorComponents0_GLVW6') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6_BM0') + k.label('ShiftVectorComponents0_GLVW7') + k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7_BM0') + k.label('ShiftVectorComponents0_GLVW1_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW2_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW3_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW4_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW5_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW6_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW7_BM0') + k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0)) + k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7_BM0_VW0') + k.label('ShiftVectorComponents0_GLVW1_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_read(v[25], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_read(v[25], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_read(v[25], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_read(v[25], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_read(v[25], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_read(v[25], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_read(v[25], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_read(v[25], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_read(v[25], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_read(v[25], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_read(v[25], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_read(v[25], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_read(v[25], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_read(v[25], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_read(v[25], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_read(v[25], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_read(v[25], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_read(v[25], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_read(v[25], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_read(v[25], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_read(v[25], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_read(v[25], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_read(v[25], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_read(v[25], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_read(v[25], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_read(v[25], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_read(v[25], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_read(v[25], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_read(v[25], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_read(v[25], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_read(v[25], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW2_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[24])) + k.emit(v_accvgpr_read(v[26], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_read(v[25], v[56])) + k.emit(v_accvgpr_read(v[26], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_read(v[25], v[88])) + k.emit(v_accvgpr_read(v[26], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_read(v[25], v[120])) + k.emit(v_accvgpr_read(v[26], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_read(v[25], v[152])) + k.emit(v_accvgpr_read(v[26], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_read(v[25], v[184])) + k.emit(v_accvgpr_read(v[26], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_read(v[25], v[216])) + k.emit(v_accvgpr_read(v[26], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_read(v[25], v[248])) + k.emit(v_accvgpr_read(v[26], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_read(v[25], v[25])) + k.emit(v_accvgpr_read(v[26], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_read(v[25], v[57])) + k.emit(v_accvgpr_read(v[26], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_read(v[25], v[89])) + k.emit(v_accvgpr_read(v[26], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_read(v[25], v[121])) + k.emit(v_accvgpr_read(v[26], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_read(v[25], v[153])) + k.emit(v_accvgpr_read(v[26], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_read(v[25], v[185])) + k.emit(v_accvgpr_read(v[26], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_read(v[25], v[217])) + k.emit(v_accvgpr_read(v[26], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_read(v[25], v[249])) + k.emit(v_accvgpr_read(v[26], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_read(v[25], v[26])) + k.emit(v_accvgpr_read(v[26], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_read(v[25], v[58])) + k.emit(v_accvgpr_read(v[26], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_read(v[25], v[90])) + k.emit(v_accvgpr_read(v[26], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_read(v[25], v[122])) + k.emit(v_accvgpr_read(v[26], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_read(v[25], v[154])) + k.emit(v_accvgpr_read(v[26], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_read(v[25], v[186])) + k.emit(v_accvgpr_read(v[26], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_read(v[25], v[218])) + k.emit(v_accvgpr_read(v[26], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_read(v[25], v[250])) + k.emit(v_accvgpr_read(v[26], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_read(v[25], v[27])) + k.emit(v_accvgpr_read(v[26], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_read(v[25], v[59])) + k.emit(v_accvgpr_read(v[26], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_read(v[25], v[91])) + k.emit(v_accvgpr_read(v[26], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_read(v[25], v[123])) + k.emit(v_accvgpr_read(v[26], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_read(v[25], v[155])) + k.emit(v_accvgpr_read(v[26], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_read(v[25], v[187])) + k.emit(v_accvgpr_read(v[26], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_read(v[25], v[219])) + k.emit(v_accvgpr_read(v[26], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_read(v[25], v[251])) + k.emit(v_accvgpr_read(v[26], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW3_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[20])) + k.emit(v_accvgpr_read(v[26], v[24])) + k.emit(v_accvgpr_read(v[27], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_write(v[8], v[27])) + k.emit(v_accvgpr_read(v[25], v[52])) + k.emit(v_accvgpr_read(v[26], v[56])) + k.emit(v_accvgpr_read(v[27], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_write(v[40], v[27])) + k.emit(v_accvgpr_read(v[25], v[84])) + k.emit(v_accvgpr_read(v[26], v[88])) + k.emit(v_accvgpr_read(v[27], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_write(v[72], v[27])) + k.emit(v_accvgpr_read(v[25], v[116])) + k.emit(v_accvgpr_read(v[26], v[120])) + k.emit(v_accvgpr_read(v[27], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_write(v[104], v[27])) + k.emit(v_accvgpr_read(v[25], v[148])) + k.emit(v_accvgpr_read(v[26], v[152])) + k.emit(v_accvgpr_read(v[27], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_write(v[136], v[27])) + k.emit(v_accvgpr_read(v[25], v[180])) + k.emit(v_accvgpr_read(v[26], v[184])) + k.emit(v_accvgpr_read(v[27], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_write(v[168], v[27])) + k.emit(v_accvgpr_read(v[25], v[212])) + k.emit(v_accvgpr_read(v[26], v[216])) + k.emit(v_accvgpr_read(v[27], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_write(v[200], v[27])) + k.emit(v_accvgpr_read(v[25], v[244])) + k.emit(v_accvgpr_read(v[26], v[248])) + k.emit(v_accvgpr_read(v[27], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_write(v[232], v[27])) + k.emit(v_accvgpr_read(v[25], v[21])) + k.emit(v_accvgpr_read(v[26], v[25])) + k.emit(v_accvgpr_read(v[27], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_write(v[9], v[27])) + k.emit(v_accvgpr_read(v[25], v[53])) + k.emit(v_accvgpr_read(v[26], v[57])) + k.emit(v_accvgpr_read(v[27], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_write(v[41], v[27])) + k.emit(v_accvgpr_read(v[25], v[85])) + k.emit(v_accvgpr_read(v[26], v[89])) + k.emit(v_accvgpr_read(v[27], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_write(v[73], v[27])) + k.emit(v_accvgpr_read(v[25], v[117])) + k.emit(v_accvgpr_read(v[26], v[121])) + k.emit(v_accvgpr_read(v[27], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_write(v[105], v[27])) + k.emit(v_accvgpr_read(v[25], v[149])) + k.emit(v_accvgpr_read(v[26], v[153])) + k.emit(v_accvgpr_read(v[27], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_write(v[137], v[27])) + k.emit(v_accvgpr_read(v[25], v[181])) + k.emit(v_accvgpr_read(v[26], v[185])) + k.emit(v_accvgpr_read(v[27], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_write(v[169], v[27])) + k.emit(v_accvgpr_read(v[25], v[213])) + k.emit(v_accvgpr_read(v[26], v[217])) + k.emit(v_accvgpr_read(v[27], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_write(v[201], v[27])) + k.emit(v_accvgpr_read(v[25], v[245])) + k.emit(v_accvgpr_read(v[26], v[249])) + k.emit(v_accvgpr_read(v[27], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_write(v[233], v[27])) + k.emit(v_accvgpr_read(v[25], v[22])) + k.emit(v_accvgpr_read(v[26], v[26])) + k.emit(v_accvgpr_read(v[27], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_write(v[10], v[27])) + k.emit(v_accvgpr_read(v[25], v[54])) + k.emit(v_accvgpr_read(v[26], v[58])) + k.emit(v_accvgpr_read(v[27], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_write(v[42], v[27])) + k.emit(v_accvgpr_read(v[25], v[86])) + k.emit(v_accvgpr_read(v[26], v[90])) + k.emit(v_accvgpr_read(v[27], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_write(v[74], v[27])) + k.emit(v_accvgpr_read(v[25], v[118])) + k.emit(v_accvgpr_read(v[26], v[122])) + k.emit(v_accvgpr_read(v[27], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_write(v[106], v[27])) + k.emit(v_accvgpr_read(v[25], v[150])) + k.emit(v_accvgpr_read(v[26], v[154])) + k.emit(v_accvgpr_read(v[27], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_write(v[138], v[27])) + k.emit(v_accvgpr_read(v[25], v[182])) + k.emit(v_accvgpr_read(v[26], v[186])) + k.emit(v_accvgpr_read(v[27], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_write(v[170], v[27])) + k.emit(v_accvgpr_read(v[25], v[214])) + k.emit(v_accvgpr_read(v[26], v[218])) + k.emit(v_accvgpr_read(v[27], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_write(v[202], v[27])) + k.emit(v_accvgpr_read(v[25], v[246])) + k.emit(v_accvgpr_read(v[26], v[250])) + k.emit(v_accvgpr_read(v[27], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_write(v[234], v[27])) + k.emit(v_accvgpr_read(v[25], v[23])) + k.emit(v_accvgpr_read(v[26], v[27])) + k.emit(v_accvgpr_read(v[27], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_write(v[11], v[27])) + k.emit(v_accvgpr_read(v[25], v[55])) + k.emit(v_accvgpr_read(v[26], v[59])) + k.emit(v_accvgpr_read(v[27], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_write(v[43], v[27])) + k.emit(v_accvgpr_read(v[25], v[87])) + k.emit(v_accvgpr_read(v[26], v[91])) + k.emit(v_accvgpr_read(v[27], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_write(v[75], v[27])) + k.emit(v_accvgpr_read(v[25], v[119])) + k.emit(v_accvgpr_read(v[26], v[123])) + k.emit(v_accvgpr_read(v[27], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_write(v[107], v[27])) + k.emit(v_accvgpr_read(v[25], v[151])) + k.emit(v_accvgpr_read(v[26], v[155])) + k.emit(v_accvgpr_read(v[27], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_write(v[139], v[27])) + k.emit(v_accvgpr_read(v[25], v[183])) + k.emit(v_accvgpr_read(v[26], v[187])) + k.emit(v_accvgpr_read(v[27], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_write(v[171], v[27])) + k.emit(v_accvgpr_read(v[25], v[215])) + k.emit(v_accvgpr_read(v[26], v[219])) + k.emit(v_accvgpr_read(v[27], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_write(v[203], v[27])) + k.emit(v_accvgpr_read(v[25], v[247])) + k.emit(v_accvgpr_read(v[26], v[251])) + k.emit(v_accvgpr_read(v[27], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(v_accvgpr_write(v[235], v[27])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW4_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[16])) + k.emit(v_accvgpr_read(v[26], v[20])) + k.emit(v_accvgpr_read(v[27], v[24])) + k.emit(v_accvgpr_read(v[28], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_write(v[8], v[27])) + k.emit(v_accvgpr_write(v[12], v[28])) + k.emit(v_accvgpr_read(v[25], v[48])) + k.emit(v_accvgpr_read(v[26], v[52])) + k.emit(v_accvgpr_read(v[27], v[56])) + k.emit(v_accvgpr_read(v[28], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_write(v[40], v[27])) + k.emit(v_accvgpr_write(v[44], v[28])) + k.emit(v_accvgpr_read(v[25], v[80])) + k.emit(v_accvgpr_read(v[26], v[84])) + k.emit(v_accvgpr_read(v[27], v[88])) + k.emit(v_accvgpr_read(v[28], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_write(v[72], v[27])) + k.emit(v_accvgpr_write(v[76], v[28])) + k.emit(v_accvgpr_read(v[25], v[112])) + k.emit(v_accvgpr_read(v[26], v[116])) + k.emit(v_accvgpr_read(v[27], v[120])) + k.emit(v_accvgpr_read(v[28], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_write(v[104], v[27])) + k.emit(v_accvgpr_write(v[108], v[28])) + k.emit(v_accvgpr_read(v[25], v[144])) + k.emit(v_accvgpr_read(v[26], v[148])) + k.emit(v_accvgpr_read(v[27], v[152])) + k.emit(v_accvgpr_read(v[28], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_write(v[136], v[27])) + k.emit(v_accvgpr_write(v[140], v[28])) + k.emit(v_accvgpr_read(v[25], v[176])) + k.emit(v_accvgpr_read(v[26], v[180])) + k.emit(v_accvgpr_read(v[27], v[184])) + k.emit(v_accvgpr_read(v[28], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_write(v[168], v[27])) + k.emit(v_accvgpr_write(v[172], v[28])) + k.emit(v_accvgpr_read(v[25], v[208])) + k.emit(v_accvgpr_read(v[26], v[212])) + k.emit(v_accvgpr_read(v[27], v[216])) + k.emit(v_accvgpr_read(v[28], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_write(v[200], v[27])) + k.emit(v_accvgpr_write(v[204], v[28])) + k.emit(v_accvgpr_read(v[25], v[240])) + k.emit(v_accvgpr_read(v[26], v[244])) + k.emit(v_accvgpr_read(v[27], v[248])) + k.emit(v_accvgpr_read(v[28], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_write(v[232], v[27])) + k.emit(v_accvgpr_write(v[236], v[28])) + k.emit(v_accvgpr_read(v[25], v[17])) + k.emit(v_accvgpr_read(v[26], v[21])) + k.emit(v_accvgpr_read(v[27], v[25])) + k.emit(v_accvgpr_read(v[28], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_write(v[9], v[27])) + k.emit(v_accvgpr_write(v[13], v[28])) + k.emit(v_accvgpr_read(v[25], v[49])) + k.emit(v_accvgpr_read(v[26], v[53])) + k.emit(v_accvgpr_read(v[27], v[57])) + k.emit(v_accvgpr_read(v[28], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_write(v[41], v[27])) + k.emit(v_accvgpr_write(v[45], v[28])) + k.emit(v_accvgpr_read(v[25], v[81])) + k.emit(v_accvgpr_read(v[26], v[85])) + k.emit(v_accvgpr_read(v[27], v[89])) + k.emit(v_accvgpr_read(v[28], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_write(v[73], v[27])) + k.emit(v_accvgpr_write(v[77], v[28])) + k.emit(v_accvgpr_read(v[25], v[113])) + k.emit(v_accvgpr_read(v[26], v[117])) + k.emit(v_accvgpr_read(v[27], v[121])) + k.emit(v_accvgpr_read(v[28], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_write(v[105], v[27])) + k.emit(v_accvgpr_write(v[109], v[28])) + k.emit(v_accvgpr_read(v[25], v[145])) + k.emit(v_accvgpr_read(v[26], v[149])) + k.emit(v_accvgpr_read(v[27], v[153])) + k.emit(v_accvgpr_read(v[28], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_write(v[137], v[27])) + k.emit(v_accvgpr_write(v[141], v[28])) + k.emit(v_accvgpr_read(v[25], v[177])) + k.emit(v_accvgpr_read(v[26], v[181])) + k.emit(v_accvgpr_read(v[27], v[185])) + k.emit(v_accvgpr_read(v[28], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_write(v[169], v[27])) + k.emit(v_accvgpr_write(v[173], v[28])) + k.emit(v_accvgpr_read(v[25], v[209])) + k.emit(v_accvgpr_read(v[26], v[213])) + k.emit(v_accvgpr_read(v[27], v[217])) + k.emit(v_accvgpr_read(v[28], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_write(v[201], v[27])) + k.emit(v_accvgpr_write(v[205], v[28])) + k.emit(v_accvgpr_read(v[25], v[241])) + k.emit(v_accvgpr_read(v[26], v[245])) + k.emit(v_accvgpr_read(v[27], v[249])) + k.emit(v_accvgpr_read(v[28], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_write(v[233], v[27])) + k.emit(v_accvgpr_write(v[237], v[28])) + k.emit(v_accvgpr_read(v[25], v[18])) + k.emit(v_accvgpr_read(v[26], v[22])) + k.emit(v_accvgpr_read(v[27], v[26])) + k.emit(v_accvgpr_read(v[28], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_write(v[10], v[27])) + k.emit(v_accvgpr_write(v[14], v[28])) + k.emit(v_accvgpr_read(v[25], v[50])) + k.emit(v_accvgpr_read(v[26], v[54])) + k.emit(v_accvgpr_read(v[27], v[58])) + k.emit(v_accvgpr_read(v[28], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_write(v[42], v[27])) + k.emit(v_accvgpr_write(v[46], v[28])) + k.emit(v_accvgpr_read(v[25], v[82])) + k.emit(v_accvgpr_read(v[26], v[86])) + k.emit(v_accvgpr_read(v[27], v[90])) + k.emit(v_accvgpr_read(v[28], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_write(v[74], v[27])) + k.emit(v_accvgpr_write(v[78], v[28])) + k.emit(v_accvgpr_read(v[25], v[114])) + k.emit(v_accvgpr_read(v[26], v[118])) + k.emit(v_accvgpr_read(v[27], v[122])) + k.emit(v_accvgpr_read(v[28], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_write(v[106], v[27])) + k.emit(v_accvgpr_write(v[110], v[28])) + k.emit(v_accvgpr_read(v[25], v[146])) + k.emit(v_accvgpr_read(v[26], v[150])) + k.emit(v_accvgpr_read(v[27], v[154])) + k.emit(v_accvgpr_read(v[28], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_write(v[138], v[27])) + k.emit(v_accvgpr_write(v[142], v[28])) + k.emit(v_accvgpr_read(v[25], v[178])) + k.emit(v_accvgpr_read(v[26], v[182])) + k.emit(v_accvgpr_read(v[27], v[186])) + k.emit(v_accvgpr_read(v[28], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_write(v[170], v[27])) + k.emit(v_accvgpr_write(v[174], v[28])) + k.emit(v_accvgpr_read(v[25], v[210])) + k.emit(v_accvgpr_read(v[26], v[214])) + k.emit(v_accvgpr_read(v[27], v[218])) + k.emit(v_accvgpr_read(v[28], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_write(v[202], v[27])) + k.emit(v_accvgpr_write(v[206], v[28])) + k.emit(v_accvgpr_read(v[25], v[242])) + k.emit(v_accvgpr_read(v[26], v[246])) + k.emit(v_accvgpr_read(v[27], v[250])) + k.emit(v_accvgpr_read(v[28], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_write(v[234], v[27])) + k.emit(v_accvgpr_write(v[238], v[28])) + k.emit(v_accvgpr_read(v[25], v[19])) + k.emit(v_accvgpr_read(v[26], v[23])) + k.emit(v_accvgpr_read(v[27], v[27])) + k.emit(v_accvgpr_read(v[28], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_write(v[11], v[27])) + k.emit(v_accvgpr_write(v[15], v[28])) + k.emit(v_accvgpr_read(v[25], v[51])) + k.emit(v_accvgpr_read(v[26], v[55])) + k.emit(v_accvgpr_read(v[27], v[59])) + k.emit(v_accvgpr_read(v[28], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_write(v[43], v[27])) + k.emit(v_accvgpr_write(v[47], v[28])) + k.emit(v_accvgpr_read(v[25], v[83])) + k.emit(v_accvgpr_read(v[26], v[87])) + k.emit(v_accvgpr_read(v[27], v[91])) + k.emit(v_accvgpr_read(v[28], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_write(v[75], v[27])) + k.emit(v_accvgpr_write(v[79], v[28])) + k.emit(v_accvgpr_read(v[25], v[115])) + k.emit(v_accvgpr_read(v[26], v[119])) + k.emit(v_accvgpr_read(v[27], v[123])) + k.emit(v_accvgpr_read(v[28], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_write(v[107], v[27])) + k.emit(v_accvgpr_write(v[111], v[28])) + k.emit(v_accvgpr_read(v[25], v[147])) + k.emit(v_accvgpr_read(v[26], v[151])) + k.emit(v_accvgpr_read(v[27], v[155])) + k.emit(v_accvgpr_read(v[28], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_write(v[139], v[27])) + k.emit(v_accvgpr_write(v[143], v[28])) + k.emit(v_accvgpr_read(v[25], v[179])) + k.emit(v_accvgpr_read(v[26], v[183])) + k.emit(v_accvgpr_read(v[27], v[187])) + k.emit(v_accvgpr_read(v[28], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_write(v[171], v[27])) + k.emit(v_accvgpr_write(v[175], v[28])) + k.emit(v_accvgpr_read(v[25], v[211])) + k.emit(v_accvgpr_read(v[26], v[215])) + k.emit(v_accvgpr_read(v[27], v[219])) + k.emit(v_accvgpr_read(v[28], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_write(v[203], v[27])) + k.emit(v_accvgpr_write(v[207], v[28])) + k.emit(v_accvgpr_read(v[25], v[243])) + k.emit(v_accvgpr_read(v[26], v[247])) + k.emit(v_accvgpr_read(v[27], v[251])) + k.emit(v_accvgpr_read(v[28], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(v_accvgpr_write(v[235], v[27])) + k.emit(v_accvgpr_write(v[239], v[28])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW5_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[12])) + k.emit(v_accvgpr_read(v[26], v[16])) + k.emit(v_accvgpr_read(v[27], v[20])) + k.emit(v_accvgpr_read(v[28], v[24])) + k.emit(v_accvgpr_read(v[29], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_write(v[8], v[27])) + k.emit(v_accvgpr_write(v[12], v[28])) + k.emit(v_accvgpr_write(v[16], v[29])) + k.emit(v_accvgpr_read(v[25], v[44])) + k.emit(v_accvgpr_read(v[26], v[48])) + k.emit(v_accvgpr_read(v[27], v[52])) + k.emit(v_accvgpr_read(v[28], v[56])) + k.emit(v_accvgpr_read(v[29], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_write(v[40], v[27])) + k.emit(v_accvgpr_write(v[44], v[28])) + k.emit(v_accvgpr_write(v[48], v[29])) + k.emit(v_accvgpr_read(v[25], v[76])) + k.emit(v_accvgpr_read(v[26], v[80])) + k.emit(v_accvgpr_read(v[27], v[84])) + k.emit(v_accvgpr_read(v[28], v[88])) + k.emit(v_accvgpr_read(v[29], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_write(v[72], v[27])) + k.emit(v_accvgpr_write(v[76], v[28])) + k.emit(v_accvgpr_write(v[80], v[29])) + k.emit(v_accvgpr_read(v[25], v[108])) + k.emit(v_accvgpr_read(v[26], v[112])) + k.emit(v_accvgpr_read(v[27], v[116])) + k.emit(v_accvgpr_read(v[28], v[120])) + k.emit(v_accvgpr_read(v[29], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_write(v[104], v[27])) + k.emit(v_accvgpr_write(v[108], v[28])) + k.emit(v_accvgpr_write(v[112], v[29])) + k.emit(v_accvgpr_read(v[25], v[140])) + k.emit(v_accvgpr_read(v[26], v[144])) + k.emit(v_accvgpr_read(v[27], v[148])) + k.emit(v_accvgpr_read(v[28], v[152])) + k.emit(v_accvgpr_read(v[29], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_write(v[136], v[27])) + k.emit(v_accvgpr_write(v[140], v[28])) + k.emit(v_accvgpr_write(v[144], v[29])) + k.emit(v_accvgpr_read(v[25], v[172])) + k.emit(v_accvgpr_read(v[26], v[176])) + k.emit(v_accvgpr_read(v[27], v[180])) + k.emit(v_accvgpr_read(v[28], v[184])) + k.emit(v_accvgpr_read(v[29], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_write(v[168], v[27])) + k.emit(v_accvgpr_write(v[172], v[28])) + k.emit(v_accvgpr_write(v[176], v[29])) + k.emit(v_accvgpr_read(v[25], v[204])) + k.emit(v_accvgpr_read(v[26], v[208])) + k.emit(v_accvgpr_read(v[27], v[212])) + k.emit(v_accvgpr_read(v[28], v[216])) + k.emit(v_accvgpr_read(v[29], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_write(v[200], v[27])) + k.emit(v_accvgpr_write(v[204], v[28])) + k.emit(v_accvgpr_write(v[208], v[29])) + k.emit(v_accvgpr_read(v[25], v[236])) + k.emit(v_accvgpr_read(v[26], v[240])) + k.emit(v_accvgpr_read(v[27], v[244])) + k.emit(v_accvgpr_read(v[28], v[248])) + k.emit(v_accvgpr_read(v[29], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_write(v[232], v[27])) + k.emit(v_accvgpr_write(v[236], v[28])) + k.emit(v_accvgpr_write(v[240], v[29])) + k.emit(v_accvgpr_read(v[25], v[13])) + k.emit(v_accvgpr_read(v[26], v[17])) + k.emit(v_accvgpr_read(v[27], v[21])) + k.emit(v_accvgpr_read(v[28], v[25])) + k.emit(v_accvgpr_read(v[29], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_write(v[9], v[27])) + k.emit(v_accvgpr_write(v[13], v[28])) + k.emit(v_accvgpr_write(v[17], v[29])) + k.emit(v_accvgpr_read(v[25], v[45])) + k.emit(v_accvgpr_read(v[26], v[49])) + k.emit(v_accvgpr_read(v[27], v[53])) + k.emit(v_accvgpr_read(v[28], v[57])) + k.emit(v_accvgpr_read(v[29], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_write(v[41], v[27])) + k.emit(v_accvgpr_write(v[45], v[28])) + k.emit(v_accvgpr_write(v[49], v[29])) + k.emit(v_accvgpr_read(v[25], v[77])) + k.emit(v_accvgpr_read(v[26], v[81])) + k.emit(v_accvgpr_read(v[27], v[85])) + k.emit(v_accvgpr_read(v[28], v[89])) + k.emit(v_accvgpr_read(v[29], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_write(v[73], v[27])) + k.emit(v_accvgpr_write(v[77], v[28])) + k.emit(v_accvgpr_write(v[81], v[29])) + k.emit(v_accvgpr_read(v[25], v[109])) + k.emit(v_accvgpr_read(v[26], v[113])) + k.emit(v_accvgpr_read(v[27], v[117])) + k.emit(v_accvgpr_read(v[28], v[121])) + k.emit(v_accvgpr_read(v[29], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_write(v[105], v[27])) + k.emit(v_accvgpr_write(v[109], v[28])) + k.emit(v_accvgpr_write(v[113], v[29])) + k.emit(v_accvgpr_read(v[25], v[141])) + k.emit(v_accvgpr_read(v[26], v[145])) + k.emit(v_accvgpr_read(v[27], v[149])) + k.emit(v_accvgpr_read(v[28], v[153])) + k.emit(v_accvgpr_read(v[29], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_write(v[137], v[27])) + k.emit(v_accvgpr_write(v[141], v[28])) + k.emit(v_accvgpr_write(v[145], v[29])) + k.emit(v_accvgpr_read(v[25], v[173])) + k.emit(v_accvgpr_read(v[26], v[177])) + k.emit(v_accvgpr_read(v[27], v[181])) + k.emit(v_accvgpr_read(v[28], v[185])) + k.emit(v_accvgpr_read(v[29], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_write(v[169], v[27])) + k.emit(v_accvgpr_write(v[173], v[28])) + k.emit(v_accvgpr_write(v[177], v[29])) + k.emit(v_accvgpr_read(v[25], v[205])) + k.emit(v_accvgpr_read(v[26], v[209])) + k.emit(v_accvgpr_read(v[27], v[213])) + k.emit(v_accvgpr_read(v[28], v[217])) + k.emit(v_accvgpr_read(v[29], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_write(v[201], v[27])) + k.emit(v_accvgpr_write(v[205], v[28])) + k.emit(v_accvgpr_write(v[209], v[29])) + k.emit(v_accvgpr_read(v[25], v[237])) + k.emit(v_accvgpr_read(v[26], v[241])) + k.emit(v_accvgpr_read(v[27], v[245])) + k.emit(v_accvgpr_read(v[28], v[249])) + k.emit(v_accvgpr_read(v[29], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_write(v[233], v[27])) + k.emit(v_accvgpr_write(v[237], v[28])) + k.emit(v_accvgpr_write(v[241], v[29])) + k.emit(v_accvgpr_read(v[25], v[14])) + k.emit(v_accvgpr_read(v[26], v[18])) + k.emit(v_accvgpr_read(v[27], v[22])) + k.emit(v_accvgpr_read(v[28], v[26])) + k.emit(v_accvgpr_read(v[29], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_write(v[10], v[27])) + k.emit(v_accvgpr_write(v[14], v[28])) + k.emit(v_accvgpr_write(v[18], v[29])) + k.emit(v_accvgpr_read(v[25], v[46])) + k.emit(v_accvgpr_read(v[26], v[50])) + k.emit(v_accvgpr_read(v[27], v[54])) + k.emit(v_accvgpr_read(v[28], v[58])) + k.emit(v_accvgpr_read(v[29], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_write(v[42], v[27])) + k.emit(v_accvgpr_write(v[46], v[28])) + k.emit(v_accvgpr_write(v[50], v[29])) + k.emit(v_accvgpr_read(v[25], v[78])) + k.emit(v_accvgpr_read(v[26], v[82])) + k.emit(v_accvgpr_read(v[27], v[86])) + k.emit(v_accvgpr_read(v[28], v[90])) + k.emit(v_accvgpr_read(v[29], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_write(v[74], v[27])) + k.emit(v_accvgpr_write(v[78], v[28])) + k.emit(v_accvgpr_write(v[82], v[29])) + k.emit(v_accvgpr_read(v[25], v[110])) + k.emit(v_accvgpr_read(v[26], v[114])) + k.emit(v_accvgpr_read(v[27], v[118])) + k.emit(v_accvgpr_read(v[28], v[122])) + k.emit(v_accvgpr_read(v[29], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_write(v[106], v[27])) + k.emit(v_accvgpr_write(v[110], v[28])) + k.emit(v_accvgpr_write(v[114], v[29])) + k.emit(v_accvgpr_read(v[25], v[142])) + k.emit(v_accvgpr_read(v[26], v[146])) + k.emit(v_accvgpr_read(v[27], v[150])) + k.emit(v_accvgpr_read(v[28], v[154])) + k.emit(v_accvgpr_read(v[29], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_write(v[138], v[27])) + k.emit(v_accvgpr_write(v[142], v[28])) + k.emit(v_accvgpr_write(v[146], v[29])) + k.emit(v_accvgpr_read(v[25], v[174])) + k.emit(v_accvgpr_read(v[26], v[178])) + k.emit(v_accvgpr_read(v[27], v[182])) + k.emit(v_accvgpr_read(v[28], v[186])) + k.emit(v_accvgpr_read(v[29], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_write(v[170], v[27])) + k.emit(v_accvgpr_write(v[174], v[28])) + k.emit(v_accvgpr_write(v[178], v[29])) + k.emit(v_accvgpr_read(v[25], v[206])) + k.emit(v_accvgpr_read(v[26], v[210])) + k.emit(v_accvgpr_read(v[27], v[214])) + k.emit(v_accvgpr_read(v[28], v[218])) + k.emit(v_accvgpr_read(v[29], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_write(v[202], v[27])) + k.emit(v_accvgpr_write(v[206], v[28])) + k.emit(v_accvgpr_write(v[210], v[29])) + k.emit(v_accvgpr_read(v[25], v[238])) + k.emit(v_accvgpr_read(v[26], v[242])) + k.emit(v_accvgpr_read(v[27], v[246])) + k.emit(v_accvgpr_read(v[28], v[250])) + k.emit(v_accvgpr_read(v[29], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_write(v[234], v[27])) + k.emit(v_accvgpr_write(v[238], v[28])) + k.emit(v_accvgpr_write(v[242], v[29])) + k.emit(v_accvgpr_read(v[25], v[15])) + k.emit(v_accvgpr_read(v[26], v[19])) + k.emit(v_accvgpr_read(v[27], v[23])) + k.emit(v_accvgpr_read(v[28], v[27])) + k.emit(v_accvgpr_read(v[29], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_write(v[11], v[27])) + k.emit(v_accvgpr_write(v[15], v[28])) + k.emit(v_accvgpr_write(v[19], v[29])) + k.emit(v_accvgpr_read(v[25], v[47])) + k.emit(v_accvgpr_read(v[26], v[51])) + k.emit(v_accvgpr_read(v[27], v[55])) + k.emit(v_accvgpr_read(v[28], v[59])) + k.emit(v_accvgpr_read(v[29], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_write(v[43], v[27])) + k.emit(v_accvgpr_write(v[47], v[28])) + k.emit(v_accvgpr_write(v[51], v[29])) + k.emit(v_accvgpr_read(v[25], v[79])) + k.emit(v_accvgpr_read(v[26], v[83])) + k.emit(v_accvgpr_read(v[27], v[87])) + k.emit(v_accvgpr_read(v[28], v[91])) + k.emit(v_accvgpr_read(v[29], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_write(v[75], v[27])) + k.emit(v_accvgpr_write(v[79], v[28])) + k.emit(v_accvgpr_write(v[83], v[29])) + k.emit(v_accvgpr_read(v[25], v[111])) + k.emit(v_accvgpr_read(v[26], v[115])) + k.emit(v_accvgpr_read(v[27], v[119])) + k.emit(v_accvgpr_read(v[28], v[123])) + k.emit(v_accvgpr_read(v[29], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_write(v[107], v[27])) + k.emit(v_accvgpr_write(v[111], v[28])) + k.emit(v_accvgpr_write(v[115], v[29])) + k.emit(v_accvgpr_read(v[25], v[143])) + k.emit(v_accvgpr_read(v[26], v[147])) + k.emit(v_accvgpr_read(v[27], v[151])) + k.emit(v_accvgpr_read(v[28], v[155])) + k.emit(v_accvgpr_read(v[29], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_write(v[139], v[27])) + k.emit(v_accvgpr_write(v[143], v[28])) + k.emit(v_accvgpr_write(v[147], v[29])) + k.emit(v_accvgpr_read(v[25], v[175])) + k.emit(v_accvgpr_read(v[26], v[179])) + k.emit(v_accvgpr_read(v[27], v[183])) + k.emit(v_accvgpr_read(v[28], v[187])) + k.emit(v_accvgpr_read(v[29], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_write(v[171], v[27])) + k.emit(v_accvgpr_write(v[175], v[28])) + k.emit(v_accvgpr_write(v[179], v[29])) + k.emit(v_accvgpr_read(v[25], v[207])) + k.emit(v_accvgpr_read(v[26], v[211])) + k.emit(v_accvgpr_read(v[27], v[215])) + k.emit(v_accvgpr_read(v[28], v[219])) + k.emit(v_accvgpr_read(v[29], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_write(v[203], v[27])) + k.emit(v_accvgpr_write(v[207], v[28])) + k.emit(v_accvgpr_write(v[211], v[29])) + k.emit(v_accvgpr_read(v[25], v[239])) + k.emit(v_accvgpr_read(v[26], v[243])) + k.emit(v_accvgpr_read(v[27], v[247])) + k.emit(v_accvgpr_read(v[28], v[251])) + k.emit(v_accvgpr_read(v[29], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(v_accvgpr_write(v[235], v[27])) + k.emit(v_accvgpr_write(v[239], v[28])) + k.emit(v_accvgpr_write(v[243], v[29])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW6_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[8])) + k.emit(v_accvgpr_read(v[26], v[12])) + k.emit(v_accvgpr_read(v[27], v[16])) + k.emit(v_accvgpr_read(v[28], v[20])) + k.emit(v_accvgpr_read(v[29], v[24])) + k.emit(v_accvgpr_read(v[30], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_write(v[8], v[27])) + k.emit(v_accvgpr_write(v[12], v[28])) + k.emit(v_accvgpr_write(v[16], v[29])) + k.emit(v_accvgpr_write(v[20], v[30])) + k.emit(v_accvgpr_read(v[25], v[40])) + k.emit(v_accvgpr_read(v[26], v[44])) + k.emit(v_accvgpr_read(v[27], v[48])) + k.emit(v_accvgpr_read(v[28], v[52])) + k.emit(v_accvgpr_read(v[29], v[56])) + k.emit(v_accvgpr_read(v[30], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_write(v[40], v[27])) + k.emit(v_accvgpr_write(v[44], v[28])) + k.emit(v_accvgpr_write(v[48], v[29])) + k.emit(v_accvgpr_write(v[52], v[30])) + k.emit(v_accvgpr_read(v[25], v[72])) + k.emit(v_accvgpr_read(v[26], v[76])) + k.emit(v_accvgpr_read(v[27], v[80])) + k.emit(v_accvgpr_read(v[28], v[84])) + k.emit(v_accvgpr_read(v[29], v[88])) + k.emit(v_accvgpr_read(v[30], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_write(v[72], v[27])) + k.emit(v_accvgpr_write(v[76], v[28])) + k.emit(v_accvgpr_write(v[80], v[29])) + k.emit(v_accvgpr_write(v[84], v[30])) + k.emit(v_accvgpr_read(v[25], v[104])) + k.emit(v_accvgpr_read(v[26], v[108])) + k.emit(v_accvgpr_read(v[27], v[112])) + k.emit(v_accvgpr_read(v[28], v[116])) + k.emit(v_accvgpr_read(v[29], v[120])) + k.emit(v_accvgpr_read(v[30], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_write(v[104], v[27])) + k.emit(v_accvgpr_write(v[108], v[28])) + k.emit(v_accvgpr_write(v[112], v[29])) + k.emit(v_accvgpr_write(v[116], v[30])) + k.emit(v_accvgpr_read(v[25], v[136])) + k.emit(v_accvgpr_read(v[26], v[140])) + k.emit(v_accvgpr_read(v[27], v[144])) + k.emit(v_accvgpr_read(v[28], v[148])) + k.emit(v_accvgpr_read(v[29], v[152])) + k.emit(v_accvgpr_read(v[30], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_write(v[136], v[27])) + k.emit(v_accvgpr_write(v[140], v[28])) + k.emit(v_accvgpr_write(v[144], v[29])) + k.emit(v_accvgpr_write(v[148], v[30])) + k.emit(v_accvgpr_read(v[25], v[168])) + k.emit(v_accvgpr_read(v[26], v[172])) + k.emit(v_accvgpr_read(v[27], v[176])) + k.emit(v_accvgpr_read(v[28], v[180])) + k.emit(v_accvgpr_read(v[29], v[184])) + k.emit(v_accvgpr_read(v[30], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_write(v[168], v[27])) + k.emit(v_accvgpr_write(v[172], v[28])) + k.emit(v_accvgpr_write(v[176], v[29])) + k.emit(v_accvgpr_write(v[180], v[30])) + k.emit(v_accvgpr_read(v[25], v[200])) + k.emit(v_accvgpr_read(v[26], v[204])) + k.emit(v_accvgpr_read(v[27], v[208])) + k.emit(v_accvgpr_read(v[28], v[212])) + k.emit(v_accvgpr_read(v[29], v[216])) + k.emit(v_accvgpr_read(v[30], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_write(v[200], v[27])) + k.emit(v_accvgpr_write(v[204], v[28])) + k.emit(v_accvgpr_write(v[208], v[29])) + k.emit(v_accvgpr_write(v[212], v[30])) + k.emit(v_accvgpr_read(v[25], v[232])) + k.emit(v_accvgpr_read(v[26], v[236])) + k.emit(v_accvgpr_read(v[27], v[240])) + k.emit(v_accvgpr_read(v[28], v[244])) + k.emit(v_accvgpr_read(v[29], v[248])) + k.emit(v_accvgpr_read(v[30], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_write(v[232], v[27])) + k.emit(v_accvgpr_write(v[236], v[28])) + k.emit(v_accvgpr_write(v[240], v[29])) + k.emit(v_accvgpr_write(v[244], v[30])) + k.emit(v_accvgpr_read(v[25], v[9])) + k.emit(v_accvgpr_read(v[26], v[13])) + k.emit(v_accvgpr_read(v[27], v[17])) + k.emit(v_accvgpr_read(v[28], v[21])) + k.emit(v_accvgpr_read(v[29], v[25])) + k.emit(v_accvgpr_read(v[30], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_write(v[9], v[27])) + k.emit(v_accvgpr_write(v[13], v[28])) + k.emit(v_accvgpr_write(v[17], v[29])) + k.emit(v_accvgpr_write(v[21], v[30])) + k.emit(v_accvgpr_read(v[25], v[41])) + k.emit(v_accvgpr_read(v[26], v[45])) + k.emit(v_accvgpr_read(v[27], v[49])) + k.emit(v_accvgpr_read(v[28], v[53])) + k.emit(v_accvgpr_read(v[29], v[57])) + k.emit(v_accvgpr_read(v[30], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_write(v[41], v[27])) + k.emit(v_accvgpr_write(v[45], v[28])) + k.emit(v_accvgpr_write(v[49], v[29])) + k.emit(v_accvgpr_write(v[53], v[30])) + k.emit(v_accvgpr_read(v[25], v[73])) + k.emit(v_accvgpr_read(v[26], v[77])) + k.emit(v_accvgpr_read(v[27], v[81])) + k.emit(v_accvgpr_read(v[28], v[85])) + k.emit(v_accvgpr_read(v[29], v[89])) + k.emit(v_accvgpr_read(v[30], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_write(v[73], v[27])) + k.emit(v_accvgpr_write(v[77], v[28])) + k.emit(v_accvgpr_write(v[81], v[29])) + k.emit(v_accvgpr_write(v[85], v[30])) + k.emit(v_accvgpr_read(v[25], v[105])) + k.emit(v_accvgpr_read(v[26], v[109])) + k.emit(v_accvgpr_read(v[27], v[113])) + k.emit(v_accvgpr_read(v[28], v[117])) + k.emit(v_accvgpr_read(v[29], v[121])) + k.emit(v_accvgpr_read(v[30], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_write(v[105], v[27])) + k.emit(v_accvgpr_write(v[109], v[28])) + k.emit(v_accvgpr_write(v[113], v[29])) + k.emit(v_accvgpr_write(v[117], v[30])) + k.emit(v_accvgpr_read(v[25], v[137])) + k.emit(v_accvgpr_read(v[26], v[141])) + k.emit(v_accvgpr_read(v[27], v[145])) + k.emit(v_accvgpr_read(v[28], v[149])) + k.emit(v_accvgpr_read(v[29], v[153])) + k.emit(v_accvgpr_read(v[30], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_write(v[137], v[27])) + k.emit(v_accvgpr_write(v[141], v[28])) + k.emit(v_accvgpr_write(v[145], v[29])) + k.emit(v_accvgpr_write(v[149], v[30])) + k.emit(v_accvgpr_read(v[25], v[169])) + k.emit(v_accvgpr_read(v[26], v[173])) + k.emit(v_accvgpr_read(v[27], v[177])) + k.emit(v_accvgpr_read(v[28], v[181])) + k.emit(v_accvgpr_read(v[29], v[185])) + k.emit(v_accvgpr_read(v[30], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_write(v[169], v[27])) + k.emit(v_accvgpr_write(v[173], v[28])) + k.emit(v_accvgpr_write(v[177], v[29])) + k.emit(v_accvgpr_write(v[181], v[30])) + k.emit(v_accvgpr_read(v[25], v[201])) + k.emit(v_accvgpr_read(v[26], v[205])) + k.emit(v_accvgpr_read(v[27], v[209])) + k.emit(v_accvgpr_read(v[28], v[213])) + k.emit(v_accvgpr_read(v[29], v[217])) + k.emit(v_accvgpr_read(v[30], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_write(v[201], v[27])) + k.emit(v_accvgpr_write(v[205], v[28])) + k.emit(v_accvgpr_write(v[209], v[29])) + k.emit(v_accvgpr_write(v[213], v[30])) + k.emit(v_accvgpr_read(v[25], v[233])) + k.emit(v_accvgpr_read(v[26], v[237])) + k.emit(v_accvgpr_read(v[27], v[241])) + k.emit(v_accvgpr_read(v[28], v[245])) + k.emit(v_accvgpr_read(v[29], v[249])) + k.emit(v_accvgpr_read(v[30], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_write(v[233], v[27])) + k.emit(v_accvgpr_write(v[237], v[28])) + k.emit(v_accvgpr_write(v[241], v[29])) + k.emit(v_accvgpr_write(v[245], v[30])) + k.emit(v_accvgpr_read(v[25], v[10])) + k.emit(v_accvgpr_read(v[26], v[14])) + k.emit(v_accvgpr_read(v[27], v[18])) + k.emit(v_accvgpr_read(v[28], v[22])) + k.emit(v_accvgpr_read(v[29], v[26])) + k.emit(v_accvgpr_read(v[30], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_write(v[10], v[27])) + k.emit(v_accvgpr_write(v[14], v[28])) + k.emit(v_accvgpr_write(v[18], v[29])) + k.emit(v_accvgpr_write(v[22], v[30])) + k.emit(v_accvgpr_read(v[25], v[42])) + k.emit(v_accvgpr_read(v[26], v[46])) + k.emit(v_accvgpr_read(v[27], v[50])) + k.emit(v_accvgpr_read(v[28], v[54])) + k.emit(v_accvgpr_read(v[29], v[58])) + k.emit(v_accvgpr_read(v[30], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_write(v[42], v[27])) + k.emit(v_accvgpr_write(v[46], v[28])) + k.emit(v_accvgpr_write(v[50], v[29])) + k.emit(v_accvgpr_write(v[54], v[30])) + k.emit(v_accvgpr_read(v[25], v[74])) + k.emit(v_accvgpr_read(v[26], v[78])) + k.emit(v_accvgpr_read(v[27], v[82])) + k.emit(v_accvgpr_read(v[28], v[86])) + k.emit(v_accvgpr_read(v[29], v[90])) + k.emit(v_accvgpr_read(v[30], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_write(v[74], v[27])) + k.emit(v_accvgpr_write(v[78], v[28])) + k.emit(v_accvgpr_write(v[82], v[29])) + k.emit(v_accvgpr_write(v[86], v[30])) + k.emit(v_accvgpr_read(v[25], v[106])) + k.emit(v_accvgpr_read(v[26], v[110])) + k.emit(v_accvgpr_read(v[27], v[114])) + k.emit(v_accvgpr_read(v[28], v[118])) + k.emit(v_accvgpr_read(v[29], v[122])) + k.emit(v_accvgpr_read(v[30], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_write(v[106], v[27])) + k.emit(v_accvgpr_write(v[110], v[28])) + k.emit(v_accvgpr_write(v[114], v[29])) + k.emit(v_accvgpr_write(v[118], v[30])) + k.emit(v_accvgpr_read(v[25], v[138])) + k.emit(v_accvgpr_read(v[26], v[142])) + k.emit(v_accvgpr_read(v[27], v[146])) + k.emit(v_accvgpr_read(v[28], v[150])) + k.emit(v_accvgpr_read(v[29], v[154])) + k.emit(v_accvgpr_read(v[30], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_write(v[138], v[27])) + k.emit(v_accvgpr_write(v[142], v[28])) + k.emit(v_accvgpr_write(v[146], v[29])) + k.emit(v_accvgpr_write(v[150], v[30])) + k.emit(v_accvgpr_read(v[25], v[170])) + k.emit(v_accvgpr_read(v[26], v[174])) + k.emit(v_accvgpr_read(v[27], v[178])) + k.emit(v_accvgpr_read(v[28], v[182])) + k.emit(v_accvgpr_read(v[29], v[186])) + k.emit(v_accvgpr_read(v[30], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_write(v[170], v[27])) + k.emit(v_accvgpr_write(v[174], v[28])) + k.emit(v_accvgpr_write(v[178], v[29])) + k.emit(v_accvgpr_write(v[182], v[30])) + k.emit(v_accvgpr_read(v[25], v[202])) + k.emit(v_accvgpr_read(v[26], v[206])) + k.emit(v_accvgpr_read(v[27], v[210])) + k.emit(v_accvgpr_read(v[28], v[214])) + k.emit(v_accvgpr_read(v[29], v[218])) + k.emit(v_accvgpr_read(v[30], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_write(v[202], v[27])) + k.emit(v_accvgpr_write(v[206], v[28])) + k.emit(v_accvgpr_write(v[210], v[29])) + k.emit(v_accvgpr_write(v[214], v[30])) + k.emit(v_accvgpr_read(v[25], v[234])) + k.emit(v_accvgpr_read(v[26], v[238])) + k.emit(v_accvgpr_read(v[27], v[242])) + k.emit(v_accvgpr_read(v[28], v[246])) + k.emit(v_accvgpr_read(v[29], v[250])) + k.emit(v_accvgpr_read(v[30], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_write(v[234], v[27])) + k.emit(v_accvgpr_write(v[238], v[28])) + k.emit(v_accvgpr_write(v[242], v[29])) + k.emit(v_accvgpr_write(v[246], v[30])) + k.emit(v_accvgpr_read(v[25], v[11])) + k.emit(v_accvgpr_read(v[26], v[15])) + k.emit(v_accvgpr_read(v[27], v[19])) + k.emit(v_accvgpr_read(v[28], v[23])) + k.emit(v_accvgpr_read(v[29], v[27])) + k.emit(v_accvgpr_read(v[30], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_write(v[11], v[27])) + k.emit(v_accvgpr_write(v[15], v[28])) + k.emit(v_accvgpr_write(v[19], v[29])) + k.emit(v_accvgpr_write(v[23], v[30])) + k.emit(v_accvgpr_read(v[25], v[43])) + k.emit(v_accvgpr_read(v[26], v[47])) + k.emit(v_accvgpr_read(v[27], v[51])) + k.emit(v_accvgpr_read(v[28], v[55])) + k.emit(v_accvgpr_read(v[29], v[59])) + k.emit(v_accvgpr_read(v[30], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_write(v[43], v[27])) + k.emit(v_accvgpr_write(v[47], v[28])) + k.emit(v_accvgpr_write(v[51], v[29])) + k.emit(v_accvgpr_write(v[55], v[30])) + k.emit(v_accvgpr_read(v[25], v[75])) + k.emit(v_accvgpr_read(v[26], v[79])) + k.emit(v_accvgpr_read(v[27], v[83])) + k.emit(v_accvgpr_read(v[28], v[87])) + k.emit(v_accvgpr_read(v[29], v[91])) + k.emit(v_accvgpr_read(v[30], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_write(v[75], v[27])) + k.emit(v_accvgpr_write(v[79], v[28])) + k.emit(v_accvgpr_write(v[83], v[29])) + k.emit(v_accvgpr_write(v[87], v[30])) + k.emit(v_accvgpr_read(v[25], v[107])) + k.emit(v_accvgpr_read(v[26], v[111])) + k.emit(v_accvgpr_read(v[27], v[115])) + k.emit(v_accvgpr_read(v[28], v[119])) + k.emit(v_accvgpr_read(v[29], v[123])) + k.emit(v_accvgpr_read(v[30], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_write(v[107], v[27])) + k.emit(v_accvgpr_write(v[111], v[28])) + k.emit(v_accvgpr_write(v[115], v[29])) + k.emit(v_accvgpr_write(v[119], v[30])) + k.emit(v_accvgpr_read(v[25], v[139])) + k.emit(v_accvgpr_read(v[26], v[143])) + k.emit(v_accvgpr_read(v[27], v[147])) + k.emit(v_accvgpr_read(v[28], v[151])) + k.emit(v_accvgpr_read(v[29], v[155])) + k.emit(v_accvgpr_read(v[30], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_write(v[139], v[27])) + k.emit(v_accvgpr_write(v[143], v[28])) + k.emit(v_accvgpr_write(v[147], v[29])) + k.emit(v_accvgpr_write(v[151], v[30])) + k.emit(v_accvgpr_read(v[25], v[171])) + k.emit(v_accvgpr_read(v[26], v[175])) + k.emit(v_accvgpr_read(v[27], v[179])) + k.emit(v_accvgpr_read(v[28], v[183])) + k.emit(v_accvgpr_read(v[29], v[187])) + k.emit(v_accvgpr_read(v[30], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_write(v[171], v[27])) + k.emit(v_accvgpr_write(v[175], v[28])) + k.emit(v_accvgpr_write(v[179], v[29])) + k.emit(v_accvgpr_write(v[183], v[30])) + k.emit(v_accvgpr_read(v[25], v[203])) + k.emit(v_accvgpr_read(v[26], v[207])) + k.emit(v_accvgpr_read(v[27], v[211])) + k.emit(v_accvgpr_read(v[28], v[215])) + k.emit(v_accvgpr_read(v[29], v[219])) + k.emit(v_accvgpr_read(v[30], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_write(v[203], v[27])) + k.emit(v_accvgpr_write(v[207], v[28])) + k.emit(v_accvgpr_write(v[211], v[29])) + k.emit(v_accvgpr_write(v[215], v[30])) + k.emit(v_accvgpr_read(v[25], v[235])) + k.emit(v_accvgpr_read(v[26], v[239])) + k.emit(v_accvgpr_read(v[27], v[243])) + k.emit(v_accvgpr_read(v[28], v[247])) + k.emit(v_accvgpr_read(v[29], v[251])) + k.emit(v_accvgpr_read(v[30], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(v_accvgpr_write(v[235], v[27])) + k.emit(v_accvgpr_write(v[239], v[28])) + k.emit(v_accvgpr_write(v[243], v[29])) + k.emit(v_accvgpr_write(v[247], v[30])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0') + k.label('ShiftVectorComponents0_GLVW7_BM0_VW0') + k.emit(s_mov_b32(s[8], 0)) + k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8])) + k.emit(v_and_b32_e32(v[18], 63, v[180])) + k.emit(v_lshlrev_b32_e32(v[18], 2, v[18])) + k.emit(v_accvgpr_read(v[25], v[4])) + k.emit(v_accvgpr_read(v[26], v[8])) + k.emit(v_accvgpr_read(v[27], v[12])) + k.emit(v_accvgpr_read(v[28], v[16])) + k.emit(v_accvgpr_read(v[29], v[20])) + k.emit(v_accvgpr_read(v[30], v[24])) + k.emit(v_accvgpr_read(v[31], v[28])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[0], v[25])) + k.emit(v_accvgpr_write(v[4], v[26])) + k.emit(v_accvgpr_write(v[8], v[27])) + k.emit(v_accvgpr_write(v[12], v[28])) + k.emit(v_accvgpr_write(v[16], v[29])) + k.emit(v_accvgpr_write(v[20], v[30])) + k.emit(v_accvgpr_write(v[24], v[31])) + k.emit(v_accvgpr_read(v[25], v[36])) + k.emit(v_accvgpr_read(v[26], v[40])) + k.emit(v_accvgpr_read(v[27], v[44])) + k.emit(v_accvgpr_read(v[28], v[48])) + k.emit(v_accvgpr_read(v[29], v[52])) + k.emit(v_accvgpr_read(v[30], v[56])) + k.emit(v_accvgpr_read(v[31], v[60])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[32], v[25])) + k.emit(v_accvgpr_write(v[36], v[26])) + k.emit(v_accvgpr_write(v[40], v[27])) + k.emit(v_accvgpr_write(v[44], v[28])) + k.emit(v_accvgpr_write(v[48], v[29])) + k.emit(v_accvgpr_write(v[52], v[30])) + k.emit(v_accvgpr_write(v[56], v[31])) + k.emit(v_accvgpr_read(v[25], v[68])) + k.emit(v_accvgpr_read(v[26], v[72])) + k.emit(v_accvgpr_read(v[27], v[76])) + k.emit(v_accvgpr_read(v[28], v[80])) + k.emit(v_accvgpr_read(v[29], v[84])) + k.emit(v_accvgpr_read(v[30], v[88])) + k.emit(v_accvgpr_read(v[31], v[92])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[64], v[25])) + k.emit(v_accvgpr_write(v[68], v[26])) + k.emit(v_accvgpr_write(v[72], v[27])) + k.emit(v_accvgpr_write(v[76], v[28])) + k.emit(v_accvgpr_write(v[80], v[29])) + k.emit(v_accvgpr_write(v[84], v[30])) + k.emit(v_accvgpr_write(v[88], v[31])) + k.emit(v_accvgpr_read(v[25], v[100])) + k.emit(v_accvgpr_read(v[26], v[104])) + k.emit(v_accvgpr_read(v[27], v[108])) + k.emit(v_accvgpr_read(v[28], v[112])) + k.emit(v_accvgpr_read(v[29], v[116])) + k.emit(v_accvgpr_read(v[30], v[120])) + k.emit(v_accvgpr_read(v[31], v[124])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[96], v[25])) + k.emit(v_accvgpr_write(v[100], v[26])) + k.emit(v_accvgpr_write(v[104], v[27])) + k.emit(v_accvgpr_write(v[108], v[28])) + k.emit(v_accvgpr_write(v[112], v[29])) + k.emit(v_accvgpr_write(v[116], v[30])) + k.emit(v_accvgpr_write(v[120], v[31])) + k.emit(v_accvgpr_read(v[25], v[132])) + k.emit(v_accvgpr_read(v[26], v[136])) + k.emit(v_accvgpr_read(v[27], v[140])) + k.emit(v_accvgpr_read(v[28], v[144])) + k.emit(v_accvgpr_read(v[29], v[148])) + k.emit(v_accvgpr_read(v[30], v[152])) + k.emit(v_accvgpr_read(v[31], v[156])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[128], v[25])) + k.emit(v_accvgpr_write(v[132], v[26])) + k.emit(v_accvgpr_write(v[136], v[27])) + k.emit(v_accvgpr_write(v[140], v[28])) + k.emit(v_accvgpr_write(v[144], v[29])) + k.emit(v_accvgpr_write(v[148], v[30])) + k.emit(v_accvgpr_write(v[152], v[31])) + k.emit(v_accvgpr_read(v[25], v[164])) + k.emit(v_accvgpr_read(v[26], v[168])) + k.emit(v_accvgpr_read(v[27], v[172])) + k.emit(v_accvgpr_read(v[28], v[176])) + k.emit(v_accvgpr_read(v[29], v[180])) + k.emit(v_accvgpr_read(v[30], v[184])) + k.emit(v_accvgpr_read(v[31], v[188])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[160], v[25])) + k.emit(v_accvgpr_write(v[164], v[26])) + k.emit(v_accvgpr_write(v[168], v[27])) + k.emit(v_accvgpr_write(v[172], v[28])) + k.emit(v_accvgpr_write(v[176], v[29])) + k.emit(v_accvgpr_write(v[180], v[30])) + k.emit(v_accvgpr_write(v[184], v[31])) + k.emit(v_accvgpr_read(v[25], v[196])) + k.emit(v_accvgpr_read(v[26], v[200])) + k.emit(v_accvgpr_read(v[27], v[204])) + k.emit(v_accvgpr_read(v[28], v[208])) + k.emit(v_accvgpr_read(v[29], v[212])) + k.emit(v_accvgpr_read(v[30], v[216])) + k.emit(v_accvgpr_read(v[31], v[220])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[192], v[25])) + k.emit(v_accvgpr_write(v[196], v[26])) + k.emit(v_accvgpr_write(v[200], v[27])) + k.emit(v_accvgpr_write(v[204], v[28])) + k.emit(v_accvgpr_write(v[208], v[29])) + k.emit(v_accvgpr_write(v[212], v[30])) + k.emit(v_accvgpr_write(v[216], v[31])) + k.emit(v_accvgpr_read(v[25], v[228])) + k.emit(v_accvgpr_read(v[26], v[232])) + k.emit(v_accvgpr_read(v[27], v[236])) + k.emit(v_accvgpr_read(v[28], v[240])) + k.emit(v_accvgpr_read(v[29], v[244])) + k.emit(v_accvgpr_read(v[30], v[248])) + k.emit(v_accvgpr_read(v[31], v[252])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[224], v[25])) + k.emit(v_accvgpr_write(v[228], v[26])) + k.emit(v_accvgpr_write(v[232], v[27])) + k.emit(v_accvgpr_write(v[236], v[28])) + k.emit(v_accvgpr_write(v[240], v[29])) + k.emit(v_accvgpr_write(v[244], v[30])) + k.emit(v_accvgpr_write(v[248], v[31])) + k.emit(v_accvgpr_read(v[25], v[5])) + k.emit(v_accvgpr_read(v[26], v[9])) + k.emit(v_accvgpr_read(v[27], v[13])) + k.emit(v_accvgpr_read(v[28], v[17])) + k.emit(v_accvgpr_read(v[29], v[21])) + k.emit(v_accvgpr_read(v[30], v[25])) + k.emit(v_accvgpr_read(v[31], v[29])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[1], v[25])) + k.emit(v_accvgpr_write(v[5], v[26])) + k.emit(v_accvgpr_write(v[9], v[27])) + k.emit(v_accvgpr_write(v[13], v[28])) + k.emit(v_accvgpr_write(v[17], v[29])) + k.emit(v_accvgpr_write(v[21], v[30])) + k.emit(v_accvgpr_write(v[25], v[31])) + k.emit(v_accvgpr_read(v[25], v[37])) + k.emit(v_accvgpr_read(v[26], v[41])) + k.emit(v_accvgpr_read(v[27], v[45])) + k.emit(v_accvgpr_read(v[28], v[49])) + k.emit(v_accvgpr_read(v[29], v[53])) + k.emit(v_accvgpr_read(v[30], v[57])) + k.emit(v_accvgpr_read(v[31], v[61])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[33], v[25])) + k.emit(v_accvgpr_write(v[37], v[26])) + k.emit(v_accvgpr_write(v[41], v[27])) + k.emit(v_accvgpr_write(v[45], v[28])) + k.emit(v_accvgpr_write(v[49], v[29])) + k.emit(v_accvgpr_write(v[53], v[30])) + k.emit(v_accvgpr_write(v[57], v[31])) + k.emit(v_accvgpr_read(v[25], v[69])) + k.emit(v_accvgpr_read(v[26], v[73])) + k.emit(v_accvgpr_read(v[27], v[77])) + k.emit(v_accvgpr_read(v[28], v[81])) + k.emit(v_accvgpr_read(v[29], v[85])) + k.emit(v_accvgpr_read(v[30], v[89])) + k.emit(v_accvgpr_read(v[31], v[93])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[65], v[25])) + k.emit(v_accvgpr_write(v[69], v[26])) + k.emit(v_accvgpr_write(v[73], v[27])) + k.emit(v_accvgpr_write(v[77], v[28])) + k.emit(v_accvgpr_write(v[81], v[29])) + k.emit(v_accvgpr_write(v[85], v[30])) + k.emit(v_accvgpr_write(v[89], v[31])) + k.emit(v_accvgpr_read(v[25], v[101])) + k.emit(v_accvgpr_read(v[26], v[105])) + k.emit(v_accvgpr_read(v[27], v[109])) + k.emit(v_accvgpr_read(v[28], v[113])) + k.emit(v_accvgpr_read(v[29], v[117])) + k.emit(v_accvgpr_read(v[30], v[121])) + k.emit(v_accvgpr_read(v[31], v[125])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[97], v[25])) + k.emit(v_accvgpr_write(v[101], v[26])) + k.emit(v_accvgpr_write(v[105], v[27])) + k.emit(v_accvgpr_write(v[109], v[28])) + k.emit(v_accvgpr_write(v[113], v[29])) + k.emit(v_accvgpr_write(v[117], v[30])) + k.emit(v_accvgpr_write(v[121], v[31])) + k.emit(v_accvgpr_read(v[25], v[133])) + k.emit(v_accvgpr_read(v[26], v[137])) + k.emit(v_accvgpr_read(v[27], v[141])) + k.emit(v_accvgpr_read(v[28], v[145])) + k.emit(v_accvgpr_read(v[29], v[149])) + k.emit(v_accvgpr_read(v[30], v[153])) + k.emit(v_accvgpr_read(v[31], v[157])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[129], v[25])) + k.emit(v_accvgpr_write(v[133], v[26])) + k.emit(v_accvgpr_write(v[137], v[27])) + k.emit(v_accvgpr_write(v[141], v[28])) + k.emit(v_accvgpr_write(v[145], v[29])) + k.emit(v_accvgpr_write(v[149], v[30])) + k.emit(v_accvgpr_write(v[153], v[31])) + k.emit(v_accvgpr_read(v[25], v[165])) + k.emit(v_accvgpr_read(v[26], v[169])) + k.emit(v_accvgpr_read(v[27], v[173])) + k.emit(v_accvgpr_read(v[28], v[177])) + k.emit(v_accvgpr_read(v[29], v[181])) + k.emit(v_accvgpr_read(v[30], v[185])) + k.emit(v_accvgpr_read(v[31], v[189])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[161], v[25])) + k.emit(v_accvgpr_write(v[165], v[26])) + k.emit(v_accvgpr_write(v[169], v[27])) + k.emit(v_accvgpr_write(v[173], v[28])) + k.emit(v_accvgpr_write(v[177], v[29])) + k.emit(v_accvgpr_write(v[181], v[30])) + k.emit(v_accvgpr_write(v[185], v[31])) + k.emit(v_accvgpr_read(v[25], v[197])) + k.emit(v_accvgpr_read(v[26], v[201])) + k.emit(v_accvgpr_read(v[27], v[205])) + k.emit(v_accvgpr_read(v[28], v[209])) + k.emit(v_accvgpr_read(v[29], v[213])) + k.emit(v_accvgpr_read(v[30], v[217])) + k.emit(v_accvgpr_read(v[31], v[221])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[193], v[25])) + k.emit(v_accvgpr_write(v[197], v[26])) + k.emit(v_accvgpr_write(v[201], v[27])) + k.emit(v_accvgpr_write(v[205], v[28])) + k.emit(v_accvgpr_write(v[209], v[29])) + k.emit(v_accvgpr_write(v[213], v[30])) + k.emit(v_accvgpr_write(v[217], v[31])) + k.emit(v_accvgpr_read(v[25], v[229])) + k.emit(v_accvgpr_read(v[26], v[233])) + k.emit(v_accvgpr_read(v[27], v[237])) + k.emit(v_accvgpr_read(v[28], v[241])) + k.emit(v_accvgpr_read(v[29], v[245])) + k.emit(v_accvgpr_read(v[30], v[249])) + k.emit(v_accvgpr_read(v[31], v[253])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[225], v[25])) + k.emit(v_accvgpr_write(v[229], v[26])) + k.emit(v_accvgpr_write(v[233], v[27])) + k.emit(v_accvgpr_write(v[237], v[28])) + k.emit(v_accvgpr_write(v[241], v[29])) + k.emit(v_accvgpr_write(v[245], v[30])) + k.emit(v_accvgpr_write(v[249], v[31])) + k.emit(v_accvgpr_read(v[25], v[6])) + k.emit(v_accvgpr_read(v[26], v[10])) + k.emit(v_accvgpr_read(v[27], v[14])) + k.emit(v_accvgpr_read(v[28], v[18])) + k.emit(v_accvgpr_read(v[29], v[22])) + k.emit(v_accvgpr_read(v[30], v[26])) + k.emit(v_accvgpr_read(v[31], v[30])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[2], v[25])) + k.emit(v_accvgpr_write(v[6], v[26])) + k.emit(v_accvgpr_write(v[10], v[27])) + k.emit(v_accvgpr_write(v[14], v[28])) + k.emit(v_accvgpr_write(v[18], v[29])) + k.emit(v_accvgpr_write(v[22], v[30])) + k.emit(v_accvgpr_write(v[26], v[31])) + k.emit(v_accvgpr_read(v[25], v[38])) + k.emit(v_accvgpr_read(v[26], v[42])) + k.emit(v_accvgpr_read(v[27], v[46])) + k.emit(v_accvgpr_read(v[28], v[50])) + k.emit(v_accvgpr_read(v[29], v[54])) + k.emit(v_accvgpr_read(v[30], v[58])) + k.emit(v_accvgpr_read(v[31], v[62])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[34], v[25])) + k.emit(v_accvgpr_write(v[38], v[26])) + k.emit(v_accvgpr_write(v[42], v[27])) + k.emit(v_accvgpr_write(v[46], v[28])) + k.emit(v_accvgpr_write(v[50], v[29])) + k.emit(v_accvgpr_write(v[54], v[30])) + k.emit(v_accvgpr_write(v[58], v[31])) + k.emit(v_accvgpr_read(v[25], v[70])) + k.emit(v_accvgpr_read(v[26], v[74])) + k.emit(v_accvgpr_read(v[27], v[78])) + k.emit(v_accvgpr_read(v[28], v[82])) + k.emit(v_accvgpr_read(v[29], v[86])) + k.emit(v_accvgpr_read(v[30], v[90])) + k.emit(v_accvgpr_read(v[31], v[94])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[66], v[25])) + k.emit(v_accvgpr_write(v[70], v[26])) + k.emit(v_accvgpr_write(v[74], v[27])) + k.emit(v_accvgpr_write(v[78], v[28])) + k.emit(v_accvgpr_write(v[82], v[29])) + k.emit(v_accvgpr_write(v[86], v[30])) + k.emit(v_accvgpr_write(v[90], v[31])) + k.emit(v_accvgpr_read(v[25], v[102])) + k.emit(v_accvgpr_read(v[26], v[106])) + k.emit(v_accvgpr_read(v[27], v[110])) + k.emit(v_accvgpr_read(v[28], v[114])) + k.emit(v_accvgpr_read(v[29], v[118])) + k.emit(v_accvgpr_read(v[30], v[122])) + k.emit(v_accvgpr_read(v[31], v[126])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[98], v[25])) + k.emit(v_accvgpr_write(v[102], v[26])) + k.emit(v_accvgpr_write(v[106], v[27])) + k.emit(v_accvgpr_write(v[110], v[28])) + k.emit(v_accvgpr_write(v[114], v[29])) + k.emit(v_accvgpr_write(v[118], v[30])) + k.emit(v_accvgpr_write(v[122], v[31])) + k.emit(v_accvgpr_read(v[25], v[134])) + k.emit(v_accvgpr_read(v[26], v[138])) + k.emit(v_accvgpr_read(v[27], v[142])) + k.emit(v_accvgpr_read(v[28], v[146])) + k.emit(v_accvgpr_read(v[29], v[150])) + k.emit(v_accvgpr_read(v[30], v[154])) + k.emit(v_accvgpr_read(v[31], v[158])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[130], v[25])) + k.emit(v_accvgpr_write(v[134], v[26])) + k.emit(v_accvgpr_write(v[138], v[27])) + k.emit(v_accvgpr_write(v[142], v[28])) + k.emit(v_accvgpr_write(v[146], v[29])) + k.emit(v_accvgpr_write(v[150], v[30])) + k.emit(v_accvgpr_write(v[154], v[31])) + k.emit(v_accvgpr_read(v[25], v[166])) + k.emit(v_accvgpr_read(v[26], v[170])) + k.emit(v_accvgpr_read(v[27], v[174])) + k.emit(v_accvgpr_read(v[28], v[178])) + k.emit(v_accvgpr_read(v[29], v[182])) + k.emit(v_accvgpr_read(v[30], v[186])) + k.emit(v_accvgpr_read(v[31], v[190])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[162], v[25])) + k.emit(v_accvgpr_write(v[166], v[26])) + k.emit(v_accvgpr_write(v[170], v[27])) + k.emit(v_accvgpr_write(v[174], v[28])) + k.emit(v_accvgpr_write(v[178], v[29])) + k.emit(v_accvgpr_write(v[182], v[30])) + k.emit(v_accvgpr_write(v[186], v[31])) + k.emit(v_accvgpr_read(v[25], v[198])) + k.emit(v_accvgpr_read(v[26], v[202])) + k.emit(v_accvgpr_read(v[27], v[206])) + k.emit(v_accvgpr_read(v[28], v[210])) + k.emit(v_accvgpr_read(v[29], v[214])) + k.emit(v_accvgpr_read(v[30], v[218])) + k.emit(v_accvgpr_read(v[31], v[222])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[194], v[25])) + k.emit(v_accvgpr_write(v[198], v[26])) + k.emit(v_accvgpr_write(v[202], v[27])) + k.emit(v_accvgpr_write(v[206], v[28])) + k.emit(v_accvgpr_write(v[210], v[29])) + k.emit(v_accvgpr_write(v[214], v[30])) + k.emit(v_accvgpr_write(v[218], v[31])) + k.emit(v_accvgpr_read(v[25], v[230])) + k.emit(v_accvgpr_read(v[26], v[234])) + k.emit(v_accvgpr_read(v[27], v[238])) + k.emit(v_accvgpr_read(v[28], v[242])) + k.emit(v_accvgpr_read(v[29], v[246])) + k.emit(v_accvgpr_read(v[30], v[250])) + k.emit(v_accvgpr_read(v[31], v[254])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[226], v[25])) + k.emit(v_accvgpr_write(v[230], v[26])) + k.emit(v_accvgpr_write(v[234], v[27])) + k.emit(v_accvgpr_write(v[238], v[28])) + k.emit(v_accvgpr_write(v[242], v[29])) + k.emit(v_accvgpr_write(v[246], v[30])) + k.emit(v_accvgpr_write(v[250], v[31])) + k.emit(v_accvgpr_read(v[25], v[7])) + k.emit(v_accvgpr_read(v[26], v[11])) + k.emit(v_accvgpr_read(v[27], v[15])) + k.emit(v_accvgpr_read(v[28], v[19])) + k.emit(v_accvgpr_read(v[29], v[23])) + k.emit(v_accvgpr_read(v[30], v[27])) + k.emit(v_accvgpr_read(v[31], v[31])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[3], v[25])) + k.emit(v_accvgpr_write(v[7], v[26])) + k.emit(v_accvgpr_write(v[11], v[27])) + k.emit(v_accvgpr_write(v[15], v[28])) + k.emit(v_accvgpr_write(v[19], v[29])) + k.emit(v_accvgpr_write(v[23], v[30])) + k.emit(v_accvgpr_write(v[27], v[31])) + k.emit(v_accvgpr_read(v[25], v[39])) + k.emit(v_accvgpr_read(v[26], v[43])) + k.emit(v_accvgpr_read(v[27], v[47])) + k.emit(v_accvgpr_read(v[28], v[51])) + k.emit(v_accvgpr_read(v[29], v[55])) + k.emit(v_accvgpr_read(v[30], v[59])) + k.emit(v_accvgpr_read(v[31], v[63])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[35], v[25])) + k.emit(v_accvgpr_write(v[39], v[26])) + k.emit(v_accvgpr_write(v[43], v[27])) + k.emit(v_accvgpr_write(v[47], v[28])) + k.emit(v_accvgpr_write(v[51], v[29])) + k.emit(v_accvgpr_write(v[55], v[30])) + k.emit(v_accvgpr_write(v[59], v[31])) + k.emit(v_accvgpr_read(v[25], v[71])) + k.emit(v_accvgpr_read(v[26], v[75])) + k.emit(v_accvgpr_read(v[27], v[79])) + k.emit(v_accvgpr_read(v[28], v[83])) + k.emit(v_accvgpr_read(v[29], v[87])) + k.emit(v_accvgpr_read(v[30], v[91])) + k.emit(v_accvgpr_read(v[31], v[95])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[67], v[25])) + k.emit(v_accvgpr_write(v[71], v[26])) + k.emit(v_accvgpr_write(v[75], v[27])) + k.emit(v_accvgpr_write(v[79], v[28])) + k.emit(v_accvgpr_write(v[83], v[29])) + k.emit(v_accvgpr_write(v[87], v[30])) + k.emit(v_accvgpr_write(v[91], v[31])) + k.emit(v_accvgpr_read(v[25], v[103])) + k.emit(v_accvgpr_read(v[26], v[107])) + k.emit(v_accvgpr_read(v[27], v[111])) + k.emit(v_accvgpr_read(v[28], v[115])) + k.emit(v_accvgpr_read(v[29], v[119])) + k.emit(v_accvgpr_read(v[30], v[123])) + k.emit(v_accvgpr_read(v[31], v[127])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[99], v[25])) + k.emit(v_accvgpr_write(v[103], v[26])) + k.emit(v_accvgpr_write(v[107], v[27])) + k.emit(v_accvgpr_write(v[111], v[28])) + k.emit(v_accvgpr_write(v[115], v[29])) + k.emit(v_accvgpr_write(v[119], v[30])) + k.emit(v_accvgpr_write(v[123], v[31])) + k.emit(v_accvgpr_read(v[25], v[135])) + k.emit(v_accvgpr_read(v[26], v[139])) + k.emit(v_accvgpr_read(v[27], v[143])) + k.emit(v_accvgpr_read(v[28], v[147])) + k.emit(v_accvgpr_read(v[29], v[151])) + k.emit(v_accvgpr_read(v[30], v[155])) + k.emit(v_accvgpr_read(v[31], v[159])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[131], v[25])) + k.emit(v_accvgpr_write(v[135], v[26])) + k.emit(v_accvgpr_write(v[139], v[27])) + k.emit(v_accvgpr_write(v[143], v[28])) + k.emit(v_accvgpr_write(v[147], v[29])) + k.emit(v_accvgpr_write(v[151], v[30])) + k.emit(v_accvgpr_write(v[155], v[31])) + k.emit(v_accvgpr_read(v[25], v[167])) + k.emit(v_accvgpr_read(v[26], v[171])) + k.emit(v_accvgpr_read(v[27], v[175])) + k.emit(v_accvgpr_read(v[28], v[179])) + k.emit(v_accvgpr_read(v[29], v[183])) + k.emit(v_accvgpr_read(v[30], v[187])) + k.emit(v_accvgpr_read(v[31], v[191])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[163], v[25])) + k.emit(v_accvgpr_write(v[167], v[26])) + k.emit(v_accvgpr_write(v[171], v[27])) + k.emit(v_accvgpr_write(v[175], v[28])) + k.emit(v_accvgpr_write(v[179], v[29])) + k.emit(v_accvgpr_write(v[183], v[30])) + k.emit(v_accvgpr_write(v[187], v[31])) + k.emit(v_accvgpr_read(v[25], v[199])) + k.emit(v_accvgpr_read(v[26], v[203])) + k.emit(v_accvgpr_read(v[27], v[207])) + k.emit(v_accvgpr_read(v[28], v[211])) + k.emit(v_accvgpr_read(v[29], v[215])) + k.emit(v_accvgpr_read(v[30], v[219])) + k.emit(v_accvgpr_read(v[31], v[223])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[195], v[25])) + k.emit(v_accvgpr_write(v[199], v[26])) + k.emit(v_accvgpr_write(v[203], v[27])) + k.emit(v_accvgpr_write(v[207], v[28])) + k.emit(v_accvgpr_write(v[211], v[29])) + k.emit(v_accvgpr_write(v[215], v[30])) + k.emit(v_accvgpr_write(v[219], v[31])) + k.emit(v_accvgpr_read(v[25], v[231])) + k.emit(v_accvgpr_read(v[26], v[235])) + k.emit(v_accvgpr_read(v[27], v[239])) + k.emit(v_accvgpr_read(v[28], v[243])) + k.emit(v_accvgpr_read(v[29], v[247])) + k.emit(v_accvgpr_read(v[30], v[251])) + k.emit(v_accvgpr_read(v[31], v[255])) + k.emit(s_nop(1)) + k.emit(v_accvgpr_write(v[227], v[25])) + k.emit(v_accvgpr_write(v[231], v[26])) + k.emit(v_accvgpr_write(v[235], v[27])) + k.emit(v_accvgpr_write(v[239], v[28])) + k.emit(v_accvgpr_write(v[243], v[29])) + k.emit(v_accvgpr_write(v[247], v[30])) + k.emit(v_accvgpr_write(v[251], v[31])) + k.emit(s_mov_b64(s[8:9], -1)) + k.emit(s_or_saveexec_b64(VCC, s[8:9])) + k.label('ShiftVectorComponents0_GLVW0') + k.emit(v_lshrrev_b32_e32(v[22], 6, v[180])) + k.emit(v_lshrrev_b32_e32(v[23], 1, v[22])) + k.emit(v_mul_lo_u32(v[23], 16, v[23])) + k.emit(v_and_b32_e32(v[19], 63, v[180])) + k.emit(v_lshrrev_b32_e32(v[19], 4, v[19])) + k.emit(v_lshlrev_b32_e32(v[19], 2, v[19])) + k.emit(v_add_lshl_u32_e64(v[19], v[23], v[19], 3)) + k.emit(v_mul_lo_u32(v[20], v[19], s[38])) + k.emit(v_mul_lo_u32(v[21], v[19], s[36])) + k.emit(v_and_b32_e32(v[18], 1, v[22])) + k.emit(v_mul_lo_u32(v[18], 16, v[18])) + k.emit(v_and_b32_e32(v[23], 15, v[180])) + k.emit(v_add_lshl_u32_e64(v[18], v[23], v[18], 3)) + k.emit(s_mul_i32(s[8], 256, s[2])) + k.emit(v_add_u32_e32(v[18], s[8], v[18])) + k.emit(s_mul_i32(s[8], 256, s[3])) + k.emit(v_add_u32_e32(v[19], s[8], v[19])) + k.waitcnt(lgkm=0) + k.emit(s_add_u32(s[8], s[4], 1)) + k.emit(s_mul_i32(s[8], s[73], s[8])) + k.emit(s_cmp_eq_u32(s[8], 0)) + k.emit(s_cselect_b32(s[8], s[20], s[8])) + k.emit(s_mov_b32(s[91], 131072)) + k.emit(s_mov_b32(s[90], 0)) + k.emit(s_mul_i32(s[8], 256, s[2])) + k.emit(v_add_u32_e32(v[26], s[8], v[180])) + k.emit(s_mul_i32(s[90], 4, s[90])) + k.emit(s_mul_i32(s[8], s[73], s[4])) + k.emit(v_add_u32_e32(v[24], s[8], v[26])) + k.emit(v_lshlrev_b32_e32(v[24], 2, v[24])) + k.emit(s_mul_i32(s[8], 256, s[3])) + k.emit(v_add_u32_e32(v[26], s[8], v[180])) + k.emit(buffer_load_dword(v[22], v[24], s[88:91], 0, 0, 1)) + k.emit(v_lshlrev_b32_e32(v[26], 2, v[180])) + k.emit(s_barrier()) + k.waitcnt(vm=0) + k.emit(ds_write_b32(v[0], v[26], v[22])) + k.emit(v_mov_b32_e32(v[23], 1.0)) + k.emit(ds_write_b32(v[0], v[26], v[23], v[0], 0, 0, 4)) + k.emit(s_mul_i32(s[8], 256, s[2])) + k.emit(v_add_u32_e32(v[26], s[8], v[180])) + k.emit(s_mul_i32(s[90], 2, s[90])) + k.emit(s_mul_i32(s[8], s[73], s[4])) + k.emit(v_add_u32_e32(v[24], s[8], v[26])) + k.emit(v_lshlrev_b32_e32(v[24], 1, v[24])) + k.emit(s_mul_i32(s[8], 256, s[3])) + k.emit(v_add_u32_e32(v[26], s[8], v[180])) + k.emit(buffer_load_short_d16(v[22], v[24], s[88:91], 0, 0, 1)) + k.emit(v_lshlrev_b32_e32(v[26], 2, v[180])) + k.emit(s_barrier()) + k.waitcnt(vm=0) + k.emit(v_cvt(v[22], SDWA, v[22], 0, 0, 0, 0, 0, 0, 6, 2, 4)) + k.emit(ds_write_b32(v[0], v[26], v[22])) + k.emit(v_mov_b32_e32(v[23], 1.0)) + k.emit(ds_write_b32(v[0], v[26], v[23], v[0], 0, 0, 4)) + k.emit(s_and_b32(s[78], 255, s[20])) + k.emit(s_add_u32(s[79], -1, s[10])) + k.emit(s_cmp_ge_u32(s[2], s[79])) + k.emit(s_cselect_b32(s[78], s[78], 0)) + k.emit(s_cmpk_gt_u32(s[78])) + k.emit(s_cbranch_scc1(), target='GW_B0_E1_M_1') + k.emit(s_and_b32(s[78], 255, s[21])) + k.emit(s_add_u32(s[79], -1, s[11])) + k.emit(s_cmp_ge_u32(s[3], s[79])) + k.emit(s_cselect_b32(s[78], s[78], 0)) + k.emit(s_cmpk_gt_u32(s[78])) + k.emit(s_cbranch_scc0(), target='GW_B0_E0_1') + k.emit(s_cbranch_scc1(), target='GW_B0_E1_N_1') + k.label('GW_B0_E0_1') + k.emit(s_mul_i32(s[68], 256, s[2])) + k.emit(v_sub_u32_e64(v[37], v[18], s[68])) + k.emit(v_lshlrev_b32_e32(v[37], 2, v[37])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(ds_read_b128(v[88:91], v[37])) + k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_accvgpr_read(v[40], v[0])) + k.emit(v_accvgpr_read(v[41], v[4])) + k.emit(v_accvgpr_read(v[42], v[8])) + k.emit(v_accvgpr_read(v[43], v[12])) + k.emit(v_accvgpr_read(v[44], v[16])) + k.emit(v_accvgpr_read(v[45], v[20])) + k.emit(v_accvgpr_read(v[46], v[24])) + k.emit(v_accvgpr_read(v[47], v[28])) + k.emit(v_accvgpr_read(v[48], v[32])) + k.emit(v_accvgpr_read(v[49], v[36])) + k.emit(v_accvgpr_read(v[50], v[40])) + k.emit(v_accvgpr_read(v[51], v[44])) + k.emit(v_accvgpr_read(v[52], v[48])) + k.emit(v_accvgpr_read(v[53], v[52])) + k.emit(v_accvgpr_read(v[54], v[56])) + k.emit(v_accvgpr_read(v[55], v[60])) + k.emit(v_accvgpr_read(v[56], v[64])) + k.emit(v_accvgpr_read(v[57], v[68])) + k.emit(v_accvgpr_read(v[58], v[72])) + k.emit(v_accvgpr_read(v[59], v[76])) + k.emit(v_accvgpr_read(v[60], v[80])) + k.emit(v_accvgpr_read(v[61], v[84])) + k.emit(v_accvgpr_read(v[62], v[88])) + k.emit(v_accvgpr_read(v[63], v[92])) + k.emit(v_accvgpr_read(v[64], v[96])) + k.emit(v_accvgpr_read(v[65], v[100])) + k.emit(v_accvgpr_read(v[66], v[104])) + k.emit(v_accvgpr_read(v[67], v[108])) + k.emit(v_accvgpr_read(v[68], v[112])) + k.emit(v_accvgpr_read(v[69], v[116])) + k.emit(v_accvgpr_read(v[70], v[120])) + k.emit(v_accvgpr_read(v[71], v[124])) + k.emit(v_accvgpr_read(v[72], v[128])) + k.emit(v_accvgpr_read(v[73], v[132])) + k.emit(v_accvgpr_read(v[74], v[136])) + k.emit(v_accvgpr_read(v[75], v[140])) + k.emit(v_accvgpr_read(v[76], v[144])) + k.emit(v_accvgpr_read(v[77], v[148])) + k.emit(v_accvgpr_read(v[78], v[152])) + k.emit(v_accvgpr_read(v[79], v[156])) + k.emit(v_accvgpr_read(v[80], v[160])) + k.emit(v_accvgpr_read(v[81], v[164])) + k.emit(v_accvgpr_read(v[82], v[168])) + k.emit(v_accvgpr_read(v[83], v[172])) + k.emit(v_accvgpr_read(v[84], v[176])) + k.emit(v_accvgpr_read(v[85], v[180])) + k.emit(v_accvgpr_read(v[86], v[184])) + k.emit(v_accvgpr_read(v[87], v[188])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(ds_read_b128(v[88:91], v[37])) + k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_accvgpr_read(v[40], v[192])) + k.emit(v_accvgpr_read(v[41], v[196])) + k.emit(v_accvgpr_read(v[42], v[200])) + k.emit(v_accvgpr_read(v[43], v[204])) + k.emit(v_accvgpr_read(v[44], v[208])) + k.emit(v_accvgpr_read(v[45], v[212])) + k.emit(v_accvgpr_read(v[46], v[216])) + k.emit(v_accvgpr_read(v[47], v[220])) + k.emit(v_accvgpr_read(v[48], v[224])) + k.emit(v_accvgpr_read(v[49], v[228])) + k.emit(v_accvgpr_read(v[50], v[232])) + k.emit(v_accvgpr_read(v[51], v[236])) + k.emit(v_accvgpr_read(v[52], v[240])) + k.emit(v_accvgpr_read(v[53], v[244])) + k.emit(v_accvgpr_read(v[54], v[248])) + k.emit(v_accvgpr_read(v[55], v[252])) + k.emit(v_accvgpr_read(v[56], v[1])) + k.emit(v_accvgpr_read(v[57], v[5])) + k.emit(v_accvgpr_read(v[58], v[9])) + k.emit(v_accvgpr_read(v[59], v[13])) + k.emit(v_accvgpr_read(v[60], v[17])) + k.emit(v_accvgpr_read(v[61], v[21])) + k.emit(v_accvgpr_read(v[62], v[25])) + k.emit(v_accvgpr_read(v[63], v[29])) + k.emit(v_accvgpr_read(v[64], v[33])) + k.emit(v_accvgpr_read(v[65], v[37])) + k.emit(v_accvgpr_read(v[66], v[41])) + k.emit(v_accvgpr_read(v[67], v[45])) + k.emit(v_accvgpr_read(v[68], v[49])) + k.emit(v_accvgpr_read(v[69], v[53])) + k.emit(v_accvgpr_read(v[70], v[57])) + k.emit(v_accvgpr_read(v[71], v[61])) + k.emit(v_accvgpr_read(v[72], v[65])) + k.emit(v_accvgpr_read(v[73], v[69])) + k.emit(v_accvgpr_read(v[74], v[73])) + k.emit(v_accvgpr_read(v[75], v[77])) + k.emit(v_accvgpr_read(v[76], v[81])) + k.emit(v_accvgpr_read(v[77], v[85])) + k.emit(v_accvgpr_read(v[78], v[89])) + k.emit(v_accvgpr_read(v[79], v[93])) + k.emit(v_accvgpr_read(v[80], v[97])) + k.emit(v_accvgpr_read(v[81], v[101])) + k.emit(v_accvgpr_read(v[82], v[105])) + k.emit(v_accvgpr_read(v[83], v[109])) + k.emit(v_accvgpr_read(v[84], v[113])) + k.emit(v_accvgpr_read(v[85], v[117])) + k.emit(v_accvgpr_read(v[86], v[121])) + k.emit(v_accvgpr_read(v[87], v[125])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(ds_read_b128(v[88:91], v[37])) + k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_accvgpr_read(v[40], v[129])) + k.emit(v_accvgpr_read(v[41], v[133])) + k.emit(v_accvgpr_read(v[42], v[137])) + k.emit(v_accvgpr_read(v[43], v[141])) + k.emit(v_accvgpr_read(v[44], v[145])) + k.emit(v_accvgpr_read(v[45], v[149])) + k.emit(v_accvgpr_read(v[46], v[153])) + k.emit(v_accvgpr_read(v[47], v[157])) + k.emit(v_accvgpr_read(v[48], v[161])) + k.emit(v_accvgpr_read(v[49], v[165])) + k.emit(v_accvgpr_read(v[50], v[169])) + k.emit(v_accvgpr_read(v[51], v[173])) + k.emit(v_accvgpr_read(v[52], v[177])) + k.emit(v_accvgpr_read(v[53], v[181])) + k.emit(v_accvgpr_read(v[54], v[185])) + k.emit(v_accvgpr_read(v[55], v[189])) + k.emit(v_accvgpr_read(v[56], v[193])) + k.emit(v_accvgpr_read(v[57], v[197])) + k.emit(v_accvgpr_read(v[58], v[201])) + k.emit(v_accvgpr_read(v[59], v[205])) + k.emit(v_accvgpr_read(v[60], v[209])) + k.emit(v_accvgpr_read(v[61], v[213])) + k.emit(v_accvgpr_read(v[62], v[217])) + k.emit(v_accvgpr_read(v[63], v[221])) + k.emit(v_accvgpr_read(v[64], v[225])) + k.emit(v_accvgpr_read(v[65], v[229])) + k.emit(v_accvgpr_read(v[66], v[233])) + k.emit(v_accvgpr_read(v[67], v[237])) + k.emit(v_accvgpr_read(v[68], v[241])) + k.emit(v_accvgpr_read(v[69], v[245])) + k.emit(v_accvgpr_read(v[70], v[249])) + k.emit(v_accvgpr_read(v[71], v[253])) + k.emit(v_accvgpr_read(v[72], v[2])) + k.emit(v_accvgpr_read(v[73], v[6])) + k.emit(v_accvgpr_read(v[74], v[10])) + k.emit(v_accvgpr_read(v[75], v[14])) + k.emit(v_accvgpr_read(v[76], v[18])) + k.emit(v_accvgpr_read(v[77], v[22])) + k.emit(v_accvgpr_read(v[78], v[26])) + k.emit(v_accvgpr_read(v[79], v[30])) + k.emit(v_accvgpr_read(v[80], v[34])) + k.emit(v_accvgpr_read(v[81], v[38])) + k.emit(v_accvgpr_read(v[82], v[42])) + k.emit(v_accvgpr_read(v[83], v[46])) + k.emit(v_accvgpr_read(v[84], v[50])) + k.emit(v_accvgpr_read(v[85], v[54])) + k.emit(v_accvgpr_read(v[86], v[58])) + k.emit(v_accvgpr_read(v[87], v[62])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(ds_read_b128(v[88:91], v[37])) + k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_accvgpr_read(v[40], v[66])) + k.emit(v_accvgpr_read(v[41], v[70])) + k.emit(v_accvgpr_read(v[42], v[74])) + k.emit(v_accvgpr_read(v[43], v[78])) + k.emit(v_accvgpr_read(v[44], v[82])) + k.emit(v_accvgpr_read(v[45], v[86])) + k.emit(v_accvgpr_read(v[46], v[90])) + k.emit(v_accvgpr_read(v[47], v[94])) + k.emit(v_accvgpr_read(v[48], v[98])) + k.emit(v_accvgpr_read(v[49], v[102])) + k.emit(v_accvgpr_read(v[50], v[106])) + k.emit(v_accvgpr_read(v[51], v[110])) + k.emit(v_accvgpr_read(v[52], v[114])) + k.emit(v_accvgpr_read(v[53], v[118])) + k.emit(v_accvgpr_read(v[54], v[122])) + k.emit(v_accvgpr_read(v[55], v[126])) + k.emit(v_accvgpr_read(v[56], v[130])) + k.emit(v_accvgpr_read(v[57], v[134])) + k.emit(v_accvgpr_read(v[58], v[138])) + k.emit(v_accvgpr_read(v[59], v[142])) + k.emit(v_accvgpr_read(v[60], v[146])) + k.emit(v_accvgpr_read(v[61], v[150])) + k.emit(v_accvgpr_read(v[62], v[154])) + k.emit(v_accvgpr_read(v[63], v[158])) + k.emit(v_accvgpr_read(v[64], v[162])) + k.emit(v_accvgpr_read(v[65], v[166])) + k.emit(v_accvgpr_read(v[66], v[170])) + k.emit(v_accvgpr_read(v[67], v[174])) + k.emit(v_accvgpr_read(v[68], v[178])) + k.emit(v_accvgpr_read(v[69], v[182])) + k.emit(v_accvgpr_read(v[70], v[186])) + k.emit(v_accvgpr_read(v[71], v[190])) + k.emit(v_accvgpr_read(v[72], v[194])) + k.emit(v_accvgpr_read(v[73], v[198])) + k.emit(v_accvgpr_read(v[74], v[202])) + k.emit(v_accvgpr_read(v[75], v[206])) + k.emit(v_accvgpr_read(v[76], v[210])) + k.emit(v_accvgpr_read(v[77], v[214])) + k.emit(v_accvgpr_read(v[78], v[218])) + k.emit(v_accvgpr_read(v[79], v[222])) + k.emit(v_accvgpr_read(v[80], v[226])) + k.emit(v_accvgpr_read(v[81], v[230])) + k.emit(v_accvgpr_read(v[82], v[234])) + k.emit(v_accvgpr_read(v[83], v[238])) + k.emit(v_accvgpr_read(v[84], v[242])) + k.emit(v_accvgpr_read(v[85], v[246])) + k.emit(v_accvgpr_read(v[86], v[250])) + k.emit(v_accvgpr_read(v[87], v[254])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(ds_read_b128(v[88:91], v[37])) + k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_accvgpr_read(v[40], v[3])) + k.emit(v_accvgpr_read(v[41], v[7])) + k.emit(v_accvgpr_read(v[42], v[11])) + k.emit(v_accvgpr_read(v[43], v[15])) + k.emit(v_accvgpr_read(v[44], v[19])) + k.emit(v_accvgpr_read(v[45], v[23])) + k.emit(v_accvgpr_read(v[46], v[27])) + k.emit(v_accvgpr_read(v[47], v[31])) + k.emit(v_accvgpr_read(v[48], v[35])) + k.emit(v_accvgpr_read(v[49], v[39])) + k.emit(v_accvgpr_read(v[50], v[43])) + k.emit(v_accvgpr_read(v[51], v[47])) + k.emit(v_accvgpr_read(v[52], v[51])) + k.emit(v_accvgpr_read(v[53], v[55])) + k.emit(v_accvgpr_read(v[54], v[59])) + k.emit(v_accvgpr_read(v[55], v[63])) + k.emit(v_accvgpr_read(v[56], v[67])) + k.emit(v_accvgpr_read(v[57], v[71])) + k.emit(v_accvgpr_read(v[58], v[75])) + k.emit(v_accvgpr_read(v[59], v[79])) + k.emit(v_accvgpr_read(v[60], v[83])) + k.emit(v_accvgpr_read(v[61], v[87])) + k.emit(v_accvgpr_read(v[62], v[91])) + k.emit(v_accvgpr_read(v[63], v[95])) + k.emit(v_accvgpr_read(v[64], v[99])) + k.emit(v_accvgpr_read(v[65], v[103])) + k.emit(v_accvgpr_read(v[66], v[107])) + k.emit(v_accvgpr_read(v[67], v[111])) + k.emit(v_accvgpr_read(v[68], v[115])) + k.emit(v_accvgpr_read(v[69], v[119])) + k.emit(v_accvgpr_read(v[70], v[123])) + k.emit(v_accvgpr_read(v[71], v[127])) + k.emit(v_accvgpr_read(v[72], v[131])) + k.emit(v_accvgpr_read(v[73], v[135])) + k.emit(v_accvgpr_read(v[74], v[139])) + k.emit(v_accvgpr_read(v[75], v[143])) + k.emit(v_accvgpr_read(v[76], v[147])) + k.emit(v_accvgpr_read(v[77], v[151])) + k.emit(v_accvgpr_read(v[78], v[155])) + k.emit(v_accvgpr_read(v[79], v[159])) + k.emit(v_accvgpr_read(v[80], v[163])) + k.emit(v_accvgpr_read(v[81], v[167])) + k.emit(v_accvgpr_read(v[82], v[171])) + k.emit(v_accvgpr_read(v[83], v[175])) + k.emit(v_accvgpr_read(v[84], v[179])) + k.emit(v_accvgpr_read(v[85], v[183])) + k.emit(v_accvgpr_read(v[86], v[187])) + k.emit(v_accvgpr_read(v[87], v[191])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(ds_read_b128(v[56:59], v[37])) + k.emit(ds_read_b128(v[60:63], v[37], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[64:67], v[37], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[68:71], v[37], v[0], v[0], 0, 16, 4)) + k.emit(v_accvgpr_read(v[40], v[195])) + k.emit(v_accvgpr_read(v[41], v[199])) + k.emit(v_accvgpr_read(v[42], v[203])) + k.emit(v_accvgpr_read(v[43], v[207])) + k.emit(v_accvgpr_read(v[44], v[211])) + k.emit(v_accvgpr_read(v[45], v[215])) + k.emit(v_accvgpr_read(v[46], v[219])) + k.emit(v_accvgpr_read(v[47], v[223])) + k.emit(v_accvgpr_read(v[48], v[227])) + k.emit(v_accvgpr_read(v[49], v[231])) + k.emit(v_accvgpr_read(v[50], v[235])) + k.emit(v_accvgpr_read(v[51], v[239])) + k.emit(v_accvgpr_read(v[52], v[243])) + k.emit(v_accvgpr_read(v[53], v[247])) + k.emit(v_accvgpr_read(v[54], v[251])) + k.emit(v_accvgpr_read(v[55], v[255])) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.waitcnt(lgkm=0) + k.emit(v_pk_mul_f32(v[40:41], v[64:65], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[66:67], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[68:69], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[70:71], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[56:57], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[58:59], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[60:61], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[62:63], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[64:65], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[66:67], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[68:69], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[70:71], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[56:57], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[58:59], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[60:61], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[62:63], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(s_lshl_b32(s[68], s[36], 1)) + k.emit(s_add_u32(s[12], s[12], s[68])) + k.emit(s_addc_u32(s[13], s[13], 0)) + k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(s_branch(), target='GW_End_1') + k.label('GW_B0_E1_N_1') + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(ds_read_b128(v[88:91], v[36])) + k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[104], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[104], 2, v[104])) + k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[0])) + k.emit(v_accvgpr_read(v[41], v[4])) + k.emit(v_accvgpr_read(v[42], v[8])) + k.emit(v_accvgpr_read(v[43], v[12])) + k.emit(v_accvgpr_read(v[44], v[16])) + k.emit(v_accvgpr_read(v[45], v[20])) + k.emit(v_accvgpr_read(v[46], v[24])) + k.emit(v_accvgpr_read(v[47], v[28])) + k.emit(v_accvgpr_read(v[48], v[32])) + k.emit(v_accvgpr_read(v[49], v[36])) + k.emit(v_accvgpr_read(v[50], v[40])) + k.emit(v_accvgpr_read(v[51], v[44])) + k.emit(v_accvgpr_read(v[52], v[48])) + k.emit(v_accvgpr_read(v[53], v[52])) + k.emit(v_accvgpr_read(v[54], v[56])) + k.emit(v_accvgpr_read(v[55], v[60])) + k.emit(v_accvgpr_read(v[56], v[64])) + k.emit(v_accvgpr_read(v[57], v[68])) + k.emit(v_accvgpr_read(v[58], v[72])) + k.emit(v_accvgpr_read(v[59], v[76])) + k.emit(v_accvgpr_read(v[60], v[80])) + k.emit(v_accvgpr_read(v[61], v[84])) + k.emit(v_accvgpr_read(v[62], v[88])) + k.emit(v_accvgpr_read(v[63], v[92])) + k.emit(v_accvgpr_read(v[64], v[96])) + k.emit(v_accvgpr_read(v[65], v[100])) + k.emit(v_accvgpr_read(v[66], v[104])) + k.emit(v_accvgpr_read(v[67], v[108])) + k.emit(v_accvgpr_read(v[68], v[112])) + k.emit(v_accvgpr_read(v[69], v[116])) + k.emit(v_accvgpr_read(v[70], v[120])) + k.emit(v_accvgpr_read(v[71], v[124])) + k.emit(v_accvgpr_read(v[72], v[128])) + k.emit(v_accvgpr_read(v[73], v[132])) + k.emit(v_accvgpr_read(v[74], v[136])) + k.emit(v_accvgpr_read(v[75], v[140])) + k.emit(v_accvgpr_read(v[76], v[144])) + k.emit(v_accvgpr_read(v[77], v[148])) + k.emit(v_accvgpr_read(v[78], v[152])) + k.emit(v_accvgpr_read(v[79], v[156])) + k.emit(v_accvgpr_read(v[80], v[160])) + k.emit(v_accvgpr_read(v[81], v[164])) + k.emit(v_accvgpr_read(v[82], v[168])) + k.emit(v_accvgpr_read(v[83], v[172])) + k.emit(v_accvgpr_read(v[84], v[176])) + k.emit(v_accvgpr_read(v[85], v[180])) + k.emit(v_accvgpr_read(v[86], v[184])) + k.emit(v_accvgpr_read(v[87], v[188])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.emit(ds_read_b128(v[88:91], v[36])) + k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[104], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[104], 2, v[104])) + k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[192])) + k.emit(v_accvgpr_read(v[41], v[196])) + k.emit(v_accvgpr_read(v[42], v[200])) + k.emit(v_accvgpr_read(v[43], v[204])) + k.emit(v_accvgpr_read(v[44], v[208])) + k.emit(v_accvgpr_read(v[45], v[212])) + k.emit(v_accvgpr_read(v[46], v[216])) + k.emit(v_accvgpr_read(v[47], v[220])) + k.emit(v_accvgpr_read(v[48], v[224])) + k.emit(v_accvgpr_read(v[49], v[228])) + k.emit(v_accvgpr_read(v[50], v[232])) + k.emit(v_accvgpr_read(v[51], v[236])) + k.emit(v_accvgpr_read(v[52], v[240])) + k.emit(v_accvgpr_read(v[53], v[244])) + k.emit(v_accvgpr_read(v[54], v[248])) + k.emit(v_accvgpr_read(v[55], v[252])) + k.emit(v_accvgpr_read(v[56], v[1])) + k.emit(v_accvgpr_read(v[57], v[5])) + k.emit(v_accvgpr_read(v[58], v[9])) + k.emit(v_accvgpr_read(v[59], v[13])) + k.emit(v_accvgpr_read(v[60], v[17])) + k.emit(v_accvgpr_read(v[61], v[21])) + k.emit(v_accvgpr_read(v[62], v[25])) + k.emit(v_accvgpr_read(v[63], v[29])) + k.emit(v_accvgpr_read(v[64], v[33])) + k.emit(v_accvgpr_read(v[65], v[37])) + k.emit(v_accvgpr_read(v[66], v[41])) + k.emit(v_accvgpr_read(v[67], v[45])) + k.emit(v_accvgpr_read(v[68], v[49])) + k.emit(v_accvgpr_read(v[69], v[53])) + k.emit(v_accvgpr_read(v[70], v[57])) + k.emit(v_accvgpr_read(v[71], v[61])) + k.emit(v_accvgpr_read(v[72], v[65])) + k.emit(v_accvgpr_read(v[73], v[69])) + k.emit(v_accvgpr_read(v[74], v[73])) + k.emit(v_accvgpr_read(v[75], v[77])) + k.emit(v_accvgpr_read(v[76], v[81])) + k.emit(v_accvgpr_read(v[77], v[85])) + k.emit(v_accvgpr_read(v[78], v[89])) + k.emit(v_accvgpr_read(v[79], v[93])) + k.emit(v_accvgpr_read(v[80], v[97])) + k.emit(v_accvgpr_read(v[81], v[101])) + k.emit(v_accvgpr_read(v[82], v[105])) + k.emit(v_accvgpr_read(v[83], v[109])) + k.emit(v_accvgpr_read(v[84], v[113])) + k.emit(v_accvgpr_read(v[85], v[117])) + k.emit(v_accvgpr_read(v[86], v[121])) + k.emit(v_accvgpr_read(v[87], v[125])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.emit(ds_read_b128(v[88:91], v[36])) + k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[104], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[104], 2, v[104])) + k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[129])) + k.emit(v_accvgpr_read(v[41], v[133])) + k.emit(v_accvgpr_read(v[42], v[137])) + k.emit(v_accvgpr_read(v[43], v[141])) + k.emit(v_accvgpr_read(v[44], v[145])) + k.emit(v_accvgpr_read(v[45], v[149])) + k.emit(v_accvgpr_read(v[46], v[153])) + k.emit(v_accvgpr_read(v[47], v[157])) + k.emit(v_accvgpr_read(v[48], v[161])) + k.emit(v_accvgpr_read(v[49], v[165])) + k.emit(v_accvgpr_read(v[50], v[169])) + k.emit(v_accvgpr_read(v[51], v[173])) + k.emit(v_accvgpr_read(v[52], v[177])) + k.emit(v_accvgpr_read(v[53], v[181])) + k.emit(v_accvgpr_read(v[54], v[185])) + k.emit(v_accvgpr_read(v[55], v[189])) + k.emit(v_accvgpr_read(v[56], v[193])) + k.emit(v_accvgpr_read(v[57], v[197])) + k.emit(v_accvgpr_read(v[58], v[201])) + k.emit(v_accvgpr_read(v[59], v[205])) + k.emit(v_accvgpr_read(v[60], v[209])) + k.emit(v_accvgpr_read(v[61], v[213])) + k.emit(v_accvgpr_read(v[62], v[217])) + k.emit(v_accvgpr_read(v[63], v[221])) + k.emit(v_accvgpr_read(v[64], v[225])) + k.emit(v_accvgpr_read(v[65], v[229])) + k.emit(v_accvgpr_read(v[66], v[233])) + k.emit(v_accvgpr_read(v[67], v[237])) + k.emit(v_accvgpr_read(v[68], v[241])) + k.emit(v_accvgpr_read(v[69], v[245])) + k.emit(v_accvgpr_read(v[70], v[249])) + k.emit(v_accvgpr_read(v[71], v[253])) + k.emit(v_accvgpr_read(v[72], v[2])) + k.emit(v_accvgpr_read(v[73], v[6])) + k.emit(v_accvgpr_read(v[74], v[10])) + k.emit(v_accvgpr_read(v[75], v[14])) + k.emit(v_accvgpr_read(v[76], v[18])) + k.emit(v_accvgpr_read(v[77], v[22])) + k.emit(v_accvgpr_read(v[78], v[26])) + k.emit(v_accvgpr_read(v[79], v[30])) + k.emit(v_accvgpr_read(v[80], v[34])) + k.emit(v_accvgpr_read(v[81], v[38])) + k.emit(v_accvgpr_read(v[82], v[42])) + k.emit(v_accvgpr_read(v[83], v[46])) + k.emit(v_accvgpr_read(v[84], v[50])) + k.emit(v_accvgpr_read(v[85], v[54])) + k.emit(v_accvgpr_read(v[86], v[58])) + k.emit(v_accvgpr_read(v[87], v[62])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.emit(ds_read_b128(v[88:91], v[36])) + k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[104], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[104], 2, v[104])) + k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[66])) + k.emit(v_accvgpr_read(v[41], v[70])) + k.emit(v_accvgpr_read(v[42], v[74])) + k.emit(v_accvgpr_read(v[43], v[78])) + k.emit(v_accvgpr_read(v[44], v[82])) + k.emit(v_accvgpr_read(v[45], v[86])) + k.emit(v_accvgpr_read(v[46], v[90])) + k.emit(v_accvgpr_read(v[47], v[94])) + k.emit(v_accvgpr_read(v[48], v[98])) + k.emit(v_accvgpr_read(v[49], v[102])) + k.emit(v_accvgpr_read(v[50], v[106])) + k.emit(v_accvgpr_read(v[51], v[110])) + k.emit(v_accvgpr_read(v[52], v[114])) + k.emit(v_accvgpr_read(v[53], v[118])) + k.emit(v_accvgpr_read(v[54], v[122])) + k.emit(v_accvgpr_read(v[55], v[126])) + k.emit(v_accvgpr_read(v[56], v[130])) + k.emit(v_accvgpr_read(v[57], v[134])) + k.emit(v_accvgpr_read(v[58], v[138])) + k.emit(v_accvgpr_read(v[59], v[142])) + k.emit(v_accvgpr_read(v[60], v[146])) + k.emit(v_accvgpr_read(v[61], v[150])) + k.emit(v_accvgpr_read(v[62], v[154])) + k.emit(v_accvgpr_read(v[63], v[158])) + k.emit(v_accvgpr_read(v[64], v[162])) + k.emit(v_accvgpr_read(v[65], v[166])) + k.emit(v_accvgpr_read(v[66], v[170])) + k.emit(v_accvgpr_read(v[67], v[174])) + k.emit(v_accvgpr_read(v[68], v[178])) + k.emit(v_accvgpr_read(v[69], v[182])) + k.emit(v_accvgpr_read(v[70], v[186])) + k.emit(v_accvgpr_read(v[71], v[190])) + k.emit(v_accvgpr_read(v[72], v[194])) + k.emit(v_accvgpr_read(v[73], v[198])) + k.emit(v_accvgpr_read(v[74], v[202])) + k.emit(v_accvgpr_read(v[75], v[206])) + k.emit(v_accvgpr_read(v[76], v[210])) + k.emit(v_accvgpr_read(v[77], v[214])) + k.emit(v_accvgpr_read(v[78], v[218])) + k.emit(v_accvgpr_read(v[79], v[222])) + k.emit(v_accvgpr_read(v[80], v[226])) + k.emit(v_accvgpr_read(v[81], v[230])) + k.emit(v_accvgpr_read(v[82], v[234])) + k.emit(v_accvgpr_read(v[83], v[238])) + k.emit(v_accvgpr_read(v[84], v[242])) + k.emit(v_accvgpr_read(v[85], v[246])) + k.emit(v_accvgpr_read(v[86], v[250])) + k.emit(v_accvgpr_read(v[87], v[254])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.emit(ds_read_b128(v[88:91], v[36])) + k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[104], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[104], 2, v[104])) + k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[3])) + k.emit(v_accvgpr_read(v[41], v[7])) + k.emit(v_accvgpr_read(v[42], v[11])) + k.emit(v_accvgpr_read(v[43], v[15])) + k.emit(v_accvgpr_read(v[44], v[19])) + k.emit(v_accvgpr_read(v[45], v[23])) + k.emit(v_accvgpr_read(v[46], v[27])) + k.emit(v_accvgpr_read(v[47], v[31])) + k.emit(v_accvgpr_read(v[48], v[35])) + k.emit(v_accvgpr_read(v[49], v[39])) + k.emit(v_accvgpr_read(v[50], v[43])) + k.emit(v_accvgpr_read(v[51], v[47])) + k.emit(v_accvgpr_read(v[52], v[51])) + k.emit(v_accvgpr_read(v[53], v[55])) + k.emit(v_accvgpr_read(v[54], v[59])) + k.emit(v_accvgpr_read(v[55], v[63])) + k.emit(v_accvgpr_read(v[56], v[67])) + k.emit(v_accvgpr_read(v[57], v[71])) + k.emit(v_accvgpr_read(v[58], v[75])) + k.emit(v_accvgpr_read(v[59], v[79])) + k.emit(v_accvgpr_read(v[60], v[83])) + k.emit(v_accvgpr_read(v[61], v[87])) + k.emit(v_accvgpr_read(v[62], v[91])) + k.emit(v_accvgpr_read(v[63], v[95])) + k.emit(v_accvgpr_read(v[64], v[99])) + k.emit(v_accvgpr_read(v[65], v[103])) + k.emit(v_accvgpr_read(v[66], v[107])) + k.emit(v_accvgpr_read(v[67], v[111])) + k.emit(v_accvgpr_read(v[68], v[115])) + k.emit(v_accvgpr_read(v[69], v[119])) + k.emit(v_accvgpr_read(v[70], v[123])) + k.emit(v_accvgpr_read(v[71], v[127])) + k.emit(v_accvgpr_read(v[72], v[131])) + k.emit(v_accvgpr_read(v[73], v[135])) + k.emit(v_accvgpr_read(v[74], v[139])) + k.emit(v_accvgpr_read(v[75], v[143])) + k.emit(v_accvgpr_read(v[76], v[147])) + k.emit(v_accvgpr_read(v[77], v[151])) + k.emit(v_accvgpr_read(v[78], v[155])) + k.emit(v_accvgpr_read(v[79], v[159])) + k.emit(v_accvgpr_read(v[80], v[163])) + k.emit(v_accvgpr_read(v[81], v[167])) + k.emit(v_accvgpr_read(v[82], v[171])) + k.emit(v_accvgpr_read(v[83], v[175])) + k.emit(v_accvgpr_read(v[84], v[179])) + k.emit(v_accvgpr_read(v[85], v[183])) + k.emit(v_accvgpr_read(v[86], v[187])) + k.emit(v_accvgpr_read(v[87], v[191])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57])) + k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59])) + k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61])) + k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63])) + k.emit(v_mov_b64_e32(v[56:57], v[22:23])) + k.emit(v_mov_b64_e32(v[58:59], v[24:25])) + k.emit(v_mov_b64_e32(v[60:61], v[26:27])) + k.emit(v_mov_b64_e32(v[62:63], v[28:29])) + k.emit(v_cvt_pk(v[56], v[56], v[57])) + k.emit(v_cvt_pk(v[57], v[58], v[59])) + k.emit(v_cvt_pk(v[58], v[60], v[61])) + k.emit(v_cvt_pk(v[59], v[62], v[63])) + k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65])) + k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67])) + k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69])) + k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71])) + k.emit(v_mov_b64_e32(v[64:65], v[22:23])) + k.emit(v_mov_b64_e32(v[66:67], v[24:25])) + k.emit(v_mov_b64_e32(v[68:69], v[26:27])) + k.emit(v_mov_b64_e32(v[70:71], v[28:29])) + k.emit(v_cvt_pk(v[64], v[64], v[65])) + k.emit(v_cvt_pk(v[65], v[66], v[67])) + k.emit(v_cvt_pk(v[66], v[68], v[69])) + k.emit(v_cvt_pk(v[67], v[70], v[71])) + k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73])) + k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75])) + k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77])) + k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79])) + k.emit(v_mov_b64_e32(v[72:73], v[22:23])) + k.emit(v_mov_b64_e32(v[74:75], v[24:25])) + k.emit(v_mov_b64_e32(v[76:77], v[26:27])) + k.emit(v_mov_b64_e32(v[78:79], v[28:29])) + k.emit(v_cvt_pk(v[72], v[72], v[73])) + k.emit(v_cvt_pk(v[73], v[74], v[75])) + k.emit(v_cvt_pk(v[74], v[76], v[77])) + k.emit(v_cvt_pk(v[75], v[78], v[79])) + k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81])) + k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83])) + k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85])) + k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87])) + k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81])) + k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83])) + k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85])) + k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87])) + k.emit(v_mov_b64_e32(v[80:81], v[22:23])) + k.emit(v_mov_b64_e32(v[82:83], v[24:25])) + k.emit(v_mov_b64_e32(v[84:85], v[26:27])) + k.emit(v_mov_b64_e32(v[86:87], v[28:29])) + k.emit(v_cvt_pk(v[80], v[80], v[81])) + k.emit(v_cvt_pk(v[81], v[82], v[83])) + k.emit(v_cvt_pk(v[82], v[84], v[85])) + k.emit(v_cvt_pk(v[83], v[86], v[87])) + k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[36], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[36], 2, v[36])) + k.emit(ds_read_b128(v[56:59], v[36])) + k.emit(ds_read_b128(v[60:63], v[36], v[0], v[0], 0, 16)) + k.emit(ds_read_b128(v[64:67], v[36], v[0], v[0], 0, 0, 4)) + k.emit(ds_read_b128(v[68:71], v[36], v[0], v[0], 0, 16, 4)) + k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[38], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[38], 2, v[38])) + k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83])) + k.emit(v_accvgpr_read(v[40], v[195])) + k.emit(v_accvgpr_read(v[41], v[199])) + k.emit(v_accvgpr_read(v[42], v[203])) + k.emit(v_accvgpr_read(v[43], v[207])) + k.emit(v_accvgpr_read(v[44], v[211])) + k.emit(v_accvgpr_read(v[45], v[215])) + k.emit(v_accvgpr_read(v[46], v[219])) + k.emit(v_accvgpr_read(v[47], v[223])) + k.emit(v_accvgpr_read(v[48], v[227])) + k.emit(v_accvgpr_read(v[49], v[231])) + k.emit(v_accvgpr_read(v[50], v[235])) + k.emit(v_accvgpr_read(v[51], v[239])) + k.emit(v_accvgpr_read(v[52], v[243])) + k.emit(v_accvgpr_read(v[53], v[247])) + k.emit(v_accvgpr_read(v[54], v[251])) + k.emit(v_accvgpr_read(v[55], v[255])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_pk_mul_f32(v[40:41], v[64:65], v[40:41])) + k.emit(v_pk_mul_f32(v[42:43], v[66:67], v[42:43])) + k.emit(v_pk_mul_f32(v[44:45], v[68:69], v[44:45])) + k.emit(v_pk_mul_f32(v[46:47], v[70:71], v[46:47])) + k.emit(v_pk_add_f32(v[22:23], v[56:57], v[40:41])) + k.emit(v_pk_add_f32(v[24:25], v[58:59], v[42:43])) + k.emit(v_pk_add_f32(v[26:27], v[60:61], v[44:45])) + k.emit(v_pk_add_f32(v[28:29], v[62:63], v[46:47])) + k.emit(v_mov_b64_e32(v[40:41], v[22:23])) + k.emit(v_mov_b64_e32(v[42:43], v[24:25])) + k.emit(v_mov_b64_e32(v[44:45], v[26:27])) + k.emit(v_mov_b64_e32(v[46:47], v[28:29])) + k.emit(v_cvt_pk(v[40], v[40], v[41])) + k.emit(v_cvt_pk(v[41], v[42], v[43])) + k.emit(v_cvt_pk(v[42], v[44], v[45])) + k.emit(v_cvt_pk(v[43], v[46], v[47])) + k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_pk_mul_f32(v[48:49], v[64:65], v[48:49])) + k.emit(v_pk_mul_f32(v[50:51], v[66:67], v[50:51])) + k.emit(v_pk_mul_f32(v[52:53], v[68:69], v[52:53])) + k.emit(v_pk_mul_f32(v[54:55], v[70:71], v[54:55])) + k.emit(v_pk_add_f32(v[22:23], v[56:57], v[48:49])) + k.emit(v_pk_add_f32(v[24:25], v[58:59], v[50:51])) + k.emit(v_pk_add_f32(v[26:27], v[60:61], v[52:53])) + k.emit(v_pk_add_f32(v[28:29], v[62:63], v[54:55])) + k.emit(v_mov_b64_e32(v[48:49], v[22:23])) + k.emit(v_mov_b64_e32(v[50:51], v[24:25])) + k.emit(v_mov_b64_e32(v[52:53], v[26:27])) + k.emit(v_mov_b64_e32(v[54:55], v[28:29])) + k.emit(v_cvt_pk(v[48], v[48], v[49])) + k.emit(v_cvt_pk(v[49], v[50], v[51])) + k.emit(v_cvt_pk(v[50], v[52], v[53])) + k.emit(v_cvt_pk(v[51], v[54], v[55])) + k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(s_branch(), target='GW_End_1') + k.label('GW_B0_E1_M_1') + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.waitcnt(lgkm=0) + k.emit(s_barrier()) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[0])) + k.emit(v_accvgpr_read(v[36], v[4])) + k.emit(v_accvgpr_read(v[37], v[8])) + k.emit(v_accvgpr_read(v[38], v[12])) + k.emit(v_accvgpr_read(v[39], v[16])) + k.emit(v_accvgpr_read(v[40], v[20])) + k.emit(v_accvgpr_read(v[41], v[24])) + k.emit(v_accvgpr_read(v[42], v[28])) + k.emit(v_accvgpr_read(v[43], v[32])) + k.emit(v_accvgpr_read(v[44], v[36])) + k.emit(v_accvgpr_read(v[45], v[40])) + k.emit(v_accvgpr_read(v[46], v[44])) + k.emit(v_accvgpr_read(v[47], v[48])) + k.emit(v_accvgpr_read(v[48], v[52])) + k.emit(v_accvgpr_read(v[49], v[56])) + k.emit(v_accvgpr_read(v[50], v[60])) + k.emit(v_accvgpr_read(v[51], v[64])) + k.emit(v_accvgpr_read(v[52], v[68])) + k.emit(v_accvgpr_read(v[53], v[72])) + k.emit(v_accvgpr_read(v[54], v[76])) + k.emit(v_accvgpr_read(v[55], v[80])) + k.emit(v_accvgpr_read(v[56], v[84])) + k.emit(v_accvgpr_read(v[57], v[88])) + k.emit(v_accvgpr_read(v[58], v[92])) + k.emit(v_accvgpr_read(v[59], v[96])) + k.emit(v_accvgpr_read(v[60], v[100])) + k.emit(v_accvgpr_read(v[61], v[104])) + k.emit(v_accvgpr_read(v[62], v[108])) + k.emit(v_accvgpr_read(v[63], v[112])) + k.emit(v_accvgpr_read(v[64], v[116])) + k.emit(v_accvgpr_read(v[65], v[120])) + k.emit(v_accvgpr_read(v[66], v[124])) + k.emit(v_accvgpr_read(v[67], v[128])) + k.emit(v_accvgpr_read(v[68], v[132])) + k.emit(v_accvgpr_read(v[69], v[136])) + k.emit(v_accvgpr_read(v[70], v[140])) + k.emit(v_accvgpr_read(v[71], v[144])) + k.emit(v_accvgpr_read(v[72], v[148])) + k.emit(v_accvgpr_read(v[73], v[152])) + k.emit(v_accvgpr_read(v[74], v[156])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[160])) + k.emit(v_accvgpr_read(v[36], v[164])) + k.emit(v_accvgpr_read(v[37], v[168])) + k.emit(v_accvgpr_read(v[38], v[172])) + k.emit(v_accvgpr_read(v[39], v[176])) + k.emit(v_accvgpr_read(v[40], v[180])) + k.emit(v_accvgpr_read(v[41], v[184])) + k.emit(v_accvgpr_read(v[42], v[188])) + k.emit(v_accvgpr_read(v[43], v[192])) + k.emit(v_accvgpr_read(v[44], v[196])) + k.emit(v_accvgpr_read(v[45], v[200])) + k.emit(v_accvgpr_read(v[46], v[204])) + k.emit(v_accvgpr_read(v[47], v[208])) + k.emit(v_accvgpr_read(v[48], v[212])) + k.emit(v_accvgpr_read(v[49], v[216])) + k.emit(v_accvgpr_read(v[50], v[220])) + k.emit(v_accvgpr_read(v[51], v[224])) + k.emit(v_accvgpr_read(v[52], v[228])) + k.emit(v_accvgpr_read(v[53], v[232])) + k.emit(v_accvgpr_read(v[54], v[236])) + k.emit(v_accvgpr_read(v[55], v[240])) + k.emit(v_accvgpr_read(v[56], v[244])) + k.emit(v_accvgpr_read(v[57], v[248])) + k.emit(v_accvgpr_read(v[58], v[252])) + k.emit(v_accvgpr_read(v[59], v[1])) + k.emit(v_accvgpr_read(v[60], v[5])) + k.emit(v_accvgpr_read(v[61], v[9])) + k.emit(v_accvgpr_read(v[62], v[13])) + k.emit(v_accvgpr_read(v[63], v[17])) + k.emit(v_accvgpr_read(v[64], v[21])) + k.emit(v_accvgpr_read(v[65], v[25])) + k.emit(v_accvgpr_read(v[66], v[29])) + k.emit(v_accvgpr_read(v[67], v[33])) + k.emit(v_accvgpr_read(v[68], v[37])) + k.emit(v_accvgpr_read(v[69], v[41])) + k.emit(v_accvgpr_read(v[70], v[45])) + k.emit(v_accvgpr_read(v[71], v[49])) + k.emit(v_accvgpr_read(v[72], v[53])) + k.emit(v_accvgpr_read(v[73], v[57])) + k.emit(v_accvgpr_read(v[74], v[61])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[65])) + k.emit(v_accvgpr_read(v[36], v[69])) + k.emit(v_accvgpr_read(v[37], v[73])) + k.emit(v_accvgpr_read(v[38], v[77])) + k.emit(v_accvgpr_read(v[39], v[81])) + k.emit(v_accvgpr_read(v[40], v[85])) + k.emit(v_accvgpr_read(v[41], v[89])) + k.emit(v_accvgpr_read(v[42], v[93])) + k.emit(v_accvgpr_read(v[43], v[97])) + k.emit(v_accvgpr_read(v[44], v[101])) + k.emit(v_accvgpr_read(v[45], v[105])) + k.emit(v_accvgpr_read(v[46], v[109])) + k.emit(v_accvgpr_read(v[47], v[113])) + k.emit(v_accvgpr_read(v[48], v[117])) + k.emit(v_accvgpr_read(v[49], v[121])) + k.emit(v_accvgpr_read(v[50], v[125])) + k.emit(v_accvgpr_read(v[51], v[129])) + k.emit(v_accvgpr_read(v[52], v[133])) + k.emit(v_accvgpr_read(v[53], v[137])) + k.emit(v_accvgpr_read(v[54], v[141])) + k.emit(v_accvgpr_read(v[55], v[145])) + k.emit(v_accvgpr_read(v[56], v[149])) + k.emit(v_accvgpr_read(v[57], v[153])) + k.emit(v_accvgpr_read(v[58], v[157])) + k.emit(v_accvgpr_read(v[59], v[161])) + k.emit(v_accvgpr_read(v[60], v[165])) + k.emit(v_accvgpr_read(v[61], v[169])) + k.emit(v_accvgpr_read(v[62], v[173])) + k.emit(v_accvgpr_read(v[63], v[177])) + k.emit(v_accvgpr_read(v[64], v[181])) + k.emit(v_accvgpr_read(v[65], v[185])) + k.emit(v_accvgpr_read(v[66], v[189])) + k.emit(v_accvgpr_read(v[67], v[193])) + k.emit(v_accvgpr_read(v[68], v[197])) + k.emit(v_accvgpr_read(v[69], v[201])) + k.emit(v_accvgpr_read(v[70], v[205])) + k.emit(v_accvgpr_read(v[71], v[209])) + k.emit(v_accvgpr_read(v[72], v[213])) + k.emit(v_accvgpr_read(v[73], v[217])) + k.emit(v_accvgpr_read(v[74], v[221])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[225])) + k.emit(v_accvgpr_read(v[36], v[229])) + k.emit(v_accvgpr_read(v[37], v[233])) + k.emit(v_accvgpr_read(v[38], v[237])) + k.emit(v_accvgpr_read(v[39], v[241])) + k.emit(v_accvgpr_read(v[40], v[245])) + k.emit(v_accvgpr_read(v[41], v[249])) + k.emit(v_accvgpr_read(v[42], v[253])) + k.emit(v_accvgpr_read(v[43], v[2])) + k.emit(v_accvgpr_read(v[44], v[6])) + k.emit(v_accvgpr_read(v[45], v[10])) + k.emit(v_accvgpr_read(v[46], v[14])) + k.emit(v_accvgpr_read(v[47], v[18])) + k.emit(v_accvgpr_read(v[48], v[22])) + k.emit(v_accvgpr_read(v[49], v[26])) + k.emit(v_accvgpr_read(v[50], v[30])) + k.emit(v_accvgpr_read(v[51], v[34])) + k.emit(v_accvgpr_read(v[52], v[38])) + k.emit(v_accvgpr_read(v[53], v[42])) + k.emit(v_accvgpr_read(v[54], v[46])) + k.emit(v_accvgpr_read(v[55], v[50])) + k.emit(v_accvgpr_read(v[56], v[54])) + k.emit(v_accvgpr_read(v[57], v[58])) + k.emit(v_accvgpr_read(v[58], v[62])) + k.emit(v_accvgpr_read(v[59], v[66])) + k.emit(v_accvgpr_read(v[60], v[70])) + k.emit(v_accvgpr_read(v[61], v[74])) + k.emit(v_accvgpr_read(v[62], v[78])) + k.emit(v_accvgpr_read(v[63], v[82])) + k.emit(v_accvgpr_read(v[64], v[86])) + k.emit(v_accvgpr_read(v[65], v[90])) + k.emit(v_accvgpr_read(v[66], v[94])) + k.emit(v_accvgpr_read(v[67], v[98])) + k.emit(v_accvgpr_read(v[68], v[102])) + k.emit(v_accvgpr_read(v[69], v[106])) + k.emit(v_accvgpr_read(v[70], v[110])) + k.emit(v_accvgpr_read(v[71], v[114])) + k.emit(v_accvgpr_read(v[72], v[118])) + k.emit(v_accvgpr_read(v[73], v[122])) + k.emit(v_accvgpr_read(v[74], v[126])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[130])) + k.emit(v_accvgpr_read(v[36], v[134])) + k.emit(v_accvgpr_read(v[37], v[138])) + k.emit(v_accvgpr_read(v[38], v[142])) + k.emit(v_accvgpr_read(v[39], v[146])) + k.emit(v_accvgpr_read(v[40], v[150])) + k.emit(v_accvgpr_read(v[41], v[154])) + k.emit(v_accvgpr_read(v[42], v[158])) + k.emit(v_accvgpr_read(v[43], v[162])) + k.emit(v_accvgpr_read(v[44], v[166])) + k.emit(v_accvgpr_read(v[45], v[170])) + k.emit(v_accvgpr_read(v[46], v[174])) + k.emit(v_accvgpr_read(v[47], v[178])) + k.emit(v_accvgpr_read(v[48], v[182])) + k.emit(v_accvgpr_read(v[49], v[186])) + k.emit(v_accvgpr_read(v[50], v[190])) + k.emit(v_accvgpr_read(v[51], v[194])) + k.emit(v_accvgpr_read(v[52], v[198])) + k.emit(v_accvgpr_read(v[53], v[202])) + k.emit(v_accvgpr_read(v[54], v[206])) + k.emit(v_accvgpr_read(v[55], v[210])) + k.emit(v_accvgpr_read(v[56], v[214])) + k.emit(v_accvgpr_read(v[57], v[218])) + k.emit(v_accvgpr_read(v[58], v[222])) + k.emit(v_accvgpr_read(v[59], v[226])) + k.emit(v_accvgpr_read(v[60], v[230])) + k.emit(v_accvgpr_read(v[61], v[234])) + k.emit(v_accvgpr_read(v[62], v[238])) + k.emit(v_accvgpr_read(v[63], v[242])) + k.emit(v_accvgpr_read(v[64], v[246])) + k.emit(v_accvgpr_read(v[65], v[250])) + k.emit(v_accvgpr_read(v[66], v[254])) + k.emit(v_accvgpr_read(v[67], v[3])) + k.emit(v_accvgpr_read(v[68], v[7])) + k.emit(v_accvgpr_read(v[69], v[11])) + k.emit(v_accvgpr_read(v[70], v[15])) + k.emit(v_accvgpr_read(v[71], v[19])) + k.emit(v_accvgpr_read(v[72], v[23])) + k.emit(v_accvgpr_read(v[73], v[27])) + k.emit(v_accvgpr_read(v[74], v[31])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(ds_read_b32(v[83], v[86])) + k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(ds_read_b32(v[87], v[90])) + k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(ds_read_b32(v[91], v[94])) + k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(ds_read_b32(v[95], v[98])) + k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[102], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[102], 2, v[102])) + k.emit(ds_read_b32(v[99], v[102])) + k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[106], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[106], 2, v[106])) + k.emit(ds_read_b32(v[103], v[106])) + k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[108], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[108], 2, v[108])) + k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[110], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[110], 2, v[110])) + k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[112], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[112], 2, v[112])) + k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[114], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[114], 2, v[114])) + k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[116], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[116], 2, v[116])) + k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[118], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[118], 2, v[118])) + k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[120], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[120], 2, v[120])) + k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[122], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[122], 2, v[122])) + k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[124], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[124], 2, v[124])) + k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[126], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[126], 2, v[126])) + k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[128], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[128], 2, v[128])) + k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[130], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[130], 2, v[130])) + k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[132], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[132], 2, v[132])) + k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[134], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[134], 2, v[134])) + k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[136], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[136], 2, v[136])) + k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[138], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[138], 2, v[138])) + k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[140], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[140], 2, v[140])) + k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[142], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[142], 2, v[142])) + k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[144], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[144], 2, v[144])) + k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[146], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[146], 2, v[146])) + k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[148], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[148], 2, v[148])) + k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[150], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[150], 2, v[150])) + k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[152], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[152], 2, v[152])) + k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[154], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[154], 2, v[154])) + k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[156], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[156], 2, v[156])) + k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[158], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[158], 2, v[158])) + k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[160], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[160], 2, v[160])) + k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[162], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[162], 2, v[162])) + k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[164], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[164], 2, v[164])) + k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[166], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[166], 2, v[166])) + k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[168], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[168], 2, v[168])) + k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[170], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[170], 2, v[170])) + k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[35])) + k.emit(v_accvgpr_read(v[36], v[39])) + k.emit(v_accvgpr_read(v[37], v[43])) + k.emit(v_accvgpr_read(v[38], v[47])) + k.emit(v_accvgpr_read(v[39], v[51])) + k.emit(v_accvgpr_read(v[40], v[55])) + k.emit(v_accvgpr_read(v[41], v[59])) + k.emit(v_accvgpr_read(v[42], v[63])) + k.emit(v_accvgpr_read(v[43], v[67])) + k.emit(v_accvgpr_read(v[44], v[71])) + k.emit(v_accvgpr_read(v[45], v[75])) + k.emit(v_accvgpr_read(v[46], v[79])) + k.emit(v_accvgpr_read(v[47], v[83])) + k.emit(v_accvgpr_read(v[48], v[87])) + k.emit(v_accvgpr_read(v[49], v[91])) + k.emit(v_accvgpr_read(v[50], v[95])) + k.emit(v_accvgpr_read(v[51], v[99])) + k.emit(v_accvgpr_read(v[52], v[103])) + k.emit(v_accvgpr_read(v[53], v[107])) + k.emit(v_accvgpr_read(v[54], v[111])) + k.emit(v_accvgpr_read(v[55], v[115])) + k.emit(v_accvgpr_read(v[56], v[119])) + k.emit(v_accvgpr_read(v[57], v[123])) + k.emit(v_accvgpr_read(v[58], v[127])) + k.emit(v_accvgpr_read(v[59], v[131])) + k.emit(v_accvgpr_read(v[60], v[135])) + k.emit(v_accvgpr_read(v[61], v[139])) + k.emit(v_accvgpr_read(v[62], v[143])) + k.emit(v_accvgpr_read(v[63], v[147])) + k.emit(v_accvgpr_read(v[64], v[151])) + k.emit(v_accvgpr_read(v[65], v[155])) + k.emit(v_accvgpr_read(v[66], v[159])) + k.emit(v_accvgpr_read(v[67], v[163])) + k.emit(v_accvgpr_read(v[68], v[167])) + k.emit(v_accvgpr_read(v[69], v[171])) + k.emit(v_accvgpr_read(v[70], v[175])) + k.emit(v_accvgpr_read(v[71], v[179])) + k.emit(v_accvgpr_read(v[72], v[183])) + k.emit(v_accvgpr_read(v[73], v[187])) + k.emit(v_accvgpr_read(v[74], v[191])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[76], v[35])) + k.emit(v_add_f32_e32(v[22], v[75], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[80], v[36])) + k.emit(v_add_f32_e32(v[22], v[79], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[84], v[37])) + k.emit(v_add_f32_e32(v[22], v[83], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[88], v[38])) + k.emit(v_add_f32_e32(v[22], v[87], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[92], v[39])) + k.emit(v_add_f32_e32(v[22], v[91], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[96], v[40])) + k.emit(v_add_f32_e32(v[22], v[95], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[100], v[41])) + k.emit(v_add_f32_e32(v[22], v[99], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[104], v[42])) + k.emit(v_add_f32_e32(v[22], v[103], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[76], v[43])) + k.emit(v_add_f32_e32(v[22], v[75], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[80], v[44])) + k.emit(v_add_f32_e32(v[22], v[79], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[84], v[45])) + k.emit(v_add_f32_e32(v[22], v[83], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[88], v[46])) + k.emit(v_add_f32_e32(v[22], v[87], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[92], v[47])) + k.emit(v_add_f32_e32(v[22], v[91], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[96], v[48])) + k.emit(v_add_f32_e32(v[22], v[95], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[100], v[49])) + k.emit(v_add_f32_e32(v[22], v[99], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[104], v[50])) + k.emit(v_add_f32_e32(v[22], v[103], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[51], v[76], v[51])) + k.emit(v_add_f32_e32(v[22], v[75], v[51])) + k.emit(v_mov_b32_e32(v[51], v[22])) + k.emit(v_cvt_pk(v[51], v[51], v[51])) + k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[52], v[80], v[52])) + k.emit(v_add_f32_e32(v[22], v[79], v[52])) + k.emit(v_mov_b32_e32(v[52], v[22])) + k.emit(v_cvt_pk(v[52], v[52], v[52])) + k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[53], v[84], v[53])) + k.emit(v_add_f32_e32(v[22], v[83], v[53])) + k.emit(v_mov_b32_e32(v[53], v[22])) + k.emit(v_cvt_pk(v[53], v[53], v[53])) + k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[54], v[88], v[54])) + k.emit(v_add_f32_e32(v[22], v[87], v[54])) + k.emit(v_mov_b32_e32(v[54], v[22])) + k.emit(v_cvt_pk(v[54], v[54], v[54])) + k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[55], v[92], v[55])) + k.emit(v_add_f32_e32(v[22], v[91], v[55])) + k.emit(v_mov_b32_e32(v[55], v[22])) + k.emit(v_cvt_pk(v[55], v[55], v[55])) + k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[56], v[96], v[56])) + k.emit(v_add_f32_e32(v[22], v[95], v[56])) + k.emit(v_mov_b32_e32(v[56], v[22])) + k.emit(v_cvt_pk(v[56], v[56], v[56])) + k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[57], v[100], v[57])) + k.emit(v_add_f32_e32(v[22], v[99], v[57])) + k.emit(v_mov_b32_e32(v[57], v[22])) + k.emit(v_cvt_pk(v[57], v[57], v[57])) + k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[58], v[104], v[58])) + k.emit(v_add_f32_e32(v[22], v[103], v[58])) + k.emit(v_mov_b32_e32(v[58], v[22])) + k.emit(v_cvt_pk(v[58], v[58], v[58])) + k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[59], v[76], v[59])) + k.emit(v_add_f32_e32(v[22], v[75], v[59])) + k.emit(v_mov_b32_e32(v[59], v[22])) + k.emit(v_cvt_pk(v[59], v[59], v[59])) + k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[60], v[80], v[60])) + k.emit(v_add_f32_e32(v[22], v[79], v[60])) + k.emit(v_mov_b32_e32(v[60], v[22])) + k.emit(v_cvt_pk(v[60], v[60], v[60])) + k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[61], v[84], v[61])) + k.emit(v_add_f32_e32(v[22], v[83], v[61])) + k.emit(v_mov_b32_e32(v[61], v[22])) + k.emit(v_cvt_pk(v[61], v[61], v[61])) + k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[62], v[88], v[62])) + k.emit(v_add_f32_e32(v[22], v[87], v[62])) + k.emit(v_mov_b32_e32(v[62], v[22])) + k.emit(v_cvt_pk(v[62], v[62], v[62])) + k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[63], v[92], v[63])) + k.emit(v_add_f32_e32(v[22], v[91], v[63])) + k.emit(v_mov_b32_e32(v[63], v[22])) + k.emit(v_cvt_pk(v[63], v[63], v[63])) + k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[64], v[96], v[64])) + k.emit(v_add_f32_e32(v[22], v[95], v[64])) + k.emit(v_mov_b32_e32(v[64], v[22])) + k.emit(v_cvt_pk(v[64], v[64], v[64])) + k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[65], v[100], v[65])) + k.emit(v_add_f32_e32(v[22], v[99], v[65])) + k.emit(v_mov_b32_e32(v[65], v[22])) + k.emit(v_cvt_pk(v[65], v[65], v[65])) + k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[66], v[104], v[66])) + k.emit(v_add_f32_e32(v[22], v[103], v[66])) + k.emit(v_mov_b32_e32(v[66], v[22])) + k.emit(v_cvt_pk(v[66], v[66], v[66])) + k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[67], v[76], v[67])) + k.emit(v_add_f32_e32(v[22], v[75], v[67])) + k.emit(v_mov_b32_e32(v[67], v[22])) + k.emit(v_cvt_pk(v[67], v[67], v[67])) + k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[68], v[80], v[68])) + k.emit(v_add_f32_e32(v[22], v[79], v[68])) + k.emit(v_mov_b32_e32(v[68], v[22])) + k.emit(v_cvt_pk(v[68], v[68], v[68])) + k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[69], v[84], v[69])) + k.emit(v_add_f32_e32(v[22], v[83], v[69])) + k.emit(v_mov_b32_e32(v[69], v[22])) + k.emit(v_cvt_pk(v[69], v[69], v[69])) + k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[70], v[88], v[70])) + k.emit(v_add_f32_e32(v[22], v[87], v[70])) + k.emit(v_mov_b32_e32(v[70], v[22])) + k.emit(v_cvt_pk(v[70], v[70], v[70])) + k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[71], v[92], v[71])) + k.emit(v_add_f32_e32(v[22], v[91], v[71])) + k.emit(v_mov_b32_e32(v[71], v[22])) + k.emit(v_cvt_pk(v[71], v[71], v[71])) + k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[72], v[96], v[72])) + k.emit(v_add_f32_e32(v[22], v[95], v[72])) + k.emit(v_mov_b32_e32(v[72], v[22])) + k.emit(v_cvt_pk(v[72], v[72], v[72])) + k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[73], v[100], v[73])) + k.emit(v_add_f32_e32(v[22], v[99], v[73])) + k.emit(v_mov_b32_e32(v[73], v[22])) + k.emit(v_cvt_pk(v[73], v[73], v[73])) + k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[74], v[104], v[74])) + k.emit(v_add_f32_e32(v[22], v[103], v[74])) + k.emit(v_mov_b32_e32(v[74], v[22])) + k.emit(v_cvt_pk(v[74], v[74], v[74])) + k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(v_mov_b32_e32(v[30], 2147483648)) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[54], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[54], 2, v[54])) + k.emit(ds_read_b32(v[51], v[54])) + k.emit(ds_read_b32(v[52], v[54], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[53], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[53], v[30], v[53], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[58], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[58], 2, v[58])) + k.emit(ds_read_b32(v[55], v[58])) + k.emit(ds_read_b32(v[56], v[58], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[57], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[57], v[30], v[57], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[62], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[62], 2, v[62])) + k.emit(ds_read_b32(v[59], v[62])) + k.emit(ds_read_b32(v[60], v[62], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[61], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[61], v[30], v[61], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[66], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[66], 2, v[66])) + k.emit(ds_read_b32(v[63], v[66])) + k.emit(ds_read_b32(v[64], v[66], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[65], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[65], v[30], v[65], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[70], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[70], 2, v[70])) + k.emit(ds_read_b32(v[67], v[70])) + k.emit(ds_read_b32(v[68], v[70], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[69], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[69], v[30], v[69], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[74], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[74], 2, v[74])) + k.emit(ds_read_b32(v[71], v[74])) + k.emit(ds_read_b32(v[72], v[74], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[73], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[73], v[30], v[73], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[78], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[78], 2, v[78])) + k.emit(ds_read_b32(v[75], v[78])) + k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[77], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[82], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[82], 2, v[82])) + k.emit(ds_read_b32(v[79], v[82])) + k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4)) + k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83])) + k.emit(v_add_co_u32(v[19], VCC, v[19], 1)) + k.emit(v_add_u32_e64(v[20], v[20], s[38])) + k.emit(v_add_u32_e64(v[21], v[21], s[36])) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[84], v[18], s[78])) + k.emit(v_lshlrev_b32_e32(v[84], 2, v[84])) + k.emit(v_add_lshl_u32_e64(v[83], v[21], v[18], 1)) + k.emit(v_cndmask_b32_e64(v[83], v[30], v[83], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 1)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[86], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[86], 2, v[86])) + k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 2)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[88], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[88], 2, v[88])) + k.emit(v_add_lshl_u32_e64(v[87], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[87], v[30], v[87], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 3)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[90], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[90], 2, v[90])) + k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 4)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[92], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[92], 2, v[92])) + k.emit(v_add_lshl_u32_e64(v[91], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[91], v[30], v[91], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 5)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[94], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[94], 2, v[94])) + k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 6)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[96], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[96], 2, v[96])) + k.emit(v_add_lshl_u32_e64(v[95], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[95], v[30], v[95], s[82:83])) + k.emit(v_add_co_u32(v[22], VCC, v[18], 7)) + k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20])) + k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21])) + k.emit(s_and_b64(s[82:83], s[78:79], s[82:83])) + k.emit(s_mul_i32(s[78], 256, s[2])) + k.emit(v_sub_u32_e64(v[98], v[22], s[78])) + k.emit(v_lshlrev_b32_e32(v[98], 2, v[98])) + k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1)) + k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83])) + k.emit(v_accvgpr_read(v[35], v[195])) + k.emit(v_accvgpr_read(v[36], v[199])) + k.emit(v_accvgpr_read(v[37], v[203])) + k.emit(v_accvgpr_read(v[38], v[207])) + k.emit(v_accvgpr_read(v[39], v[211])) + k.emit(v_accvgpr_read(v[40], v[215])) + k.emit(v_accvgpr_read(v[41], v[219])) + k.emit(v_accvgpr_read(v[42], v[223])) + k.emit(v_accvgpr_read(v[43], v[227])) + k.emit(v_accvgpr_read(v[44], v[231])) + k.emit(v_accvgpr_read(v[45], v[235])) + k.emit(v_accvgpr_read(v[46], v[239])) + k.emit(v_accvgpr_read(v[47], v[243])) + k.emit(v_accvgpr_read(v[48], v[247])) + k.emit(v_accvgpr_read(v[49], v[251])) + k.emit(v_accvgpr_read(v[50], v[255])) + k.waitcnt(lgkm=0) + k.emit(v_mov_b32_e32(v[32], 4294901760)) + k.emit(v_mov_b32_e32(v[33], 2147418112)) + k.emit(v_mov_b32_e32(v[34], 32767)) + k.emit(v_mul_f32_e32(v[35], v[52], v[35])) + k.emit(v_add_f32_e32(v[22], v[51], v[35])) + k.emit(v_mov_b32_e32(v[35], v[22])) + k.emit(v_cvt_pk(v[35], v[35], v[35])) + k.emit(buffer_store_short(v[35], v[53], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[36], v[56], v[36])) + k.emit(v_add_f32_e32(v[22], v[55], v[36])) + k.emit(v_mov_b32_e32(v[36], v[22])) + k.emit(v_cvt_pk(v[36], v[36], v[36])) + k.emit(buffer_store_short(v[36], v[57], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[37], v[60], v[37])) + k.emit(v_add_f32_e32(v[22], v[59], v[37])) + k.emit(v_mov_b32_e32(v[37], v[22])) + k.emit(v_cvt_pk(v[37], v[37], v[37])) + k.emit(buffer_store_short(v[37], v[61], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[38], v[64], v[38])) + k.emit(v_add_f32_e32(v[22], v[63], v[38])) + k.emit(v_mov_b32_e32(v[38], v[22])) + k.emit(v_cvt_pk(v[38], v[38], v[38])) + k.emit(buffer_store_short(v[38], v[65], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[39], v[68], v[39])) + k.emit(v_add_f32_e32(v[22], v[67], v[39])) + k.emit(v_mov_b32_e32(v[39], v[22])) + k.emit(v_cvt_pk(v[39], v[39], v[39])) + k.emit(buffer_store_short(v[39], v[69], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[40], v[72], v[40])) + k.emit(v_add_f32_e32(v[22], v[71], v[40])) + k.emit(v_mov_b32_e32(v[40], v[22])) + k.emit(v_cvt_pk(v[40], v[40], v[40])) + k.emit(buffer_store_short(v[40], v[73], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[41], v[76], v[41])) + k.emit(v_add_f32_e32(v[22], v[75], v[41])) + k.emit(v_mov_b32_e32(v[41], v[22])) + k.emit(v_cvt_pk(v[41], v[41], v[41])) + k.emit(buffer_store_short(v[41], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[42], v[80], v[42])) + k.emit(v_add_f32_e32(v[22], v[79], v[42])) + k.emit(v_mov_b32_e32(v[42], v[22])) + k.emit(v_cvt_pk(v[42], v[42], v[42])) + k.emit(buffer_store_short(v[42], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[43], v[52], v[43])) + k.emit(v_add_f32_e32(v[22], v[51], v[43])) + k.emit(v_mov_b32_e32(v[43], v[22])) + k.emit(v_cvt_pk(v[43], v[43], v[43])) + k.emit(buffer_store_short(v[43], v[83], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[44], v[56], v[44])) + k.emit(v_add_f32_e32(v[22], v[55], v[44])) + k.emit(v_mov_b32_e32(v[44], v[22])) + k.emit(v_cvt_pk(v[44], v[44], v[44])) + k.emit(buffer_store_short(v[44], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[45], v[60], v[45])) + k.emit(v_add_f32_e32(v[22], v[59], v[45])) + k.emit(v_mov_b32_e32(v[45], v[22])) + k.emit(v_cvt_pk(v[45], v[45], v[45])) + k.emit(buffer_store_short(v[45], v[87], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[46], v[64], v[46])) + k.emit(v_add_f32_e32(v[22], v[63], v[46])) + k.emit(v_mov_b32_e32(v[46], v[22])) + k.emit(v_cvt_pk(v[46], v[46], v[46])) + k.emit(buffer_store_short(v[46], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[47], v[68], v[47])) + k.emit(v_add_f32_e32(v[22], v[67], v[47])) + k.emit(v_mov_b32_e32(v[47], v[22])) + k.emit(v_cvt_pk(v[47], v[47], v[47])) + k.emit(buffer_store_short(v[47], v[91], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[48], v[72], v[48])) + k.emit(v_add_f32_e32(v[22], v[71], v[48])) + k.emit(v_mov_b32_e32(v[48], v[22])) + k.emit(v_cvt_pk(v[48], v[48], v[48])) + k.emit(buffer_store_short(v[48], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[49], v[76], v[49])) + k.emit(v_add_f32_e32(v[22], v[75], v[49])) + k.emit(v_mov_b32_e32(v[49], v[22])) + k.emit(v_cvt_pk(v[49], v[49], v[49])) + k.emit(buffer_store_short(v[49], v[95], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(v_mul_f32_e32(v[50], v[80], v[50])) + k.emit(v_add_f32_e32(v[22], v[79], v[50])) + k.emit(v_mov_b32_e32(v[50], v[22])) + k.emit(v_cvt_pk(v[50], v[50], v[50])) + k.emit(buffer_store_short(v[50], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1)) + k.emit(s_nop()) + k.emit(s_branch(), target='GW_End_1') + k.label('GW_End_1') + k.emit(s_cmp_ge_u32(s[58], s[59])) + k.emit(s_cbranch_scc1(), target='KernelEnd') + k.emit(s_branch(), target='PersistentLoopStart') + k.label('KernelEnd') + k.emit(s_endpgm()) + return k diff --git a/extra/gemm/asm/cdna/gemm.py b/extra/gemm/asm/cdna/gemm.py new file mode 100644 index 0000000000..1201966715 --- /dev/null +++ b/extra/gemm/asm/cdna/gemm.py @@ -0,0 +1,95 @@ +import atexit, functools +from tinygrad.runtime.support.compiler_amd import HIPCompiler +from tinygrad import Tensor, Device, dtypes +from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType +from tinygrad.renderer import Estimates +from tinygrad.helpers import getenv, all_same, dedup +from extra.gemm.asm.cdna.asm import build_kernel, GEMM_ARGS + +# ** CDNA4 assembly gemm + +WORKGROUP_SIZE = 256 + +def custom_asm_gemm(C:UOp, A:UOp, B:UOp, dname:str, arch:str, wg:int) -> UOp: + batch, M, K = A.shape + K2, N = B.shape[(1 if B.ndim == 3 else 0):] + assert K == K2 + lidx = UOp.special(WORKGROUP_SIZE, "lidx0") + gidx = UOp.special(wg, "gidx0") + k = build_kernel(batch, M, N, K, A.dtype.base) + sink = UOp.sink(C.base, A.base, B.base, lidx, gidx, + arg=KernelInfo(name=k.name, estimates=Estimates(ops=2*batch*M*N*K, mem=(batch*M*K + K*N + batch*M*N)*2))) + binary = HIPCompiler(arch).compile(k.to_asm()) + return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=dname), UOp(Ops.LINEAR, src=(*sink.src, sink)), + UOp(Ops.SOURCE, arg=k.to_text()), UOp(Ops.BINARY, arg=binary))) + +counters = {"used":0, "todos":[]} +def todo(msg:str) -> bool: counters["todos"].append(msg); return False +atexit.register(lambda: print(f'asm_gemm: {counters["used"]} used, {len(counters["todos"])} not used')) + +def can_use_asm_gemm(a:Tensor, b:Tensor) -> bool: + if a.dtype != b.dtype: return todo(f"dtypes must match {a.dtype} != {b.dtype}") + if a.dtype not in {dtypes.bfloat16, dtypes.float16}: return todo(f"only bfloat16/float16, got {a.dtype}") + # only sharding on the batch is tested, others might work too + if isinstance(a.device, tuple) and not (a.ndim == 3 and a.uop.axis == 0 and b.uop.axis is None): + return todo(f"sharding mismatch a.ndim={a.ndim} a.uop.axis={a.uop.axis} b.uop.axis={b.uop.axis}") + batch, M, K = (1, *a.shape) if a.ndim == 2 else a.shape + N = b.shape[1] + if isinstance(a.device, tuple): batch //= len(a.device) + if batch not in {1, 2}: return todo(f"GEMM batch size {batch}") + if (key:=(M, N, K)) not in GEMM_ARGS: return todo(f"GEMM shape not supported {key}") + return True + +# ** UOp gemm to test Tensor.custom_kernel multi and backward correctness on non cdna4 +# note: this can be removed after we have GEMM on mixins + +def custom_uop_gemm(C:UOp, A:UOp, B:UOp) -> UOp: + M, K = A.shape[0]*A.shape[1], A.shape[2] + K2, N = B.shape[(1 if B.ndim == 3 else 0):] + assert K == K2 + m = UOp.range(M, 1, AxisType.LOOP) + n = UOp.range(N, 2, AxisType.LOOP) + k = UOp.range(K, 0, AxisType.REDUCE) + mul = (A.index((m*UOp.const(dtypes.index, K)+k))*B.index((k*UOp.const(dtypes.index, N)+n))).cast(dtypes.float32) + red = mul.reduce(k, arg=Ops.ADD, dtype=dtypes.float32).cast(C.dtype.base) + store = C.index((m*UOp.const(dtypes.index, N)+n), ptr=True).store(red).end(m, n) + return store.sink(arg=KernelInfo(name=f'uop_gemm_{M}_{N}_{K}')) + +# ** backward gemm, might use the asm gemm + +def custom_gemm_bw(gradient:UOp, kernel:UOp): + out, a, b = kernel.src + assert all_same([gradient.device, a.device, b.device, out.device]) + a_t, b_t, g_t = Tensor(a, device=a.device), Tensor(b, device=a.device), Tensor(gradient, device=a.device) + grad_a = (g_t @ b_t.T).uop + a_T = a_t.transpose(-2, -1) + a_T = a_T.reshape(*a_T.shape[:-1], 1, a_T.shape[-1]) + g_r = g_t.reshape(*g_t.shape[:-2], 1, *g_t.shape[-2:]).transpose(-1, -2) + grad_b = (a_T * g_r).sum((-1, 0)).uop + return (None, grad_a, grad_b) + +# ** main gemm function + +def asm_gemm(a:Tensor, b:Tensor) -> Tensor: + assert can_use_asm_gemm(a, b), f"{counters['todos'][-1]}" + counters["used"] += 1 + squeeze = a.ndim == 2 + if squeeze: a = a.unsqueeze(0) + + batch, M, K = a.shape + N = b.shape[1] + is_multi = isinstance(a.device, tuple) + + if is_multi: + out = Tensor(Tensor.empty(batch//len(a.device), M, N, dtype=a.dtype, device=a.device).uop.multi(0), device=a.device) + else: + out = Tensor.empty(batch, M, N, dtype=a.dtype, device=a.device) + + dname = a.device[0] if is_multi else a.device + arch = getattr(Device[dname].renderer, "arch", None) + if arch.startswith("gfx950") and getenv("USE_ASM", 1): + numWG = GEMM_ARGS[(M, N, K)][0] + out = Tensor.custom_kernel(out, a, b, fxn=functools.partial(custom_asm_gemm, dname=dname, wg=numWG, arch=arch), grad_fxn=custom_gemm_bw)[0] + else: + out = Tensor.custom_kernel(out, a, b, fxn=custom_uop_gemm, grad_fxn=custom_gemm_bw)[0] + return out.squeeze(0) if squeeze else out diff --git a/extra/gemm/asm/cdna/gemm.s b/extra/gemm/asm/cdna/gemm.s deleted file mode 100644 index 81bc04e12f..0000000000 --- a/extra/gemm/asm/cdna/gemm.s +++ /dev/null @@ -1,1621 +0,0 @@ - // ** global buffers - s_load_dwordx2 s[28:29], s[0:1], 0x0 // C - s_load_dwordx2 s[34:35], s[0:1], 0x08 // A - s_load_dwordx2 s[32:33], s[0:1], 0x10 // B - // ** others kernel args - s_load_dword s24, s[0:1], 0x18 // N - s_waitcnt lgkmcnt(0) - // "info" - s_mov_b32 s51, 1 // gemm_info = 1 - s_mov_b32 s53, 1 // kernel_info0 = 1 - s_mov_b32 s11, 0x40010020 // kernel_info1 = 0x40010020 - // sizes / strides - s_mov_b32 s25, s24 // sizesFree1 = N - s_mov_b32 s26, 1 // sizesFree2 = BATCH - s_mov_b32 s27, s24 // sizesSum0 = K (== N) - // Strides: major=N, minor=0 (addr = base + idx0*N + idx1*0) - s_mov_b32 s36, s24 // strideD0 - s_mov_b32 s37, 0 // strideD1 - s_mov_b32 s38, s24 // strideC0 - s_mov_b32 s39, 0 // strideC1 - s_mov_b32 s40, s24 // strideA0 - s_mov_b32 s41, 0 // strideA1 - s_mov_b32 s42, s24 // strideB0 - s_mov_b32 s43, 0 // strideB1 - // ** workgroup mapping - s_lshr_b32 s52, s51, 30 // 000000002924: 8F349E33 - s_and_b32 s51, 0x3fffffff, s51 // 000000002928: 863333FF 3FFFFFFF - s_cmp_eq_u32 s52, 0 // 000000002930: BF068034 - s_and_b32 s10, s53, 0xffff0000 // 000000002A70: 860AFF35 FFFF0000 - s_lshr_b32 s10, s10, 16 // 000000002A78: 8F0A900A - s_and_b32 s50, s53, 0xffff // 000000002A7C: 8632FF35 0000FFFF - s_mov_b32 s5, s52 // 000000002A84: BE850034 - s_mov_b32 m0, 0x20800 // 000000002A88: BEFC00FF 00020800 - v_mov_b32_e32 v134, v0 // 000000002A90: 7F0C0300 - s_lshr_b32 s60, s11, 16 // 000000002A94: 8F3C900B - s_ff1_i32_b32 s60, s60 // 000000002A98: BEBC103C - s_lshr_b32 s61, s11, 22 // 000000002A9C: 8F3D960B - v_and_b32_e32 v5, 63, v134 // 000000002BB0: 260B0CBF - v_and_b32_e32 v4, 15, v5 // 000000002BB4: 26080A8F - v_lshlrev_b32_e32 v4, 6, v4 // 000000002BB8: 24080886 - v_lshlrev_b32_e32 v4, 3, v4 // 000000002BBC: 24080883 - v_lshrrev_b32_e32 v5, 4, v5 // 000000002BC0: 200A0A84 - v_lshl_add_u32 v4, v5, 3, v4 // 000000002BC4: D1FD0004 04110705 - v_lshrrev_b32_e32 v8, 6, v134 // 000000002BCC: 20110C86 - v_and_b32_e32 v8, 1, v8 // 000000002BD0: 26101081 - v_lshl_add_u32 v4, v8, 13, v4 // 000000002BD4: D1FD0004 04111B08 - v_and_b32_e32 v6, 63, v134 // 000000002BDC: 260D0CBF - v_and_b32_e32 v5, 15, v6 // 000000002BE0: 260A0C8F - v_lshlrev_b32_e32 v5, 6, v5 // 000000002BE4: 240A0A86 - v_lshlrev_b32_e32 v5, 3, v5 // 000000002BE8: 240A0A83 - v_lshrrev_b32_e32 v6, 4, v6 // 000000002BEC: 200C0C84 - v_lshl_add_u32 v5, v6, 3, v5 // 000000002BF0: D1FD0005 04150706 - v_lshrrev_b32_e32 v7, 7, v134 // 000000002BF8: 200F0C87 - v_and_b32_e32 v7, 1, v7 // 000000002BFC: 260E0E81 - v_lshl_add_u32 v5, v7, 13, v5 // 000000002C00: D1FD0005 04151B07 - v_lshrrev_b32_e32 v6, 6, v134 // 000000002C08: 200D0C86 - v_lshrrev_b32_e32 v6, 2, v6 // 000000002C0C: 200C0C82 - s_mov_b32 s53, 64 // 000000002C10: BEB500C0 - v_mul_lo_u32 v6, s53, v6 // 000000002C14: D2850006 00020C35 - v_add_lshl_u32 v2, v6, v4, 1 // 000000002C1C: D1FE0002 02060906 - v_lshrrev_b32_e32 v7, 10, v2 // 000000002C24: 200E048A - v_lshl_add_u32 v2, v7, 4, v2 // 000000002C28: D1FD0002 04090907 - v_lshrrev_b32_e32 v4, 6, v134 // 000000002C30: 20090C86 - v_lshrrev_b32_e32 v4, 2, v4 // 000000002C34: 20080882 - v_mul_lo_u32 v4, s53, v4 // 000000002C38: D2850004 00020835 - v_add_lshl_u32 v3, v4, v5, 1 // 000000002C40: D1FE0003 02060B04 - v_lshrrev_b32_e32 v6, 10, v3 // 000000002C48: 200C068A - v_lshl_add_u32 v3, v6, 4, v3 // 000000002C4C: D1FD0003 040D0906 - v_add_co_u32_e32 v3, vcc, 0x8200, v3 // 000000002C54: 320606FF 00008200 - v_add_u32_e32 v132, 0x10400, v2 // 000000002C5C: 690804FF 00010400 - v_xor_b32_e32 v132, v132, v2 // 000000002C64: 2B080584 - v_add_u32_e32 v133, 0x10400, v3 // 000000002C68: 690A06FF 00010400 - v_xor_b32_e32 v133, v133, v3 // 000000002C70: 2B0A0785 - v_lshrrev_b32_e32 v4, 3, v134 // 000000002C74: 20090C83 - v_and_b32_e32 v5, 7, v134 // 000000002C78: 260B0C87 - v_lshlrev_b32_e32 v5, 3, v5 // 000000002C7C: 240A0A83 - v_mov_b32_e32 v8, v5 // 000000002C80: 7E100305 - v_lshrrev_b32_e32 v6, 3, v134 // 000000002C84: 200D0C83 - v_and_b32_e32 v7, 7, v134 // 000000002C88: 260F0C87 - v_lshlrev_b32_e32 v7, 3, v7 // 000000002C8C: 240E0E83 - v_mov_b32_e32 v9, v7 // 000000002C90: 7E120307 - v_mul_u32_u24_e32 v10, 64, v4 // 000000002C94: 101408C0 - v_add_lshl_u32 v10, v8, v10, 1 // 000000002C98: D1FE000A 02061508 - v_lshrrev_b32_e32 v12, 10, v10 // 000000002CA0: 2018148A - v_lshl_add_u32 v10, v12, 4, v10 // 000000002CA4: D1FD000A 0429090C - s_nop 0 // 000000002CAC: BF800000 - v_readfirstlane_b32 s46, v10 // 000000002CB0: 7E5C050A - s_nop 0 // 000000002CB4: BF800000 - s_add_u32 s48, s46, 0x10400 // 000000002CB8: 8030FF2E 00010400 - s_xor_b32 s48, s48, s46 // 000000002CC0: 88302E30 - v_mul_u32_u24_e32 v10, 64, v6 // 000000002CC4: 10140CC0 - v_add_lshl_u32 v10, v9, v10, 1 // 000000002CC8: D1FE000A 02061509 - v_lshrrev_b32_e32 v12, 10, v10 // 000000002CD0: 2018148A - v_lshl_add_u32 v10, v12, 4, v10 // 000000002CD4: D1FD000A 0429090C - v_add_co_u32_e32 v10, vcc, 0x8200, v10 // 000000002CDC: 321414FF 00008200 - s_nop 0 // 000000002CE4: BF800000 - v_readfirstlane_b32 s47, v10 // 000000002CE8: 7E5E050A - s_nop 0 // 000000002CEC: BF800000 - s_add_u32 s49, s47, 0x10400 // 000000002CF0: 8031FF2F 00010400 - s_xor_b32 s49, s49, s47 // 000000002CF8: 88312F31 - v_mov_b32_e32 v12, 0x100 // 000000002CFC: 7E1802FF 00000100 - v_mov_b32_e32 v11, s24 // 000000002D04: 7E160218 - v_cvt_f32_u32_e32 v10, v12 // 000000002D08: 7E140D0C - v_rcp_iflag_f32_e32 v10, v10 // 000000002D0C: 7E14470A - v_cvt_f32_u32_e32 v13, v11 // 000000002D10: 7E1A0D0B - v_mul_f32_e32 v10, v10, v13 // 000000002D14: 0A141B0A - v_cvt_u32_f32_e32 v10, v10 // 000000002D18: 7E140F0A - v_mul_u32_u24_e32 v13, v10, v12 // 000000002D1C: 101A190A - v_sub_u32_e32 v13, v11, v13 // 000000002D20: 6A1A1B0B - v_cmp_ne_u32_e64 vcc, v13, 0 // 000000002D24: D0CD006A 0001010D - v_addc_co_u32_e64 v10, vcc, v10, 0, vcc // 000000002D2C: D11C6A0A 01A9010A - v_mov_b32_e32 v12, 0x100 // 000000002D34: 7E1802FF 00000100 - v_mov_b32_e32 v11, s25 // 000000002D3C: 7E160219 - v_readfirstlane_b32 s14, v10 // 000000002D40: 7E1C050A - v_cvt_f32_u32_e32 v10, v12 // 000000002D44: 7E140D0C - v_rcp_iflag_f32_e32 v10, v10 // 000000002D48: 7E14470A - v_cvt_f32_u32_e32 v13, v11 // 000000002D4C: 7E1A0D0B - v_mul_f32_e32 v10, v10, v13 // 000000002D50: 0A141B0A - v_cvt_u32_f32_e32 v10, v10 // 000000002D54: 7E140F0A - v_mul_u32_u24_e32 v13, v10, v12 // 000000002D58: 101A190A - v_sub_u32_e32 v13, v11, v13 // 000000002D5C: 6A1A1B0B - v_cmp_ne_u32_e64 vcc, v13, 0 // 000000002D60: D0CD006A 0001010D - v_addc_co_u32_e64 v10, vcc, v10, 0, vcc // 000000002D68: D11C6A0A 01A9010A - s_nop 0 // 000000002D70: BF800000 - v_readfirstlane_b32 s15, v10 // 000000002D74: 7E1E050A - s_waitcnt lgkmcnt(0) // 000000002D78: BF8CC07F - s_mul_i32 s52, s14, s15 // 000000002D7C: 92340F0E - s_and_b32 s53, s50, 0x3fff // 000000002D80: 8635FF32 00003FFF - s_mul_i32 s52, s52, s53 // 000000002D88: 92343534 - v_cvt_f32_u32_e32 v10, s52 // 000000002D8C: 7E140C34 - v_rcp_iflag_f32_e32 v10, v10 // 000000002D90: 7E14470A - v_cvt_f32_u32_e32 v11, s2 // 000000002D94: 7E160C02 - v_mul_f32_e32 v10, v10, v11 // 000000002D98: 0A14170A - v_cvt_u32_f32_e32 v10, v10 // 000000002D9C: 7E140F0A - v_mul_u32_u24_e64 v11, v10, s52 // 000000002DA0: D108000B 0000690A - v_sub_u32_e32 v11, s2, v11 // 000000002DA8: 6A161602 - v_cmpx_eq_u32_e64 exec, v11, s52 // 000000002DAC: D0DA007E 0000690B - v_add_u32_e32 v10, 1, v10 // 000000002DB4: 68141481 - s_mov_b64 exec, -1 // 000000002DB8: BEFE01C1 - v_cmpx_gt_u32_e64 exec, v11, s52 // 000000002DBC: D0DC007E 0000690B - v_sub_u32_e64 v10, v10, 1 // 000000002DC4: D135000A 0001030A - s_mov_b64 exec, -1 // 000000002DCC: BEFE01C1 - v_readfirstlane_b32 s52, v10 // 000000002DD0: 7E68050A - s_mov_b32 s4, s52 // 000000002DD4: BE840034 - s_mul_i32 s52, s15, s14 // 000000002DD8: 92340E0F - s_mul_i32 s52, s52, s4 // 000000002DDC: 92340434 - s_mul_i32 s52, s52, s53 // 000000002DE0: 92343534 - s_sub_u32 s2, s2, s52 // 000000002DE4: 80823402 - v_cvt_f32_u32_e32 v10, s14 // 000000002DE8: 7E140C0E - v_rcp_iflag_f32_e32 v10, v10 // 000000002DEC: 7E14470A - v_cvt_f32_u32_e32 v11, s2 // 000000002DF0: 7E160C02 - v_mul_f32_e32 v10, v10, v11 // 000000002DF4: 0A14170A - v_cvt_u32_f32_e32 v10, v10 // 000000002DF8: 7E140F0A - v_mul_u32_u24_e64 v11, v10, s14 // 000000002DFC: D108000B 00001D0A - v_sub_u32_e32 v11, s2, v11 // 000000002E04: 6A161602 - v_cmpx_eq_u32_e64 exec, v11, s14 // 000000002E08: D0DA007E 00001D0B - v_add_u32_e32 v10, 1, v10 // 000000002E10: 68141481 - s_mov_b64 exec, -1 // 000000002E14: BEFE01C1 - v_cmpx_gt_u32_e64 exec, v11, s14 // 000000002E18: D0DC007E 00001D0B - v_sub_u32_e64 v10, v10, 1 // 000000002E20: D135000A 0001030A - s_mov_b64 exec, -1 // 000000002E28: BEFE01C1 - v_readfirstlane_b32 s52, v10 // 000000002E2C: 7E68050A - s_mov_b32 s3, s52 // 000000002E30: BE830034 - s_mul_i32 s52, s3, s14 // 000000002E34: 92340E03 - s_sub_u32 s2, s2, s52 // 000000002E38: 80823402 - s_sub_u32 s32, s32, 16 // 000000002E3C: 80A09020 - s_subb_u32 s33, s33, 0 // 000000002E40: 82A18021 - s_sub_u32 s34, s34, 16 // 000000002E44: 80A29022 - s_subb_u32 s35, s35, 0 // 000000002E48: 82A38023 - s_and_b32 s84, s50, 0x3fff // 000000002E5C: 8654FF32 00003FFF - s_mov_b64 s[6:7], 0 // 000000002F48: BE860180 - s_mov_b32 s8, 1 // 000000002F4C: BE880081 - s_mov_b32 s9, 1 // 000000002F50: BE890081 - - s_sext_i32_i16 s11, s11 // 000000002F54: BE8B170B - v_mul_lo_u32 v10, s40, v4 // 0000000031B8: D285000A 00020828 - v_add_co_u32_e32 v0, vcc, v5, v10 // 0000000031C0: 32001505 - v_add_u32_e32 v0, 8, v0 // 0000000031C4: 68000088 - v_lshlrev_b32_e32 v0, 1, v0 // 0000000031C8: 24000081 - s_mul_i32 s70, s40, 32 // 0000000031CC: 9246A028 - s_lshl_b32 s70, s70, 1 // 0000000031D0: 8E468146 - s_mul_i32 s71, s40, 64 // 0000000031D4: 9247C028 - s_lshl_b32 s71, s71, 1 // 0000000031D8: 8E478147 - s_mul_i32 s72, s40, 0x60 // 0000000031DC: 9248FF28 00000060 - s_lshl_b32 s72, s72, 1 // 0000000031E4: 8E488148 - s_mul_i32 s73, s40, 0x80 // 0000000031E8: 9249FF28 00000080 - s_lshl_b32 s73, s73, 1 // 0000000031F0: 8E498149 - s_mul_i32 s74, s40, 0xa0 // 0000000031F4: 924AFF28 000000A0 - s_lshl_b32 s74, s74, 1 // 0000000031FC: 8E4A814A - s_mul_i32 s75, s40, 0xc0 // 000000003200: 924BFF28 000000C0 - s_lshl_b32 s75, s75, 1 // 000000003208: 8E4B814B - s_mul_i32 s76, s40, 0xe0 // 00000000320C: 924CFF28 000000E0 - s_lshl_b32 s76, s76, 1 // 000000003214: 8E4C814C - v_mul_lo_u32 v10, s42, v6 // 000000003218: D285000A 00020C2A - v_add_co_u32_e32 v1, vcc, v7, v10 // 000000003220: 32021507 - v_add_u32_e32 v1, 8, v1 // 000000003224: 68020288 - v_lshlrev_b32_e32 v1, 1, v1 // 000000003228: 24020281 - s_mul_i32 s77, s42, 32 // 00000000322C: 924DA02A - s_lshl_b32 s77, s77, 1 // 000000003230: 8E4D814D - s_mul_i32 s78, s42, 64 // 000000003234: 924EC02A - s_lshl_b32 s78, s78, 1 // 000000003238: 8E4E814E - s_mul_i32 s79, s42, 0x60 // 00000000323C: 924FFF2A 00000060 - s_lshl_b32 s79, s79, 1 // 000000003244: 8E4F814F - s_mul_i32 s80, s42, 0x80 // 000000003248: 9250FF2A 00000080 - s_lshl_b32 s80, s80, 1 // 000000003250: 8E508150 - s_mul_i32 s81, s42, 0xa0 // 000000003254: 9251FF2A 000000A0 - s_lshl_b32 s81, s81, 1 // 00000000325C: 8E518151 - s_mul_i32 s82, s42, 0xc0 // 000000003260: 9252FF2A 000000C0 - s_lshl_b32 s82, s82, 1 // 000000003268: 8E528152 - s_mul_i32 s83, s42, 0xe0 // 00000000326C: 9253FF2A 000000E0 - s_lshl_b32 s83, s83, 1 // 000000003274: 8E538153 - s_mul_hi_u32 s87, s2, 0x100 // 000000003278: 9657FF02 00000100 - s_mul_i32 s86, s2, 0x100 // 000000003280: 9256FF02 00000100 - s_mul_hi_u32 s87, s86, s40 // 000000003288: 96572856 - s_mul_i32 s86, s86, s40 // 00000000328C: 92562856 - s_and_b32 s84, s50, 0x8000 // 000000003290: 8654FF32 00008000 - s_cbranch_scc1 skip_offset_A // 000000003298: BF850003 - s_mul_hi_u32 s85, 64, s6 // 00000000329C: 965506C0 - s_mul_i32 s84, 64, s6 // 0000000032A0: 925406C0 - -skip_offset_A: - s_add_u32 s86, s86, s84 // 000000003330: 80565456 - s_addc_u32 s87, s87, s85 // 000000003334: 82575557 - s_mov_b64 s[60:61], 1 // 000000003338: BEBC0181 - s_sub_u32 s84, s27, 1 // 00000000333C: 80D4811B - s_mul_hi_u32 s85, 1, s84 // 000000003340: 96555481 - s_mul_i32 s84, 1, s84 // 000000003344: 92545481 - s_add_u32 s60, s60, s84 // 000000003348: 803C543C - s_addc_u32 s61, s61, s85 // 00000000334C: 823D553D - s_sub_u32 s84, s24, 1 // 000000003350: 80D48118 - s_mul_hi_u32 s85, s40, s84 // 000000003354: 96555428 - s_mul_i32 s84, s40, s84 // 000000003358: 92545428 - s_add_u32 s60, s60, s84 // 00000000335C: 803C543C - s_addc_u32 s61, s61, s85 // 000000003360: 823D553D - s_sub_u32 s60, s60, s86 // 000000003364: 80BC563C - s_subb_u32 s61, s61, s87 // 000000003368: 82BD573D - s_lshl_b64 s[60:61], s[60:61], 1 // 00000000336C: 8EBC813C - s_add_u32 s60, s60, 16 // 000000003370: 803C903C - s_addc_u32 s61, s61, 0 // 000000003374: 823D803D - s_cmp_eq_u32 s61, 0 // 000000003378: BF06803D - s_cselect_b32 s54, s60, -1 // 00000000337C: 8536C13C - s_mul_hi_u32 s85, s41, s4 // 000000003380: 96550429 - s_mul_i32 s84, s41, s4 // 000000003384: 92540429 - s_add_u32 s86, s86, s84 // 000000003388: 80565456 - s_addc_u32 s87, s87, s85 // 00000000338C: 82575557 - s_lshl_b64 s[86:87], s[86:87], 1 // 000000003390: 8ED68156 - s_add_u32 s52, s32, s86 // 000000003394: 80345620 - s_addc_u32 s53, s33, s87 // 000000003398: 82355721 - s_mov_b32 s55, 0x20000 // 00000000339C: BEB700FF 00020000 - s_mul_hi_u32 s87, s3, 0x100 // 0000000033A4: 9657FF03 00000100 - s_mul_i32 s86, s3, 0x100 // 0000000033AC: 9256FF03 00000100 - s_mul_hi_u32 s87, s86, s42 // 0000000033B4: 96572A56 - s_mul_i32 s86, s86, s42 // 0000000033B8: 92562A56 - s_and_b32 s84, s50, 0x8000 // 0000000033BC: 8654FF32 00008000 - s_cbranch_scc1 skip_offset_B // 0000000033C4: BF850003 - s_mul_hi_u32 s85, 64, s6 // 0000000033C8: 965506C0 - s_mul_i32 s84, 64, s6 // 0000000033CC: 925406C0 - -skip_offset_B: - s_add_u32 s86, s86, s84 // 00000000345C: 80565456 - s_addc_u32 s87, s87, s85 // 000000003460: 82575557 - s_mov_b64 s[62:63], 1 // 000000003464: BEBE0181 - s_sub_u32 s84, s27, 1 // 000000003468: 80D4811B - s_mul_hi_u32 s85, 1, s84 // 00000000346C: 96555481 - s_mul_i32 s84, 1, s84 // 000000003470: 92545481 - s_add_u32 s62, s62, s84 // 000000003474: 803E543E - s_addc_u32 s63, s63, s85 // 000000003478: 823F553F - s_sub_u32 s84, s25, 1 // 00000000347C: 80D48119 - s_mul_hi_u32 s85, s42, s84 // 000000003480: 9655542A - s_mul_i32 s84, s42, s84 // 000000003484: 9254542A - s_add_u32 s62, s62, s84 // 000000003488: 803E543E - s_addc_u32 s63, s63, s85 // 00000000348C: 823F553F - s_sub_u32 s62, s62, s86 // 000000003490: 80BE563E - s_subb_u32 s63, s63, s87 // 000000003494: 82BF573F - s_lshl_b64 s[62:63], s[62:63], 1 // 000000003498: 8EBE813E - s_add_u32 s62, s62, 16 // 00000000349C: 803E903E - s_addc_u32 s63, s63, 0 // 0000000034A0: 823F803F - s_cmp_eq_u32 s63, 0 // 0000000034A4: BF06803F - s_cselect_b32 s58, s62, -1 // 0000000034A8: 853AC13E - s_mul_hi_u32 s85, s43, s4 // 0000000034AC: 9655042B - s_mul_i32 s84, s43, s4 // 0000000034B0: 9254042B - s_add_u32 s86, s86, s84 // 0000000034B4: 80565456 - s_addc_u32 s87, s87, s85 // 0000000034B8: 82575557 - s_lshl_b64 s[86:87], s[86:87], 1 // 0000000034BC: 8ED68156 - s_add_u32 s56, s34, s86 // 0000000034C0: 80385622 - s_addc_u32 s57, s35, s87 // 0000000034C4: 82395723 - s_mov_b32 s59, 0x20000 // 0000000034C8: BEBB00FF 00020000 - s_and_b32 s85, s50, 0x3fff // 0000000034D0: 8655FF32 00003FFF - s_mul_i32 s85, s85, 0x80 // 0000000034D8: 9255FF55 00000080 - s_and_b32 s84, s50, 0x8000 // 0000000034E0: 8654FF32 00008000 - s_cselect_b32 s68, 0x80, s85 // 0000000034E8: 854455FF 00000080 - s_and_b32 s85, s50, 0x3fff // 0000000034F0: 8655FF32 00003FFF - s_mul_i32 s85, s85, 0x80 // 0000000034F8: 9255FF55 00000080 - s_and_b32 s84, s50, 0x8000 // 000000003500: 8654FF32 00008000 - s_cselect_b32 s69, 0x80, s85 // 000000003508: 854555FF 00000080 - s_lshr_b32 s12, s27, 6 // 000000003510: 8F0C861B - s_and_b32 s84, s50, 0x3fff // 000000003514: 8654FF32 00003FFF - s_mov_b32 s13, s12 // 000000003594: BE8D000C - s_and_b32 s86, s10, 0x1f00 // 000000003598: 8656FF0A 00001F00 - s_lshr_b32 s86, s86, 8 // 0000000035A0: 8F568856 - s_and_b32 s87, s10, 0xe000 // 0000000035A4: 8657FF0A 0000E000 - s_and_b32 s10, s10, 0xff // 0000000035AC: 860AFF0A 000000FF - s_mov_b32 s84, s10 // 0000000035B4: BED4000A - s_lshl_b32 s85, s84, s86 // 0000000035B8: 8E555654 - s_cmp_ge_u32 s13, s85 // 0000000035BC: BF09550D - s_sub_u32 s85, s84, 1 // 0000000035CC: 80D58154 - s_cmp_ge_u32 s84, 1 // 0000000035D0: BF098154 - s_cselect_b32 s51, s85, 0 // 0000000035D4: 85338055 - s_cmp_eq_u32 s87, 0x2000 // 0000000035E8: BF06FF57 00002000 - s_and_b32 s51, s51, s84 // 000000003640: 86335433 - s_lshl_b32 s51, s51, s86 // 000000003644: 8E335633 - s_mul_hi_i32 s85, s51, s68 // 000000003648: 96D54433 - s_mul_i32 s84, s51, s68 // 00000000364C: 92544433 - s_mul_hi_i32 s65, s12, s68 // 000000003650: 96C1440C - s_mul_i32 s64, s12, s68 // 000000003654: 9240440C - s_sub_u32 s64, s68, s64 // 000000003658: 80C04044 - s_subb_u32 s65, 0, s65 // 00000000365C: 82C14180 - s_add_u32 s52, s52, s84 // 000000003660: 80345434 - s_addc_u32 s53, s53, s85 // 000000003664: 82355535 - s_sub_u32 s60, s60, s84 // 000000003668: 80BC543C - s_subb_u32 s61, s61, s85 // 00000000366C: 82BD553D - s_cmp_eq_u32 s61, 0 // 000000003670: BF06803D - s_cselect_b32 s54, s60, -1 // 000000003674: 8536C13C - s_mul_hi_i32 s85, s51, s69 // 000000003678: 96D54533 - s_mul_i32 s84, s51, s69 // 00000000367C: 92544533 - s_mul_hi_i32 s67, s12, s69 // 000000003680: 96C3450C - s_mul_i32 s66, s12, s69 // 000000003684: 9242450C - s_sub_u32 s66, s69, s66 // 000000003688: 80C24245 - s_subb_u32 s67, 0, s67 // 00000000368C: 82C34380 - s_add_u32 s56, s56, s84 // 000000003690: 80385438 - s_addc_u32 s57, s57, s85 // 000000003694: 82395539 - s_sub_u32 s62, s62, s84 // 000000003698: 80BE543E - s_subb_u32 s63, s63, s85 // 00000000369C: 82BF553F - s_cmp_eq_u32 s63, 0 // 0000000036A0: BF06803F - s_cselect_b32 s58, s62, -1 // 0000000036A4: 853AC13E - s_add_u32 s51, s51, 2 // 0000000036A8: 80338233 - s_cmp_eq_u32 s12, 0 // 0000000036AC: BF06800C - s_cbranch_scc1 init_output_buffers // 0000000036B0: BF850092 - s_mov_b32 m0, s46 // 0000000036B4: BEFC002E - buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 0000000036B8: E05D1000 800D0000 - s_add_u32 m0, m0, 0x1040 // 0000000036C0: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 0000000036C8: E05D1000 460D0000 - s_add_u32 m0, m0, 0x1040 // 0000000036D0: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 0000000036D8: E05D1000 470D0000 - s_add_u32 m0, m0, 0x1040 // 0000000036E0: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 0000000036E8: E05D1000 480D0000 - s_add_u32 m0, m0, 0x1040 // 0000000036F0: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 0000000036F8: E05D1000 490D0000 - s_add_u32 m0, m0, 0x1040 // 000000003700: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003708: E05D1000 4A0D0000 - s_add_u32 m0, m0, 0x1040 // 000000003710: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003718: E05D1000 4B0D0000 - s_add_u32 m0, m0, 0x1040 // 000000003720: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003728: E05D1000 4C0D0000 - s_mov_b32 m0, s47 // 000000003730: BEFC002F - buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003734: E05D1000 800E0001 - s_add_u32 m0, m0, 0x1040 // 00000000373C: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003744: E05D1000 4D0E0001 - s_add_u32 m0, m0, 0x1040 // 00000000374C: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003754: E05D1000 4E0E0001 - s_add_u32 m0, m0, 0x1040 // 00000000375C: 807CFF7C 00001040 - v_accvgpr_write_b32 a0, 0 // 000000003764: D3D94000 18000080 - v_accvgpr_write_b32 a1, 0 // 00000000376C: D3D94001 18000080 - v_accvgpr_write_b32 a2, 0 // 000000003774: D3D94002 18000080 - v_accvgpr_write_b32 a3, 0 // 00000000377C: D3D94003 18000080 - v_accvgpr_write_b32 a4, 0 // 000000003784: D3D94004 18000080 - v_accvgpr_write_b32 a5, 0 // 00000000378C: D3D94005 18000080 - v_accvgpr_write_b32 a6, 0 // 000000003794: D3D94006 18000080 - v_accvgpr_write_b32 a7, 0 // 00000000379C: D3D94007 18000080 - v_accvgpr_write_b32 a8, 0 // 0000000037A4: D3D94008 18000080 - v_accvgpr_write_b32 a9, 0 // 0000000037AC: D3D94009 18000080 - v_accvgpr_write_b32 a10, 0 // 0000000037B4: D3D9400A 18000080 - v_accvgpr_write_b32 a11, 0 // 0000000037BC: D3D9400B 18000080 - v_accvgpr_write_b32 a12, 0 // 0000000037C4: D3D9400C 18000080 - v_accvgpr_write_b32 a13, 0 // 0000000037CC: D3D9400D 18000080 - v_accvgpr_write_b32 a14, 0 // 0000000037D4: D3D9400E 18000080 - v_accvgpr_write_b32 a15, 0 // 0000000037DC: D3D9400F 18000080 - v_mov_b64_e32 v[6:7], 0 // 0000000037E4: 7E0C7080 - v_mov_b64_e32 v[8:9], 0 // 0000000037E8: 7E107080 - v_mfma_f32_32x32x16_bf16 a[16:31], v[6:9], v[6:9], a[0:15] // 0000000037EC: D3B78010 04020D06 - v_mfma_f32_32x32x16_bf16 a[32:47], v[6:9], v[6:9], a[0:15] // 0000000037F4: D3B78020 04020D06 - v_mfma_f32_32x32x16_bf16 a[48:63], v[6:9], v[6:9], a[0:15] // 0000000037FC: D3B78030 04020D06 - v_mfma_f32_32x32x16_bf16 a[64:79], v[6:9], v[6:9], a[0:15] // 000000003804: D3B78040 04020D06 - v_mfma_f32_32x32x16_bf16 a[80:95], v[6:9], v[6:9], a[0:15] // 00000000380C: D3B78050 04020D06 - v_mfma_f32_32x32x16_bf16 a[96:111], v[6:9], v[6:9], a[0:15]// 000000003814: D3B78060 04020D06 - v_mfma_f32_32x32x16_bf16 a[112:127], v[6:9], v[6:9], a[0:15]// 00000000381C: D3B78070 04020D06 - v_mfma_f32_32x32x16_bf16 a[128:143], v[6:9], v[6:9], a[0:15]// 000000003824: D3B78080 04020D06 - buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 00000000382C: E05D1000 4F0E0001 - s_add_u32 m0, m0, 0x1040 // 000000003834: 807CFF7C 00001040 - v_mfma_f32_32x32x16_bf16 a[144:159], v[6:9], v[6:9], a[0:15]// 00000000383C: D3B78090 04020D06 - v_mfma_f32_32x32x16_bf16 a[160:175], v[6:9], v[6:9], a[0:15]// 000000003844: D3B780A0 04020D06 - v_mfma_f32_32x32x16_bf16 a[176:191], v[6:9], v[6:9], a[0:15]// 00000000384C: D3B780B0 04020D06 - v_mfma_f32_32x32x16_bf16 a[192:207], v[6:9], v[6:9], a[0:15]// 000000003854: D3B780C0 04020D06 - v_mfma_f32_32x32x16_bf16 a[208:223], v[6:9], v[6:9], a[0:15]// 00000000385C: D3B780D0 04020D06 - v_mfma_f32_32x32x16_bf16 a[224:239], v[6:9], v[6:9], a[0:15]// 000000003864: D3B780E0 04020D06 - v_mfma_f32_32x32x16_bf16 a[240:255], v[6:9], v[6:9], a[0:15]// 00000000386C: D3B780F0 04020D06 - buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000003874: E05D1000 500E0001 - s_add_u32 m0, m0, 0x1040 // 00000000387C: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000003884: E05D1000 510E0001 - s_add_u32 m0, m0, 0x1040 // 00000000388C: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 000000003894: E05D1000 520E0001 - s_add_u32 m0, m0, 0x1040 // 00000000389C: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 0000000038A4: E05D1000 530E0001 - s_add_u32 s86, s12, 1 // 0000000038AC: 8056810C - s_cmp_eq_u32 s51, s86 // 0000000038B0: BF065633 - s_cselect_b32 s84, s64, s68 // 0000000038B4: 85544440 - s_cselect_b32 s85, s65, 0 // 0000000038B8: 85558041 - s_add_u32 s52, s52, s84 // 0000000038BC: 80345434 - s_addc_u32 s53, s53, s85 // 0000000038C0: 82355535 - s_sub_u32 s60, s60, s84 // 0000000038C4: 80BC543C - s_subb_u32 s61, s61, s85 // 0000000038C8: 82BD553D - s_cmp_eq_u32 s61, 0 // 0000000038CC: BF06803D - s_cselect_b32 s54, s60, -1 // 0000000038D0: 8536C13C - s_add_u32 s86, s12, 1 // 0000000038D4: 8056810C - s_cmp_eq_u32 s51, s86 // 0000000038D8: BF065633 - s_cselect_b32 s84, s66, s69 // 0000000038DC: 85544542 - s_cselect_b32 s85, s67, 0 // 0000000038E0: 85558043 - s_add_u32 s56, s56, s84 // 0000000038E4: 80385438 - s_addc_u32 s57, s57, s85 // 0000000038E8: 82395539 - s_sub_u32 s62, s62, s84 // 0000000038EC: 80BE543E - s_subb_u32 s63, s63, s85 // 0000000038F0: 82BF553F - s_cmp_eq_u32 s63, 0 // 0000000038F4: BF06803F - s_cselect_b32 s58, s62, -1 // 0000000038F8: 853AC13E - -init_output_buffers: - s_mov_b64 s[16:17], s[28:29] // 0000000038FC: BE90011C - s_mov_b32 s18, 0x80000000 // 000000003900: BE9200FF 80000000 - s_mov_b32 s19, 0x20000 // 000000003908: BE9300FF 00020000 - s_mov_b64 s[20:21], s[30:31] // 000000003910: BE94011E - s_mov_b32 s22, 0x80000000 // 000000003914: BE9600FF 80000000 - s_mov_b32 s23, 0x20000 // 00000000391C: BE9700FF 00020000 - s_mul_i32 s86, 0x100, s3 // 000000003924: 925603FF 00000100 - s_mul_hi_u32 s85, s86, s38 // 00000000392C: 96552656 - s_mul_i32 s84, s86, s38 // 000000003930: 92542656 - s_lshl_b64 s[84:85], s[84:85], s8 // 000000003934: 8ED40854 - s_add_u32 s20, s30, s84 // 000000003938: 8014541E - s_addc_u32 s21, s31, s85 // 00000000393C: 8215551F - s_mul_hi_u32 s85, s86, s36 // 000000003940: 96552456 - s_mul_i32 s84, s86, s36 // 000000003944: 92542456 - s_lshl_b64 s[84:85], s[84:85], s9 // 000000003948: 8ED40954 - s_add_u32 s16, s28, s84 // 00000000394C: 8010541C - s_addc_u32 s17, s29, s85 // 000000003950: 8211551D - s_mul_hi_u32 s85, s4, s39 // 000000003954: 96552704 - s_mul_i32 s84, s4, s39 // 000000003958: 92542704 - s_lshl_b64 s[84:85], s[84:85], s8 // 00000000395C: 8ED40854 - s_add_u32 s20, s20, s84 // 000000003960: 80145414 - s_addc_u32 s21, s21, s85 // 000000003964: 82155515 - s_mul_hi_u32 s85, s4, s37 // 000000003968: 96552504 - s_mul_i32 s84, s4, s37 // 00000000396C: 92542504 - s_lshl_b64 s[84:85], s[84:85], s9 // 000000003970: 8ED40954 - s_add_u32 s16, s16, s84 // 000000003974: 80105410 - s_addc_u32 s17, s17, s85 // 000000003978: 82115511 - s_mul_hi_u32 s85, s24, s6 // 00000000398C: 96550618 - s_mul_i32 s84, s24, s6 // 000000003990: 92540618 - s_sub_u32 s86, s25, 1 // 000000003994: 80D68119 - s_mul_i32 s86, s86, s6 // 000000003998: 92560656 - s_mul_hi_u32 s87, s86, s38 // 00000000399C: 96572656 - s_mul_i32 s86, s86, s38 // 0000000039A0: 92562656 - s_add_u32 s84, s84, s86 // 0000000039A4: 80545654 - s_addc_u32 s85, s85, s87 // 0000000039A8: 82555755 - s_sub_u32 s86, s26, 1 // 0000000039AC: 80D6811A - s_mul_i32 s86, s86, s6 // 0000000039B0: 92560656 - s_mul_hi_u32 s87, s86, s39 // 0000000039B4: 96572756 - s_mul_i32 s86, s86, s39 // 0000000039B8: 92562756 - s_add_u32 s84, s84, s86 // 0000000039BC: 80545654 - s_addc_u32 s85, s85, s87 // 0000000039C0: 82555755 - s_lshl_b64 s[84:85], s[84:85], 2 // 0000000039C4: 8ED48254 - s_add_u32 s16, s16, s84 // 0000000039C8: 80105410 - s_addc_u32 s17, s17, s85 // 0000000039CC: 82115511 - s_xor_b32 s46, s48, s46 // 0000000039F0: 882E2E30 - s_xor_b32 s47, s49, s47 // 0000000039F4: 882F2F31 - s_cmp_eq_u32 s12, 1 // 0000000039F8: BF06810C - s_cbranch_scc1 after_prefetch // 0000000039FC: BF850040 - s_mov_b32 m0, s46 // 000000003A00: BEFC002E - buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 000000003A04: E05D1000 800D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A0C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 000000003A14: E05D1000 460D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A1C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 000000003A24: E05D1000 470D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A2C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 000000003A34: E05D1000 480D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A3C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 000000003A44: E05D1000 490D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A4C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003A54: E05D1000 4A0D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A5C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003A64: E05D1000 4B0D0000 - s_add_u32 m0, m0, 0x1040 // 000000003A6C: 807CFF7C 00001040 - buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003A74: E05D1000 4C0D0000 - s_mov_b32 m0, s47 // 000000003A7C: BEFC002F - buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003A80: E05D1000 800E0001 - s_add_u32 m0, m0, 0x1040 // 000000003A88: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003A90: E05D1000 4D0E0001 - s_add_u32 m0, m0, 0x1040 // 000000003A98: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003AA0: E05D1000 4E0E0001 - s_add_u32 m0, m0, 0x1040 // 000000003AA8: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 000000003AB0: E05D1000 4F0E0001 - s_add_u32 m0, m0, 0x1040 // 000000003AB8: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000003AC0: E05D1000 500E0001 - s_add_u32 m0, m0, 0x1040 // 000000003AC8: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000003AD0: E05D1000 510E0001 - s_add_u32 m0, m0, 0x1040 // 000000003AD8: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 000000003AE0: E05D1000 520E0001 - s_add_u32 m0, m0, 0x1040 // 000000003AE8: 807CFF7C 00001040 - buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 000000003AF0: E05D1000 530E0001 - s_xor_b32 s46, s48, s46 // 000000003AF8: 882E2E30 - s_xor_b32 s47, s49, s47 // 000000003AFC: 882F2F31 - -after_prefetch: - s_waitcnt vmcnt(24) // 000000003B00: BF8C4F78 - s_barrier // 000000003B04: BF8A0000 - ds_read_b128 v[4:7], v2 // 000000003B08: D9FE0000 04000002 - ds_read_b128 v[8:11], v2 offset:128 // 000000003B10: D9FE0080 08000002 - ds_read_b128 v[12:15], v2 offset:256 // 000000003B18: D9FE0100 0C000002 - ds_read_b128 v[16:19], v2 offset:384 // 000000003B20: D9FE0180 10000002 - ds_read_b128 v[20:23], v2 offset:512 // 000000003B28: D9FE0200 14000002 - ds_read_b128 v[24:27], v2 offset:640 // 000000003B30: D9FE0280 18000002 - ds_read_b128 v[28:31], v2 offset:768 // 000000003B38: D9FE0300 1C000002 - ds_read_b128 v[32:35], v2 offset:896 // 000000003B40: D9FE0380 20000002 - s_waitcnt vmcnt(16) // 000000003B48: BF8C4F70 - s_barrier // 000000003B4C: BF8A0000 - ds_read_b128 v[68:71], v3 // 000000003B50: D9FE0000 44000003 - ds_read_b128 v[72:75], v3 offset:128 // 000000003B58: D9FE0080 48000003 - ds_read_b128 v[76:79], v3 offset:256 // 000000003B60: D9FE0100 4C000003 - ds_read_b128 v[80:83], v3 offset:384 // 000000003B68: D9FE0180 50000003 - ds_read_b128 v[84:87], v3 offset:512 // 000000003B70: D9FE0200 54000003 - ds_read_b128 v[88:91], v3 offset:640 // 000000003B78: D9FE0280 58000003 - ds_read_b128 v[92:95], v3 offset:768 // 000000003B80: D9FE0300 5C000003 - ds_read_b128 v[96:99], v3 offset:896 // 000000003B88: D9FE0380 60000003 - s_waitcnt lgkmcnt(0) // 000000003B90: BF8CC07F - s_cmp_eq_u32 s12, 1 // 000000003B94: BF06810C - s_cbranch_scc1 final_compute // 000000003B98: BF8502E5 - s_cmp_le_u32 s12, 2 // 000000003B9C: BF0B820C - s_cbranch_scc1 loop_epilogue // 000000003BA0: BF85019E - -main_loop: - v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 000000003BA4: D3B58000 04020944 - ds_read_b128 v[36:39], v2 offset:64 // 000000003BAC: D9FE0040 24000002 - v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 000000003BB4: D3B58004 04121144 - s_cmp_eq_u32 s12, s51 // 000000003BBC: BF06330C - s_cselect_b32 s84, s64, s68 // 000000003BC0: 85544440 - v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 000000003BC4: D3B58008 04221944 - ds_read_b128 v[40:43], v2 offset:192 // 000000003BCC: D9FE00C0 28000002 - v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 000000003BD4: D3B5800C 04322144 - s_cselect_b32 s85, s65, 0 // 000000003BDC: 85558041 - s_add_u32 s52, s52, s84 // 000000003BE0: 80345434 - v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 000000003BE4: D3B58010 04422944 - ds_read_b128 v[44:47], v2 offset:320 // 000000003BEC: D9FE0140 2C000002 - v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 000000003BF4: D3B58014 04523144 - s_addc_u32 s53, s53, s85 // 000000003BFC: 82355535 - s_sub_u32 s60, s60, s84 // 000000003C00: 80BC543C - v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 000000003C04: D3B58018 04623944 - ds_read_b128 v[48:51], v2 offset:448 // 000000003C0C: D9FE01C0 30000002 - v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 000000003C14: D3B5801C 04724144 - s_subb_u32 s61, s61, s85 // 000000003C1C: 82BD553D - s_cmp_eq_u32 s61, 0 // 000000003C20: BF06803D - v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 000000003C24: D3B58020 04820948 - ds_read_b128 v[52:55], v2 offset:576 // 000000003C2C: D9FE0240 34000002 - v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 000000003C34: D3B58024 04921148 - s_cselect_b32 s54, s60, -1 // 000000003C3C: 8536C13C - s_cmp_eq_u32 s12, s51 // 000000003C40: BF06330C - v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000003C44: D3B58028 04A21948 - ds_read_b128 v[56:59], v2 offset:704 // 000000003C4C: D9FE02C0 38000002 - v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 000000003C54: D3B5802C 04B22148 - s_cselect_b32 s84, s66, s69 // 000000003C5C: 85544542 - s_cselect_b32 s85, s67, 0 // 000000003C60: 85558043 - v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 000000003C64: D3B58030 04C22948 - ds_read_b128 v[60:63], v2 offset:832 // 000000003C6C: D9FE0340 3C000002 - v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 000000003C74: D3B58034 04D23148 - s_add_u32 s56, s56, s84 // 000000003C7C: 80385438 - s_addc_u32 s57, s57, s85 // 000000003C80: 82395539 - v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 000000003C84: D3B58038 04E23948 - ds_read_b128 v[64:67], v2 offset:960 // 000000003C8C: D9FE03C0 40000002 - v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 000000003C94: D3B5803C 04F24148 - s_mov_b32 m0, s46 // 000000003C9C: BEFC002E - s_sub_u32 s62, s62, s84 // 000000003CA0: 80BE543E - v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 000000003CA4: D3B58040 0502094C - s_subb_u32 s63, s63, s85 // 000000003CAC: 82BF553F - s_cmp_eq_u32 s63, 0 // 000000003CB0: BF06803F - v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 000000003CB4: D3B58044 0512114C - s_cselect_b32 s58, s62, -1 // 000000003CBC: 853AC13E - v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 000000003CC0: D3B58048 0522194C - v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000003CC8: D3B5804C 0532214C - v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 000000003CD0: D3B58050 0542294C - s_waitcnt lgkmcnt(0) // 000000003CD8: BF8CC07F - v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 000000003CDC: D3B58054 0552314C - s_barrier // 000000003CE4: BF8A0000 - v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000003CE8: D3B58058 0562394C - buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 000000003CF0: E05D1000 800D0000 - v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 000000003CF8: D3B5805C 0572414C - s_add_u32 m0, m0, 0x1040 // 000000003D00: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 000000003D08: D3B58060 05820950 - ds_read_b128 v[100:103], v3 offset:64 // 000000003D10: D9FE0040 64000003 - v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 000000003D18: D3B58064 05921150 - buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 000000003D20: E05D1000 460D0000 - v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 000000003D28: D3B58068 05A21950 - s_add_u32 m0, m0, 0x1040 // 000000003D30: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 000000003D38: D3B5806C 05B22150 - ds_read_b128 v[104:107], v3 offset:192 // 000000003D40: D9FE00C0 68000003 - v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 000000003D48: D3B58070 05C22950 - buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 000000003D50: E05D1000 470D0000 - v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 000000003D58: D3B58074 05D23150 - s_add_u32 m0, m0, 0x1040 // 000000003D60: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 000000003D68: D3B58078 05E23950 - ds_read_b128 v[108:111], v3 offset:320 // 000000003D70: D9FE0140 6C000003 - v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000003D78: D3B5807C 05F24150 - buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 000000003D80: E05D1000 480D0000 - v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 000000003D88: D3B58080 06020954 - s_add_u32 m0, m0, 0x1040 // 000000003D90: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 000000003D98: D3B58084 06121154 - ds_read_b128 v[112:115], v3 offset:448 // 000000003DA0: D9FE01C0 70000003 - v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 000000003DA8: D3B58088 06221954 - buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 000000003DB0: E05D1000 490D0000 - v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 000000003DB8: D3B5808C 06322154 - s_add_u32 m0, m0, 0x1040 // 000000003DC0: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 000000003DC8: D3B58090 06422954 - ds_read_b128 v[116:119], v3 offset:576 // 000000003DD0: D9FE0240 74000003 - v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 000000003DD8: D3B58094 06523154 - v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 000000003DE0: D3B58098 06623954 - ds_read_b128 v[120:123], v3 offset:704 // 000000003DE8: D9FE02C0 78000003 - v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 000000003DF0: D3B5809C 06724154 - v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 000000003DF8: D3B580A0 06820958 - ds_read_b128 v[124:127], v3 offset:832 // 000000003E00: D9FE0340 7C000003 - v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 000000003E08: D3B580A4 06921158 - v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 000000003E10: D3B580A8 06A21958 - ds_read_b128 v[128:131], v3 offset:960 // 000000003E18: D9FE03C0 80000003 - v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 000000003E20: D3B580AC 06B22158 - v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 000000003E28: D3B580B0 06C22958 - v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000003E30: D3B580B4 06D23158 - v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 000000003E38: D3B580B8 06E23958 - v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000003E40: D3B580BC 06F24158 - v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 000000003E48: D3B580C0 0702095C - v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000003E50: D3B580C4 0712115C - v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 000000003E58: D3B580C8 0722195C - s_waitcnt lgkmcnt(0) // 000000003E60: BF8CC07F - v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 000000003E64: D3B580CC 0732215C - s_barrier // 000000003E6C: BF8A0000 - v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 000000003E70: D3B580D0 0742295C - buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003E78: E05D1000 4A0D0000 - v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 000000003E80: D3B580D4 0752315C - s_add_u32 m0, m0, 0x1040 // 000000003E88: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 000000003E90: D3B580D8 0762395C - v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 000000003E98: D3B580DC 0772415C - buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003EA0: E05D1000 4B0D0000 - v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 000000003EA8: D3B580E0 07820960 - s_add_u32 m0, m0, 0x1040 // 000000003EB0: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 000000003EB8: D3B580E4 07921160 - v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 000000003EC0: D3B580E8 07A21960 - buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003EC8: E05D1000 4C0D0000 - v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 000000003ED0: D3B580EC 07B22160 - s_mov_b32 m0, s47 // 000000003ED8: BEFC002F - v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 000000003EDC: D3B580F0 07C22960 - v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 000000003EE4: D3B580F4 07D23160 - buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003EEC: E05D1000 800E0001 - v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 000000003EF4: D3B580F8 07E23960 - s_add_u32 m0, m0, 0x1040 // 000000003EFC: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 000000003F04: D3B580FC 07F24160 - v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 000000003F0C: D3B58000 04024964 - buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003F14: E05D1000 4D0E0001 - v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 000000003F1C: D3B58004 04125164 - s_add_u32 m0, m0, 0x1040 // 000000003F24: 807CFF7C 00001040 - s_xor_b32 s46, s48, s46 // 000000003F2C: 882E2E30 - v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 000000003F30: D3B58008 04225964 - v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 000000003F38: D3B5800C 04326164 - v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 000000003F40: D3B58010 04426964 - v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 000000003F48: D3B58014 04527164 - v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 000000003F50: D3B58018 04627964 - v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 000000003F58: D3B5801C 04728164 - v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 000000003F60: D3B58020 04824968 - v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 000000003F68: D3B58024 04925168 - v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 000000003F70: D3B58028 04A25968 - v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000003F78: D3B5802C 04B26168 - v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000003F80: D3B58030 04C26968 - v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000003F88: D3B58034 04D27168 - v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000003F90: D3B58038 04E27968 - v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000003F98: D3B5803C 04F28168 - v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000003FA0: D3B58040 0502496C - v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000003FA8: D3B58044 0512516C - v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000003FB0: D3B58048 0522596C - v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000003FB8: D3B5804C 0532616C - v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000003FC0: D3B58050 0542696C - v_xor_b32_e32 v2, v132, v2 // 000000003FC8: 2A040584 - v_xor_b32_e32 v3, v133, v3 // 000000003FCC: 2A060785 - v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000003FD0: D3B58054 0552716C - buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003FD8: E05D1000 4E0E0001 - v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000003FE0: D3B58058 0562796C - s_add_u32 m0, m0, 0x1040 // 000000003FE8: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000003FF0: D3B5805C 0572816C - buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 000000003FF8: E05D1000 4F0E0001 - v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004000: D3B58060 05824970 - s_add_u32 m0, m0, 0x1040 // 000000004008: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004010: D3B58064 05925170 - buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000004018: E05D1000 500E0001 - v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004020: D3B58068 05A25970 - v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004028: D3B5806C 05B26170 - s_waitcnt vmcnt(13) // 000000004030: BF8C0F7D - v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004034: D3B58070 05C26970 - s_barrier // 00000000403C: BF8A0000 - v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004040: D3B58074 05D27170 - ds_read_b128 v[4:7], v2 // 000000004048: D9FE0000 04000002 - v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004050: D3B58078 05E27970 - ds_read_b128 v[8:11], v2 offset:128 // 000000004058: D9FE0080 08000002 - s_add_u32 m0, m0, 0x1040 // 000000004060: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 000000004068: D3B5807C 05F28170 - ds_read_b128 v[12:15], v2 offset:256 // 000000004070: D9FE0100 0C000002 - v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 000000004078: D3B58080 06024974 - buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000004080: E05D1000 510E0001 - v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 000000004088: D3B58084 06125174 - ds_read_b128 v[16:19], v2 offset:384 // 000000004090: D9FE0180 10000002 - v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 000000004098: D3B58088 06225974 - ds_read_b128 v[20:23], v2 offset:512 // 0000000040A0: D9FE0200 14000002 - s_add_u32 m0, m0, 0x1040 // 0000000040A8: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 0000000040B0: D3B5808C 06326174 - v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 0000000040B8: D3B58090 06426974 - buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 0000000040C0: E05D1000 520E0001 - v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 0000000040C8: D3B58094 06527174 - v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 0000000040D0: D3B58098 06627974 - ds_read_b128 v[24:27], v2 offset:640 // 0000000040D8: D9FE0280 18000002 - s_add_u32 m0, m0, 0x1040 // 0000000040E0: 807CFF7C 00001040 - v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 0000000040E8: D3B5809C 06728174 - ds_read_b128 v[28:31], v2 offset:768 // 0000000040F0: D9FE0300 1C000002 - v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 0000000040F8: D3B580A0 06824978 - ds_read_b128 v[32:35], v2 offset:896 // 000000004100: D9FE0380 20000002 - v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 000000004108: D3B580A4 06925178 - ds_read_b128 v[68:71], v3 // 000000004110: D9FE0000 44000003 - v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 000000004118: D3B580A8 06A25978 - ds_read_b128 v[72:75], v3 offset:128 // 000000004120: D9FE0080 48000003 - v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004128: D3B580AC 06B26178 - v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004130: D3B580B0 06C26978 - v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004138: D3B580B4 06D27178 - ds_read_b128 v[76:79], v3 offset:256 // 000000004140: D9FE0100 4C000003 - v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004148: D3B580B8 06E27978 - v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004150: D3B580BC 06F28178 - v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004158: D3B580C0 0702497C - ds_read_b128 v[80:83], v3 offset:384 // 000000004160: D9FE0180 50000003 - v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004168: D3B580C4 0712517C - v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004170: D3B580C8 0722597C - ds_read_b128 v[84:87], v3 offset:512 // 000000004178: D9FE0200 54000003 - v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004180: D3B580CC 0732617C - v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004188: D3B580D0 0742697C - v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 000000004190: D3B580D4 0752717C - ds_read_b128 v[88:91], v3 offset:640 // 000000004198: D9FE0280 58000003 - v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 0000000041A0: D3B580D8 0762797C - v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 0000000041A8: D3B580DC 0772817C - v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 0000000041B0: D3B580E0 07824980 - ds_read_b128 v[92:95], v3 offset:768 // 0000000041B8: D9FE0300 5C000003 - v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 0000000041C0: D3B580E4 07925180 - v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 0000000041C8: D3B580E8 07A25980 - v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 0000000041D0: D3B580EC 07B26180 - ds_read_b128 v[96:99], v3 offset:896 // 0000000041D8: D9FE0380 60000003 - v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 0000000041E0: D3B580F0 07C26980 - buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 0000000041E8: E05D1000 530E0001 - v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 0000000041F0: D3B580F4 07D27180 - s_xor_b32 s47, s49, s47 // 0000000041F8: 882F2F31 - s_sub_u32 s12, s12, 1 // 0000000041FC: 808C810C - v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004200: D3B580F8 07E27980 - s_cmp_eq_i32 s12, 2 // 000000004208: BF00820C - s_waitcnt lgkmcnt(0) // 00000000420C: BF8CC07F - v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004210: D3B580FC 07F28180 - s_cbranch_scc0 main_loop // 000000004218: BF84FE62 - -loop_epilogue: - v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 00000000421C: D3B58000 04020944 - ds_read_b128 v[36:39], v2 offset:64 // 000000004224: D9FE0040 24000002 - v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 00000000422C: D3B58004 04121144 - v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 000000004234: D3B58008 04221944 - ds_read_b128 v[100:103], v3 offset:64 // 00000000423C: D9FE0040 64000003 - v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 000000004244: D3B5800C 04322144 - v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 00000000424C: D3B58010 04422944 - ds_read_b128 v[40:43], v2 offset:192 // 000000004254: D9FE00C0 28000002 - v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 00000000425C: D3B58014 04523144 - v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 000000004264: D3B58018 04623944 - ds_read_b128 v[44:47], v2 offset:320 // 00000000426C: D9FE0140 2C000002 - v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 000000004274: D3B5801C 04724144 - v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 00000000427C: D3B58020 04820948 - ds_read_b128 v[48:51], v2 offset:448 // 000000004284: D9FE01C0 30000002 - v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 00000000428C: D3B58024 04921148 - v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000004294: D3B58028 04A21948 - ds_read_b128 v[52:55], v2 offset:576 // 00000000429C: D9FE0240 34000002 - v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 0000000042A4: D3B5802C 04B22148 - v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 0000000042AC: D3B58030 04C22948 - ds_read_b128 v[56:59], v2 offset:704 // 0000000042B4: D9FE02C0 38000002 - v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 0000000042BC: D3B58034 04D23148 - v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 0000000042C4: D3B58038 04E23948 - ds_read_b128 v[60:63], v2 offset:832 // 0000000042CC: D9FE0340 3C000002 - v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 0000000042D4: D3B5803C 04F24148 - v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 0000000042DC: D3B58040 0502094C - ds_read_b128 v[64:67], v2 offset:960 // 0000000042E4: D9FE03C0 40000002 - v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 0000000042EC: D3B58044 0512114C - v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 0000000042F4: D3B58048 0522194C - ds_read_b128 v[104:107], v3 offset:192 // 0000000042FC: D9FE00C0 68000003 - v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000004304: D3B5804C 0532214C - v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 00000000430C: D3B58050 0542294C - ds_read_b128 v[108:111], v3 offset:320 // 000000004314: D9FE0140 6C000003 - v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 00000000431C: D3B58054 0552314C - v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000004324: D3B58058 0562394C - ds_read_b128 v[112:115], v3 offset:448 // 00000000432C: D9FE01C0 70000003 - v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 000000004334: D3B5805C 0572414C - v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 00000000433C: D3B58060 05820950 - ds_read_b128 v[116:119], v3 offset:576 // 000000004344: D9FE0240 74000003 - v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 00000000434C: D3B58064 05921150 - v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 000000004354: D3B58068 05A21950 - ds_read_b128 v[120:123], v3 offset:704 // 00000000435C: D9FE02C0 78000003 - v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 000000004364: D3B5806C 05B22150 - v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 00000000436C: D3B58070 05C22950 - ds_read_b128 v[124:127], v3 offset:832 // 000000004374: D9FE0340 7C000003 - v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 00000000437C: D3B58074 05D23150 - v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 000000004384: D3B58078 05E23950 - ds_read_b128 v[128:131], v3 offset:960 // 00000000438C: D9FE03C0 80000003 - v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000004394: D3B5807C 05F24150 - v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 00000000439C: D3B58080 06020954 - v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 0000000043A4: D3B58084 06121154 - v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 0000000043AC: D3B58088 06221954 - v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 0000000043B4: D3B5808C 06322154 - v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 0000000043BC: D3B58090 06422954 - v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 0000000043C4: D3B58094 06523154 - v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 0000000043CC: D3B58098 06623954 - v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 0000000043D4: D3B5809C 06724154 - v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 0000000043DC: D3B580A0 06820958 - v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 0000000043E4: D3B580A4 06921158 - v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 0000000043EC: D3B580A8 06A21958 - v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 0000000043F4: D3B580AC 06B22158 - v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 0000000043FC: D3B580B0 06C22958 - v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000004404: D3B580B4 06D23158 - v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 00000000440C: D3B580B8 06E23958 - v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000004414: D3B580BC 06F24158 - v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 00000000441C: D3B580C0 0702095C - v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000004424: D3B580C4 0712115C - v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 00000000442C: D3B580C8 0722195C - v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 000000004434: D3B580CC 0732215C - v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 00000000443C: D3B580D0 0742295C - v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 000000004444: D3B580D4 0752315C - v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 00000000444C: D3B580D8 0762395C - v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 000000004454: D3B580DC 0772415C - v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 00000000445C: D3B580E0 07820960 - v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 000000004464: D3B580E4 07921160 - v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 00000000446C: D3B580E8 07A21960 - v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 000000004474: D3B580EC 07B22160 - v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 00000000447C: D3B580F0 07C22960 - v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 000000004484: D3B580F4 07D23160 - v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 00000000448C: D3B580F8 07E23960 - v_xor_b32_e32 v2, v132, v2 // 000000004494: 2A040584 - v_xor_b32_e32 v3, v133, v3 // 000000004498: 2A060785 - v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 00000000449C: D3B580FC 07F24160 - s_waitcnt lgkmcnt(0) // 0000000044A4: BF8CC07F - v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 0000000044A8: D3B58000 04024964 - v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 0000000044B0: D3B58004 04125164 - v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 0000000044B8: D3B58008 04225964 - v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 0000000044C0: D3B5800C 04326164 - v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 0000000044C8: D3B58010 04426964 - v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 0000000044D0: D3B58014 04527164 - v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 0000000044D8: D3B58018 04627964 - v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 0000000044E0: D3B5801C 04728164 - v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 0000000044E8: D3B58020 04824968 - v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 0000000044F0: D3B58024 04925168 - v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 0000000044F8: D3B58028 04A25968 - v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000004500: D3B5802C 04B26168 - v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000004508: D3B58030 04C26968 - v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000004510: D3B58034 04D27168 - v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000004518: D3B58038 04E27968 - v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000004520: D3B5803C 04F28168 - v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000004528: D3B58040 0502496C - v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000004530: D3B58044 0512516C - v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000004538: D3B58048 0522596C - v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000004540: D3B5804C 0532616C - v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000004548: D3B58050 0542696C - v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000004550: D3B58054 0552716C - v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000004558: D3B58058 0562796C - v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000004560: D3B5805C 0572816C - v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004568: D3B58060 05824970 - v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004570: D3B58064 05925170 - v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004578: D3B58068 05A25970 - v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004580: D3B5806C 05B26170 - v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004588: D3B58070 05C26970 - v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004590: D3B58074 05D27170 - v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004598: D3B58078 05E27970 - v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 0000000045A0: D3B5807C 05F28170 - v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 0000000045A8: D3B58080 06024974 - v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 0000000045B0: D3B58084 06125174 - v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 0000000045B8: D3B58088 06225974 - v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 0000000045C0: D3B5808C 06326174 - v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 0000000045C8: D3B58090 06426974 - v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 0000000045D0: D3B58094 06527174 - v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 0000000045D8: D3B58098 06627974 - v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 0000000045E0: D3B5809C 06728174 - v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 0000000045E8: D3B580A0 06824978 - v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 0000000045F0: D3B580A4 06925178 - s_waitcnt vmcnt(0) // 0000000045F8: BF8C0F70 - v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 0000000045FC: D3B580A8 06A25978 - s_barrier // 000000004604: BF8A0000 - v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004608: D3B580AC 06B26178 - ds_read_b128 v[4:7], v2 // 000000004610: D9FE0000 04000002 - v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004618: D3B580B0 06C26978 - ds_read_b128 v[68:71], v3 // 000000004620: D9FE0000 44000003 - v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004628: D3B580B4 06D27178 - ds_read_b128 v[8:11], v2 offset:128 // 000000004630: D9FE0080 08000002 - v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004638: D3B580B8 06E27978 - ds_read_b128 v[12:15], v2 offset:256 // 000000004640: D9FE0100 0C000002 - v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004648: D3B580BC 06F28178 - ds_read_b128 v[16:19], v2 offset:384 // 000000004650: D9FE0180 10000002 - v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004658: D3B580C0 0702497C - ds_read_b128 v[20:23], v2 offset:512 // 000000004660: D9FE0200 14000002 - v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004668: D3B580C4 0712517C - ds_read_b128 v[24:27], v2 offset:640 // 000000004670: D9FE0280 18000002 - v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004678: D3B580C8 0722597C - ds_read_b128 v[28:31], v2 offset:768 // 000000004680: D9FE0300 1C000002 - v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004688: D3B580CC 0732617C - ds_read_b128 v[32:35], v2 offset:896 // 000000004690: D9FE0380 20000002 - v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004698: D3B580D0 0742697C - ds_read_b128 v[72:75], v3 offset:128 // 0000000046A0: D9FE0080 48000003 - v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 0000000046A8: D3B580D4 0752717C - ds_read_b128 v[76:79], v3 offset:256 // 0000000046B0: D9FE0100 4C000003 - v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 0000000046B8: D3B580D8 0762797C - ds_read_b128 v[80:83], v3 offset:384 // 0000000046C0: D9FE0180 50000003 - v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 0000000046C8: D3B580DC 0772817C - ds_read_b128 v[84:87], v3 offset:512 // 0000000046D0: D9FE0200 54000003 - v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 0000000046D8: D3B580E0 07824980 - ds_read_b128 v[88:91], v3 offset:640 // 0000000046E0: D9FE0280 58000003 - v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 0000000046E8: D3B580E4 07925180 - ds_read_b128 v[92:95], v3 offset:768 // 0000000046F0: D9FE0300 5C000003 - v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 0000000046F8: D3B580E8 07A25980 - ds_read_b128 v[96:99], v3 offset:896 // 000000004700: D9FE0380 60000003 - v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 000000004708: D3B580EC 07B26180 - v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 000000004710: D3B580F0 07C26980 - v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 000000004718: D3B580F4 07D27180 - v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004720: D3B580F8 07E27980 - v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004728: D3B580FC 07F28180 - -final_compute: - s_and_b32 s8, s50, 0x3fff // 000000004730: 8608FF32 00003FFF - s_and_b32 s84, 0xff, s24 // 000000004750: 865418FF 000000FF - s_add_u32 s85, -1, s14 // 000000004758: 80550EC1 - s_cmp_ge_u32 s2, s85 // 00000000475C: BF095502 - s_cselect_b32 s84, s84, 0 // 000000004760: 85548054 - s_and_b32 s84, 0xff, s25 // 00000000476C: 865419FF 000000FF - s_add_u32 s85, -1, s15 // 000000004774: 80550FC1 - s_cmp_ge_u32 s3, s85 // 000000004778: BF095503 - s_cselect_b32 s84, s84, 0 // 00000000477C: 85548054 - v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 000000004788: D3B58000 04020944 - ds_read_b128 v[36:39], v2 offset:64 // 000000004790: D9FE0040 24000002 - v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 000000004798: D3B58004 04121144 - v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 0000000047A0: D3B58008 04221944 - ds_read_b128 v[100:103], v3 offset:64 // 0000000047A8: D9FE0040 64000003 - v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 0000000047B0: D3B5800C 04322144 - v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 0000000047B8: D3B58010 04422944 - ds_read_b128 v[40:43], v2 offset:192 // 0000000047C0: D9FE00C0 28000002 - v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 0000000047C8: D3B58014 04523144 - v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 0000000047D0: D3B58018 04623944 - ds_read_b128 v[44:47], v2 offset:320 // 0000000047D8: D9FE0140 2C000002 - v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 0000000047E0: D3B5801C 04724144 - v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 0000000047E8: D3B58020 04820948 - ds_read_b128 v[48:51], v2 offset:448 // 0000000047F0: D9FE01C0 30000002 - v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 0000000047F8: D3B58024 04921148 - v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000004800: D3B58028 04A21948 - ds_read_b128 v[52:55], v2 offset:576 // 000000004808: D9FE0240 34000002 - v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 000000004810: D3B5802C 04B22148 - v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 000000004818: D3B58030 04C22948 - ds_read_b128 v[56:59], v2 offset:704 // 000000004820: D9FE02C0 38000002 - v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 000000004828: D3B58034 04D23148 - v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 000000004830: D3B58038 04E23948 - ds_read_b128 v[60:63], v2 offset:832 // 000000004838: D9FE0340 3C000002 - v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 000000004840: D3B5803C 04F24148 - v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 000000004848: D3B58040 0502094C - ds_read_b128 v[64:67], v2 offset:960 // 000000004850: D9FE03C0 40000002 - v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 000000004858: D3B58044 0512114C - v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 000000004860: D3B58048 0522194C - ds_read_b128 v[104:107], v3 offset:192 // 000000004868: D9FE00C0 68000003 - v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000004870: D3B5804C 0532214C - v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 000000004878: D3B58050 0542294C - ds_read_b128 v[108:111], v3 offset:320 // 000000004880: D9FE0140 6C000003 - v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 000000004888: D3B58054 0552314C - v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000004890: D3B58058 0562394C - ds_read_b128 v[112:115], v3 offset:448 // 000000004898: D9FE01C0 70000003 - v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 0000000048A0: D3B5805C 0572414C - v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 0000000048A8: D3B58060 05820950 - ds_read_b128 v[116:119], v3 offset:576 // 0000000048B0: D9FE0240 74000003 - v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 0000000048B8: D3B58064 05921150 - v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 0000000048C0: D3B58068 05A21950 - ds_read_b128 v[120:123], v3 offset:704 // 0000000048C8: D9FE02C0 78000003 - v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 0000000048D0: D3B5806C 05B22150 - v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 0000000048D8: D3B58070 05C22950 - ds_read_b128 v[124:127], v3 offset:832 // 0000000048E0: D9FE0340 7C000003 - v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 0000000048E8: D3B58074 05D23150 - v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 0000000048F0: D3B58078 05E23950 - ds_read_b128 v[128:131], v3 offset:960 // 0000000048F8: D9FE03C0 80000003 - v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000004900: D3B5807C 05F24150 - v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 000000004908: D3B58080 06020954 - v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 000000004910: D3B58084 06121154 - v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 000000004918: D3B58088 06221954 - v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 000000004920: D3B5808C 06322154 - v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 000000004928: D3B58090 06422954 - v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 000000004930: D3B58094 06523154 - v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 000000004938: D3B58098 06623954 - v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 000000004940: D3B5809C 06724154 - v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 000000004948: D3B580A0 06820958 - v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 000000004950: D3B580A4 06921158 - v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 000000004958: D3B580A8 06A21958 - v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 000000004960: D3B580AC 06B22158 - v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 000000004968: D3B580B0 06C22958 - v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000004970: D3B580B4 06D23158 - v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 000000004978: D3B580B8 06E23958 - v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000004980: D3B580BC 06F24158 - v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 000000004988: D3B580C0 0702095C - v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000004990: D3B580C4 0712115C - v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 000000004998: D3B580C8 0722195C - v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 0000000049A0: D3B580CC 0732215C - v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 0000000049A8: D3B580D0 0742295C - v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 0000000049B0: D3B580D4 0752315C - v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 0000000049B8: D3B580D8 0762395C - v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 0000000049C0: D3B580DC 0772415C - v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 0000000049C8: D3B580E0 07820960 - v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 0000000049D0: D3B580E4 07921160 - v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 0000000049D8: D3B580E8 07A21960 - v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 0000000049E0: D3B580EC 07B22160 - v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 0000000049E8: D3B580F0 07C22960 - v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 0000000049F0: D3B580F4 07D23160 - v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 0000000049F8: D3B580F8 07E23960 - v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 000000004A00: D3B580FC 07F24160 - s_waitcnt lgkmcnt(0) // 000000004A08: BF8CC07F - v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 000000004A0C: D3B58000 04024964 - v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 000000004A14: D3B58004 04125164 - v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 000000004A1C: D3B58008 04225964 - v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 000000004A24: D3B5800C 04326164 - v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 000000004A2C: D3B58010 04426964 - v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 000000004A34: D3B58014 04527164 - v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 000000004A3C: D3B58018 04627964 - v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 000000004A44: D3B5801C 04728164 - v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 000000004A4C: D3B58020 04824968 - v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 000000004A54: D3B58024 04925168 - v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 000000004A5C: D3B58028 04A25968 - v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000004A64: D3B5802C 04B26168 - v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000004A6C: D3B58030 04C26968 - v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000004A74: D3B58034 04D27168 - v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000004A7C: D3B58038 04E27968 - v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000004A84: D3B5803C 04F28168 - v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000004A8C: D3B58040 0502496C - v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000004A94: D3B58044 0512516C - v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000004A9C: D3B58048 0522596C - v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000004AA4: D3B5804C 0532616C - v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000004AAC: D3B58050 0542696C - v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000004AB4: D3B58054 0552716C - v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000004ABC: D3B58058 0562796C - v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000004AC4: D3B5805C 0572816C - v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004ACC: D3B58060 05824970 - v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004AD4: D3B58064 05925170 - v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004ADC: D3B58068 05A25970 - v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004AE4: D3B5806C 05B26170 - v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004AEC: D3B58070 05C26970 - v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004AF4: D3B58074 05D27170 - v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004AFC: D3B58078 05E27970 - v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 000000004B04: D3B5807C 05F28170 - v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 000000004B0C: D3B58080 06024974 - v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 000000004B14: D3B58084 06125174 - v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 000000004B1C: D3B58088 06225974 - v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 000000004B24: D3B5808C 06326174 - v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 000000004B2C: D3B58090 06426974 - v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 000000004B34: D3B58094 06527174 - v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 000000004B3C: D3B58098 06627974 - v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 000000004B44: D3B5809C 06728174 - v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 000000004B4C: D3B580A0 06824978 - v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 000000004B54: D3B580A4 06925178 - v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 000000004B5C: D3B580A8 06A25978 - v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004B64: D3B580AC 06B26178 - v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004B6C: D3B580B0 06C26978 - v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004B74: D3B580B4 06D27178 - v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004B7C: D3B580B8 06E27978 - v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004B84: D3B580BC 06F28178 - v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004B8C: D3B580C0 0702497C - v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004B94: D3B580C4 0712517C - v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004B9C: D3B580C8 0722597C - v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004BA4: D3B580CC 0732617C - v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004BAC: D3B580D0 0742697C - v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 000000004BB4: D3B580D4 0752717C - v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 000000004BBC: D3B580D8 0762797C - v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 000000004BC4: D3B580DC 0772817C - v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 000000004BCC: D3B580E0 07824980 - v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 000000004BD4: D3B580E4 07925180 - v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 000000004BDC: D3B580E8 07A25980 - v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 000000004BE4: D3B580EC 07B26180 - v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 000000004BEC: D3B580F0 07C26980 - v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 000000004BF4: D3B580F4 07D27180 - v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004BFC: D3B580F8 07E27980 - v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004C04: D3B580FC 07F28180 - - v_lshrrev_b32_e32 v4, 6, v134 // 000000004C0C: 20090C86 - v_lshrrev_b32_e32 v5, 1, v4 // 000000004C10: 200A0881 - v_mul_lo_u32 v5, 16, v5 // 000000004C14: D2850005 00020A90 - v_and_b32_e32 v1, 63, v134 // 000000004C1C: 26030CBF - v_lshrrev_b32_e32 v1, 4, v1 // 000000004C20: 20020284 - v_lshlrev_b32_e32 v1, 2, v1 // 000000004C24: 24020282 - v_add_lshl_u32 v1, v5, v1, 3 // 000000004C28: D1FE0001 020E0305 - v_mul_lo_u32 v2, v1, s38 // 000000004C30: D2850002 00004D01 - v_mul_lo_u32 v3, v1, s36 // 000000004C38: D2850003 00004901 - v_and_b32_e32 v0, 1, v4 // 000000004C40: 26000881 - v_mul_lo_u32 v0, 16, v0 // 000000004C44: D2850000 00020090 - v_and_b32_e32 v5, 15, v134 // 000000004C4C: 260B0C8F - v_add_lshl_u32 v0, v5, v0, 3 // 000000004C50: D1FE0000 020E0105 - s_mul_i32 s8, 0x100, s2 // 000000004C58: 920802FF 00000100 - v_add_u32_e32 v0, s8, v0 // 000000004C60: 68000008 - s_mul_i32 s8, 0x100, s3 // 000000004C64: 920803FF 00000100 - v_add_u32_e32 v1, s8, v1 // 000000004C6C: 68020208 - - v_add_lshl_u32 v11, v3, v0, 1 // 000000004C70: D1FE000B 02060103 - v_accvgpr_read_b32 v16, a0 // 000000004C78: D3D84010 18000100 - v_accvgpr_read_b32 v17, a4 // 000000004C80: D3D84011 18000104 - v_accvgpr_read_b32 v18, a8 // 000000004C88: D3D84012 18000108 - v_accvgpr_read_b32 v19, a12 // 000000004C90: D3D84013 1800010C - v_accvgpr_read_b32 v20, a16 // 000000004C98: D3D84014 18000110 - v_accvgpr_read_b32 v21, a20 // 000000004CA0: D3D84015 18000114 - v_accvgpr_read_b32 v22, a24 // 000000004CA8: D3D84016 18000118 - v_accvgpr_read_b32 v23, a28 // 000000004CB0: D3D84017 1800011C - v_accvgpr_read_b32 v24, a32 // 000000004CB8: D3D84018 18000120 - v_accvgpr_read_b32 v25, a36 // 000000004CC0: D3D84019 18000124 - v_accvgpr_read_b32 v26, a40 // 000000004CC8: D3D8401A 18000128 - v_accvgpr_read_b32 v27, a44 // 000000004CD0: D3D8401B 1800012C - v_accvgpr_read_b32 v28, a48 // 000000004CD8: D3D8401C 18000130 - v_accvgpr_read_b32 v29, a52 // 000000004CE0: D3D8401D 18000134 - v_accvgpr_read_b32 v30, a56 // 000000004CE8: D3D8401E 18000138 - v_accvgpr_read_b32 v31, a60 // 000000004CF0: D3D8401F 1800013C - v_accvgpr_read_b32 v32, a64 // 000000004CF8: D3D84020 18000140 - v_accvgpr_read_b32 v33, a68 // 000000004D00: D3D84021 18000144 - v_accvgpr_read_b32 v34, a72 // 000000004D08: D3D84022 18000148 - v_accvgpr_read_b32 v35, a76 // 000000004D10: D3D84023 1800014C - v_accvgpr_read_b32 v36, a80 // 000000004D18: D3D84024 18000150 - v_accvgpr_read_b32 v37, a84 // 000000004D20: D3D84025 18000154 - v_accvgpr_read_b32 v38, a88 // 000000004D28: D3D84026 18000158 - v_accvgpr_read_b32 v39, a92 // 000000004D30: D3D84027 1800015C - v_accvgpr_read_b32 v40, a96 // 000000004D38: D3D84028 18000160 - v_accvgpr_read_b32 v41, a100 // 000000004D40: D3D84029 18000164 - v_accvgpr_read_b32 v42, a104 // 000000004D48: D3D8402A 18000168 - v_accvgpr_read_b32 v43, a108 // 000000004D50: D3D8402B 1800016C - v_accvgpr_read_b32 v44, a112 // 000000004D58: D3D8402C 18000170 - v_accvgpr_read_b32 v45, a116 // 000000004D60: D3D8402D 18000174 - v_accvgpr_read_b32 v46, a120 // 000000004D68: D3D8402E 18000178 - v_accvgpr_read_b32 v47, a124 // 000000004D70: D3D8402F 1800017C - v_accvgpr_read_b32 v48, a128 // 000000004D78: D3D84030 18000180 - v_accvgpr_read_b32 v49, a132 // 000000004D80: D3D84031 18000184 - v_accvgpr_read_b32 v50, a136 // 000000004D88: D3D84032 18000188 - v_accvgpr_read_b32 v51, a140 // 000000004D90: D3D84033 1800018C - v_accvgpr_read_b32 v52, a144 // 000000004D98: D3D84034 18000190 - v_accvgpr_read_b32 v53, a148 // 000000004DA0: D3D84035 18000194 - v_accvgpr_read_b32 v54, a152 // 000000004DA8: D3D84036 18000198 - v_accvgpr_read_b32 v55, a156 // 000000004DB0: D3D84037 1800019C - v_accvgpr_read_b32 v56, a160 // 000000004DB8: D3D84038 180001A0 - v_accvgpr_read_b32 v57, a164 // 000000004DC0: D3D84039 180001A4 - v_accvgpr_read_b32 v58, a168 // 000000004DC8: D3D8403A 180001A8 - v_accvgpr_read_b32 v59, a172 // 000000004DD0: D3D8403B 180001AC - v_accvgpr_read_b32 v60, a176 // 000000004DD8: D3D8403C 180001B0 - v_accvgpr_read_b32 v61, a180 // 000000004DE0: D3D8403D 180001B4 - v_accvgpr_read_b32 v62, a184 // 000000004DE8: D3D8403E 180001B8 - v_accvgpr_read_b32 v63, a188 // 000000004DF0: D3D8403F 180001BC - v_accvgpr_read_b32 v64, a192 // 000000004DF8: D3D84040 180001C0 - v_accvgpr_read_b32 v65, a196 // 000000004E00: D3D84041 180001C4 - v_accvgpr_read_b32 v66, a200 // 000000004E08: D3D84042 180001C8 - v_accvgpr_read_b32 v67, a204 // 000000004E10: D3D84043 180001CC - v_accvgpr_read_b32 v68, a208 // 000000004E18: D3D84044 180001D0 - v_accvgpr_read_b32 v69, a212 // 000000004E20: D3D84045 180001D4 - v_accvgpr_read_b32 v70, a216 // 000000004E28: D3D84046 180001D8 - v_accvgpr_read_b32 v71, a220 // 000000004E30: D3D84047 180001DC - v_accvgpr_read_b32 v72, a224 // 000000004E38: D3D84048 180001E0 - v_accvgpr_read_b32 v73, a228 // 000000004E40: D3D84049 180001E4 - v_accvgpr_read_b32 v74, a232 // 000000004E48: D3D8404A 180001E8 - v_accvgpr_read_b32 v75, a236 // 000000004E50: D3D8404B 180001EC - v_accvgpr_read_b32 v76, a240 // 000000004E58: D3D8404C 180001F0 - v_accvgpr_read_b32 v77, a244 // 000000004E60: D3D8404D 180001F4 - v_accvgpr_read_b32 v78, a248 // 000000004E68: D3D8404E 180001F8 - v_accvgpr_read_b32 v79, a252 // 000000004E70: D3D8404F 180001FC - v_accvgpr_read_b32 v80, a1 // 000000004E78: D3D84050 18000101 - v_accvgpr_read_b32 v81, a5 // 000000004E80: D3D84051 18000105 - v_accvgpr_read_b32 v82, a9 // 000000004E88: D3D84052 18000109 - v_accvgpr_read_b32 v83, a13 // 000000004E90: D3D84053 1800010D - v_accvgpr_read_b32 v84, a17 // 000000004E98: D3D84054 18000111 - v_accvgpr_read_b32 v85, a21 // 000000004EA0: D3D84055 18000115 - v_accvgpr_read_b32 v86, a25 // 000000004EA8: D3D84056 18000119 - v_accvgpr_read_b32 v87, a29 // 000000004EB0: D3D84057 1800011D - v_accvgpr_read_b32 v88, a33 // 000000004EB8: D3D84058 18000121 - v_accvgpr_read_b32 v89, a37 // 000000004EC0: D3D84059 18000125 - v_accvgpr_read_b32 v90, a41 // 000000004EC8: D3D8405A 18000129 - v_accvgpr_read_b32 v91, a45 // 000000004ED0: D3D8405B 1800012D - v_accvgpr_read_b32 v92, a49 // 000000004ED8: D3D8405C 18000131 - v_accvgpr_read_b32 v93, a53 // 000000004EE0: D3D8405D 18000135 - v_accvgpr_read_b32 v94, a57 // 000000004EE8: D3D8405E 18000139 - v_accvgpr_read_b32 v95, a61 // 000000004EF0: D3D8405F 1800013D - v_accvgpr_read_b32 v96, a65 // 000000004EF8: D3D84060 18000141 - v_accvgpr_read_b32 v97, a69 // 000000004F00: D3D84061 18000145 - v_accvgpr_read_b32 v98, a73 // 000000004F08: D3D84062 18000149 - v_accvgpr_read_b32 v99, a77 // 000000004F10: D3D84063 1800014D - v_accvgpr_read_b32 v100, a81 // 000000004F18: D3D84064 18000151 - v_accvgpr_read_b32 v101, a85 // 000000004F20: D3D84065 18000155 - v_accvgpr_read_b32 v102, a89 // 000000004F28: D3D84066 18000159 - v_accvgpr_read_b32 v103, a93 // 000000004F30: D3D84067 1800015D - v_accvgpr_read_b32 v104, a97 // 000000004F38: D3D84068 18000161 - v_accvgpr_read_b32 v105, a101 // 000000004F40: D3D84069 18000165 - v_accvgpr_read_b32 v106, a105 // 000000004F48: D3D8406A 18000169 - v_accvgpr_read_b32 v107, a109 // 000000004F50: D3D8406B 1800016D - v_accvgpr_read_b32 v108, a113 // 000000004F58: D3D8406C 18000171 - v_accvgpr_read_b32 v109, a117 // 000000004F60: D3D8406D 18000175 - v_accvgpr_read_b32 v110, a121 // 000000004F68: D3D8406E 18000179 - v_accvgpr_read_b32 v111, a125 // 000000004F70: D3D8406F 1800017D - v_accvgpr_read_b32 v112, a129 // 000000004F78: D3D84070 18000181 - v_accvgpr_read_b32 v113, a133 // 000000004F80: D3D84071 18000185 - v_accvgpr_read_b32 v114, a137 // 000000004F88: D3D84072 18000189 - v_accvgpr_read_b32 v115, a141 // 000000004F90: D3D84073 1800018D - v_accvgpr_read_b32 v116, a145 // 000000004F98: D3D84074 18000191 - v_accvgpr_read_b32 v117, a149 // 000000004FA0: D3D84075 18000195 - v_accvgpr_read_b32 v118, a153 // 000000004FA8: D3D84076 18000199 - v_accvgpr_read_b32 v119, a157 // 000000004FB0: D3D84077 1800019D - v_accvgpr_read_b32 v120, a161 // 000000004FB8: D3D84078 180001A1 - v_accvgpr_read_b32 v121, a165 // 000000004FC0: D3D84079 180001A5 - v_accvgpr_read_b32 v122, a169 // 000000004FC8: D3D8407A 180001A9 - v_accvgpr_read_b32 v123, a173 // 000000004FD0: D3D8407B 180001AD - v_accvgpr_read_b32 v124, a177 // 000000004FD8: D3D8407C 180001B1 - v_accvgpr_read_b32 v125, a181 // 000000004FE0: D3D8407D 180001B5 - v_accvgpr_read_b32 v126, a185 // 000000004FE8: D3D8407E 180001B9 - v_accvgpr_read_b32 v127, a189 // 000000004FF0: D3D8407F 180001BD - v_accvgpr_read_b32 v136, a193 // 000000004FF8: D3D84088 180001C1 - v_accvgpr_read_b32 v137, a197 // 000000005000: D3D84089 180001C5 - v_accvgpr_read_b32 v138, a201 // 000000005008: D3D8408A 180001C9 - v_accvgpr_read_b32 v139, a205 // 000000005010: D3D8408B 180001CD - v_accvgpr_read_b32 v140, a209 // 000000005018: D3D8408C 180001D1 - v_accvgpr_read_b32 v141, a213 // 000000005020: D3D8408D 180001D5 - v_accvgpr_read_b32 v142, a217 // 000000005028: D3D8408E 180001D9 - v_accvgpr_read_b32 v143, a221 // 000000005030: D3D8408F 180001DD - v_accvgpr_read_b32 v144, a225 // 000000005038: D3D84090 180001E1 - v_accvgpr_read_b32 v145, a229 // 000000005040: D3D84091 180001E5 - v_accvgpr_read_b32 v146, a233 // 000000005048: D3D84092 180001E9 - v_accvgpr_read_b32 v147, a237 // 000000005050: D3D84093 180001ED - v_accvgpr_read_b32 v148, a241 // 000000005058: D3D84094 180001F1 - v_accvgpr_read_b32 v149, a245 // 000000005060: D3D84095 180001F5 - v_accvgpr_read_b32 v150, a249 // 000000005068: D3D84096 180001F9 - v_accvgpr_read_b32 v151, a253 // 000000005070: D3D84097 180001FD - v_accvgpr_read_b32 v152, a2 // 000000005078: D3D84098 18000102 - v_accvgpr_read_b32 v153, a6 // 000000005080: D3D84099 18000106 - v_accvgpr_read_b32 v154, a10 // 000000005088: D3D8409A 1800010A - v_accvgpr_read_b32 v155, a14 // 000000005090: D3D8409B 1800010E - v_accvgpr_read_b32 v156, a18 // 000000005098: D3D8409C 18000112 - v_accvgpr_read_b32 v157, a22 // 0000000050A0: D3D8409D 18000116 - v_accvgpr_read_b32 v158, a26 // 0000000050A8: D3D8409E 1800011A - v_accvgpr_read_b32 v159, a30 // 0000000050B0: D3D8409F 1800011E - v_accvgpr_read_b32 v160, a34 // 0000000050B8: D3D840A0 18000122 - v_accvgpr_read_b32 v161, a38 // 0000000050C0: D3D840A1 18000126 - v_accvgpr_read_b32 v162, a42 // 0000000050C8: D3D840A2 1800012A - v_accvgpr_read_b32 v163, a46 // 0000000050D0: D3D840A3 1800012E - v_accvgpr_read_b32 v164, a50 // 0000000050D8: D3D840A4 18000132 - v_accvgpr_read_b32 v165, a54 // 0000000050E0: D3D840A5 18000136 - v_accvgpr_read_b32 v166, a58 // 0000000050E8: D3D840A6 1800013A - v_accvgpr_read_b32 v167, a62 // 0000000050F0: D3D840A7 1800013E - v_accvgpr_read_b32 v168, a66 // 0000000050F8: D3D840A8 18000142 - v_accvgpr_read_b32 v169, a70 // 000000005100: D3D840A9 18000146 - v_accvgpr_read_b32 v170, a74 // 000000005108: D3D840AA 1800014A - v_accvgpr_read_b32 v171, a78 // 000000005110: D3D840AB 1800014E - v_accvgpr_read_b32 v172, a82 // 000000005118: D3D840AC 18000152 - v_accvgpr_read_b32 v173, a86 // 000000005120: D3D840AD 18000156 - v_accvgpr_read_b32 v174, a90 // 000000005128: D3D840AE 1800015A - v_accvgpr_read_b32 v175, a94 // 000000005130: D3D840AF 1800015E - v_accvgpr_read_b32 v176, a98 // 000000005138: D3D840B0 18000162 - v_accvgpr_read_b32 v177, a102 // 000000005140: D3D840B1 18000166 - v_accvgpr_read_b32 v178, a106 // 000000005148: D3D840B2 1800016A - v_accvgpr_read_b32 v179, a110 // 000000005150: D3D840B3 1800016E - v_accvgpr_read_b32 v180, a114 // 000000005158: D3D840B4 18000172 - v_accvgpr_read_b32 v181, a118 // 000000005160: D3D840B5 18000176 - v_accvgpr_read_b32 v182, a122 // 000000005168: D3D840B6 1800017A - v_accvgpr_read_b32 v183, a126 // 000000005170: D3D840B7 1800017E - v_accvgpr_read_b32 v184, a130 // 000000005178: D3D840B8 18000182 - v_accvgpr_read_b32 v185, a134 // 000000005180: D3D840B9 18000186 - v_accvgpr_read_b32 v186, a138 // 000000005188: D3D840BA 1800018A - v_accvgpr_read_b32 v187, a142 // 000000005190: D3D840BB 1800018E - v_accvgpr_read_b32 v188, a146 // 000000005198: D3D840BC 18000192 - v_accvgpr_read_b32 v189, a150 // 0000000051A0: D3D840BD 18000196 - v_accvgpr_read_b32 v190, a154 // 0000000051A8: D3D840BE 1800019A - v_accvgpr_read_b32 v191, a158 // 0000000051B0: D3D840BF 1800019E - v_accvgpr_read_b32 v192, a162 // 0000000051B8: D3D840C0 180001A2 - v_accvgpr_read_b32 v193, a166 // 0000000051C0: D3D840C1 180001A6 - v_accvgpr_read_b32 v194, a170 // 0000000051C8: D3D840C2 180001AA - v_accvgpr_read_b32 v195, a174 // 0000000051D0: D3D840C3 180001AE - v_accvgpr_read_b32 v196, a178 // 0000000051D8: D3D840C4 180001B2 - v_accvgpr_read_b32 v197, a182 // 0000000051E0: D3D840C5 180001B6 - v_accvgpr_read_b32 v198, a186 // 0000000051E8: D3D840C6 180001BA - v_accvgpr_read_b32 v199, a190 // 0000000051F0: D3D840C7 180001BE - v_accvgpr_read_b32 v200, a194 // 0000000051F8: D3D840C8 180001C2 - v_accvgpr_read_b32 v201, a198 // 000000005200: D3D840C9 180001C6 - v_accvgpr_read_b32 v202, a202 // 000000005208: D3D840CA 180001CA - v_accvgpr_read_b32 v203, a206 // 000000005210: D3D840CB 180001CE - v_accvgpr_read_b32 v204, a210 // 000000005218: D3D840CC 180001D2 - v_accvgpr_read_b32 v205, a214 // 000000005220: D3D840CD 180001D6 - v_accvgpr_read_b32 v206, a218 // 000000005228: D3D840CE 180001DA - v_accvgpr_read_b32 v207, a222 // 000000005230: D3D840CF 180001DE - v_accvgpr_read_b32 v208, a226 // 000000005238: D3D840D0 180001E2 - v_accvgpr_read_b32 v209, a230 // 000000005240: D3D840D1 180001E6 - v_accvgpr_read_b32 v210, a234 // 000000005248: D3D840D2 180001EA - v_accvgpr_read_b32 v211, a238 // 000000005250: D3D840D3 180001EE - v_accvgpr_read_b32 v212, a242 // 000000005258: D3D840D4 180001F2 - v_accvgpr_read_b32 v213, a246 // 000000005260: D3D840D5 180001F6 - v_accvgpr_read_b32 v214, a250 // 000000005268: D3D840D6 180001FA - v_accvgpr_read_b32 v215, a254 // 000000005270: D3D840D7 180001FE - v_accvgpr_read_b32 v216, a3 // 000000005278: D3D840D8 18000103 - v_accvgpr_read_b32 v217, a7 // 000000005280: D3D840D9 18000107 - v_accvgpr_read_b32 v218, a11 // 000000005288: D3D840DA 1800010B - v_accvgpr_read_b32 v219, a15 // 000000005290: D3D840DB 1800010F - v_accvgpr_read_b32 v220, a19 // 000000005298: D3D840DC 18000113 - v_accvgpr_read_b32 v221, a23 // 0000000052A0: D3D840DD 18000117 - v_accvgpr_read_b32 v222, a27 // 0000000052A8: D3D840DE 1800011B - v_accvgpr_read_b32 v223, a31 // 0000000052B0: D3D840DF 1800011F - v_accvgpr_read_b32 v224, a35 // 0000000052B8: D3D840E0 18000123 - v_accvgpr_read_b32 v225, a39 // 0000000052C0: D3D840E1 18000127 - v_accvgpr_read_b32 v226, a43 // 0000000052C8: D3D840E2 1800012B - v_accvgpr_read_b32 v227, a47 // 0000000052D0: D3D840E3 1800012F - v_accvgpr_read_b32 v228, a51 // 0000000052D8: D3D840E4 18000133 - v_accvgpr_read_b32 v229, a55 // 0000000052E0: D3D840E5 18000137 - v_accvgpr_read_b32 v230, a59 // 0000000052E8: D3D840E6 1800013B - v_accvgpr_read_b32 v231, a63 // 0000000052F0: D3D840E7 1800013F - v_accvgpr_read_b32 v232, a67 // 0000000052F8: D3D840E8 18000143 - v_accvgpr_read_b32 v233, a71 // 000000005300: D3D840E9 18000147 - v_accvgpr_read_b32 v234, a75 // 000000005308: D3D840EA 1800014B - v_accvgpr_read_b32 v235, a79 // 000000005310: D3D840EB 1800014F - v_accvgpr_read_b32 v236, a83 // 000000005318: D3D840EC 18000153 - v_accvgpr_read_b32 v237, a87 // 000000005320: D3D840ED 18000157 - v_accvgpr_read_b32 v238, a91 // 000000005328: D3D840EE 1800015B - v_accvgpr_read_b32 v239, a95 // 000000005330: D3D840EF 1800015F - v_accvgpr_read_b32 v240, a99 // 000000005338: D3D840F0 18000163 - v_accvgpr_read_b32 v241, a103 // 000000005340: D3D840F1 18000167 - v_accvgpr_read_b32 v242, a107 // 000000005348: D3D840F2 1800016B - v_accvgpr_read_b32 v243, a111 // 000000005350: D3D840F3 1800016F - v_accvgpr_read_b32 v244, a115 // 000000005358: D3D840F4 18000173 - v_accvgpr_read_b32 v245, a119 // 000000005360: D3D840F5 18000177 - v_accvgpr_read_b32 v246, a123 // 000000005368: D3D840F6 1800017B - v_accvgpr_read_b32 v247, a127 // 000000005370: D3D840F7 1800017F - v_mov_b32_e32 v8, 0xffff0000 // 000000005378: 7E1002FF FFFF0000 - v_mov_b32_e32 v9, 0x7fff0000 // 000000005380: 7E1202FF 7FFF0000 - v_mov_b32_e32 v10, 0x7fff // 000000005388: 7E1402FF 00007FFF - v_cvt_pk_bf16_f32 v16, v16, v17 // 000000005390: D2680010 00022310 - v_cvt_pk_bf16_f32 v17, v18, v19 // 000000005398: D2680011 00022712 - v_cvt_pk_bf16_f32 v18, v20, v21 // 0000000053A0: D2680012 00022B14 - v_cvt_pk_bf16_f32 v19, v22, v23 // 0000000053A8: D2680013 00022F16 - buffer_store_dwordx4 v[16:19], v11, s[16:19], 0 offen nt // 0000000053B0: E07E1000 8004100B - v_cvt_pk_bf16_f32 v24, v24, v25 // 0000000053B8: D2680018 00023318 - v_cvt_pk_bf16_f32 v25, v26, v27 // 0000000053C0: D2680019 0002371A - v_cvt_pk_bf16_f32 v26, v28, v29 // 0000000053C8: D268001A 00023B1C - v_cvt_pk_bf16_f32 v27, v30, v31 // 0000000053D0: D268001B 00023F1E - s_lshl_b32 s12, s36, 1 // 0000000053D8: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000053DC: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000053E0: 82118011 - buffer_store_dwordx4 v[24:27], v11, s[16:19], 0 offen nt // 0000000053E4: E07E1000 8004180B - v_cvt_pk_bf16_f32 v32, v32, v33 // 0000000053EC: D2680020 00024320 - v_cvt_pk_bf16_f32 v33, v34, v35 // 0000000053F4: D2680021 00024722 - v_cvt_pk_bf16_f32 v34, v36, v37 // 0000000053FC: D2680022 00024B24 - v_cvt_pk_bf16_f32 v35, v38, v39 // 000000005404: D2680023 00024F26 - s_lshl_b32 s12, s36, 1 // 00000000540C: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005410: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005414: 82118011 - buffer_store_dwordx4 v[32:35], v11, s[16:19], 0 offen nt // 000000005418: E07E1000 8004200B - v_cvt_pk_bf16_f32 v40, v40, v41 // 000000005420: D2680028 00025328 - v_cvt_pk_bf16_f32 v41, v42, v43 // 000000005428: D2680029 0002572A - v_cvt_pk_bf16_f32 v42, v44, v45 // 000000005430: D268002A 00025B2C - v_cvt_pk_bf16_f32 v43, v46, v47 // 000000005438: D268002B 00025F2E - s_lshl_b32 s12, s36, 1 // 000000005440: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005444: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005448: 82118011 - buffer_store_dwordx4 v[40:43], v11, s[16:19], 0 offen nt // 00000000544C: E07E1000 8004280B - v_cvt_pk_bf16_f32 v48, v48, v49 // 000000005454: D2680030 00026330 - v_cvt_pk_bf16_f32 v49, v50, v51 // 00000000545C: D2680031 00026732 - v_cvt_pk_bf16_f32 v50, v52, v53 // 000000005464: D2680032 00026B34 - v_cvt_pk_bf16_f32 v51, v54, v55 // 00000000546C: D2680033 00026F36 - s_lshl_b32 s12, s36, 1 // 000000005474: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005478: 80100C10 - s_addc_u32 s17, s17, 0 // 00000000547C: 82118011 - buffer_store_dwordx4 v[48:51], v11, s[16:19], 0 offen nt // 000000005480: E07E1000 8004300B - v_cvt_pk_bf16_f32 v56, v56, v57 // 000000005488: D2680038 00027338 - v_cvt_pk_bf16_f32 v57, v58, v59 // 000000005490: D2680039 0002773A - v_cvt_pk_bf16_f32 v58, v60, v61 // 000000005498: D268003A 00027B3C - v_cvt_pk_bf16_f32 v59, v62, v63 // 0000000054A0: D268003B 00027F3E - s_lshl_b32 s12, s36, 1 // 0000000054A8: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000054AC: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000054B0: 82118011 - buffer_store_dwordx4 v[56:59], v11, s[16:19], 0 offen nt // 0000000054B4: E07E1000 8004380B - v_cvt_pk_bf16_f32 v64, v64, v65 // 0000000054BC: D2680040 00028340 - v_cvt_pk_bf16_f32 v65, v66, v67 // 0000000054C4: D2680041 00028742 - v_cvt_pk_bf16_f32 v66, v68, v69 // 0000000054CC: D2680042 00028B44 - v_cvt_pk_bf16_f32 v67, v70, v71 // 0000000054D4: D2680043 00028F46 - s_lshl_b32 s12, s36, 1 // 0000000054DC: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000054E0: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000054E4: 82118011 - buffer_store_dwordx4 v[64:67], v11, s[16:19], 0 offen nt // 0000000054E8: E07E1000 8004400B - v_cvt_pk_bf16_f32 v72, v72, v73 // 0000000054F0: D2680048 00029348 - v_cvt_pk_bf16_f32 v73, v74, v75 // 0000000054F8: D2680049 0002974A - v_cvt_pk_bf16_f32 v74, v76, v77 // 000000005500: D268004A 00029B4C - v_cvt_pk_bf16_f32 v75, v78, v79 // 000000005508: D268004B 00029F4E - s_lshl_b32 s12, s36, 1 // 000000005510: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005514: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005518: 82118011 - buffer_store_dwordx4 v[72:75], v11, s[16:19], 0 offen nt // 00000000551C: E07E1000 8004480B - v_cvt_pk_bf16_f32 v80, v80, v81 // 000000005524: D2680050 0002A350 - v_cvt_pk_bf16_f32 v81, v82, v83 // 00000000552C: D2680051 0002A752 - v_cvt_pk_bf16_f32 v82, v84, v85 // 000000005534: D2680052 0002AB54 - v_cvt_pk_bf16_f32 v83, v86, v87 // 00000000553C: D2680053 0002AF56 - s_lshl_b32 s12, s36, 1 // 000000005544: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005548: 80100C10 - s_addc_u32 s17, s17, 0 // 00000000554C: 82118011 - buffer_store_dwordx4 v[80:83], v11, s[16:19], 0 offen nt // 000000005550: E07E1000 8004500B - v_cvt_pk_bf16_f32 v88, v88, v89 // 000000005558: D2680058 0002B358 - v_cvt_pk_bf16_f32 v89, v90, v91 // 000000005560: D2680059 0002B75A - v_cvt_pk_bf16_f32 v90, v92, v93 // 000000005568: D268005A 0002BB5C - v_cvt_pk_bf16_f32 v91, v94, v95 // 000000005570: D268005B 0002BF5E - s_lshl_b32 s12, s36, 1 // 000000005578: 8E0C8124 - s_add_u32 s16, s16, s12 // 00000000557C: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005580: 82118011 - buffer_store_dwordx4 v[88:91], v11, s[16:19], 0 offen nt // 000000005584: E07E1000 8004580B - v_cvt_pk_bf16_f32 v96, v96, v97 // 00000000558C: D2680060 0002C360 - v_cvt_pk_bf16_f32 v97, v98, v99 // 000000005594: D2680061 0002C762 - v_cvt_pk_bf16_f32 v98, v100, v101 // 00000000559C: D2680062 0002CB64 - v_cvt_pk_bf16_f32 v99, v102, v103 // 0000000055A4: D2680063 0002CF66 - s_lshl_b32 s12, s36, 1 // 0000000055AC: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000055B0: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000055B4: 82118011 - buffer_store_dwordx4 v[96:99], v11, s[16:19], 0 offen nt // 0000000055B8: E07E1000 8004600B - v_cvt_pk_bf16_f32 v104, v104, v105 // 0000000055C0: D2680068 0002D368 - v_cvt_pk_bf16_f32 v105, v106, v107 // 0000000055C8: D2680069 0002D76A - v_cvt_pk_bf16_f32 v106, v108, v109 // 0000000055D0: D268006A 0002DB6C - v_cvt_pk_bf16_f32 v107, v110, v111 // 0000000055D8: D268006B 0002DF6E - s_lshl_b32 s12, s36, 1 // 0000000055E0: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000055E4: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000055E8: 82118011 - buffer_store_dwordx4 v[104:107], v11, s[16:19], 0 offen nt // 0000000055EC: E07E1000 8004680B - v_cvt_pk_bf16_f32 v112, v112, v113 // 0000000055F4: D2680070 0002E370 - v_cvt_pk_bf16_f32 v113, v114, v115 // 0000000055FC: D2680071 0002E772 - v_cvt_pk_bf16_f32 v114, v116, v117 // 000000005604: D2680072 0002EB74 - v_cvt_pk_bf16_f32 v115, v118, v119 // 00000000560C: D2680073 0002EF76 - s_lshl_b32 s12, s36, 1 // 000000005614: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005618: 80100C10 - s_addc_u32 s17, s17, 0 // 00000000561C: 82118011 - buffer_store_dwordx4 v[112:115], v11, s[16:19], 0 offen nt // 000000005620: E07E1000 8004700B - v_cvt_pk_bf16_f32 v120, v120, v121 // 000000005628: D2680078 0002F378 - v_cvt_pk_bf16_f32 v121, v122, v123 // 000000005630: D2680079 0002F77A - v_cvt_pk_bf16_f32 v122, v124, v125 // 000000005638: D268007A 0002FB7C - v_cvt_pk_bf16_f32 v123, v126, v127 // 000000005640: D268007B 0002FF7E - s_lshl_b32 s12, s36, 1 // 000000005648: 8E0C8124 - s_add_u32 s16, s16, s12 // 00000000564C: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005650: 82118011 - buffer_store_dwordx4 v[120:123], v11, s[16:19], 0 offen nt // 000000005654: E07E1000 8004780B - v_cvt_pk_bf16_f32 v136, v136, v137 // 00000000565C: D2680088 00031388 - v_cvt_pk_bf16_f32 v137, v138, v139 // 000000005664: D2680089 0003178A - v_cvt_pk_bf16_f32 v138, v140, v141 // 00000000566C: D268008A 00031B8C - v_cvt_pk_bf16_f32 v139, v142, v143 // 000000005674: D268008B 00031F8E - s_lshl_b32 s12, s36, 1 // 00000000567C: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005680: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005684: 82118011 - buffer_store_dwordx4 v[136:139], v11, s[16:19], 0 offen nt // 000000005688: E07E1000 8004880B - v_cvt_pk_bf16_f32 v144, v144, v145 // 000000005690: D2680090 00032390 - v_cvt_pk_bf16_f32 v145, v146, v147 // 000000005698: D2680091 00032792 - v_cvt_pk_bf16_f32 v146, v148, v149 // 0000000056A0: D2680092 00032B94 - v_cvt_pk_bf16_f32 v147, v150, v151 // 0000000056A8: D2680093 00032F96 - s_lshl_b32 s12, s36, 1 // 0000000056B0: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000056B4: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000056B8: 82118011 - buffer_store_dwordx4 v[144:147], v11, s[16:19], 0 offen nt // 0000000056BC: E07E1000 8004900B - v_cvt_pk_bf16_f32 v152, v152, v153 // 0000000056C4: D2680098 00033398 - v_cvt_pk_bf16_f32 v153, v154, v155 // 0000000056CC: D2680099 0003379A - v_cvt_pk_bf16_f32 v154, v156, v157 // 0000000056D4: D268009A 00033B9C - v_cvt_pk_bf16_f32 v155, v158, v159 // 0000000056DC: D268009B 00033F9E - s_lshl_b32 s12, s36, 1 // 0000000056E4: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000056E8: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000056EC: 82118011 - buffer_store_dwordx4 v[152:155], v11, s[16:19], 0 offen nt // 0000000056F0: E07E1000 8004980B - v_cvt_pk_bf16_f32 v160, v160, v161 // 0000000056F8: D26800A0 000343A0 - v_cvt_pk_bf16_f32 v161, v162, v163 // 000000005700: D26800A1 000347A2 - v_cvt_pk_bf16_f32 v162, v164, v165 // 000000005708: D26800A2 00034BA4 - v_cvt_pk_bf16_f32 v163, v166, v167 // 000000005710: D26800A3 00034FA6 - s_lshl_b32 s12, s36, 1 // 000000005718: 8E0C8124 - s_add_u32 s16, s16, s12 // 00000000571C: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005720: 82118011 - buffer_store_dwordx4 v[160:163], v11, s[16:19], 0 offen nt // 000000005724: E07E1000 8004A00B - v_cvt_pk_bf16_f32 v168, v168, v169 // 00000000572C: D26800A8 000353A8 - v_cvt_pk_bf16_f32 v169, v170, v171 // 000000005734: D26800A9 000357AA - v_cvt_pk_bf16_f32 v170, v172, v173 // 00000000573C: D26800AA 00035BAC - v_cvt_pk_bf16_f32 v171, v174, v175 // 000000005744: D26800AB 00035FAE - s_lshl_b32 s12, s36, 1 // 00000000574C: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005750: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005754: 82118011 - buffer_store_dwordx4 v[168:171], v11, s[16:19], 0 offen nt // 000000005758: E07E1000 8004A80B - v_cvt_pk_bf16_f32 v176, v176, v177 // 000000005760: D26800B0 000363B0 - v_cvt_pk_bf16_f32 v177, v178, v179 // 000000005768: D26800B1 000367B2 - v_cvt_pk_bf16_f32 v178, v180, v181 // 000000005770: D26800B2 00036BB4 - v_cvt_pk_bf16_f32 v179, v182, v183 // 000000005778: D26800B3 00036FB6 - s_lshl_b32 s12, s36, 1 // 000000005780: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005784: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005788: 82118011 - buffer_store_dwordx4 v[176:179], v11, s[16:19], 0 offen nt // 00000000578C: E07E1000 8004B00B - v_cvt_pk_bf16_f32 v184, v184, v185 // 000000005794: D26800B8 000373B8 - v_cvt_pk_bf16_f32 v185, v186, v187 // 00000000579C: D26800B9 000377BA - v_cvt_pk_bf16_f32 v186, v188, v189 // 0000000057A4: D26800BA 00037BBC - v_cvt_pk_bf16_f32 v187, v190, v191 // 0000000057AC: D26800BB 00037FBE - s_lshl_b32 s12, s36, 1 // 0000000057B4: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000057B8: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000057BC: 82118011 - buffer_store_dwordx4 v[184:187], v11, s[16:19], 0 offen nt // 0000000057C0: E07E1000 8004B80B - v_cvt_pk_bf16_f32 v192, v192, v193 // 0000000057C8: D26800C0 000383C0 - v_cvt_pk_bf16_f32 v193, v194, v195 // 0000000057D0: D26800C1 000387C2 - v_cvt_pk_bf16_f32 v194, v196, v197 // 0000000057D8: D26800C2 00038BC4 - v_cvt_pk_bf16_f32 v195, v198, v199 // 0000000057E0: D26800C3 00038FC6 - s_lshl_b32 s12, s36, 1 // 0000000057E8: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000057EC: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000057F0: 82118011 - buffer_store_dwordx4 v[192:195], v11, s[16:19], 0 offen nt // 0000000057F4: E07E1000 8004C00B - v_cvt_pk_bf16_f32 v200, v200, v201 // 0000000057FC: D26800C8 000393C8 - v_cvt_pk_bf16_f32 v201, v202, v203 // 000000005804: D26800C9 000397CA - v_cvt_pk_bf16_f32 v202, v204, v205 // 00000000580C: D26800CA 00039BCC - v_cvt_pk_bf16_f32 v203, v206, v207 // 000000005814: D26800CB 00039FCE - s_lshl_b32 s12, s36, 1 // 00000000581C: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005820: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005824: 82118011 - buffer_store_dwordx4 v[200:203], v11, s[16:19], 0 offen nt // 000000005828: E07E1000 8004C80B - v_cvt_pk_bf16_f32 v208, v208, v209 // 000000005830: D26800D0 0003A3D0 - v_cvt_pk_bf16_f32 v209, v210, v211 // 000000005838: D26800D1 0003A7D2 - v_cvt_pk_bf16_f32 v210, v212, v213 // 000000005840: D26800D2 0003ABD4 - v_cvt_pk_bf16_f32 v211, v214, v215 // 000000005848: D26800D3 0003AFD6 - s_lshl_b32 s12, s36, 1 // 000000005850: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005854: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005858: 82118011 - buffer_store_dwordx4 v[208:211], v11, s[16:19], 0 offen nt // 00000000585C: E07E1000 8004D00B - v_cvt_pk_bf16_f32 v216, v216, v217 // 000000005864: D26800D8 0003B3D8 - v_cvt_pk_bf16_f32 v217, v218, v219 // 00000000586C: D26800D9 0003B7DA - v_cvt_pk_bf16_f32 v218, v220, v221 // 000000005874: D26800DA 0003BBDC - v_cvt_pk_bf16_f32 v219, v222, v223 // 00000000587C: D26800DB 0003BFDE - s_lshl_b32 s12, s36, 1 // 000000005884: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005888: 80100C10 - s_addc_u32 s17, s17, 0 // 00000000588C: 82118011 - buffer_store_dwordx4 v[216:219], v11, s[16:19], 0 offen nt // 000000005890: E07E1000 8004D80B - v_cvt_pk_bf16_f32 v224, v224, v225 // 000000005898: D26800E0 0003C3E0 - v_cvt_pk_bf16_f32 v225, v226, v227 // 0000000058A0: D26800E1 0003C7E2 - v_cvt_pk_bf16_f32 v226, v228, v229 // 0000000058A8: D26800E2 0003CBE4 - v_cvt_pk_bf16_f32 v227, v230, v231 // 0000000058B0: D26800E3 0003CFE6 - s_lshl_b32 s12, s36, 1 // 0000000058B8: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000058BC: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000058C0: 82118011 - buffer_store_dwordx4 v[224:227], v11, s[16:19], 0 offen nt // 0000000058C4: E07E1000 8004E00B - v_cvt_pk_bf16_f32 v232, v232, v233 // 0000000058CC: D26800E8 0003D3E8 - v_cvt_pk_bf16_f32 v233, v234, v235 // 0000000058D4: D26800E9 0003D7EA - v_cvt_pk_bf16_f32 v234, v236, v237 // 0000000058DC: D26800EA 0003DBEC - v_cvt_pk_bf16_f32 v235, v238, v239 // 0000000058E4: D26800EB 0003DFEE - s_lshl_b32 s12, s36, 1 // 0000000058EC: 8E0C8124 - s_add_u32 s16, s16, s12 // 0000000058F0: 80100C10 - s_addc_u32 s17, s17, 0 // 0000000058F4: 82118011 - buffer_store_dwordx4 v[232:235], v11, s[16:19], 0 offen nt // 0000000058F8: E07E1000 8004E80B - v_cvt_pk_bf16_f32 v240, v240, v241 // 000000005900: D26800F0 0003E3F0 - v_cvt_pk_bf16_f32 v241, v242, v243 // 000000005908: D26800F1 0003E7F2 - v_cvt_pk_bf16_f32 v242, v244, v245 // 000000005910: D26800F2 0003EBF4 - v_cvt_pk_bf16_f32 v243, v246, v247 // 000000005918: D26800F3 0003EFF6 - s_lshl_b32 s12, s36, 1 // 000000005920: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005924: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005928: 82118011 - buffer_store_dwordx4 v[240:243], v11, s[16:19], 0 offen nt // 00000000592C: E07E1000 8004F00B - s_nop 0 // 000000005934: BF800000 - v_accvgpr_read_b32 v16, a131 // 000000005938: D3D84010 18000183 - v_accvgpr_read_b32 v17, a135 // 000000005940: D3D84011 18000187 - v_accvgpr_read_b32 v18, a139 // 000000005948: D3D84012 1800018B - v_accvgpr_read_b32 v19, a143 // 000000005950: D3D84013 1800018F - v_accvgpr_read_b32 v20, a147 // 000000005958: D3D84014 18000193 - v_accvgpr_read_b32 v21, a151 // 000000005960: D3D84015 18000197 - v_accvgpr_read_b32 v22, a155 // 000000005968: D3D84016 1800019B - v_accvgpr_read_b32 v23, a159 // 000000005970: D3D84017 1800019F - v_accvgpr_read_b32 v24, a163 // 000000005978: D3D84018 180001A3 - v_accvgpr_read_b32 v25, a167 // 000000005980: D3D84019 180001A7 - v_accvgpr_read_b32 v26, a171 // 000000005988: D3D8401A 180001AB - v_accvgpr_read_b32 v27, a175 // 000000005990: D3D8401B 180001AF - v_accvgpr_read_b32 v28, a179 // 000000005998: D3D8401C 180001B3 - v_accvgpr_read_b32 v29, a183 // 0000000059A0: D3D8401D 180001B7 - v_accvgpr_read_b32 v30, a187 // 0000000059A8: D3D8401E 180001BB - v_accvgpr_read_b32 v31, a191 // 0000000059B0: D3D8401F 180001BF - v_accvgpr_read_b32 v32, a195 // 0000000059B8: D3D84020 180001C3 - v_accvgpr_read_b32 v33, a199 // 0000000059C0: D3D84021 180001C7 - v_accvgpr_read_b32 v34, a203 // 0000000059C8: D3D84022 180001CB - v_accvgpr_read_b32 v35, a207 // 0000000059D0: D3D84023 180001CF - v_accvgpr_read_b32 v36, a211 // 0000000059D8: D3D84024 180001D3 - v_accvgpr_read_b32 v37, a215 // 0000000059E0: D3D84025 180001D7 - v_accvgpr_read_b32 v38, a219 // 0000000059E8: D3D84026 180001DB - v_accvgpr_read_b32 v39, a223 // 0000000059F0: D3D84027 180001DF - v_accvgpr_read_b32 v40, a227 // 0000000059F8: D3D84028 180001E3 - v_accvgpr_read_b32 v41, a231 // 000000005A00: D3D84029 180001E7 - v_accvgpr_read_b32 v42, a235 // 000000005A08: D3D8402A 180001EB - v_accvgpr_read_b32 v43, a239 // 000000005A10: D3D8402B 180001EF - v_accvgpr_read_b32 v44, a243 // 000000005A18: D3D8402C 180001F3 - v_accvgpr_read_b32 v45, a247 // 000000005A20: D3D8402D 180001F7 - v_accvgpr_read_b32 v46, a251 // 000000005A28: D3D8402E 180001FB - v_accvgpr_read_b32 v47, a255 // 000000005A30: D3D8402F 180001FF - v_mov_b32_e32 v8, 0xffff0000 // 000000005A38: 7E1002FF FFFF0000 - v_mov_b32_e32 v9, 0x7fff0000 // 000000005A40: 7E1202FF 7FFF0000 - v_mov_b32_e32 v10, 0x7fff // 000000005A48: 7E1402FF 00007FFF - v_cvt_pk_bf16_f32 v16, v16, v17 // 000000005A50: D2680010 00022310 - v_cvt_pk_bf16_f32 v17, v18, v19 // 000000005A58: D2680011 00022712 - v_cvt_pk_bf16_f32 v18, v20, v21 // 000000005A60: D2680012 00022B14 - v_cvt_pk_bf16_f32 v19, v22, v23 // 000000005A68: D2680013 00022F16 - s_lshl_b32 s12, s36, 1 // 000000005A70: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005A74: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005A78: 82118011 - buffer_store_dwordx4 v[16:19], v11, s[16:19], 0 offen nt // 000000005A7C: E07E1000 8004100B - v_cvt_pk_bf16_f32 v24, v24, v25 // 000000005A84: D2680018 00023318 - v_cvt_pk_bf16_f32 v25, v26, v27 // 000000005A8C: D2680019 0002371A - v_cvt_pk_bf16_f32 v26, v28, v29 // 000000005A94: D268001A 00023B1C - v_cvt_pk_bf16_f32 v27, v30, v31 // 000000005A9C: D268001B 00023F1E - s_lshl_b32 s12, s36, 1 // 000000005AA4: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005AA8: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005AAC: 82118011 - buffer_store_dwordx4 v[24:27], v11, s[16:19], 0 offen nt // 000000005AB0: E07E1000 8004180B - v_cvt_pk_bf16_f32 v32, v32, v33 // 000000005AB8: D2680020 00024320 - v_cvt_pk_bf16_f32 v33, v34, v35 // 000000005AC0: D2680021 00024722 - v_cvt_pk_bf16_f32 v34, v36, v37 // 000000005AC8: D2680022 00024B24 - v_cvt_pk_bf16_f32 v35, v38, v39 // 000000005AD0: D2680023 00024F26 - s_lshl_b32 s12, s36, 1 // 000000005AD8: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005ADC: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005AE0: 82118011 - buffer_store_dwordx4 v[32:35], v11, s[16:19], 0 offen nt // 000000005AE4: E07E1000 8004200B - v_cvt_pk_bf16_f32 v40, v40, v41 // 000000005AEC: D2680028 00025328 - v_cvt_pk_bf16_f32 v41, v42, v43 // 000000005AF4: D2680029 0002572A - v_cvt_pk_bf16_f32 v42, v44, v45 // 000000005AFC: D268002A 00025B2C - v_cvt_pk_bf16_f32 v43, v46, v47 // 000000005B04: D268002B 00025F2E - s_lshl_b32 s12, s36, 1 // 000000005B0C: 8E0C8124 - s_add_u32 s16, s16, s12 // 000000005B10: 80100C10 - s_addc_u32 s17, s17, 0 // 000000005B14: 82118011 - buffer_store_dwordx4 v[40:43], v11, s[16:19], 0 offen nt // 000000005B18: E07E1000 8004280B - s_nop 0 // 000000005B20: BF800000 - s_endpgm // 00000001F5D0: BF810000 diff --git a/extra/gemm/asm/cdna/template.s b/extra/gemm/asm/cdna/template.s deleted file mode 100644 index 9ba66ed9a9..0000000000 --- a/extra/gemm/asm/cdna/template.s +++ /dev/null @@ -1,78 +0,0 @@ -.text -.section .text. -.global gemm -.p2align 8 -.type gemm,@function - -gemm: -INSTRUCTIONS - -.section .rodata,"a",@progbits -.p2align 6, 0x0 -.amdhsa_kernel gemm - # basic memory requirements - .amdhsa_group_segment_fixed_size 133120 - .amdhsa_private_segment_fixed_size 0 - .amdhsa_kernarg_size 28 - # register usage (RSRC1) - .amdhsa_next_free_vgpr 504 - .amdhsa_next_free_sgpr 96 - # workgroup / workitem IDs (RSRC2) - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_sgpr_workgroup_id_z 1 - # user SGPRs, we only specify the kernel args ptr in s[0:1] - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_user_sgpr_count 2 - .amdhsa_user_sgpr_kernarg_preload_length 0 - .amdhsa_user_sgpr_kernarg_preload_offset 0 - # gfx90a / gfx940 specifics (RSRC3) - .amdhsa_accum_offset 248 - .amdhsa_uses_dynamic_stack 0 - .amdhsa_tg_split 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.kernels: - - .name: gemm - .symbol: gemm.kd - .args: - - .name: C - .address_space: global - .offset: 0 - .size: 8 - .value_kind: global_buffer - .value_type: bf16 - - .name: B - .address_space: global - .offset: 8 - .size: 8 - .value_kind: global_buffer - .value_type: bf16 - - .name: A - .address_space: global - .offset: 16 - .size: 8 - .value_kind: global_buffer - .value_type: bf16 - - .name: sz - .offset: 24 - .size: 4 - .value_kind: by_value - .value_type: u32 - .group_segment_fixed_size: 133120 - .private_segment_fixed_size: 0 - .kernarg_segment_align: 8 - .kernarg_segment_size: 28 - .max_flat_workgroup_size: 256 - .sgpr_count: 88 - .sgpr_spill_count: 0 - .vgpr_count: 248 - .vgpr_spill_count: 0 - .wavefront_size: 64 -amdhsa.version: - - 1 - - 0 -... -.end_amdgpu_metadata diff --git a/extra/gemm/asm/cdna/test.py b/extra/gemm/asm/cdna/test.py deleted file mode 100644 index 1370f5879b..0000000000 --- a/extra/gemm/asm/cdna/test.py +++ /dev/null @@ -1,73 +0,0 @@ -# Run assembly on the AMD runtime and check correctness -# VIZ=2 to profile -import pathlib -from tinygrad import Tensor, Device, dtypes, Context -from tinygrad.uop.ops import UOp, Ops, KernelInfo -from tinygrad.engine.realize import Estimates -from tinygrad.helpers import getenv - -fp = pathlib.Path(__file__).parent/"gemm.s" - -N = getenv("N", 8192) -THREADS_PER_WG = 256 -NUM_WG = N//THREADS_PER_WG * N//THREADS_PER_WG - -assert N % THREADS_PER_WG == 0, "N must be divisible by THREADS_PER_WG" - -# ** generate inputs on CPU - -scale = 10.0 - -import torch -torch.manual_seed(0) -A = (torch.randn(N, N, dtype=torch.float32, device="cpu") / scale).to(torch.bfloat16).contiguous() -B = (torch.randn(N, N, dtype=torch.float32, device="cpu") / scale).to(torch.bfloat16).contiguous() -Bt = B.t().contiguous() # transpose B for the asm gemm -C_torch = A@B - -# ** copy buffers to AMD - -# input creation and validation run on the copy engine for simpler tracing - -def from_torch(t:torch.Tensor) -> Tensor: - return Tensor.from_blob(t.data_ptr(), t.shape, dtype=dtypes.bfloat16, device="cpu").to(Device.DEFAULT).realize() - -C_tiny = from_torch(A) @ from_torch(B) -C_asm = Tensor.empty_like(C_tiny) - -# ** assembly custom kernel - -def custom_asm_gemm(C:UOp, A:UOp, B:UOp) -> UOp: - lidx = UOp.special(THREADS_PER_WG, "lidx0") - gidx = UOp.special(NUM_WG, "gidx0") - - src = (pathlib.Path(__file__).parent/"template.s").read_text().replace("INSTRUCTIONS", fp.read_text()) - - sz = UOp.variable("SZ", 256, 8192) - - sink = UOp.sink(C.base, A.base, B.base, sz, lidx, gidx, arg=KernelInfo(name="gemm", estimates=Estimates(ops=N*N*N*2, mem=N*N*4*3))) - return UOp(Ops.PROGRAM, src=(sink, UOp(Ops.DEVICE, arg=Device.DEFAULT), UOp(Ops.LINEAR, src=(*sink.src, sink)), UOp(Ops.SOURCE, arg=src))) - -C_asm = Tensor.custom_kernel(C_asm, from_torch(A), from_torch(Bt), fxn=custom_asm_gemm)[0] - -# ** run gemms - -sched = Tensor.schedule(C_tiny, C_asm) -eis = [si.lower() for si in sched] - -with Context(DEBUG=2): - for ei in eis: - et = ei.run({"SZ":N}, wait=True) - print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS") - -# ** correctness - -import ctypes - -def torch_bf16(t:Tensor) -> torch.tensor: - asm_out = t.to("cpu").realize().uop.buffer._buf - buf = (ctypes.c_uint16*C_asm.uop.size).from_address(asm_out.va_addr) - return torch.frombuffer(buf, dtype=torch.bfloat16, count=C_asm.uop.size).reshape(C_asm.shape) - -assert torch.allclose(torch_bf16(C_asm), C_torch, rtol=1e-2, atol=1e-3) -assert torch.allclose(torch_bf16(C_tiny), C_torch, rtol=1e-2, atol=1e-3) diff --git a/extra/gemm/asm/cdna/test_asm_gemm.py b/extra/gemm/asm/cdna/test_asm_gemm.py new file mode 100644 index 0000000000..905492cc60 --- /dev/null +++ b/extra/gemm/asm/cdna/test_asm_gemm.py @@ -0,0 +1,46 @@ +import unittest +from tinygrad import Tensor, Device, dtypes, Context +from tinygrad.helpers import getenv +from extra.gemm.asm.cdna.gemm import asm_gemm + +def verify_asm_gemm(batch:int, M:int, N:int, K:int, dtype=dtypes.bfloat16, multi=False) -> None: + Tensor.manual_seed(0) + a_rand = Tensor.randn((batch, M, K), dtype=dtypes.float).sub(0.5).cast(dtype) + b_rand = Tensor.randn((K, N), dtype=dtypes.float).sub(0.5).cast(dtype) + with Context(DEBUG=0): + Tensor.realize(a_rand, b_rand) + + devs = tuple(f"{Device.DEFAULT}:{i}" for i in range(8)) if multi else None + + a, b = Tensor(a_rand.numpy(), requires_grad=True).cast(dtype), Tensor(b_rand.numpy(), requires_grad=True).cast(dtype) + if multi: a, b = a.shard(devs, axis=0), b.shard(devs, axis=None) + tst = asm_gemm(a, b) + tst.sum().backward() + Tensor.realize(tst, a.grad, b.grad) + + a_ref, b_ref = Tensor(a_rand.numpy(), requires_grad=True).cast(dtype), Tensor(b_rand.numpy(), requires_grad=True).cast(dtype) + if multi: a_ref, b_ref = a_ref.shard(devs, axis=0), b_ref.shard(devs, axis=None) + with Context(ASM_GEMM=0): ref = a_ref @ b_ref + ref.sum().backward() + Tensor.realize(ref, a_ref.grad, b_ref.grad) + + with Context(DEBUG=0): + assert (tst - ref).square().max().float().item() < 1e-6, "forward mismatch" + assert (a.grad - a_ref.grad).square().max().float().item() < 1e-3, "grad_a mismatch" + assert (b.grad - b_ref.grad).square().max().float().item() < 1e-3, "grad_b mismatch" + +class TestGemm(unittest.TestCase): + def test_simple(self): verify_asm_gemm(1, N:=getenv("N", 4096), N, N, dtype=dtypes.half) + + def test_gemm1(self): verify_asm_gemm(8, 8192, 4096, 14336, multi=True) + def test_gemm2(self): verify_asm_gemm(8, 8192, 128256, 4096, multi=True) + def test_gemm3(self): verify_asm_gemm(8, 8192, 14336, 4096, multi=True) + def test_gemm4(self): verify_asm_gemm(8, 4096, 14336, 4096, multi=True) + def test_gemm5(self): verify_asm_gemm(8, 4096, 4096, 14336, multi=True) + def test_gemm6(self): verify_asm_gemm(16, 4096, 4096, 14336, multi=True) + def test_gemm_unsupported(self): + with self.assertRaisesRegex(AssertionError, "shape not supported"): + verify_asm_gemm(8, 8192, 1024, 4096, multi=True) + +if __name__ == "__main__": + unittest.main() diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 319d44ec8d..85c471afb3 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -206,6 +206,8 @@ ALLOW_TF32 = ContextVar("ALLOW_TF32", 0) SCACHE = ContextVar("SCACHE", 1) # allow use of atomics for embedding backward USE_ATOMICS = ContextVar("USE_ATOMICS", 0) +# allow use of assembly for gemm +ASM_GEMM = ContextVar("ASM_GEMM", 0) @dataclass(frozen=True) class Metadata: diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 59a7b72c74..86ff73ef9c 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import numpy from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate from tinygrad.dtype import _from_np_dtype, _to_np_dtype, PyConst from tinygrad.helpers import argfix, make_tuple, flatten, prod, all_int, round_up, merge_dicts, argsort, getenv, all_same, fully_flatten -from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ceildiv, fetch, polyN, is_numpy_ndarray, TracingKey, cpu_profile +from tinygrad.helpers import IMAGE, WINO, Metadata, TRACEMETA, ASM_GEMM, ceildiv, fetch, polyN, is_numpy_ndarray, TracingKey, cpu_profile from tinygrad.helpers import suppress_finalizing, disable_gc from tinygrad.gradient import compute_gradient from tinygrad.mixin import OpMixin @@ -2431,6 +2431,9 @@ class Tensor(OpMixin): ``` """ if IMAGE: return self.image_dot(w, dtype) + if ASM_GEMM: + from extra.gemm.asm.cdna.gemm import can_use_asm_gemm, asm_gemm + if can_use_asm_gemm(self, w): return asm_gemm(self, w) x, dx, dw = self, self.ndim, w.ndim if not (dx > 0 and dw > 0): raise RuntimeError(f"both tensors need to be at least 1D, got {dx}D and {dw}D") if x.shape[-1] != w.shape[axis_w:=-min(w.ndim,2)]: raise RuntimeError(f"cannot dot {x.shape} and {w.shape}") diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py index 12611f8667..6842e44b37 100644 --- a/tinygrad/uop/ops.py +++ b/tinygrad/uop/ops.py @@ -206,7 +206,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass): match self.op: # late ops don't have shape case Ops.UNIQUE | Ops.LUNIQUE | Ops.DEVICE | Ops.RANGE | Ops.LOAD | Ops.IF | Ops.BARRIER | Ops.CUSTOM | Ops.CUSTOMI | \ - Ops.VECTORIZE | Ops.VCONST | Ops.GEP | Ops.SPECIAL | Ops.UNROLL | Ops.CONTRACT | Ops.CUSTOM_KERNEL | \ + Ops.VECTORIZE | Ops.VCONST | Ops.GEP | Ops.SPECIAL | Ops.UNROLL | Ops.CONTRACT | Ops.CUSTOM_KERNEL | Ops.SINK | \ Ops.LINEAR | Ops.PROGRAM | Ops.SOURCE | Ops.BINARY: return None diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index be211c981f..04b71dbe87 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -558,7 +558,7 @@ def get_render(query:str) -> dict: rows:dict[int, dict] = {} for pc, (inst,_) in pc_to_inst.items(): if start_pc is None: start_pc = pc - rows[pc] = {"pc":pc-start_pc, "inst":inst, "hit_count":0, "dur":0, "stall":0, "hits":{"cols":inst_columns, "rows":[]}, "type":""} + rows[pc] = {"pc":pc-start_pc, "inst":inst, "hit_count":0, "dur":0, "stall":0, "type":"", "hits":{"cols":inst_columns, "rows":[]}} for e in w.unpack_insts(): if not (inst:=rows[e.pc]).get("type"): inst["type"] = str(e.typ).split("_")[-1] inst["hit_count"] += 1