Files
tinygrad/extra/gemm/asm/gemm.s

1719 lines
148 KiB
ArmAsm

.text
.section .text.
.global gemm
.p2align 8
.type gemm,@function
gemm:
// ** global buffers
s_load_dwordx2 s[28:29], s[0:1], 0x0 // C
s_load_dwordx4 s[32:35], s[0:1], 0x8 // A, B
// ** others kernel args
s_load_dword s24, s[0:1], 0x18 // N
s_load_dword s54, s[0:1], 0x1C // num work groups
s_waitcnt lgkmcnt(0)
// "info"
s_mov_b32 s51, 1 // gemm_info = 1
s_mov_b32 s53, 1 // kernel_info0 = 1
s_mov_b32 s11, 0x40010020 // kernel_info1 = 0x40010020
// sizes / strides
s_mov_b32 s25, s24 // sizesFree1 = N
s_mov_b32 s26, 1 // sizesFree2 = BATCH
s_mov_b32 s27, s24 // sizesSum0 = K (== N)
// Strides: major=N, minor=0 (addr = base + idx0*N + idx1*0)
s_mov_b32 s36, s24 // strideD0
s_mov_b32 s37, 0 // strideD1
s_mov_b32 s38, s24 // strideC0
s_mov_b32 s39, 0 // strideC1
s_mov_b32 s40, s24 // strideA0
s_mov_b32 s41, 0 // strideA1
s_mov_b32 s42, s24 // strideB0
s_mov_b32 s43, 0 // strideB1
// ** workgroup mapping
s_lshr_b32 s52, s51, 30 // 000000002924: 8F349E33
s_and_b32 s51, 0x3fffffff, s51 // 000000002928: 863333FF 3FFFFFFF
s_cmp_eq_u32 s52, 0 // 000000002930: BF068034
s_and_b32 s10, s53, 0xffff0000 // 000000002A70: 860AFF35 FFFF0000
s_lshr_b32 s10, s10, 16 // 000000002A78: 8F0A900A
s_and_b32 s50, s53, 0xffff // 000000002A7C: 8632FF35 0000FFFF
s_mov_b32 s5, s52 // 000000002A84: BE850034
s_mov_b32 m0, 0x20800 // 000000002A88: BEFC00FF 00020800
v_mov_b32_e32 v134, v0 // 000000002A90: 7F0C0300
s_lshr_b32 s60, s11, 16 // 000000002A94: 8F3C900B
s_ff1_i32_b32 s60, s60 // 000000002A98: BEBC103C
s_lshr_b32 s61, s11, 22 // 000000002A9C: 8F3D960B
v_and_b32_e32 v5, 63, v134 // 000000002BB0: 260B0CBF
v_and_b32_e32 v4, 15, v5 // 000000002BB4: 26080A8F
v_lshlrev_b32_e32 v4, 6, v4 // 000000002BB8: 24080886
v_lshlrev_b32_e32 v4, 3, v4 // 000000002BBC: 24080883
v_lshrrev_b32_e32 v5, 4, v5 // 000000002BC0: 200A0A84
v_lshl_add_u32 v4, v5, 3, v4 // 000000002BC4: D1FD0004 04110705
v_lshrrev_b32_e32 v8, 6, v134 // 000000002BCC: 20110C86
v_and_b32_e32 v8, 1, v8 // 000000002BD0: 26101081
v_lshl_add_u32 v4, v8, 13, v4 // 000000002BD4: D1FD0004 04111B08
v_and_b32_e32 v6, 63, v134 // 000000002BDC: 260D0CBF
v_and_b32_e32 v5, 15, v6 // 000000002BE0: 260A0C8F
v_lshlrev_b32_e32 v5, 6, v5 // 000000002BE4: 240A0A86
v_lshlrev_b32_e32 v5, 3, v5 // 000000002BE8: 240A0A83
v_lshrrev_b32_e32 v6, 4, v6 // 000000002BEC: 200C0C84
v_lshl_add_u32 v5, v6, 3, v5 // 000000002BF0: D1FD0005 04150706
v_lshrrev_b32_e32 v7, 7, v134 // 000000002BF8: 200F0C87
v_and_b32_e32 v7, 1, v7 // 000000002BFC: 260E0E81
v_lshl_add_u32 v5, v7, 13, v5 // 000000002C00: D1FD0005 04151B07
v_lshrrev_b32_e32 v6, 6, v134 // 000000002C08: 200D0C86
v_lshrrev_b32_e32 v6, 2, v6 // 000000002C0C: 200C0C82
s_mov_b32 s53, 64 // 000000002C10: BEB500C0
v_mul_lo_u32 v6, s53, v6 // 000000002C14: D2850006 00020C35
v_add_lshl_u32 v2, v6, v4, 1 // 000000002C1C: D1FE0002 02060906
v_lshrrev_b32_e32 v7, 10, v2 // 000000002C24: 200E048A
v_lshl_add_u32 v2, v7, 4, v2 // 000000002C28: D1FD0002 04090907
v_lshrrev_b32_e32 v4, 6, v134 // 000000002C30: 20090C86
v_lshrrev_b32_e32 v4, 2, v4 // 000000002C34: 20080882
v_mul_lo_u32 v4, s53, v4 // 000000002C38: D2850004 00020835
v_add_lshl_u32 v3, v4, v5, 1 // 000000002C40: D1FE0003 02060B04
v_lshrrev_b32_e32 v6, 10, v3 // 000000002C48: 200C068A
v_lshl_add_u32 v3, v6, 4, v3 // 000000002C4C: D1FD0003 040D0906
v_add_co_u32_e32 v3, vcc, 0x8200, v3 // 000000002C54: 320606FF 00008200
v_add_u32_e32 v132, 0x10400, v2 // 000000002C5C: 690804FF 00010400
v_xor_b32_e32 v132, v132, v2 // 000000002C64: 2B080584
v_add_u32_e32 v133, 0x10400, v3 // 000000002C68: 690A06FF 00010400
v_xor_b32_e32 v133, v133, v3 // 000000002C70: 2B0A0785
v_lshrrev_b32_e32 v4, 3, v134 // 000000002C74: 20090C83
v_and_b32_e32 v5, 7, v134 // 000000002C78: 260B0C87
v_lshlrev_b32_e32 v5, 3, v5 // 000000002C7C: 240A0A83
v_mov_b32_e32 v8, v5 // 000000002C80: 7E100305
v_lshrrev_b32_e32 v6, 3, v134 // 000000002C84: 200D0C83
v_and_b32_e32 v7, 7, v134 // 000000002C88: 260F0C87
v_lshlrev_b32_e32 v7, 3, v7 // 000000002C8C: 240E0E83
v_mov_b32_e32 v9, v7 // 000000002C90: 7E120307
v_mul_u32_u24_e32 v10, 64, v4 // 000000002C94: 101408C0
v_add_lshl_u32 v10, v8, v10, 1 // 000000002C98: D1FE000A 02061508
v_lshrrev_b32_e32 v12, 10, v10 // 000000002CA0: 2018148A
v_lshl_add_u32 v10, v12, 4, v10 // 000000002CA4: D1FD000A 0429090C
s_nop 0 // 000000002CAC: BF800000
v_readfirstlane_b32 s46, v10 // 000000002CB0: 7E5C050A
s_nop 0 // 000000002CB4: BF800000
s_add_u32 s48, s46, 0x10400 // 000000002CB8: 8030FF2E 00010400
s_xor_b32 s48, s48, s46 // 000000002CC0: 88302E30
v_mul_u32_u24_e32 v10, 64, v6 // 000000002CC4: 10140CC0
v_add_lshl_u32 v10, v9, v10, 1 // 000000002CC8: D1FE000A 02061509
v_lshrrev_b32_e32 v12, 10, v10 // 000000002CD0: 2018148A
v_lshl_add_u32 v10, v12, 4, v10 // 000000002CD4: D1FD000A 0429090C
v_add_co_u32_e32 v10, vcc, 0x8200, v10 // 000000002CDC: 321414FF 00008200
s_nop 0 // 000000002CE4: BF800000
v_readfirstlane_b32 s47, v10 // 000000002CE8: 7E5E050A
s_nop 0 // 000000002CEC: BF800000
s_add_u32 s49, s47, 0x10400 // 000000002CF0: 8031FF2F 00010400
s_xor_b32 s49, s49, s47 // 000000002CF8: 88312F31
v_mov_b32_e32 v12, 0x100 // 000000002CFC: 7E1802FF 00000100
v_mov_b32_e32 v11, s24 // 000000002D04: 7E160218
v_cvt_f32_u32_e32 v10, v12 // 000000002D08: 7E140D0C
v_rcp_iflag_f32_e32 v10, v10 // 000000002D0C: 7E14470A
v_cvt_f32_u32_e32 v13, v11 // 000000002D10: 7E1A0D0B
v_mul_f32_e32 v10, v10, v13 // 000000002D14: 0A141B0A
v_cvt_u32_f32_e32 v10, v10 // 000000002D18: 7E140F0A
v_mul_u32_u24_e32 v13, v10, v12 // 000000002D1C: 101A190A
v_sub_u32_e32 v13, v11, v13 // 000000002D20: 6A1A1B0B
v_cmp_ne_u32_e64 vcc, v13, 0 // 000000002D24: D0CD006A 0001010D
v_addc_co_u32_e64 v10, vcc, v10, 0, vcc // 000000002D2C: D11C6A0A 01A9010A
v_mov_b32_e32 v12, 0x100 // 000000002D34: 7E1802FF 00000100
v_mov_b32_e32 v11, s25 // 000000002D3C: 7E160219
v_readfirstlane_b32 s14, v10 // 000000002D40: 7E1C050A
v_cvt_f32_u32_e32 v10, v12 // 000000002D44: 7E140D0C
v_rcp_iflag_f32_e32 v10, v10 // 000000002D48: 7E14470A
v_cvt_f32_u32_e32 v13, v11 // 000000002D4C: 7E1A0D0B
v_mul_f32_e32 v10, v10, v13 // 000000002D50: 0A141B0A
v_cvt_u32_f32_e32 v10, v10 // 000000002D54: 7E140F0A
v_mul_u32_u24_e32 v13, v10, v12 // 000000002D58: 101A190A
v_sub_u32_e32 v13, v11, v13 // 000000002D5C: 6A1A1B0B
v_cmp_ne_u32_e64 vcc, v13, 0 // 000000002D60: D0CD006A 0001010D
v_addc_co_u32_e64 v10, vcc, v10, 0, vcc // 000000002D68: D11C6A0A 01A9010A
s_nop 0 // 000000002D70: BF800000
v_readfirstlane_b32 s15, v10 // 000000002D74: 7E1E050A
s_waitcnt lgkmcnt(0) // 000000002D78: BF8CC07F
s_mul_i32 s52, s14, s15 // 000000002D7C: 92340F0E
s_and_b32 s53, s50, 0x3fff // 000000002D80: 8635FF32 00003FFF
s_mul_i32 s52, s52, s53 // 000000002D88: 92343534
v_cvt_f32_u32_e32 v10, s52 // 000000002D8C: 7E140C34
v_rcp_iflag_f32_e32 v10, v10 // 000000002D90: 7E14470A
v_cvt_f32_u32_e32 v11, s2 // 000000002D94: 7E160C02
v_mul_f32_e32 v10, v10, v11 // 000000002D98: 0A14170A
v_cvt_u32_f32_e32 v10, v10 // 000000002D9C: 7E140F0A
v_mul_u32_u24_e64 v11, v10, s52 // 000000002DA0: D108000B 0000690A
v_sub_u32_e32 v11, s2, v11 // 000000002DA8: 6A161602
v_cmpx_eq_u32_e64 exec, v11, s52 // 000000002DAC: D0DA007E 0000690B
v_add_u32_e32 v10, 1, v10 // 000000002DB4: 68141481
s_mov_b64 exec, -1 // 000000002DB8: BEFE01C1
v_cmpx_gt_u32_e64 exec, v11, s52 // 000000002DBC: D0DC007E 0000690B
v_sub_u32_e64 v10, v10, 1 // 000000002DC4: D135000A 0001030A
s_mov_b64 exec, -1 // 000000002DCC: BEFE01C1
v_readfirstlane_b32 s52, v10 // 000000002DD0: 7E68050A
s_mov_b32 s4, s52 // 000000002DD4: BE840034
s_mul_i32 s52, s15, s14 // 000000002DD8: 92340E0F
s_mul_i32 s52, s52, s4 // 000000002DDC: 92340434
s_mul_i32 s52, s52, s53 // 000000002DE0: 92343534
s_sub_u32 s2, s2, s52 // 000000002DE4: 80823402
v_cvt_f32_u32_e32 v10, s14 // 000000002DE8: 7E140C0E
v_rcp_iflag_f32_e32 v10, v10 // 000000002DEC: 7E14470A
v_cvt_f32_u32_e32 v11, s2 // 000000002DF0: 7E160C02
v_mul_f32_e32 v10, v10, v11 // 000000002DF4: 0A14170A
v_cvt_u32_f32_e32 v10, v10 // 000000002DF8: 7E140F0A
v_mul_u32_u24_e64 v11, v10, s14 // 000000002DFC: D108000B 00001D0A
v_sub_u32_e32 v11, s2, v11 // 000000002E04: 6A161602
v_cmpx_eq_u32_e64 exec, v11, s14 // 000000002E08: D0DA007E 00001D0B
v_add_u32_e32 v10, 1, v10 // 000000002E10: 68141481
s_mov_b64 exec, -1 // 000000002E14: BEFE01C1
v_cmpx_gt_u32_e64 exec, v11, s14 // 000000002E18: D0DC007E 00001D0B
v_sub_u32_e64 v10, v10, 1 // 000000002E20: D135000A 0001030A
s_mov_b64 exec, -1 // 000000002E28: BEFE01C1
v_readfirstlane_b32 s52, v10 // 000000002E2C: 7E68050A
s_mov_b32 s3, s52 // 000000002E30: BE830034
s_mul_i32 s52, s3, s14 // 000000002E34: 92340E03
s_sub_u32 s2, s2, s52 // 000000002E38: 80823402
s_sub_u32 s32, s32, 16 // 000000002E3C: 80A09020
s_subb_u32 s33, s33, 0 // 000000002E40: 82A18021
s_sub_u32 s34, s34, 16 // 000000002E44: 80A29022
s_subb_u32 s35, s35, 0 // 000000002E48: 82A38023
s_and_b32 s84, s50, 0x3fff // 000000002E5C: 8654FF32 00003FFF
s_mov_b64 s[6:7], 0 // 000000002F48: BE860180
s_mov_b32 s8, 1 // 000000002F4C: BE880081
s_mov_b32 s9, 1 // 000000002F50: BE890081
s_sext_i32_i16 s11, s11 // 000000002F54: BE8B170B
v_mul_lo_u32 v10, s40, v4 // 0000000031B8: D285000A 00020828
v_add_co_u32_e32 v0, vcc, v5, v10 // 0000000031C0: 32001505
v_add_u32_e32 v0, 8, v0 // 0000000031C4: 68000088
v_lshlrev_b32_e32 v0, 1, v0 // 0000000031C8: 24000081
s_mul_i32 s70, s40, 32 // 0000000031CC: 9246A028
s_lshl_b32 s70, s70, 1 // 0000000031D0: 8E468146
s_mul_i32 s71, s40, 64 // 0000000031D4: 9247C028
s_lshl_b32 s71, s71, 1 // 0000000031D8: 8E478147
s_mul_i32 s72, s40, 0x60 // 0000000031DC: 9248FF28 00000060
s_lshl_b32 s72, s72, 1 // 0000000031E4: 8E488148
s_mul_i32 s73, s40, 0x80 // 0000000031E8: 9249FF28 00000080
s_lshl_b32 s73, s73, 1 // 0000000031F0: 8E498149
s_mul_i32 s74, s40, 0xa0 // 0000000031F4: 924AFF28 000000A0
s_lshl_b32 s74, s74, 1 // 0000000031FC: 8E4A814A
s_mul_i32 s75, s40, 0xc0 // 000000003200: 924BFF28 000000C0
s_lshl_b32 s75, s75, 1 // 000000003208: 8E4B814B
s_mul_i32 s76, s40, 0xe0 // 00000000320C: 924CFF28 000000E0
s_lshl_b32 s76, s76, 1 // 000000003214: 8E4C814C
v_mul_lo_u32 v10, s42, v6 // 000000003218: D285000A 00020C2A
v_add_co_u32_e32 v1, vcc, v7, v10 // 000000003220: 32021507
v_add_u32_e32 v1, 8, v1 // 000000003224: 68020288
v_lshlrev_b32_e32 v1, 1, v1 // 000000003228: 24020281
s_mul_i32 s77, s42, 32 // 00000000322C: 924DA02A
s_lshl_b32 s77, s77, 1 // 000000003230: 8E4D814D
s_mul_i32 s78, s42, 64 // 000000003234: 924EC02A
s_lshl_b32 s78, s78, 1 // 000000003238: 8E4E814E
s_mul_i32 s79, s42, 0x60 // 00000000323C: 924FFF2A 00000060
s_lshl_b32 s79, s79, 1 // 000000003244: 8E4F814F
s_mul_i32 s80, s42, 0x80 // 000000003248: 9250FF2A 00000080
s_lshl_b32 s80, s80, 1 // 000000003250: 8E508150
s_mul_i32 s81, s42, 0xa0 // 000000003254: 9251FF2A 000000A0
s_lshl_b32 s81, s81, 1 // 00000000325C: 8E518151
s_mul_i32 s82, s42, 0xc0 // 000000003260: 9252FF2A 000000C0
s_lshl_b32 s82, s82, 1 // 000000003268: 8E528152
s_mul_i32 s83, s42, 0xe0 // 00000000326C: 9253FF2A 000000E0
s_lshl_b32 s83, s83, 1 // 000000003274: 8E538153
s_mul_hi_u32 s87, s2, 0x100 // 000000003278: 9657FF02 00000100
s_mul_i32 s86, s2, 0x100 // 000000003280: 9256FF02 00000100
s_mul_hi_u32 s87, s86, s40 // 000000003288: 96572856
s_mul_i32 s86, s86, s40 // 00000000328C: 92562856
s_and_b32 s84, s50, 0x8000 // 000000003290: 8654FF32 00008000
s_cbranch_scc1 label_GSUC_A // 000000003298: BF850003
s_mul_hi_u32 s85, 64, s6 // 00000000329C: 965506C0
s_mul_i32 s84, 64, s6 // 0000000032A0: 925406C0
label_GSUC_A:
s_add_u32 s86, s86, s84 // 000000003330: 80565456
s_addc_u32 s87, s87, s85 // 000000003334: 82575557
s_mov_b64 s[60:61], 1 // 000000003338: BEBC0181
s_sub_u32 s84, s27, 1 // 00000000333C: 80D4811B
s_mul_hi_u32 s85, 1, s84 // 000000003340: 96555481
s_mul_i32 s84, 1, s84 // 000000003344: 92545481
s_add_u32 s60, s60, s84 // 000000003348: 803C543C
s_addc_u32 s61, s61, s85 // 00000000334C: 823D553D
s_sub_u32 s84, s24, 1 // 000000003350: 80D48118
s_mul_hi_u32 s85, s40, s84 // 000000003354: 96555428
s_mul_i32 s84, s40, s84 // 000000003358: 92545428
s_add_u32 s60, s60, s84 // 00000000335C: 803C543C
s_addc_u32 s61, s61, s85 // 000000003360: 823D553D
s_sub_u32 s60, s60, s86 // 000000003364: 80BC563C
s_subb_u32 s61, s61, s87 // 000000003368: 82BD573D
s_lshl_b64 s[60:61], s[60:61], 1 // 00000000336C: 8EBC813C
s_add_u32 s60, s60, 16 // 000000003370: 803C903C
s_addc_u32 s61, s61, 0 // 000000003374: 823D803D
s_cmp_eq_u32 s61, 0 // 000000003378: BF06803D
s_cselect_b32 s54, s60, -1 // 00000000337C: 8536C13C
s_mul_hi_u32 s85, s41, s4 // 000000003380: 96550429
s_mul_i32 s84, s41, s4 // 000000003384: 92540429
s_add_u32 s86, s86, s84 // 000000003388: 80565456
s_addc_u32 s87, s87, s85 // 00000000338C: 82575557
s_lshl_b64 s[86:87], s[86:87], 1 // 000000003390: 8ED68156
s_add_u32 s52, s32, s86 // 000000003394: 80345620
s_addc_u32 s53, s33, s87 // 000000003398: 82355721
s_mov_b32 s55, 0x20000 // 00000000339C: BEB700FF 00020000
s_mul_hi_u32 s87, s3, 0x100 // 0000000033A4: 9657FF03 00000100
s_mul_i32 s86, s3, 0x100 // 0000000033AC: 9256FF03 00000100
s_mul_hi_u32 s87, s86, s42 // 0000000033B4: 96572A56
s_mul_i32 s86, s86, s42 // 0000000033B8: 92562A56
s_and_b32 s84, s50, 0x8000 // 0000000033BC: 8654FF32 00008000
s_cbranch_scc1 label_GSUC_B // 0000000033C4: BF850003
s_mul_hi_u32 s85, 64, s6 // 0000000033C8: 965506C0
s_mul_i32 s84, 64, s6 // 0000000033CC: 925406C0
label_GSUC_B:
s_add_u32 s86, s86, s84 // 00000000345C: 80565456
s_addc_u32 s87, s87, s85 // 000000003460: 82575557
s_mov_b64 s[62:63], 1 // 000000003464: BEBE0181
s_sub_u32 s84, s27, 1 // 000000003468: 80D4811B
s_mul_hi_u32 s85, 1, s84 // 00000000346C: 96555481
s_mul_i32 s84, 1, s84 // 000000003470: 92545481
s_add_u32 s62, s62, s84 // 000000003474: 803E543E
s_addc_u32 s63, s63, s85 // 000000003478: 823F553F
s_sub_u32 s84, s25, 1 // 00000000347C: 80D48119
s_mul_hi_u32 s85, s42, s84 // 000000003480: 9655542A
s_mul_i32 s84, s42, s84 // 000000003484: 9254542A
s_add_u32 s62, s62, s84 // 000000003488: 803E543E
s_addc_u32 s63, s63, s85 // 00000000348C: 823F553F
s_sub_u32 s62, s62, s86 // 000000003490: 80BE563E
s_subb_u32 s63, s63, s87 // 000000003494: 82BF573F
s_lshl_b64 s[62:63], s[62:63], 1 // 000000003498: 8EBE813E
s_add_u32 s62, s62, 16 // 00000000349C: 803E903E
s_addc_u32 s63, s63, 0 // 0000000034A0: 823F803F
s_cmp_eq_u32 s63, 0 // 0000000034A4: BF06803F
s_cselect_b32 s58, s62, -1 // 0000000034A8: 853AC13E
s_mul_hi_u32 s85, s43, s4 // 0000000034AC: 9655042B
s_mul_i32 s84, s43, s4 // 0000000034B0: 9254042B
s_add_u32 s86, s86, s84 // 0000000034B4: 80565456
s_addc_u32 s87, s87, s85 // 0000000034B8: 82575557
s_lshl_b64 s[86:87], s[86:87], 1 // 0000000034BC: 8ED68156
s_add_u32 s56, s34, s86 // 0000000034C0: 80385622
s_addc_u32 s57, s35, s87 // 0000000034C4: 82395723
s_mov_b32 s59, 0x20000 // 0000000034C8: BEBB00FF 00020000
s_and_b32 s85, s50, 0x3fff // 0000000034D0: 8655FF32 00003FFF
s_mul_i32 s85, s85, 0x80 // 0000000034D8: 9255FF55 00000080
s_and_b32 s84, s50, 0x8000 // 0000000034E0: 8654FF32 00008000
s_cselect_b32 s68, 0x80, s85 // 0000000034E8: 854455FF 00000080
s_and_b32 s85, s50, 0x3fff // 0000000034F0: 8655FF32 00003FFF
s_mul_i32 s85, s85, 0x80 // 0000000034F8: 9255FF55 00000080
s_and_b32 s84, s50, 0x8000 // 000000003500: 8654FF32 00008000
s_cselect_b32 s69, 0x80, s85 // 000000003508: 854555FF 00000080
s_lshr_b32 s12, s27, 6 // 000000003510: 8F0C861B
s_and_b32 s84, s50, 0x3fff // 000000003514: 8654FF32 00003FFF
s_mov_b32 s13, s12 // 000000003594: BE8D000C
s_and_b32 s86, s10, 0x1f00 // 000000003598: 8656FF0A 00001F00
s_lshr_b32 s86, s86, 8 // 0000000035A0: 8F568856
s_and_b32 s87, s10, 0xe000 // 0000000035A4: 8657FF0A 0000E000
s_and_b32 s10, s10, 0xff // 0000000035AC: 860AFF0A 000000FF
s_mov_b32 s84, s10 // 0000000035B4: BED4000A
label_beginStaggerUIter:
s_lshl_b32 s85, s84, s86 // 0000000035B8: 8E555654
s_cmp_ge_u32 s13, s85 // 0000000035BC: BF09550D
s_sub_u32 s85, s84, 1 // 0000000035CC: 80D58154
s_cmp_ge_u32 s84, 1 // 0000000035D0: BF098154
s_cselect_b32 s51, s85, 0 // 0000000035D4: 85338055
s_cmp_eq_u32 s87, 0x2000 // 0000000035E8: BF06FF57 00002000
s_and_b32 s51, s51, s84 // 000000003640: 86335433
s_lshl_b32 s51, s51, s86 // 000000003644: 8E335633
s_mul_hi_i32 s85, s51, s68 // 000000003648: 96D54433
s_mul_i32 s84, s51, s68 // 00000000364C: 92544433
s_mul_hi_i32 s65, s12, s68 // 000000003650: 96C1440C
s_mul_i32 s64, s12, s68 // 000000003654: 9240440C
s_sub_u32 s64, s68, s64 // 000000003658: 80C04044
s_subb_u32 s65, 0, s65 // 00000000365C: 82C14180
s_add_u32 s52, s52, s84 // 000000003660: 80345434
s_addc_u32 s53, s53, s85 // 000000003664: 82355535
s_sub_u32 s60, s60, s84 // 000000003668: 80BC543C
s_subb_u32 s61, s61, s85 // 00000000366C: 82BD553D
s_cmp_eq_u32 s61, 0 // 000000003670: BF06803D
s_cselect_b32 s54, s60, -1 // 000000003674: 8536C13C
s_mul_hi_i32 s85, s51, s69 // 000000003678: 96D54533
s_mul_i32 s84, s51, s69 // 00000000367C: 92544533
s_mul_hi_i32 s67, s12, s69 // 000000003680: 96C3450C
s_mul_i32 s66, s12, s69 // 000000003684: 9242450C
s_sub_u32 s66, s69, s66 // 000000003688: 80C24245
s_subb_u32 s67, 0, s67 // 00000000368C: 82C34380
s_add_u32 s56, s56, s84 // 000000003690: 80385438
s_addc_u32 s57, s57, s85 // 000000003694: 82395539
s_sub_u32 s62, s62, s84 // 000000003698: 80BE543E
s_subb_u32 s63, s63, s85 // 00000000369C: 82BF553F
s_cmp_eq_u32 s63, 0 // 0000000036A0: BF06803F
s_cselect_b32 s58, s62, -1 // 0000000036A4: 853AC13E
s_add_u32 s51, s51, 2 // 0000000036A8: 80338233
s_cmp_eq_u32 s12, 0 // 0000000036AC: BF06800C
s_cbranch_scc1 label_ShadowInitStart // 0000000036B0: BF850092
s_mov_b32 m0, s46 // 0000000036B4: BEFC002E
buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 0000000036B8: E05D1000 800D0000
s_add_u32 m0, m0, 0x1040 // 0000000036C0: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 0000000036C8: E05D1000 460D0000
s_add_u32 m0, m0, 0x1040 // 0000000036D0: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 0000000036D8: E05D1000 470D0000
s_add_u32 m0, m0, 0x1040 // 0000000036E0: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 0000000036E8: E05D1000 480D0000
s_add_u32 m0, m0, 0x1040 // 0000000036F0: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 0000000036F8: E05D1000 490D0000
s_add_u32 m0, m0, 0x1040 // 000000003700: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003708: E05D1000 4A0D0000
s_add_u32 m0, m0, 0x1040 // 000000003710: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003718: E05D1000 4B0D0000
s_add_u32 m0, m0, 0x1040 // 000000003720: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003728: E05D1000 4C0D0000
s_mov_b32 m0, s47 // 000000003730: BEFC002F
buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003734: E05D1000 800E0001
s_add_u32 m0, m0, 0x1040 // 00000000373C: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003744: E05D1000 4D0E0001
s_add_u32 m0, m0, 0x1040 // 00000000374C: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003754: E05D1000 4E0E0001
s_add_u32 m0, m0, 0x1040 // 00000000375C: 807CFF7C 00001040
v_accvgpr_write_b32 a0, 0 // 000000003764: D3D94000 18000080
v_accvgpr_write_b32 a1, 0 // 00000000376C: D3D94001 18000080
v_accvgpr_write_b32 a2, 0 // 000000003774: D3D94002 18000080
v_accvgpr_write_b32 a3, 0 // 00000000377C: D3D94003 18000080
v_accvgpr_write_b32 a4, 0 // 000000003784: D3D94004 18000080
v_accvgpr_write_b32 a5, 0 // 00000000378C: D3D94005 18000080
v_accvgpr_write_b32 a6, 0 // 000000003794: D3D94006 18000080
v_accvgpr_write_b32 a7, 0 // 00000000379C: D3D94007 18000080
v_accvgpr_write_b32 a8, 0 // 0000000037A4: D3D94008 18000080
v_accvgpr_write_b32 a9, 0 // 0000000037AC: D3D94009 18000080
v_accvgpr_write_b32 a10, 0 // 0000000037B4: D3D9400A 18000080
v_accvgpr_write_b32 a11, 0 // 0000000037BC: D3D9400B 18000080
v_accvgpr_write_b32 a12, 0 // 0000000037C4: D3D9400C 18000080
v_accvgpr_write_b32 a13, 0 // 0000000037CC: D3D9400D 18000080
v_accvgpr_write_b32 a14, 0 // 0000000037D4: D3D9400E 18000080
v_accvgpr_write_b32 a15, 0 // 0000000037DC: D3D9400F 18000080
v_mov_b64_e32 v[6:7], 0 // 0000000037E4: 7E0C7080
v_mov_b64_e32 v[8:9], 0 // 0000000037E8: 7E107080
v_mfma_f32_32x32x16_bf16 a[16:31], v[6:9], v[6:9], a[0:15] // 0000000037EC: D3B78010 04020D06
v_mfma_f32_32x32x16_bf16 a[32:47], v[6:9], v[6:9], a[0:15] // 0000000037F4: D3B78020 04020D06
v_mfma_f32_32x32x16_bf16 a[48:63], v[6:9], v[6:9], a[0:15] // 0000000037FC: D3B78030 04020D06
v_mfma_f32_32x32x16_bf16 a[64:79], v[6:9], v[6:9], a[0:15] // 000000003804: D3B78040 04020D06
v_mfma_f32_32x32x16_bf16 a[80:95], v[6:9], v[6:9], a[0:15] // 00000000380C: D3B78050 04020D06
v_mfma_f32_32x32x16_bf16 a[96:111], v[6:9], v[6:9], a[0:15]// 000000003814: D3B78060 04020D06
v_mfma_f32_32x32x16_bf16 a[112:127], v[6:9], v[6:9], a[0:15]// 00000000381C: D3B78070 04020D06
v_mfma_f32_32x32x16_bf16 a[128:143], v[6:9], v[6:9], a[0:15]// 000000003824: D3B78080 04020D06
buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 00000000382C: E05D1000 4F0E0001
s_add_u32 m0, m0, 0x1040 // 000000003834: 807CFF7C 00001040
v_mfma_f32_32x32x16_bf16 a[144:159], v[6:9], v[6:9], a[0:15]// 00000000383C: D3B78090 04020D06
v_mfma_f32_32x32x16_bf16 a[160:175], v[6:9], v[6:9], a[0:15]// 000000003844: D3B780A0 04020D06
v_mfma_f32_32x32x16_bf16 a[176:191], v[6:9], v[6:9], a[0:15]// 00000000384C: D3B780B0 04020D06
v_mfma_f32_32x32x16_bf16 a[192:207], v[6:9], v[6:9], a[0:15]// 000000003854: D3B780C0 04020D06
v_mfma_f32_32x32x16_bf16 a[208:223], v[6:9], v[6:9], a[0:15]// 00000000385C: D3B780D0 04020D06
v_mfma_f32_32x32x16_bf16 a[224:239], v[6:9], v[6:9], a[0:15]// 000000003864: D3B780E0 04020D06
v_mfma_f32_32x32x16_bf16 a[240:255], v[6:9], v[6:9], a[0:15]// 00000000386C: D3B780F0 04020D06
buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000003874: E05D1000 500E0001
s_add_u32 m0, m0, 0x1040 // 00000000387C: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000003884: E05D1000 510E0001
s_add_u32 m0, m0, 0x1040 // 00000000388C: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 000000003894: E05D1000 520E0001
s_add_u32 m0, m0, 0x1040 // 00000000389C: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 0000000038A4: E05D1000 530E0001
s_add_u32 s86, s12, 1 // 0000000038AC: 8056810C
s_cmp_eq_u32 s51, s86 // 0000000038B0: BF065633
s_cselect_b32 s84, s64, s68 // 0000000038B4: 85544440
s_cselect_b32 s85, s65, 0 // 0000000038B8: 85558041
s_add_u32 s52, s52, s84 // 0000000038BC: 80345434
s_addc_u32 s53, s53, s85 // 0000000038C0: 82355535
s_sub_u32 s60, s60, s84 // 0000000038C4: 80BC543C
s_subb_u32 s61, s61, s85 // 0000000038C8: 82BD553D
s_cmp_eq_u32 s61, 0 // 0000000038CC: BF06803D
s_cselect_b32 s54, s60, -1 // 0000000038D0: 8536C13C
s_add_u32 s86, s12, 1 // 0000000038D4: 8056810C
s_cmp_eq_u32 s51, s86 // 0000000038D8: BF065633
s_cselect_b32 s84, s66, s69 // 0000000038DC: 85544542
s_cselect_b32 s85, s67, 0 // 0000000038E0: 85558043
s_add_u32 s56, s56, s84 // 0000000038E4: 80385438
s_addc_u32 s57, s57, s85 // 0000000038E8: 82395539
s_sub_u32 s62, s62, s84 // 0000000038EC: 80BE543E
s_subb_u32 s63, s63, s85 // 0000000038F0: 82BF553F
s_cmp_eq_u32 s63, 0 // 0000000038F4: BF06803F
s_cselect_b32 s58, s62, -1 // 0000000038F8: 853AC13E
label_ShadowInitStart:
s_mov_b64 s[16:17], s[28:29] // 0000000038FC: BE90011C
s_mov_b32 s18, 0x80000000 // 000000003900: BE9200FF 80000000
s_mov_b32 s19, 0x20000 // 000000003908: BE9300FF 00020000
s_mov_b64 s[20:21], s[30:31] // 000000003910: BE94011E
s_mov_b32 s22, 0x80000000 // 000000003914: BE9600FF 80000000
s_mov_b32 s23, 0x20000 // 00000000391C: BE9700FF 00020000
s_mul_i32 s86, 0x100, s3 // 000000003924: 925603FF 00000100
s_mul_hi_u32 s85, s86, s38 // 00000000392C: 96552656
s_mul_i32 s84, s86, s38 // 000000003930: 92542656
s_lshl_b64 s[84:85], s[84:85], s8 // 000000003934: 8ED40854
s_add_u32 s20, s30, s84 // 000000003938: 8014541E
s_addc_u32 s21, s31, s85 // 00000000393C: 8215551F
s_mul_hi_u32 s85, s86, s36 // 000000003940: 96552456
s_mul_i32 s84, s86, s36 // 000000003944: 92542456
s_lshl_b64 s[84:85], s[84:85], s9 // 000000003948: 8ED40954
s_add_u32 s16, s28, s84 // 00000000394C: 8010541C
s_addc_u32 s17, s29, s85 // 000000003950: 8211551D
s_mul_hi_u32 s85, s4, s39 // 000000003954: 96552704
s_mul_i32 s84, s4, s39 // 000000003958: 92542704
s_lshl_b64 s[84:85], s[84:85], s8 // 00000000395C: 8ED40854
s_add_u32 s20, s20, s84 // 000000003960: 80145414
s_addc_u32 s21, s21, s85 // 000000003964: 82155515
s_mul_hi_u32 s85, s4, s37 // 000000003968: 96552504
s_mul_i32 s84, s4, s37 // 00000000396C: 92542504
s_lshl_b64 s[84:85], s[84:85], s9 // 000000003970: 8ED40954
s_add_u32 s16, s16, s84 // 000000003974: 80105410
s_addc_u32 s17, s17, s85 // 000000003978: 82115511
s_mul_hi_u32 s85, s24, s6 // 00000000398C: 96550618
s_mul_i32 s84, s24, s6 // 000000003990: 92540618
s_sub_u32 s86, s25, 1 // 000000003994: 80D68119
s_mul_i32 s86, s86, s6 // 000000003998: 92560656
s_mul_hi_u32 s87, s86, s38 // 00000000399C: 96572656
s_mul_i32 s86, s86, s38 // 0000000039A0: 92562656
s_add_u32 s84, s84, s86 // 0000000039A4: 80545654
s_addc_u32 s85, s85, s87 // 0000000039A8: 82555755
s_sub_u32 s86, s26, 1 // 0000000039AC: 80D6811A
s_mul_i32 s86, s86, s6 // 0000000039B0: 92560656
s_mul_hi_u32 s87, s86, s39 // 0000000039B4: 96572756
s_mul_i32 s86, s86, s39 // 0000000039B8: 92562756
s_add_u32 s84, s84, s86 // 0000000039BC: 80545654
s_addc_u32 s85, s85, s87 // 0000000039C0: 82555755
s_lshl_b64 s[84:85], s[84:85], 2 // 0000000039C4: 8ED48254
s_add_u32 s16, s16, s84 // 0000000039C8: 80105410
s_addc_u32 s17, s17, s85 // 0000000039CC: 82115511
label_NoBranch_T8JHFHKM7BO5OHXW:
s_xor_b32 s46, s48, s46 // 0000000039F0: 882E2E30
s_xor_b32 s47, s49, s47 // 0000000039F4: 882F2F31
s_cmp_eq_u32 s12, 1 // 0000000039F8: BF06810C
s_cbranch_scc1 label_skipPGR2 // 0000000039FC: BF850040
s_mov_b32 m0, s46 // 000000003A00: BEFC002E
buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 000000003A04: E05D1000 800D0000
s_add_u32 m0, m0, 0x1040 // 000000003A0C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 000000003A14: E05D1000 460D0000
s_add_u32 m0, m0, 0x1040 // 000000003A1C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 000000003A24: E05D1000 470D0000
s_add_u32 m0, m0, 0x1040 // 000000003A2C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 000000003A34: E05D1000 480D0000
s_add_u32 m0, m0, 0x1040 // 000000003A3C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 000000003A44: E05D1000 490D0000
s_add_u32 m0, m0, 0x1040 // 000000003A4C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003A54: E05D1000 4A0D0000
s_add_u32 m0, m0, 0x1040 // 000000003A5C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003A64: E05D1000 4B0D0000
s_add_u32 m0, m0, 0x1040 // 000000003A6C: 807CFF7C 00001040
buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003A74: E05D1000 4C0D0000
s_mov_b32 m0, s47 // 000000003A7C: BEFC002F
buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003A80: E05D1000 800E0001
s_add_u32 m0, m0, 0x1040 // 000000003A88: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003A90: E05D1000 4D0E0001
s_add_u32 m0, m0, 0x1040 // 000000003A98: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003AA0: E05D1000 4E0E0001
s_add_u32 m0, m0, 0x1040 // 000000003AA8: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 000000003AB0: E05D1000 4F0E0001
s_add_u32 m0, m0, 0x1040 // 000000003AB8: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000003AC0: E05D1000 500E0001
s_add_u32 m0, m0, 0x1040 // 000000003AC8: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000003AD0: E05D1000 510E0001
s_add_u32 m0, m0, 0x1040 // 000000003AD8: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 000000003AE0: E05D1000 520E0001
s_add_u32 m0, m0, 0x1040 // 000000003AE8: 807CFF7C 00001040
buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 000000003AF0: E05D1000 530E0001
s_xor_b32 s46, s48, s46 // 000000003AF8: 882E2E30
s_xor_b32 s47, s49, s47 // 000000003AFC: 882F2F31
label_skipPGR2:
s_waitcnt vmcnt(24) // 000000003B00: BF8C4F78
s_barrier // 000000003B04: BF8A0000
ds_read_b128 v[4:7], v2 // 000000003B08: D9FE0000 04000002
ds_read_b128 v[8:11], v2 offset:128 // 000000003B10: D9FE0080 08000002
ds_read_b128 v[12:15], v2 offset:256 // 000000003B18: D9FE0100 0C000002
ds_read_b128 v[16:19], v2 offset:384 // 000000003B20: D9FE0180 10000002
ds_read_b128 v[20:23], v2 offset:512 // 000000003B28: D9FE0200 14000002
ds_read_b128 v[24:27], v2 offset:640 // 000000003B30: D9FE0280 18000002
ds_read_b128 v[28:31], v2 offset:768 // 000000003B38: D9FE0300 1C000002
ds_read_b128 v[32:35], v2 offset:896 // 000000003B40: D9FE0380 20000002
s_waitcnt vmcnt(16) // 000000003B48: BF8C4F70
s_barrier // 000000003B4C: BF8A0000
ds_read_b128 v[68:71], v3 // 000000003B50: D9FE0000 44000003
ds_read_b128 v[72:75], v3 offset:128 // 000000003B58: D9FE0080 48000003
ds_read_b128 v[76:79], v3 offset:256 // 000000003B60: D9FE0100 4C000003
ds_read_b128 v[80:83], v3 offset:384 // 000000003B68: D9FE0180 50000003
ds_read_b128 v[84:87], v3 offset:512 // 000000003B70: D9FE0200 54000003
ds_read_b128 v[88:91], v3 offset:640 // 000000003B78: D9FE0280 58000003
ds_read_b128 v[92:95], v3 offset:768 // 000000003B80: D9FE0300 5C000003
ds_read_b128 v[96:99], v3 offset:896 // 000000003B88: D9FE0380 60000003
s_waitcnt lgkmcnt(0) // 000000003B90: BF8CC07F
label_openLoopL:
s_cmp_eq_u32 s12, 1 // 000000003B94: BF06810C
s_cbranch_scc1 label_toPGR1 // 000000003B98: BF8502E5
s_cmp_le_u32 s12, 2 // 000000003B9C: BF0B820C
s_cbranch_scc1 label_LoopEndL // 000000003BA0: BF85019E
label_LoopBeginL:
v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 000000003BA4: D3B58000 04020944
ds_read_b128 v[36:39], v2 offset:64 // 000000003BAC: D9FE0040 24000002
v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 000000003BB4: D3B58004 04121144
s_cmp_eq_u32 s12, s51 // 000000003BBC: BF06330C
s_cselect_b32 s84, s64, s68 // 000000003BC0: 85544440
v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 000000003BC4: D3B58008 04221944
ds_read_b128 v[40:43], v2 offset:192 // 000000003BCC: D9FE00C0 28000002
v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 000000003BD4: D3B5800C 04322144
s_cselect_b32 s85, s65, 0 // 000000003BDC: 85558041
s_add_u32 s52, s52, s84 // 000000003BE0: 80345434
v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 000000003BE4: D3B58010 04422944
ds_read_b128 v[44:47], v2 offset:320 // 000000003BEC: D9FE0140 2C000002
v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 000000003BF4: D3B58014 04523144
s_addc_u32 s53, s53, s85 // 000000003BFC: 82355535
s_sub_u32 s60, s60, s84 // 000000003C00: 80BC543C
v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 000000003C04: D3B58018 04623944
ds_read_b128 v[48:51], v2 offset:448 // 000000003C0C: D9FE01C0 30000002
v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 000000003C14: D3B5801C 04724144
s_subb_u32 s61, s61, s85 // 000000003C1C: 82BD553D
s_cmp_eq_u32 s61, 0 // 000000003C20: BF06803D
v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 000000003C24: D3B58020 04820948
ds_read_b128 v[52:55], v2 offset:576 // 000000003C2C: D9FE0240 34000002
v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 000000003C34: D3B58024 04921148
s_cselect_b32 s54, s60, -1 // 000000003C3C: 8536C13C
s_cmp_eq_u32 s12, s51 // 000000003C40: BF06330C
v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000003C44: D3B58028 04A21948
ds_read_b128 v[56:59], v2 offset:704 // 000000003C4C: D9FE02C0 38000002
v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 000000003C54: D3B5802C 04B22148
s_cselect_b32 s84, s66, s69 // 000000003C5C: 85544542
s_cselect_b32 s85, s67, 0 // 000000003C60: 85558043
v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 000000003C64: D3B58030 04C22948
ds_read_b128 v[60:63], v2 offset:832 // 000000003C6C: D9FE0340 3C000002
v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 000000003C74: D3B58034 04D23148
s_add_u32 s56, s56, s84 // 000000003C7C: 80385438
s_addc_u32 s57, s57, s85 // 000000003C80: 82395539
v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 000000003C84: D3B58038 04E23948
ds_read_b128 v[64:67], v2 offset:960 // 000000003C8C: D9FE03C0 40000002
v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 000000003C94: D3B5803C 04F24148
s_mov_b32 m0, s46 // 000000003C9C: BEFC002E
s_sub_u32 s62, s62, s84 // 000000003CA0: 80BE543E
v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 000000003CA4: D3B58040 0502094C
s_subb_u32 s63, s63, s85 // 000000003CAC: 82BF553F
s_cmp_eq_u32 s63, 0 // 000000003CB0: BF06803F
v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 000000003CB4: D3B58044 0512114C
s_cselect_b32 s58, s62, -1 // 000000003CBC: 853AC13E
v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 000000003CC0: D3B58048 0522194C
v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000003CC8: D3B5804C 0532214C
v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 000000003CD0: D3B58050 0542294C
s_waitcnt lgkmcnt(0) // 000000003CD8: BF8CC07F
v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 000000003CDC: D3B58054 0552314C
s_barrier // 000000003CE4: BF8A0000
v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000003CE8: D3B58058 0562394C
buffer_load_dwordx4 v0, s[52:55], 0 offen lds // 000000003CF0: E05D1000 800D0000
v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 000000003CF8: D3B5805C 0572414C
s_add_u32 m0, m0, 0x1040 // 000000003D00: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 000000003D08: D3B58060 05820950
ds_read_b128 v[100:103], v3 offset:64 // 000000003D10: D9FE0040 64000003
v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 000000003D18: D3B58064 05921150
buffer_load_dwordx4 v0, s[52:55], s70 offen lds // 000000003D20: E05D1000 460D0000
v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 000000003D28: D3B58068 05A21950
s_add_u32 m0, m0, 0x1040 // 000000003D30: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 000000003D38: D3B5806C 05B22150
ds_read_b128 v[104:107], v3 offset:192 // 000000003D40: D9FE00C0 68000003
v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 000000003D48: D3B58070 05C22950
buffer_load_dwordx4 v0, s[52:55], s71 offen lds // 000000003D50: E05D1000 470D0000
v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 000000003D58: D3B58074 05D23150
s_add_u32 m0, m0, 0x1040 // 000000003D60: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 000000003D68: D3B58078 05E23950
ds_read_b128 v[108:111], v3 offset:320 // 000000003D70: D9FE0140 6C000003
v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000003D78: D3B5807C 05F24150
buffer_load_dwordx4 v0, s[52:55], s72 offen lds // 000000003D80: E05D1000 480D0000
v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 000000003D88: D3B58080 06020954
s_add_u32 m0, m0, 0x1040 // 000000003D90: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 000000003D98: D3B58084 06121154
ds_read_b128 v[112:115], v3 offset:448 // 000000003DA0: D9FE01C0 70000003
v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 000000003DA8: D3B58088 06221954
buffer_load_dwordx4 v0, s[52:55], s73 offen lds // 000000003DB0: E05D1000 490D0000
v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 000000003DB8: D3B5808C 06322154
s_add_u32 m0, m0, 0x1040 // 000000003DC0: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 000000003DC8: D3B58090 06422954
ds_read_b128 v[116:119], v3 offset:576 // 000000003DD0: D9FE0240 74000003
v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 000000003DD8: D3B58094 06523154
v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 000000003DE0: D3B58098 06623954
ds_read_b128 v[120:123], v3 offset:704 // 000000003DE8: D9FE02C0 78000003
v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 000000003DF0: D3B5809C 06724154
v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 000000003DF8: D3B580A0 06820958
ds_read_b128 v[124:127], v3 offset:832 // 000000003E00: D9FE0340 7C000003
v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 000000003E08: D3B580A4 06921158
v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 000000003E10: D3B580A8 06A21958
ds_read_b128 v[128:131], v3 offset:960 // 000000003E18: D9FE03C0 80000003
v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 000000003E20: D3B580AC 06B22158
v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 000000003E28: D3B580B0 06C22958
v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000003E30: D3B580B4 06D23158
v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 000000003E38: D3B580B8 06E23958
v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000003E40: D3B580BC 06F24158
v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 000000003E48: D3B580C0 0702095C
v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000003E50: D3B580C4 0712115C
v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 000000003E58: D3B580C8 0722195C
s_waitcnt lgkmcnt(0) // 000000003E60: BF8CC07F
v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 000000003E64: D3B580CC 0732215C
s_barrier // 000000003E6C: BF8A0000
v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 000000003E70: D3B580D0 0742295C
buffer_load_dwordx4 v0, s[52:55], s74 offen lds // 000000003E78: E05D1000 4A0D0000
v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 000000003E80: D3B580D4 0752315C
s_add_u32 m0, m0, 0x1040 // 000000003E88: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 000000003E90: D3B580D8 0762395C
v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 000000003E98: D3B580DC 0772415C
buffer_load_dwordx4 v0, s[52:55], s75 offen lds // 000000003EA0: E05D1000 4B0D0000
v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 000000003EA8: D3B580E0 07820960
s_add_u32 m0, m0, 0x1040 // 000000003EB0: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 000000003EB8: D3B580E4 07921160
v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 000000003EC0: D3B580E8 07A21960
buffer_load_dwordx4 v0, s[52:55], s76 offen lds // 000000003EC8: E05D1000 4C0D0000
v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 000000003ED0: D3B580EC 07B22160
s_mov_b32 m0, s47 // 000000003ED8: BEFC002F
v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 000000003EDC: D3B580F0 07C22960
v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 000000003EE4: D3B580F4 07D23160
buffer_load_dwordx4 v1, s[56:59], 0 offen lds // 000000003EEC: E05D1000 800E0001
v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 000000003EF4: D3B580F8 07E23960
s_add_u32 m0, m0, 0x1040 // 000000003EFC: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 000000003F04: D3B580FC 07F24160
v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 000000003F0C: D3B58000 04024964
buffer_load_dwordx4 v1, s[56:59], s77 offen lds // 000000003F14: E05D1000 4D0E0001
v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 000000003F1C: D3B58004 04125164
s_add_u32 m0, m0, 0x1040 // 000000003F24: 807CFF7C 00001040
s_xor_b32 s46, s48, s46 // 000000003F2C: 882E2E30
v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 000000003F30: D3B58008 04225964
v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 000000003F38: D3B5800C 04326164
v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 000000003F40: D3B58010 04426964
v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 000000003F48: D3B58014 04527164
v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 000000003F50: D3B58018 04627964
v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 000000003F58: D3B5801C 04728164
v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 000000003F60: D3B58020 04824968
v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 000000003F68: D3B58024 04925168
v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 000000003F70: D3B58028 04A25968
v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000003F78: D3B5802C 04B26168
v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000003F80: D3B58030 04C26968
v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000003F88: D3B58034 04D27168
v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000003F90: D3B58038 04E27968
v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000003F98: D3B5803C 04F28168
v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000003FA0: D3B58040 0502496C
v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000003FA8: D3B58044 0512516C
v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000003FB0: D3B58048 0522596C
v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000003FB8: D3B5804C 0532616C
v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000003FC0: D3B58050 0542696C
v_xor_b32_e32 v2, v132, v2 // 000000003FC8: 2A040584
v_xor_b32_e32 v3, v133, v3 // 000000003FCC: 2A060785
v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000003FD0: D3B58054 0552716C
buffer_load_dwordx4 v1, s[56:59], s78 offen lds // 000000003FD8: E05D1000 4E0E0001
v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000003FE0: D3B58058 0562796C
s_add_u32 m0, m0, 0x1040 // 000000003FE8: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000003FF0: D3B5805C 0572816C
buffer_load_dwordx4 v1, s[56:59], s79 offen lds // 000000003FF8: E05D1000 4F0E0001
v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004000: D3B58060 05824970
s_add_u32 m0, m0, 0x1040 // 000000004008: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004010: D3B58064 05925170
buffer_load_dwordx4 v1, s[56:59], s80 offen lds // 000000004018: E05D1000 500E0001
v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004020: D3B58068 05A25970
v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004028: D3B5806C 05B26170
s_waitcnt vmcnt(13) // 000000004030: BF8C0F7D
v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004034: D3B58070 05C26970
s_barrier // 00000000403C: BF8A0000
v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004040: D3B58074 05D27170
ds_read_b128 v[4:7], v2 // 000000004048: D9FE0000 04000002
v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004050: D3B58078 05E27970
ds_read_b128 v[8:11], v2 offset:128 // 000000004058: D9FE0080 08000002
s_add_u32 m0, m0, 0x1040 // 000000004060: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 000000004068: D3B5807C 05F28170
ds_read_b128 v[12:15], v2 offset:256 // 000000004070: D9FE0100 0C000002
v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 000000004078: D3B58080 06024974
buffer_load_dwordx4 v1, s[56:59], s81 offen lds // 000000004080: E05D1000 510E0001
v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 000000004088: D3B58084 06125174
ds_read_b128 v[16:19], v2 offset:384 // 000000004090: D9FE0180 10000002
v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 000000004098: D3B58088 06225974
ds_read_b128 v[20:23], v2 offset:512 // 0000000040A0: D9FE0200 14000002
s_add_u32 m0, m0, 0x1040 // 0000000040A8: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 0000000040B0: D3B5808C 06326174
v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 0000000040B8: D3B58090 06426974
buffer_load_dwordx4 v1, s[56:59], s82 offen lds // 0000000040C0: E05D1000 520E0001
v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 0000000040C8: D3B58094 06527174
v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 0000000040D0: D3B58098 06627974
ds_read_b128 v[24:27], v2 offset:640 // 0000000040D8: D9FE0280 18000002
s_add_u32 m0, m0, 0x1040 // 0000000040E0: 807CFF7C 00001040
v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 0000000040E8: D3B5809C 06728174
ds_read_b128 v[28:31], v2 offset:768 // 0000000040F0: D9FE0300 1C000002
v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 0000000040F8: D3B580A0 06824978
ds_read_b128 v[32:35], v2 offset:896 // 000000004100: D9FE0380 20000002
v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 000000004108: D3B580A4 06925178
ds_read_b128 v[68:71], v3 // 000000004110: D9FE0000 44000003
v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 000000004118: D3B580A8 06A25978
ds_read_b128 v[72:75], v3 offset:128 // 000000004120: D9FE0080 48000003
v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004128: D3B580AC 06B26178
v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004130: D3B580B0 06C26978
v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004138: D3B580B4 06D27178
ds_read_b128 v[76:79], v3 offset:256 // 000000004140: D9FE0100 4C000003
v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004148: D3B580B8 06E27978
v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004150: D3B580BC 06F28178
v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004158: D3B580C0 0702497C
ds_read_b128 v[80:83], v3 offset:384 // 000000004160: D9FE0180 50000003
v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004168: D3B580C4 0712517C
v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004170: D3B580C8 0722597C
ds_read_b128 v[84:87], v3 offset:512 // 000000004178: D9FE0200 54000003
v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004180: D3B580CC 0732617C
v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004188: D3B580D0 0742697C
v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 000000004190: D3B580D4 0752717C
ds_read_b128 v[88:91], v3 offset:640 // 000000004198: D9FE0280 58000003
v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 0000000041A0: D3B580D8 0762797C
v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 0000000041A8: D3B580DC 0772817C
v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 0000000041B0: D3B580E0 07824980
ds_read_b128 v[92:95], v3 offset:768 // 0000000041B8: D9FE0300 5C000003
v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 0000000041C0: D3B580E4 07925180
v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 0000000041C8: D3B580E8 07A25980
v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 0000000041D0: D3B580EC 07B26180
ds_read_b128 v[96:99], v3 offset:896 // 0000000041D8: D9FE0380 60000003
v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 0000000041E0: D3B580F0 07C26980
buffer_load_dwordx4 v1, s[56:59], s83 offen lds // 0000000041E8: E05D1000 530E0001
v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 0000000041F0: D3B580F4 07D27180
s_xor_b32 s47, s49, s47 // 0000000041F8: 882F2F31
s_sub_u32 s12, s12, 1 // 0000000041FC: 808C810C
v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004200: D3B580F8 07E27980
s_cmp_eq_i32 s12, 2 // 000000004208: BF00820C
s_waitcnt lgkmcnt(0) // 00000000420C: BF8CC07F
v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004210: D3B580FC 07F28180
s_cbranch_scc0 label_LoopBeginL // 000000004218: BF84FE62
label_LoopEndL:
v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 00000000421C: D3B58000 04020944
ds_read_b128 v[36:39], v2 offset:64 // 000000004224: D9FE0040 24000002
v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 00000000422C: D3B58004 04121144
v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 000000004234: D3B58008 04221944
ds_read_b128 v[100:103], v3 offset:64 // 00000000423C: D9FE0040 64000003
v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 000000004244: D3B5800C 04322144
v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 00000000424C: D3B58010 04422944
ds_read_b128 v[40:43], v2 offset:192 // 000000004254: D9FE00C0 28000002
v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 00000000425C: D3B58014 04523144
v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 000000004264: D3B58018 04623944
ds_read_b128 v[44:47], v2 offset:320 // 00000000426C: D9FE0140 2C000002
v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 000000004274: D3B5801C 04724144
v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 00000000427C: D3B58020 04820948
ds_read_b128 v[48:51], v2 offset:448 // 000000004284: D9FE01C0 30000002
v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 00000000428C: D3B58024 04921148
v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000004294: D3B58028 04A21948
ds_read_b128 v[52:55], v2 offset:576 // 00000000429C: D9FE0240 34000002
v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 0000000042A4: D3B5802C 04B22148
v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 0000000042AC: D3B58030 04C22948
ds_read_b128 v[56:59], v2 offset:704 // 0000000042B4: D9FE02C0 38000002
v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 0000000042BC: D3B58034 04D23148
v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 0000000042C4: D3B58038 04E23948
ds_read_b128 v[60:63], v2 offset:832 // 0000000042CC: D9FE0340 3C000002
v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 0000000042D4: D3B5803C 04F24148
v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 0000000042DC: D3B58040 0502094C
ds_read_b128 v[64:67], v2 offset:960 // 0000000042E4: D9FE03C0 40000002
v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 0000000042EC: D3B58044 0512114C
v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 0000000042F4: D3B58048 0522194C
ds_read_b128 v[104:107], v3 offset:192 // 0000000042FC: D9FE00C0 68000003
v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000004304: D3B5804C 0532214C
v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 00000000430C: D3B58050 0542294C
ds_read_b128 v[108:111], v3 offset:320 // 000000004314: D9FE0140 6C000003
v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 00000000431C: D3B58054 0552314C
v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000004324: D3B58058 0562394C
ds_read_b128 v[112:115], v3 offset:448 // 00000000432C: D9FE01C0 70000003
v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 000000004334: D3B5805C 0572414C
v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 00000000433C: D3B58060 05820950
ds_read_b128 v[116:119], v3 offset:576 // 000000004344: D9FE0240 74000003
v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 00000000434C: D3B58064 05921150
v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 000000004354: D3B58068 05A21950
ds_read_b128 v[120:123], v3 offset:704 // 00000000435C: D9FE02C0 78000003
v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 000000004364: D3B5806C 05B22150
v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 00000000436C: D3B58070 05C22950
ds_read_b128 v[124:127], v3 offset:832 // 000000004374: D9FE0340 7C000003
v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 00000000437C: D3B58074 05D23150
v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 000000004384: D3B58078 05E23950
ds_read_b128 v[128:131], v3 offset:960 // 00000000438C: D9FE03C0 80000003
v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000004394: D3B5807C 05F24150
v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 00000000439C: D3B58080 06020954
v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 0000000043A4: D3B58084 06121154
v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 0000000043AC: D3B58088 06221954
v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 0000000043B4: D3B5808C 06322154
v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 0000000043BC: D3B58090 06422954
v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 0000000043C4: D3B58094 06523154
v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 0000000043CC: D3B58098 06623954
v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 0000000043D4: D3B5809C 06724154
v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 0000000043DC: D3B580A0 06820958
v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 0000000043E4: D3B580A4 06921158
v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 0000000043EC: D3B580A8 06A21958
v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 0000000043F4: D3B580AC 06B22158
v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 0000000043FC: D3B580B0 06C22958
v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000004404: D3B580B4 06D23158
v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 00000000440C: D3B580B8 06E23958
v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000004414: D3B580BC 06F24158
v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 00000000441C: D3B580C0 0702095C
v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000004424: D3B580C4 0712115C
v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 00000000442C: D3B580C8 0722195C
v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 000000004434: D3B580CC 0732215C
v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 00000000443C: D3B580D0 0742295C
v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 000000004444: D3B580D4 0752315C
v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 00000000444C: D3B580D8 0762395C
v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 000000004454: D3B580DC 0772415C
v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 00000000445C: D3B580E0 07820960
v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 000000004464: D3B580E4 07921160
v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 00000000446C: D3B580E8 07A21960
v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 000000004474: D3B580EC 07B22160
v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 00000000447C: D3B580F0 07C22960
v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 000000004484: D3B580F4 07D23160
v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 00000000448C: D3B580F8 07E23960
v_xor_b32_e32 v2, v132, v2 // 000000004494: 2A040584
v_xor_b32_e32 v3, v133, v3 // 000000004498: 2A060785
v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 00000000449C: D3B580FC 07F24160
s_waitcnt lgkmcnt(0) // 0000000044A4: BF8CC07F
v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 0000000044A8: D3B58000 04024964
v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 0000000044B0: D3B58004 04125164
v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 0000000044B8: D3B58008 04225964
v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 0000000044C0: D3B5800C 04326164
v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 0000000044C8: D3B58010 04426964
v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 0000000044D0: D3B58014 04527164
v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 0000000044D8: D3B58018 04627964
v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 0000000044E0: D3B5801C 04728164
v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 0000000044E8: D3B58020 04824968
v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 0000000044F0: D3B58024 04925168
v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 0000000044F8: D3B58028 04A25968
v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000004500: D3B5802C 04B26168
v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000004508: D3B58030 04C26968
v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000004510: D3B58034 04D27168
v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000004518: D3B58038 04E27968
v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000004520: D3B5803C 04F28168
v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000004528: D3B58040 0502496C
v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000004530: D3B58044 0512516C
v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000004538: D3B58048 0522596C
v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000004540: D3B5804C 0532616C
v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000004548: D3B58050 0542696C
v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000004550: D3B58054 0552716C
v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000004558: D3B58058 0562796C
v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000004560: D3B5805C 0572816C
v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004568: D3B58060 05824970
v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004570: D3B58064 05925170
v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004578: D3B58068 05A25970
v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004580: D3B5806C 05B26170
v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004588: D3B58070 05C26970
v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004590: D3B58074 05D27170
v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004598: D3B58078 05E27970
v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 0000000045A0: D3B5807C 05F28170
v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 0000000045A8: D3B58080 06024974
v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 0000000045B0: D3B58084 06125174
v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 0000000045B8: D3B58088 06225974
v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 0000000045C0: D3B5808C 06326174
v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 0000000045C8: D3B58090 06426974
v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 0000000045D0: D3B58094 06527174
v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 0000000045D8: D3B58098 06627974
v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 0000000045E0: D3B5809C 06728174
v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 0000000045E8: D3B580A0 06824978
v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 0000000045F0: D3B580A4 06925178
s_waitcnt vmcnt(0) // 0000000045F8: BF8C0F70
v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 0000000045FC: D3B580A8 06A25978
s_barrier // 000000004604: BF8A0000
v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004608: D3B580AC 06B26178
ds_read_b128 v[4:7], v2 // 000000004610: D9FE0000 04000002
v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004618: D3B580B0 06C26978
ds_read_b128 v[68:71], v3 // 000000004620: D9FE0000 44000003
v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004628: D3B580B4 06D27178
ds_read_b128 v[8:11], v2 offset:128 // 000000004630: D9FE0080 08000002
v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004638: D3B580B8 06E27978
ds_read_b128 v[12:15], v2 offset:256 // 000000004640: D9FE0100 0C000002
v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004648: D3B580BC 06F28178
ds_read_b128 v[16:19], v2 offset:384 // 000000004650: D9FE0180 10000002
v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004658: D3B580C0 0702497C
ds_read_b128 v[20:23], v2 offset:512 // 000000004660: D9FE0200 14000002
v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004668: D3B580C4 0712517C
ds_read_b128 v[24:27], v2 offset:640 // 000000004670: D9FE0280 18000002
v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004678: D3B580C8 0722597C
ds_read_b128 v[28:31], v2 offset:768 // 000000004680: D9FE0300 1C000002
v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004688: D3B580CC 0732617C
ds_read_b128 v[32:35], v2 offset:896 // 000000004690: D9FE0380 20000002
v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004698: D3B580D0 0742697C
ds_read_b128 v[72:75], v3 offset:128 // 0000000046A0: D9FE0080 48000003
v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 0000000046A8: D3B580D4 0752717C
ds_read_b128 v[76:79], v3 offset:256 // 0000000046B0: D9FE0100 4C000003
v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 0000000046B8: D3B580D8 0762797C
ds_read_b128 v[80:83], v3 offset:384 // 0000000046C0: D9FE0180 50000003
v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 0000000046C8: D3B580DC 0772817C
ds_read_b128 v[84:87], v3 offset:512 // 0000000046D0: D9FE0200 54000003
v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 0000000046D8: D3B580E0 07824980
ds_read_b128 v[88:91], v3 offset:640 // 0000000046E0: D9FE0280 58000003
v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 0000000046E8: D3B580E4 07925180
ds_read_b128 v[92:95], v3 offset:768 // 0000000046F0: D9FE0300 5C000003
v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 0000000046F8: D3B580E8 07A25980
ds_read_b128 v[96:99], v3 offset:896 // 000000004700: D9FE0380 60000003
v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 000000004708: D3B580EC 07B26180
v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 000000004710: D3B580F0 07C26980
v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 000000004718: D3B580F4 07D27180
v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004720: D3B580F8 07E27980
v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004728: D3B580FC 07F28180
label_toPGR1:
s_and_b32 s8, s50, 0x3fff // 000000004730: 8608FF32 00003FFF
s_and_b32 s84, 0xff, s24 // 000000004750: 865418FF 000000FF
s_add_u32 s85, -1, s14 // 000000004758: 80550EC1
s_cmp_ge_u32 s2, s85 // 00000000475C: BF095502
s_cselect_b32 s84, s84, 0 // 000000004760: 85548054
s_and_b32 s84, 0xff, s25 // 00000000476C: 865419FF 000000FF
s_add_u32 s85, -1, s15 // 000000004774: 80550FC1
s_cmp_ge_u32 s3, s85 // 000000004778: BF095503
s_cselect_b32 s84, s84, 0 // 00000000477C: 85548054
v_mfma_f32_16x16x32_bf16 a[0:3], v[68:71], v[4:7], a[0:3] // 000000004788: D3B58000 04020944
ds_read_b128 v[36:39], v2 offset:64 // 000000004790: D9FE0040 24000002
v_mfma_f32_16x16x32_bf16 a[4:7], v[68:71], v[8:11], a[4:7] // 000000004798: D3B58004 04121144
v_mfma_f32_16x16x32_bf16 a[8:11], v[68:71], v[12:15], a[8:11]// 0000000047A0: D3B58008 04221944
ds_read_b128 v[100:103], v3 offset:64 // 0000000047A8: D9FE0040 64000003
v_mfma_f32_16x16x32_bf16 a[12:15], v[68:71], v[16:19], a[12:15]// 0000000047B0: D3B5800C 04322144
v_mfma_f32_16x16x32_bf16 a[16:19], v[68:71], v[20:23], a[16:19]// 0000000047B8: D3B58010 04422944
ds_read_b128 v[40:43], v2 offset:192 // 0000000047C0: D9FE00C0 28000002
v_mfma_f32_16x16x32_bf16 a[20:23], v[68:71], v[24:27], a[20:23]// 0000000047C8: D3B58014 04523144
v_mfma_f32_16x16x32_bf16 a[24:27], v[68:71], v[28:31], a[24:27]// 0000000047D0: D3B58018 04623944
ds_read_b128 v[44:47], v2 offset:320 // 0000000047D8: D9FE0140 2C000002
v_mfma_f32_16x16x32_bf16 a[28:31], v[68:71], v[32:35], a[28:31]// 0000000047E0: D3B5801C 04724144
v_mfma_f32_16x16x32_bf16 a[32:35], v[72:75], v[4:7], a[32:35]// 0000000047E8: D3B58020 04820948
ds_read_b128 v[48:51], v2 offset:448 // 0000000047F0: D9FE01C0 30000002
v_mfma_f32_16x16x32_bf16 a[36:39], v[72:75], v[8:11], a[36:39]// 0000000047F8: D3B58024 04921148
v_mfma_f32_16x16x32_bf16 a[40:43], v[72:75], v[12:15], a[40:43]// 000000004800: D3B58028 04A21948
ds_read_b128 v[52:55], v2 offset:576 // 000000004808: D9FE0240 34000002
v_mfma_f32_16x16x32_bf16 a[44:47], v[72:75], v[16:19], a[44:47]// 000000004810: D3B5802C 04B22148
v_mfma_f32_16x16x32_bf16 a[48:51], v[72:75], v[20:23], a[48:51]// 000000004818: D3B58030 04C22948
ds_read_b128 v[56:59], v2 offset:704 // 000000004820: D9FE02C0 38000002
v_mfma_f32_16x16x32_bf16 a[52:55], v[72:75], v[24:27], a[52:55]// 000000004828: D3B58034 04D23148
v_mfma_f32_16x16x32_bf16 a[56:59], v[72:75], v[28:31], a[56:59]// 000000004830: D3B58038 04E23948
ds_read_b128 v[60:63], v2 offset:832 // 000000004838: D9FE0340 3C000002
v_mfma_f32_16x16x32_bf16 a[60:63], v[72:75], v[32:35], a[60:63]// 000000004840: D3B5803C 04F24148
v_mfma_f32_16x16x32_bf16 a[64:67], v[76:79], v[4:7], a[64:67]// 000000004848: D3B58040 0502094C
ds_read_b128 v[64:67], v2 offset:960 // 000000004850: D9FE03C0 40000002
v_mfma_f32_16x16x32_bf16 a[68:71], v[76:79], v[8:11], a[68:71]// 000000004858: D3B58044 0512114C
v_mfma_f32_16x16x32_bf16 a[72:75], v[76:79], v[12:15], a[72:75]// 000000004860: D3B58048 0522194C
ds_read_b128 v[104:107], v3 offset:192 // 000000004868: D9FE00C0 68000003
v_mfma_f32_16x16x32_bf16 a[76:79], v[76:79], v[16:19], a[76:79]// 000000004870: D3B5804C 0532214C
v_mfma_f32_16x16x32_bf16 a[80:83], v[76:79], v[20:23], a[80:83]// 000000004878: D3B58050 0542294C
ds_read_b128 v[108:111], v3 offset:320 // 000000004880: D9FE0140 6C000003
v_mfma_f32_16x16x32_bf16 a[84:87], v[76:79], v[24:27], a[84:87]// 000000004888: D3B58054 0552314C
v_mfma_f32_16x16x32_bf16 a[88:91], v[76:79], v[28:31], a[88:91]// 000000004890: D3B58058 0562394C
ds_read_b128 v[112:115], v3 offset:448 // 000000004898: D9FE01C0 70000003
v_mfma_f32_16x16x32_bf16 a[92:95], v[76:79], v[32:35], a[92:95]// 0000000048A0: D3B5805C 0572414C
v_mfma_f32_16x16x32_bf16 a[96:99], v[80:83], v[4:7], a[96:99]// 0000000048A8: D3B58060 05820950
ds_read_b128 v[116:119], v3 offset:576 // 0000000048B0: D9FE0240 74000003
v_mfma_f32_16x16x32_bf16 a[100:103], v[80:83], v[8:11], a[100:103]// 0000000048B8: D3B58064 05921150
v_mfma_f32_16x16x32_bf16 a[104:107], v[80:83], v[12:15], a[104:107]// 0000000048C0: D3B58068 05A21950
ds_read_b128 v[120:123], v3 offset:704 // 0000000048C8: D9FE02C0 78000003
v_mfma_f32_16x16x32_bf16 a[108:111], v[80:83], v[16:19], a[108:111]// 0000000048D0: D3B5806C 05B22150
v_mfma_f32_16x16x32_bf16 a[112:115], v[80:83], v[20:23], a[112:115]// 0000000048D8: D3B58070 05C22950
ds_read_b128 v[124:127], v3 offset:832 // 0000000048E0: D9FE0340 7C000003
v_mfma_f32_16x16x32_bf16 a[116:119], v[80:83], v[24:27], a[116:119]// 0000000048E8: D3B58074 05D23150
v_mfma_f32_16x16x32_bf16 a[120:123], v[80:83], v[28:31], a[120:123]// 0000000048F0: D3B58078 05E23950
ds_read_b128 v[128:131], v3 offset:960 // 0000000048F8: D9FE03C0 80000003
v_mfma_f32_16x16x32_bf16 a[124:127], v[80:83], v[32:35], a[124:127]// 000000004900: D3B5807C 05F24150
v_mfma_f32_16x16x32_bf16 a[128:131], v[84:87], v[4:7], a[128:131]// 000000004908: D3B58080 06020954
v_mfma_f32_16x16x32_bf16 a[132:135], v[84:87], v[8:11], a[132:135]// 000000004910: D3B58084 06121154
v_mfma_f32_16x16x32_bf16 a[136:139], v[84:87], v[12:15], a[136:139]// 000000004918: D3B58088 06221954
v_mfma_f32_16x16x32_bf16 a[140:143], v[84:87], v[16:19], a[140:143]// 000000004920: D3B5808C 06322154
v_mfma_f32_16x16x32_bf16 a[144:147], v[84:87], v[20:23], a[144:147]// 000000004928: D3B58090 06422954
v_mfma_f32_16x16x32_bf16 a[148:151], v[84:87], v[24:27], a[148:151]// 000000004930: D3B58094 06523154
v_mfma_f32_16x16x32_bf16 a[152:155], v[84:87], v[28:31], a[152:155]// 000000004938: D3B58098 06623954
v_mfma_f32_16x16x32_bf16 a[156:159], v[84:87], v[32:35], a[156:159]// 000000004940: D3B5809C 06724154
v_mfma_f32_16x16x32_bf16 a[160:163], v[88:91], v[4:7], a[160:163]// 000000004948: D3B580A0 06820958
v_mfma_f32_16x16x32_bf16 a[164:167], v[88:91], v[8:11], a[164:167]// 000000004950: D3B580A4 06921158
v_mfma_f32_16x16x32_bf16 a[168:171], v[88:91], v[12:15], a[168:171]// 000000004958: D3B580A8 06A21958
v_mfma_f32_16x16x32_bf16 a[172:175], v[88:91], v[16:19], a[172:175]// 000000004960: D3B580AC 06B22158
v_mfma_f32_16x16x32_bf16 a[176:179], v[88:91], v[20:23], a[176:179]// 000000004968: D3B580B0 06C22958
v_mfma_f32_16x16x32_bf16 a[180:183], v[88:91], v[24:27], a[180:183]// 000000004970: D3B580B4 06D23158
v_mfma_f32_16x16x32_bf16 a[184:187], v[88:91], v[28:31], a[184:187]// 000000004978: D3B580B8 06E23958
v_mfma_f32_16x16x32_bf16 a[188:191], v[88:91], v[32:35], a[188:191]// 000000004980: D3B580BC 06F24158
v_mfma_f32_16x16x32_bf16 a[192:195], v[92:95], v[4:7], a[192:195]// 000000004988: D3B580C0 0702095C
v_mfma_f32_16x16x32_bf16 a[196:199], v[92:95], v[8:11], a[196:199]// 000000004990: D3B580C4 0712115C
v_mfma_f32_16x16x32_bf16 a[200:203], v[92:95], v[12:15], a[200:203]// 000000004998: D3B580C8 0722195C
v_mfma_f32_16x16x32_bf16 a[204:207], v[92:95], v[16:19], a[204:207]// 0000000049A0: D3B580CC 0732215C
v_mfma_f32_16x16x32_bf16 a[208:211], v[92:95], v[20:23], a[208:211]// 0000000049A8: D3B580D0 0742295C
v_mfma_f32_16x16x32_bf16 a[212:215], v[92:95], v[24:27], a[212:215]// 0000000049B0: D3B580D4 0752315C
v_mfma_f32_16x16x32_bf16 a[216:219], v[92:95], v[28:31], a[216:219]// 0000000049B8: D3B580D8 0762395C
v_mfma_f32_16x16x32_bf16 a[220:223], v[92:95], v[32:35], a[220:223]// 0000000049C0: D3B580DC 0772415C
v_mfma_f32_16x16x32_bf16 a[224:227], v[96:99], v[4:7], a[224:227]// 0000000049C8: D3B580E0 07820960
v_mfma_f32_16x16x32_bf16 a[228:231], v[96:99], v[8:11], a[228:231]// 0000000049D0: D3B580E4 07921160
v_mfma_f32_16x16x32_bf16 a[232:235], v[96:99], v[12:15], a[232:235]// 0000000049D8: D3B580E8 07A21960
v_mfma_f32_16x16x32_bf16 a[236:239], v[96:99], v[16:19], a[236:239]// 0000000049E0: D3B580EC 07B22160
v_mfma_f32_16x16x32_bf16 a[240:243], v[96:99], v[20:23], a[240:243]// 0000000049E8: D3B580F0 07C22960
v_mfma_f32_16x16x32_bf16 a[244:247], v[96:99], v[24:27], a[244:247]// 0000000049F0: D3B580F4 07D23160
v_mfma_f32_16x16x32_bf16 a[248:251], v[96:99], v[28:31], a[248:251]// 0000000049F8: D3B580F8 07E23960
v_mfma_f32_16x16x32_bf16 a[252:255], v[96:99], v[32:35], a[252:255]// 000000004A00: D3B580FC 07F24160
s_waitcnt lgkmcnt(0) // 000000004A08: BF8CC07F
v_mfma_f32_16x16x32_bf16 a[0:3], v[100:103], v[36:39], a[0:3]// 000000004A0C: D3B58000 04024964
v_mfma_f32_16x16x32_bf16 a[4:7], v[100:103], v[40:43], a[4:7]// 000000004A14: D3B58004 04125164
v_mfma_f32_16x16x32_bf16 a[8:11], v[100:103], v[44:47], a[8:11]// 000000004A1C: D3B58008 04225964
v_mfma_f32_16x16x32_bf16 a[12:15], v[100:103], v[48:51], a[12:15]// 000000004A24: D3B5800C 04326164
v_mfma_f32_16x16x32_bf16 a[16:19], v[100:103], v[52:55], a[16:19]// 000000004A2C: D3B58010 04426964
v_mfma_f32_16x16x32_bf16 a[20:23], v[100:103], v[56:59], a[20:23]// 000000004A34: D3B58014 04527164
v_mfma_f32_16x16x32_bf16 a[24:27], v[100:103], v[60:63], a[24:27]// 000000004A3C: D3B58018 04627964
v_mfma_f32_16x16x32_bf16 a[28:31], v[100:103], v[64:67], a[28:31]// 000000004A44: D3B5801C 04728164
v_mfma_f32_16x16x32_bf16 a[32:35], v[104:107], v[36:39], a[32:35]// 000000004A4C: D3B58020 04824968
v_mfma_f32_16x16x32_bf16 a[36:39], v[104:107], v[40:43], a[36:39]// 000000004A54: D3B58024 04925168
v_mfma_f32_16x16x32_bf16 a[40:43], v[104:107], v[44:47], a[40:43]// 000000004A5C: D3B58028 04A25968
v_mfma_f32_16x16x32_bf16 a[44:47], v[104:107], v[48:51], a[44:47]// 000000004A64: D3B5802C 04B26168
v_mfma_f32_16x16x32_bf16 a[48:51], v[104:107], v[52:55], a[48:51]// 000000004A6C: D3B58030 04C26968
v_mfma_f32_16x16x32_bf16 a[52:55], v[104:107], v[56:59], a[52:55]// 000000004A74: D3B58034 04D27168
v_mfma_f32_16x16x32_bf16 a[56:59], v[104:107], v[60:63], a[56:59]// 000000004A7C: D3B58038 04E27968
v_mfma_f32_16x16x32_bf16 a[60:63], v[104:107], v[64:67], a[60:63]// 000000004A84: D3B5803C 04F28168
v_mfma_f32_16x16x32_bf16 a[64:67], v[108:111], v[36:39], a[64:67]// 000000004A8C: D3B58040 0502496C
v_mfma_f32_16x16x32_bf16 a[68:71], v[108:111], v[40:43], a[68:71]// 000000004A94: D3B58044 0512516C
v_mfma_f32_16x16x32_bf16 a[72:75], v[108:111], v[44:47], a[72:75]// 000000004A9C: D3B58048 0522596C
v_mfma_f32_16x16x32_bf16 a[76:79], v[108:111], v[48:51], a[76:79]// 000000004AA4: D3B5804C 0532616C
v_mfma_f32_16x16x32_bf16 a[80:83], v[108:111], v[52:55], a[80:83]// 000000004AAC: D3B58050 0542696C
v_mfma_f32_16x16x32_bf16 a[84:87], v[108:111], v[56:59], a[84:87]// 000000004AB4: D3B58054 0552716C
v_mfma_f32_16x16x32_bf16 a[88:91], v[108:111], v[60:63], a[88:91]// 000000004ABC: D3B58058 0562796C
v_mfma_f32_16x16x32_bf16 a[92:95], v[108:111], v[64:67], a[92:95]// 000000004AC4: D3B5805C 0572816C
v_mfma_f32_16x16x32_bf16 a[96:99], v[112:115], v[36:39], a[96:99]// 000000004ACC: D3B58060 05824970
v_mfma_f32_16x16x32_bf16 a[100:103], v[112:115], v[40:43], a[100:103]// 000000004AD4: D3B58064 05925170
v_mfma_f32_16x16x32_bf16 a[104:107], v[112:115], v[44:47], a[104:107]// 000000004ADC: D3B58068 05A25970
v_mfma_f32_16x16x32_bf16 a[108:111], v[112:115], v[48:51], a[108:111]// 000000004AE4: D3B5806C 05B26170
v_mfma_f32_16x16x32_bf16 a[112:115], v[112:115], v[52:55], a[112:115]// 000000004AEC: D3B58070 05C26970
v_mfma_f32_16x16x32_bf16 a[116:119], v[112:115], v[56:59], a[116:119]// 000000004AF4: D3B58074 05D27170
v_mfma_f32_16x16x32_bf16 a[120:123], v[112:115], v[60:63], a[120:123]// 000000004AFC: D3B58078 05E27970
v_mfma_f32_16x16x32_bf16 a[124:127], v[112:115], v[64:67], a[124:127]// 000000004B04: D3B5807C 05F28170
v_mfma_f32_16x16x32_bf16 a[128:131], v[116:119], v[36:39], a[128:131]// 000000004B0C: D3B58080 06024974
v_mfma_f32_16x16x32_bf16 a[132:135], v[116:119], v[40:43], a[132:135]// 000000004B14: D3B58084 06125174
v_mfma_f32_16x16x32_bf16 a[136:139], v[116:119], v[44:47], a[136:139]// 000000004B1C: D3B58088 06225974
v_mfma_f32_16x16x32_bf16 a[140:143], v[116:119], v[48:51], a[140:143]// 000000004B24: D3B5808C 06326174
v_mfma_f32_16x16x32_bf16 a[144:147], v[116:119], v[52:55], a[144:147]// 000000004B2C: D3B58090 06426974
v_mfma_f32_16x16x32_bf16 a[148:151], v[116:119], v[56:59], a[148:151]// 000000004B34: D3B58094 06527174
v_mfma_f32_16x16x32_bf16 a[152:155], v[116:119], v[60:63], a[152:155]// 000000004B3C: D3B58098 06627974
v_mfma_f32_16x16x32_bf16 a[156:159], v[116:119], v[64:67], a[156:159]// 000000004B44: D3B5809C 06728174
v_mfma_f32_16x16x32_bf16 a[160:163], v[120:123], v[36:39], a[160:163]// 000000004B4C: D3B580A0 06824978
v_mfma_f32_16x16x32_bf16 a[164:167], v[120:123], v[40:43], a[164:167]// 000000004B54: D3B580A4 06925178
v_mfma_f32_16x16x32_bf16 a[168:171], v[120:123], v[44:47], a[168:171]// 000000004B5C: D3B580A8 06A25978
v_mfma_f32_16x16x32_bf16 a[172:175], v[120:123], v[48:51], a[172:175]// 000000004B64: D3B580AC 06B26178
v_mfma_f32_16x16x32_bf16 a[176:179], v[120:123], v[52:55], a[176:179]// 000000004B6C: D3B580B0 06C26978
v_mfma_f32_16x16x32_bf16 a[180:183], v[120:123], v[56:59], a[180:183]// 000000004B74: D3B580B4 06D27178
v_mfma_f32_16x16x32_bf16 a[184:187], v[120:123], v[60:63], a[184:187]// 000000004B7C: D3B580B8 06E27978
v_mfma_f32_16x16x32_bf16 a[188:191], v[120:123], v[64:67], a[188:191]// 000000004B84: D3B580BC 06F28178
v_mfma_f32_16x16x32_bf16 a[192:195], v[124:127], v[36:39], a[192:195]// 000000004B8C: D3B580C0 0702497C
v_mfma_f32_16x16x32_bf16 a[196:199], v[124:127], v[40:43], a[196:199]// 000000004B94: D3B580C4 0712517C
v_mfma_f32_16x16x32_bf16 a[200:203], v[124:127], v[44:47], a[200:203]// 000000004B9C: D3B580C8 0722597C
v_mfma_f32_16x16x32_bf16 a[204:207], v[124:127], v[48:51], a[204:207]// 000000004BA4: D3B580CC 0732617C
v_mfma_f32_16x16x32_bf16 a[208:211], v[124:127], v[52:55], a[208:211]// 000000004BAC: D3B580D0 0742697C
v_mfma_f32_16x16x32_bf16 a[212:215], v[124:127], v[56:59], a[212:215]// 000000004BB4: D3B580D4 0752717C
v_mfma_f32_16x16x32_bf16 a[216:219], v[124:127], v[60:63], a[216:219]// 000000004BBC: D3B580D8 0762797C
v_mfma_f32_16x16x32_bf16 a[220:223], v[124:127], v[64:67], a[220:223]// 000000004BC4: D3B580DC 0772817C
v_mfma_f32_16x16x32_bf16 a[224:227], v[128:131], v[36:39], a[224:227]// 000000004BCC: D3B580E0 07824980
v_mfma_f32_16x16x32_bf16 a[228:231], v[128:131], v[40:43], a[228:231]// 000000004BD4: D3B580E4 07925180
v_mfma_f32_16x16x32_bf16 a[232:235], v[128:131], v[44:47], a[232:235]// 000000004BDC: D3B580E8 07A25980
v_mfma_f32_16x16x32_bf16 a[236:239], v[128:131], v[48:51], a[236:239]// 000000004BE4: D3B580EC 07B26180
v_mfma_f32_16x16x32_bf16 a[240:243], v[128:131], v[52:55], a[240:243]// 000000004BEC: D3B580F0 07C26980
v_mfma_f32_16x16x32_bf16 a[244:247], v[128:131], v[56:59], a[244:247]// 000000004BF4: D3B580F4 07D27180
v_mfma_f32_16x16x32_bf16 a[248:251], v[128:131], v[60:63], a[248:251]// 000000004BFC: D3B580F8 07E27980
v_mfma_f32_16x16x32_bf16 a[252:255], v[128:131], v[64:67], a[252:255]// 000000004C04: D3B580FC 07F28180
label_toPGR1end_OptNLL:
v_lshrrev_b32_e32 v4, 6, v134 // 000000004C0C: 20090C86
v_lshrrev_b32_e32 v5, 1, v4 // 000000004C10: 200A0881
v_mul_lo_u32 v5, 16, v5 // 000000004C14: D2850005 00020A90
v_and_b32_e32 v1, 63, v134 // 000000004C1C: 26030CBF
v_lshrrev_b32_e32 v1, 4, v1 // 000000004C20: 20020284
v_lshlrev_b32_e32 v1, 2, v1 // 000000004C24: 24020282
v_add_lshl_u32 v1, v5, v1, 3 // 000000004C28: D1FE0001 020E0305
v_mul_lo_u32 v2, v1, s38 // 000000004C30: D2850002 00004D01
v_mul_lo_u32 v3, v1, s36 // 000000004C38: D2850003 00004901
v_and_b32_e32 v0, 1, v4 // 000000004C40: 26000881
v_mul_lo_u32 v0, 16, v0 // 000000004C44: D2850000 00020090
v_and_b32_e32 v5, 15, v134 // 000000004C4C: 260B0C8F
v_add_lshl_u32 v0, v5, v0, 3 // 000000004C50: D1FE0000 020E0105
s_mul_i32 s8, 0x100, s2 // 000000004C58: 920802FF 00000100
v_add_u32_e32 v0, s8, v0 // 000000004C60: 68000008
s_mul_i32 s8, 0x100, s3 // 000000004C64: 920803FF 00000100
v_add_u32_e32 v1, s8, v1 // 000000004C6C: 68020208
label_GW_B0_E0:
v_add_lshl_u32 v11, v3, v0, 1 // 000000004C70: D1FE000B 02060103
v_accvgpr_read_b32 v16, a0 // 000000004C78: D3D84010 18000100
v_accvgpr_read_b32 v17, a4 // 000000004C80: D3D84011 18000104
v_accvgpr_read_b32 v18, a8 // 000000004C88: D3D84012 18000108
v_accvgpr_read_b32 v19, a12 // 000000004C90: D3D84013 1800010C
v_accvgpr_read_b32 v20, a16 // 000000004C98: D3D84014 18000110
v_accvgpr_read_b32 v21, a20 // 000000004CA0: D3D84015 18000114
v_accvgpr_read_b32 v22, a24 // 000000004CA8: D3D84016 18000118
v_accvgpr_read_b32 v23, a28 // 000000004CB0: D3D84017 1800011C
v_accvgpr_read_b32 v24, a32 // 000000004CB8: D3D84018 18000120
v_accvgpr_read_b32 v25, a36 // 000000004CC0: D3D84019 18000124
v_accvgpr_read_b32 v26, a40 // 000000004CC8: D3D8401A 18000128
v_accvgpr_read_b32 v27, a44 // 000000004CD0: D3D8401B 1800012C
v_accvgpr_read_b32 v28, a48 // 000000004CD8: D3D8401C 18000130
v_accvgpr_read_b32 v29, a52 // 000000004CE0: D3D8401D 18000134
v_accvgpr_read_b32 v30, a56 // 000000004CE8: D3D8401E 18000138
v_accvgpr_read_b32 v31, a60 // 000000004CF0: D3D8401F 1800013C
v_accvgpr_read_b32 v32, a64 // 000000004CF8: D3D84020 18000140
v_accvgpr_read_b32 v33, a68 // 000000004D00: D3D84021 18000144
v_accvgpr_read_b32 v34, a72 // 000000004D08: D3D84022 18000148
v_accvgpr_read_b32 v35, a76 // 000000004D10: D3D84023 1800014C
v_accvgpr_read_b32 v36, a80 // 000000004D18: D3D84024 18000150
v_accvgpr_read_b32 v37, a84 // 000000004D20: D3D84025 18000154
v_accvgpr_read_b32 v38, a88 // 000000004D28: D3D84026 18000158
v_accvgpr_read_b32 v39, a92 // 000000004D30: D3D84027 1800015C
v_accvgpr_read_b32 v40, a96 // 000000004D38: D3D84028 18000160
v_accvgpr_read_b32 v41, a100 // 000000004D40: D3D84029 18000164
v_accvgpr_read_b32 v42, a104 // 000000004D48: D3D8402A 18000168
v_accvgpr_read_b32 v43, a108 // 000000004D50: D3D8402B 1800016C
v_accvgpr_read_b32 v44, a112 // 000000004D58: D3D8402C 18000170
v_accvgpr_read_b32 v45, a116 // 000000004D60: D3D8402D 18000174
v_accvgpr_read_b32 v46, a120 // 000000004D68: D3D8402E 18000178
v_accvgpr_read_b32 v47, a124 // 000000004D70: D3D8402F 1800017C
v_accvgpr_read_b32 v48, a128 // 000000004D78: D3D84030 18000180
v_accvgpr_read_b32 v49, a132 // 000000004D80: D3D84031 18000184
v_accvgpr_read_b32 v50, a136 // 000000004D88: D3D84032 18000188
v_accvgpr_read_b32 v51, a140 // 000000004D90: D3D84033 1800018C
v_accvgpr_read_b32 v52, a144 // 000000004D98: D3D84034 18000190
v_accvgpr_read_b32 v53, a148 // 000000004DA0: D3D84035 18000194
v_accvgpr_read_b32 v54, a152 // 000000004DA8: D3D84036 18000198
v_accvgpr_read_b32 v55, a156 // 000000004DB0: D3D84037 1800019C
v_accvgpr_read_b32 v56, a160 // 000000004DB8: D3D84038 180001A0
v_accvgpr_read_b32 v57, a164 // 000000004DC0: D3D84039 180001A4
v_accvgpr_read_b32 v58, a168 // 000000004DC8: D3D8403A 180001A8
v_accvgpr_read_b32 v59, a172 // 000000004DD0: D3D8403B 180001AC
v_accvgpr_read_b32 v60, a176 // 000000004DD8: D3D8403C 180001B0
v_accvgpr_read_b32 v61, a180 // 000000004DE0: D3D8403D 180001B4
v_accvgpr_read_b32 v62, a184 // 000000004DE8: D3D8403E 180001B8
v_accvgpr_read_b32 v63, a188 // 000000004DF0: D3D8403F 180001BC
v_accvgpr_read_b32 v64, a192 // 000000004DF8: D3D84040 180001C0
v_accvgpr_read_b32 v65, a196 // 000000004E00: D3D84041 180001C4
v_accvgpr_read_b32 v66, a200 // 000000004E08: D3D84042 180001C8
v_accvgpr_read_b32 v67, a204 // 000000004E10: D3D84043 180001CC
v_accvgpr_read_b32 v68, a208 // 000000004E18: D3D84044 180001D0
v_accvgpr_read_b32 v69, a212 // 000000004E20: D3D84045 180001D4
v_accvgpr_read_b32 v70, a216 // 000000004E28: D3D84046 180001D8
v_accvgpr_read_b32 v71, a220 // 000000004E30: D3D84047 180001DC
v_accvgpr_read_b32 v72, a224 // 000000004E38: D3D84048 180001E0
v_accvgpr_read_b32 v73, a228 // 000000004E40: D3D84049 180001E4
v_accvgpr_read_b32 v74, a232 // 000000004E48: D3D8404A 180001E8
v_accvgpr_read_b32 v75, a236 // 000000004E50: D3D8404B 180001EC
v_accvgpr_read_b32 v76, a240 // 000000004E58: D3D8404C 180001F0
v_accvgpr_read_b32 v77, a244 // 000000004E60: D3D8404D 180001F4
v_accvgpr_read_b32 v78, a248 // 000000004E68: D3D8404E 180001F8
v_accvgpr_read_b32 v79, a252 // 000000004E70: D3D8404F 180001FC
v_accvgpr_read_b32 v80, a1 // 000000004E78: D3D84050 18000101
v_accvgpr_read_b32 v81, a5 // 000000004E80: D3D84051 18000105
v_accvgpr_read_b32 v82, a9 // 000000004E88: D3D84052 18000109
v_accvgpr_read_b32 v83, a13 // 000000004E90: D3D84053 1800010D
v_accvgpr_read_b32 v84, a17 // 000000004E98: D3D84054 18000111
v_accvgpr_read_b32 v85, a21 // 000000004EA0: D3D84055 18000115
v_accvgpr_read_b32 v86, a25 // 000000004EA8: D3D84056 18000119
v_accvgpr_read_b32 v87, a29 // 000000004EB0: D3D84057 1800011D
v_accvgpr_read_b32 v88, a33 // 000000004EB8: D3D84058 18000121
v_accvgpr_read_b32 v89, a37 // 000000004EC0: D3D84059 18000125
v_accvgpr_read_b32 v90, a41 // 000000004EC8: D3D8405A 18000129
v_accvgpr_read_b32 v91, a45 // 000000004ED0: D3D8405B 1800012D
v_accvgpr_read_b32 v92, a49 // 000000004ED8: D3D8405C 18000131
v_accvgpr_read_b32 v93, a53 // 000000004EE0: D3D8405D 18000135
v_accvgpr_read_b32 v94, a57 // 000000004EE8: D3D8405E 18000139
v_accvgpr_read_b32 v95, a61 // 000000004EF0: D3D8405F 1800013D
v_accvgpr_read_b32 v96, a65 // 000000004EF8: D3D84060 18000141
v_accvgpr_read_b32 v97, a69 // 000000004F00: D3D84061 18000145
v_accvgpr_read_b32 v98, a73 // 000000004F08: D3D84062 18000149
v_accvgpr_read_b32 v99, a77 // 000000004F10: D3D84063 1800014D
v_accvgpr_read_b32 v100, a81 // 000000004F18: D3D84064 18000151
v_accvgpr_read_b32 v101, a85 // 000000004F20: D3D84065 18000155
v_accvgpr_read_b32 v102, a89 // 000000004F28: D3D84066 18000159
v_accvgpr_read_b32 v103, a93 // 000000004F30: D3D84067 1800015D
v_accvgpr_read_b32 v104, a97 // 000000004F38: D3D84068 18000161
v_accvgpr_read_b32 v105, a101 // 000000004F40: D3D84069 18000165
v_accvgpr_read_b32 v106, a105 // 000000004F48: D3D8406A 18000169
v_accvgpr_read_b32 v107, a109 // 000000004F50: D3D8406B 1800016D
v_accvgpr_read_b32 v108, a113 // 000000004F58: D3D8406C 18000171
v_accvgpr_read_b32 v109, a117 // 000000004F60: D3D8406D 18000175
v_accvgpr_read_b32 v110, a121 // 000000004F68: D3D8406E 18000179
v_accvgpr_read_b32 v111, a125 // 000000004F70: D3D8406F 1800017D
v_accvgpr_read_b32 v112, a129 // 000000004F78: D3D84070 18000181
v_accvgpr_read_b32 v113, a133 // 000000004F80: D3D84071 18000185
v_accvgpr_read_b32 v114, a137 // 000000004F88: D3D84072 18000189
v_accvgpr_read_b32 v115, a141 // 000000004F90: D3D84073 1800018D
v_accvgpr_read_b32 v116, a145 // 000000004F98: D3D84074 18000191
v_accvgpr_read_b32 v117, a149 // 000000004FA0: D3D84075 18000195
v_accvgpr_read_b32 v118, a153 // 000000004FA8: D3D84076 18000199
v_accvgpr_read_b32 v119, a157 // 000000004FB0: D3D84077 1800019D
v_accvgpr_read_b32 v120, a161 // 000000004FB8: D3D84078 180001A1
v_accvgpr_read_b32 v121, a165 // 000000004FC0: D3D84079 180001A5
v_accvgpr_read_b32 v122, a169 // 000000004FC8: D3D8407A 180001A9
v_accvgpr_read_b32 v123, a173 // 000000004FD0: D3D8407B 180001AD
v_accvgpr_read_b32 v124, a177 // 000000004FD8: D3D8407C 180001B1
v_accvgpr_read_b32 v125, a181 // 000000004FE0: D3D8407D 180001B5
v_accvgpr_read_b32 v126, a185 // 000000004FE8: D3D8407E 180001B9
v_accvgpr_read_b32 v127, a189 // 000000004FF0: D3D8407F 180001BD
v_accvgpr_read_b32 v136, a193 // 000000004FF8: D3D84088 180001C1
v_accvgpr_read_b32 v137, a197 // 000000005000: D3D84089 180001C5
v_accvgpr_read_b32 v138, a201 // 000000005008: D3D8408A 180001C9
v_accvgpr_read_b32 v139, a205 // 000000005010: D3D8408B 180001CD
v_accvgpr_read_b32 v140, a209 // 000000005018: D3D8408C 180001D1
v_accvgpr_read_b32 v141, a213 // 000000005020: D3D8408D 180001D5
v_accvgpr_read_b32 v142, a217 // 000000005028: D3D8408E 180001D9
v_accvgpr_read_b32 v143, a221 // 000000005030: D3D8408F 180001DD
v_accvgpr_read_b32 v144, a225 // 000000005038: D3D84090 180001E1
v_accvgpr_read_b32 v145, a229 // 000000005040: D3D84091 180001E5
v_accvgpr_read_b32 v146, a233 // 000000005048: D3D84092 180001E9
v_accvgpr_read_b32 v147, a237 // 000000005050: D3D84093 180001ED
v_accvgpr_read_b32 v148, a241 // 000000005058: D3D84094 180001F1
v_accvgpr_read_b32 v149, a245 // 000000005060: D3D84095 180001F5
v_accvgpr_read_b32 v150, a249 // 000000005068: D3D84096 180001F9
v_accvgpr_read_b32 v151, a253 // 000000005070: D3D84097 180001FD
v_accvgpr_read_b32 v152, a2 // 000000005078: D3D84098 18000102
v_accvgpr_read_b32 v153, a6 // 000000005080: D3D84099 18000106
v_accvgpr_read_b32 v154, a10 // 000000005088: D3D8409A 1800010A
v_accvgpr_read_b32 v155, a14 // 000000005090: D3D8409B 1800010E
v_accvgpr_read_b32 v156, a18 // 000000005098: D3D8409C 18000112
v_accvgpr_read_b32 v157, a22 // 0000000050A0: D3D8409D 18000116
v_accvgpr_read_b32 v158, a26 // 0000000050A8: D3D8409E 1800011A
v_accvgpr_read_b32 v159, a30 // 0000000050B0: D3D8409F 1800011E
v_accvgpr_read_b32 v160, a34 // 0000000050B8: D3D840A0 18000122
v_accvgpr_read_b32 v161, a38 // 0000000050C0: D3D840A1 18000126
v_accvgpr_read_b32 v162, a42 // 0000000050C8: D3D840A2 1800012A
v_accvgpr_read_b32 v163, a46 // 0000000050D0: D3D840A3 1800012E
v_accvgpr_read_b32 v164, a50 // 0000000050D8: D3D840A4 18000132
v_accvgpr_read_b32 v165, a54 // 0000000050E0: D3D840A5 18000136
v_accvgpr_read_b32 v166, a58 // 0000000050E8: D3D840A6 1800013A
v_accvgpr_read_b32 v167, a62 // 0000000050F0: D3D840A7 1800013E
v_accvgpr_read_b32 v168, a66 // 0000000050F8: D3D840A8 18000142
v_accvgpr_read_b32 v169, a70 // 000000005100: D3D840A9 18000146
v_accvgpr_read_b32 v170, a74 // 000000005108: D3D840AA 1800014A
v_accvgpr_read_b32 v171, a78 // 000000005110: D3D840AB 1800014E
v_accvgpr_read_b32 v172, a82 // 000000005118: D3D840AC 18000152
v_accvgpr_read_b32 v173, a86 // 000000005120: D3D840AD 18000156
v_accvgpr_read_b32 v174, a90 // 000000005128: D3D840AE 1800015A
v_accvgpr_read_b32 v175, a94 // 000000005130: D3D840AF 1800015E
v_accvgpr_read_b32 v176, a98 // 000000005138: D3D840B0 18000162
v_accvgpr_read_b32 v177, a102 // 000000005140: D3D840B1 18000166
v_accvgpr_read_b32 v178, a106 // 000000005148: D3D840B2 1800016A
v_accvgpr_read_b32 v179, a110 // 000000005150: D3D840B3 1800016E
v_accvgpr_read_b32 v180, a114 // 000000005158: D3D840B4 18000172
v_accvgpr_read_b32 v181, a118 // 000000005160: D3D840B5 18000176
v_accvgpr_read_b32 v182, a122 // 000000005168: D3D840B6 1800017A
v_accvgpr_read_b32 v183, a126 // 000000005170: D3D840B7 1800017E
v_accvgpr_read_b32 v184, a130 // 000000005178: D3D840B8 18000182
v_accvgpr_read_b32 v185, a134 // 000000005180: D3D840B9 18000186
v_accvgpr_read_b32 v186, a138 // 000000005188: D3D840BA 1800018A
v_accvgpr_read_b32 v187, a142 // 000000005190: D3D840BB 1800018E
v_accvgpr_read_b32 v188, a146 // 000000005198: D3D840BC 18000192
v_accvgpr_read_b32 v189, a150 // 0000000051A0: D3D840BD 18000196
v_accvgpr_read_b32 v190, a154 // 0000000051A8: D3D840BE 1800019A
v_accvgpr_read_b32 v191, a158 // 0000000051B0: D3D840BF 1800019E
v_accvgpr_read_b32 v192, a162 // 0000000051B8: D3D840C0 180001A2
v_accvgpr_read_b32 v193, a166 // 0000000051C0: D3D840C1 180001A6
v_accvgpr_read_b32 v194, a170 // 0000000051C8: D3D840C2 180001AA
v_accvgpr_read_b32 v195, a174 // 0000000051D0: D3D840C3 180001AE
v_accvgpr_read_b32 v196, a178 // 0000000051D8: D3D840C4 180001B2
v_accvgpr_read_b32 v197, a182 // 0000000051E0: D3D840C5 180001B6
v_accvgpr_read_b32 v198, a186 // 0000000051E8: D3D840C6 180001BA
v_accvgpr_read_b32 v199, a190 // 0000000051F0: D3D840C7 180001BE
v_accvgpr_read_b32 v200, a194 // 0000000051F8: D3D840C8 180001C2
v_accvgpr_read_b32 v201, a198 // 000000005200: D3D840C9 180001C6
v_accvgpr_read_b32 v202, a202 // 000000005208: D3D840CA 180001CA
v_accvgpr_read_b32 v203, a206 // 000000005210: D3D840CB 180001CE
v_accvgpr_read_b32 v204, a210 // 000000005218: D3D840CC 180001D2
v_accvgpr_read_b32 v205, a214 // 000000005220: D3D840CD 180001D6
v_accvgpr_read_b32 v206, a218 // 000000005228: D3D840CE 180001DA
v_accvgpr_read_b32 v207, a222 // 000000005230: D3D840CF 180001DE
v_accvgpr_read_b32 v208, a226 // 000000005238: D3D840D0 180001E2
v_accvgpr_read_b32 v209, a230 // 000000005240: D3D840D1 180001E6
v_accvgpr_read_b32 v210, a234 // 000000005248: D3D840D2 180001EA
v_accvgpr_read_b32 v211, a238 // 000000005250: D3D840D3 180001EE
v_accvgpr_read_b32 v212, a242 // 000000005258: D3D840D4 180001F2
v_accvgpr_read_b32 v213, a246 // 000000005260: D3D840D5 180001F6
v_accvgpr_read_b32 v214, a250 // 000000005268: D3D840D6 180001FA
v_accvgpr_read_b32 v215, a254 // 000000005270: D3D840D7 180001FE
v_accvgpr_read_b32 v216, a3 // 000000005278: D3D840D8 18000103
v_accvgpr_read_b32 v217, a7 // 000000005280: D3D840D9 18000107
v_accvgpr_read_b32 v218, a11 // 000000005288: D3D840DA 1800010B
v_accvgpr_read_b32 v219, a15 // 000000005290: D3D840DB 1800010F
v_accvgpr_read_b32 v220, a19 // 000000005298: D3D840DC 18000113
v_accvgpr_read_b32 v221, a23 // 0000000052A0: D3D840DD 18000117
v_accvgpr_read_b32 v222, a27 // 0000000052A8: D3D840DE 1800011B
v_accvgpr_read_b32 v223, a31 // 0000000052B0: D3D840DF 1800011F
v_accvgpr_read_b32 v224, a35 // 0000000052B8: D3D840E0 18000123
v_accvgpr_read_b32 v225, a39 // 0000000052C0: D3D840E1 18000127
v_accvgpr_read_b32 v226, a43 // 0000000052C8: D3D840E2 1800012B
v_accvgpr_read_b32 v227, a47 // 0000000052D0: D3D840E3 1800012F
v_accvgpr_read_b32 v228, a51 // 0000000052D8: D3D840E4 18000133
v_accvgpr_read_b32 v229, a55 // 0000000052E0: D3D840E5 18000137
v_accvgpr_read_b32 v230, a59 // 0000000052E8: D3D840E6 1800013B
v_accvgpr_read_b32 v231, a63 // 0000000052F0: D3D840E7 1800013F
v_accvgpr_read_b32 v232, a67 // 0000000052F8: D3D840E8 18000143
v_accvgpr_read_b32 v233, a71 // 000000005300: D3D840E9 18000147
v_accvgpr_read_b32 v234, a75 // 000000005308: D3D840EA 1800014B
v_accvgpr_read_b32 v235, a79 // 000000005310: D3D840EB 1800014F
v_accvgpr_read_b32 v236, a83 // 000000005318: D3D840EC 18000153
v_accvgpr_read_b32 v237, a87 // 000000005320: D3D840ED 18000157
v_accvgpr_read_b32 v238, a91 // 000000005328: D3D840EE 1800015B
v_accvgpr_read_b32 v239, a95 // 000000005330: D3D840EF 1800015F
v_accvgpr_read_b32 v240, a99 // 000000005338: D3D840F0 18000163
v_accvgpr_read_b32 v241, a103 // 000000005340: D3D840F1 18000167
v_accvgpr_read_b32 v242, a107 // 000000005348: D3D840F2 1800016B
v_accvgpr_read_b32 v243, a111 // 000000005350: D3D840F3 1800016F
v_accvgpr_read_b32 v244, a115 // 000000005358: D3D840F4 18000173
v_accvgpr_read_b32 v245, a119 // 000000005360: D3D840F5 18000177
v_accvgpr_read_b32 v246, a123 // 000000005368: D3D840F6 1800017B
v_accvgpr_read_b32 v247, a127 // 000000005370: D3D840F7 1800017F
v_mov_b32_e32 v8, 0xffff0000 // 000000005378: 7E1002FF FFFF0000
v_mov_b32_e32 v9, 0x7fff0000 // 000000005380: 7E1202FF 7FFF0000
v_mov_b32_e32 v10, 0x7fff // 000000005388: 7E1402FF 00007FFF
v_cvt_pk_bf16_f32 v16, v16, v17 // 000000005390: D2680010 00022310
v_cvt_pk_bf16_f32 v17, v18, v19 // 000000005398: D2680011 00022712
v_cvt_pk_bf16_f32 v18, v20, v21 // 0000000053A0: D2680012 00022B14
v_cvt_pk_bf16_f32 v19, v22, v23 // 0000000053A8: D2680013 00022F16
buffer_store_dwordx4 v[16:19], v11, s[16:19], 0 offen nt // 0000000053B0: E07E1000 8004100B
v_cvt_pk_bf16_f32 v24, v24, v25 // 0000000053B8: D2680018 00023318
v_cvt_pk_bf16_f32 v25, v26, v27 // 0000000053C0: D2680019 0002371A
v_cvt_pk_bf16_f32 v26, v28, v29 // 0000000053C8: D268001A 00023B1C
v_cvt_pk_bf16_f32 v27, v30, v31 // 0000000053D0: D268001B 00023F1E
s_lshl_b32 s12, s36, 1 // 0000000053D8: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000053DC: 80100C10
s_addc_u32 s17, s17, 0 // 0000000053E0: 82118011
buffer_store_dwordx4 v[24:27], v11, s[16:19], 0 offen nt // 0000000053E4: E07E1000 8004180B
v_cvt_pk_bf16_f32 v32, v32, v33 // 0000000053EC: D2680020 00024320
v_cvt_pk_bf16_f32 v33, v34, v35 // 0000000053F4: D2680021 00024722
v_cvt_pk_bf16_f32 v34, v36, v37 // 0000000053FC: D2680022 00024B24
v_cvt_pk_bf16_f32 v35, v38, v39 // 000000005404: D2680023 00024F26
s_lshl_b32 s12, s36, 1 // 00000000540C: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005410: 80100C10
s_addc_u32 s17, s17, 0 // 000000005414: 82118011
buffer_store_dwordx4 v[32:35], v11, s[16:19], 0 offen nt // 000000005418: E07E1000 8004200B
v_cvt_pk_bf16_f32 v40, v40, v41 // 000000005420: D2680028 00025328
v_cvt_pk_bf16_f32 v41, v42, v43 // 000000005428: D2680029 0002572A
v_cvt_pk_bf16_f32 v42, v44, v45 // 000000005430: D268002A 00025B2C
v_cvt_pk_bf16_f32 v43, v46, v47 // 000000005438: D268002B 00025F2E
s_lshl_b32 s12, s36, 1 // 000000005440: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005444: 80100C10
s_addc_u32 s17, s17, 0 // 000000005448: 82118011
buffer_store_dwordx4 v[40:43], v11, s[16:19], 0 offen nt // 00000000544C: E07E1000 8004280B
v_cvt_pk_bf16_f32 v48, v48, v49 // 000000005454: D2680030 00026330
v_cvt_pk_bf16_f32 v49, v50, v51 // 00000000545C: D2680031 00026732
v_cvt_pk_bf16_f32 v50, v52, v53 // 000000005464: D2680032 00026B34
v_cvt_pk_bf16_f32 v51, v54, v55 // 00000000546C: D2680033 00026F36
s_lshl_b32 s12, s36, 1 // 000000005474: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005478: 80100C10
s_addc_u32 s17, s17, 0 // 00000000547C: 82118011
buffer_store_dwordx4 v[48:51], v11, s[16:19], 0 offen nt // 000000005480: E07E1000 8004300B
v_cvt_pk_bf16_f32 v56, v56, v57 // 000000005488: D2680038 00027338
v_cvt_pk_bf16_f32 v57, v58, v59 // 000000005490: D2680039 0002773A
v_cvt_pk_bf16_f32 v58, v60, v61 // 000000005498: D268003A 00027B3C
v_cvt_pk_bf16_f32 v59, v62, v63 // 0000000054A0: D268003B 00027F3E
s_lshl_b32 s12, s36, 1 // 0000000054A8: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000054AC: 80100C10
s_addc_u32 s17, s17, 0 // 0000000054B0: 82118011
buffer_store_dwordx4 v[56:59], v11, s[16:19], 0 offen nt // 0000000054B4: E07E1000 8004380B
v_cvt_pk_bf16_f32 v64, v64, v65 // 0000000054BC: D2680040 00028340
v_cvt_pk_bf16_f32 v65, v66, v67 // 0000000054C4: D2680041 00028742
v_cvt_pk_bf16_f32 v66, v68, v69 // 0000000054CC: D2680042 00028B44
v_cvt_pk_bf16_f32 v67, v70, v71 // 0000000054D4: D2680043 00028F46
s_lshl_b32 s12, s36, 1 // 0000000054DC: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000054E0: 80100C10
s_addc_u32 s17, s17, 0 // 0000000054E4: 82118011
buffer_store_dwordx4 v[64:67], v11, s[16:19], 0 offen nt // 0000000054E8: E07E1000 8004400B
v_cvt_pk_bf16_f32 v72, v72, v73 // 0000000054F0: D2680048 00029348
v_cvt_pk_bf16_f32 v73, v74, v75 // 0000000054F8: D2680049 0002974A
v_cvt_pk_bf16_f32 v74, v76, v77 // 000000005500: D268004A 00029B4C
v_cvt_pk_bf16_f32 v75, v78, v79 // 000000005508: D268004B 00029F4E
s_lshl_b32 s12, s36, 1 // 000000005510: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005514: 80100C10
s_addc_u32 s17, s17, 0 // 000000005518: 82118011
buffer_store_dwordx4 v[72:75], v11, s[16:19], 0 offen nt // 00000000551C: E07E1000 8004480B
v_cvt_pk_bf16_f32 v80, v80, v81 // 000000005524: D2680050 0002A350
v_cvt_pk_bf16_f32 v81, v82, v83 // 00000000552C: D2680051 0002A752
v_cvt_pk_bf16_f32 v82, v84, v85 // 000000005534: D2680052 0002AB54
v_cvt_pk_bf16_f32 v83, v86, v87 // 00000000553C: D2680053 0002AF56
s_lshl_b32 s12, s36, 1 // 000000005544: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005548: 80100C10
s_addc_u32 s17, s17, 0 // 00000000554C: 82118011
buffer_store_dwordx4 v[80:83], v11, s[16:19], 0 offen nt // 000000005550: E07E1000 8004500B
v_cvt_pk_bf16_f32 v88, v88, v89 // 000000005558: D2680058 0002B358
v_cvt_pk_bf16_f32 v89, v90, v91 // 000000005560: D2680059 0002B75A
v_cvt_pk_bf16_f32 v90, v92, v93 // 000000005568: D268005A 0002BB5C
v_cvt_pk_bf16_f32 v91, v94, v95 // 000000005570: D268005B 0002BF5E
s_lshl_b32 s12, s36, 1 // 000000005578: 8E0C8124
s_add_u32 s16, s16, s12 // 00000000557C: 80100C10
s_addc_u32 s17, s17, 0 // 000000005580: 82118011
buffer_store_dwordx4 v[88:91], v11, s[16:19], 0 offen nt // 000000005584: E07E1000 8004580B
v_cvt_pk_bf16_f32 v96, v96, v97 // 00000000558C: D2680060 0002C360
v_cvt_pk_bf16_f32 v97, v98, v99 // 000000005594: D2680061 0002C762
v_cvt_pk_bf16_f32 v98, v100, v101 // 00000000559C: D2680062 0002CB64
v_cvt_pk_bf16_f32 v99, v102, v103 // 0000000055A4: D2680063 0002CF66
s_lshl_b32 s12, s36, 1 // 0000000055AC: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000055B0: 80100C10
s_addc_u32 s17, s17, 0 // 0000000055B4: 82118011
buffer_store_dwordx4 v[96:99], v11, s[16:19], 0 offen nt // 0000000055B8: E07E1000 8004600B
v_cvt_pk_bf16_f32 v104, v104, v105 // 0000000055C0: D2680068 0002D368
v_cvt_pk_bf16_f32 v105, v106, v107 // 0000000055C8: D2680069 0002D76A
v_cvt_pk_bf16_f32 v106, v108, v109 // 0000000055D0: D268006A 0002DB6C
v_cvt_pk_bf16_f32 v107, v110, v111 // 0000000055D8: D268006B 0002DF6E
s_lshl_b32 s12, s36, 1 // 0000000055E0: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000055E4: 80100C10
s_addc_u32 s17, s17, 0 // 0000000055E8: 82118011
buffer_store_dwordx4 v[104:107], v11, s[16:19], 0 offen nt // 0000000055EC: E07E1000 8004680B
v_cvt_pk_bf16_f32 v112, v112, v113 // 0000000055F4: D2680070 0002E370
v_cvt_pk_bf16_f32 v113, v114, v115 // 0000000055FC: D2680071 0002E772
v_cvt_pk_bf16_f32 v114, v116, v117 // 000000005604: D2680072 0002EB74
v_cvt_pk_bf16_f32 v115, v118, v119 // 00000000560C: D2680073 0002EF76
s_lshl_b32 s12, s36, 1 // 000000005614: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005618: 80100C10
s_addc_u32 s17, s17, 0 // 00000000561C: 82118011
buffer_store_dwordx4 v[112:115], v11, s[16:19], 0 offen nt // 000000005620: E07E1000 8004700B
v_cvt_pk_bf16_f32 v120, v120, v121 // 000000005628: D2680078 0002F378
v_cvt_pk_bf16_f32 v121, v122, v123 // 000000005630: D2680079 0002F77A
v_cvt_pk_bf16_f32 v122, v124, v125 // 000000005638: D268007A 0002FB7C
v_cvt_pk_bf16_f32 v123, v126, v127 // 000000005640: D268007B 0002FF7E
s_lshl_b32 s12, s36, 1 // 000000005648: 8E0C8124
s_add_u32 s16, s16, s12 // 00000000564C: 80100C10
s_addc_u32 s17, s17, 0 // 000000005650: 82118011
buffer_store_dwordx4 v[120:123], v11, s[16:19], 0 offen nt // 000000005654: E07E1000 8004780B
v_cvt_pk_bf16_f32 v136, v136, v137 // 00000000565C: D2680088 00031388
v_cvt_pk_bf16_f32 v137, v138, v139 // 000000005664: D2680089 0003178A
v_cvt_pk_bf16_f32 v138, v140, v141 // 00000000566C: D268008A 00031B8C
v_cvt_pk_bf16_f32 v139, v142, v143 // 000000005674: D268008B 00031F8E
s_lshl_b32 s12, s36, 1 // 00000000567C: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005680: 80100C10
s_addc_u32 s17, s17, 0 // 000000005684: 82118011
buffer_store_dwordx4 v[136:139], v11, s[16:19], 0 offen nt // 000000005688: E07E1000 8004880B
v_cvt_pk_bf16_f32 v144, v144, v145 // 000000005690: D2680090 00032390
v_cvt_pk_bf16_f32 v145, v146, v147 // 000000005698: D2680091 00032792
v_cvt_pk_bf16_f32 v146, v148, v149 // 0000000056A0: D2680092 00032B94
v_cvt_pk_bf16_f32 v147, v150, v151 // 0000000056A8: D2680093 00032F96
s_lshl_b32 s12, s36, 1 // 0000000056B0: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000056B4: 80100C10
s_addc_u32 s17, s17, 0 // 0000000056B8: 82118011
buffer_store_dwordx4 v[144:147], v11, s[16:19], 0 offen nt // 0000000056BC: E07E1000 8004900B
v_cvt_pk_bf16_f32 v152, v152, v153 // 0000000056C4: D2680098 00033398
v_cvt_pk_bf16_f32 v153, v154, v155 // 0000000056CC: D2680099 0003379A
v_cvt_pk_bf16_f32 v154, v156, v157 // 0000000056D4: D268009A 00033B9C
v_cvt_pk_bf16_f32 v155, v158, v159 // 0000000056DC: D268009B 00033F9E
s_lshl_b32 s12, s36, 1 // 0000000056E4: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000056E8: 80100C10
s_addc_u32 s17, s17, 0 // 0000000056EC: 82118011
buffer_store_dwordx4 v[152:155], v11, s[16:19], 0 offen nt // 0000000056F0: E07E1000 8004980B
v_cvt_pk_bf16_f32 v160, v160, v161 // 0000000056F8: D26800A0 000343A0
v_cvt_pk_bf16_f32 v161, v162, v163 // 000000005700: D26800A1 000347A2
v_cvt_pk_bf16_f32 v162, v164, v165 // 000000005708: D26800A2 00034BA4
v_cvt_pk_bf16_f32 v163, v166, v167 // 000000005710: D26800A3 00034FA6
s_lshl_b32 s12, s36, 1 // 000000005718: 8E0C8124
s_add_u32 s16, s16, s12 // 00000000571C: 80100C10
s_addc_u32 s17, s17, 0 // 000000005720: 82118011
buffer_store_dwordx4 v[160:163], v11, s[16:19], 0 offen nt // 000000005724: E07E1000 8004A00B
v_cvt_pk_bf16_f32 v168, v168, v169 // 00000000572C: D26800A8 000353A8
v_cvt_pk_bf16_f32 v169, v170, v171 // 000000005734: D26800A9 000357AA
v_cvt_pk_bf16_f32 v170, v172, v173 // 00000000573C: D26800AA 00035BAC
v_cvt_pk_bf16_f32 v171, v174, v175 // 000000005744: D26800AB 00035FAE
s_lshl_b32 s12, s36, 1 // 00000000574C: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005750: 80100C10
s_addc_u32 s17, s17, 0 // 000000005754: 82118011
buffer_store_dwordx4 v[168:171], v11, s[16:19], 0 offen nt // 000000005758: E07E1000 8004A80B
v_cvt_pk_bf16_f32 v176, v176, v177 // 000000005760: D26800B0 000363B0
v_cvt_pk_bf16_f32 v177, v178, v179 // 000000005768: D26800B1 000367B2
v_cvt_pk_bf16_f32 v178, v180, v181 // 000000005770: D26800B2 00036BB4
v_cvt_pk_bf16_f32 v179, v182, v183 // 000000005778: D26800B3 00036FB6
s_lshl_b32 s12, s36, 1 // 000000005780: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005784: 80100C10
s_addc_u32 s17, s17, 0 // 000000005788: 82118011
buffer_store_dwordx4 v[176:179], v11, s[16:19], 0 offen nt // 00000000578C: E07E1000 8004B00B
v_cvt_pk_bf16_f32 v184, v184, v185 // 000000005794: D26800B8 000373B8
v_cvt_pk_bf16_f32 v185, v186, v187 // 00000000579C: D26800B9 000377BA
v_cvt_pk_bf16_f32 v186, v188, v189 // 0000000057A4: D26800BA 00037BBC
v_cvt_pk_bf16_f32 v187, v190, v191 // 0000000057AC: D26800BB 00037FBE
s_lshl_b32 s12, s36, 1 // 0000000057B4: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000057B8: 80100C10
s_addc_u32 s17, s17, 0 // 0000000057BC: 82118011
buffer_store_dwordx4 v[184:187], v11, s[16:19], 0 offen nt // 0000000057C0: E07E1000 8004B80B
v_cvt_pk_bf16_f32 v192, v192, v193 // 0000000057C8: D26800C0 000383C0
v_cvt_pk_bf16_f32 v193, v194, v195 // 0000000057D0: D26800C1 000387C2
v_cvt_pk_bf16_f32 v194, v196, v197 // 0000000057D8: D26800C2 00038BC4
v_cvt_pk_bf16_f32 v195, v198, v199 // 0000000057E0: D26800C3 00038FC6
s_lshl_b32 s12, s36, 1 // 0000000057E8: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000057EC: 80100C10
s_addc_u32 s17, s17, 0 // 0000000057F0: 82118011
buffer_store_dwordx4 v[192:195], v11, s[16:19], 0 offen nt // 0000000057F4: E07E1000 8004C00B
v_cvt_pk_bf16_f32 v200, v200, v201 // 0000000057FC: D26800C8 000393C8
v_cvt_pk_bf16_f32 v201, v202, v203 // 000000005804: D26800C9 000397CA
v_cvt_pk_bf16_f32 v202, v204, v205 // 00000000580C: D26800CA 00039BCC
v_cvt_pk_bf16_f32 v203, v206, v207 // 000000005814: D26800CB 00039FCE
s_lshl_b32 s12, s36, 1 // 00000000581C: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005820: 80100C10
s_addc_u32 s17, s17, 0 // 000000005824: 82118011
buffer_store_dwordx4 v[200:203], v11, s[16:19], 0 offen nt // 000000005828: E07E1000 8004C80B
v_cvt_pk_bf16_f32 v208, v208, v209 // 000000005830: D26800D0 0003A3D0
v_cvt_pk_bf16_f32 v209, v210, v211 // 000000005838: D26800D1 0003A7D2
v_cvt_pk_bf16_f32 v210, v212, v213 // 000000005840: D26800D2 0003ABD4
v_cvt_pk_bf16_f32 v211, v214, v215 // 000000005848: D26800D3 0003AFD6
s_lshl_b32 s12, s36, 1 // 000000005850: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005854: 80100C10
s_addc_u32 s17, s17, 0 // 000000005858: 82118011
buffer_store_dwordx4 v[208:211], v11, s[16:19], 0 offen nt // 00000000585C: E07E1000 8004D00B
v_cvt_pk_bf16_f32 v216, v216, v217 // 000000005864: D26800D8 0003B3D8
v_cvt_pk_bf16_f32 v217, v218, v219 // 00000000586C: D26800D9 0003B7DA
v_cvt_pk_bf16_f32 v218, v220, v221 // 000000005874: D26800DA 0003BBDC
v_cvt_pk_bf16_f32 v219, v222, v223 // 00000000587C: D26800DB 0003BFDE
s_lshl_b32 s12, s36, 1 // 000000005884: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005888: 80100C10
s_addc_u32 s17, s17, 0 // 00000000588C: 82118011
buffer_store_dwordx4 v[216:219], v11, s[16:19], 0 offen nt // 000000005890: E07E1000 8004D80B
v_cvt_pk_bf16_f32 v224, v224, v225 // 000000005898: D26800E0 0003C3E0
v_cvt_pk_bf16_f32 v225, v226, v227 // 0000000058A0: D26800E1 0003C7E2
v_cvt_pk_bf16_f32 v226, v228, v229 // 0000000058A8: D26800E2 0003CBE4
v_cvt_pk_bf16_f32 v227, v230, v231 // 0000000058B0: D26800E3 0003CFE6
s_lshl_b32 s12, s36, 1 // 0000000058B8: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000058BC: 80100C10
s_addc_u32 s17, s17, 0 // 0000000058C0: 82118011
buffer_store_dwordx4 v[224:227], v11, s[16:19], 0 offen nt // 0000000058C4: E07E1000 8004E00B
v_cvt_pk_bf16_f32 v232, v232, v233 // 0000000058CC: D26800E8 0003D3E8
v_cvt_pk_bf16_f32 v233, v234, v235 // 0000000058D4: D26800E9 0003D7EA
v_cvt_pk_bf16_f32 v234, v236, v237 // 0000000058DC: D26800EA 0003DBEC
v_cvt_pk_bf16_f32 v235, v238, v239 // 0000000058E4: D26800EB 0003DFEE
s_lshl_b32 s12, s36, 1 // 0000000058EC: 8E0C8124
s_add_u32 s16, s16, s12 // 0000000058F0: 80100C10
s_addc_u32 s17, s17, 0 // 0000000058F4: 82118011
buffer_store_dwordx4 v[232:235], v11, s[16:19], 0 offen nt // 0000000058F8: E07E1000 8004E80B
v_cvt_pk_bf16_f32 v240, v240, v241 // 000000005900: D26800F0 0003E3F0
v_cvt_pk_bf16_f32 v241, v242, v243 // 000000005908: D26800F1 0003E7F2
v_cvt_pk_bf16_f32 v242, v244, v245 // 000000005910: D26800F2 0003EBF4
v_cvt_pk_bf16_f32 v243, v246, v247 // 000000005918: D26800F3 0003EFF6
s_lshl_b32 s12, s36, 1 // 000000005920: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005924: 80100C10
s_addc_u32 s17, s17, 0 // 000000005928: 82118011
buffer_store_dwordx4 v[240:243], v11, s[16:19], 0 offen nt // 00000000592C: E07E1000 8004F00B
s_nop 0 // 000000005934: BF800000
v_accvgpr_read_b32 v16, a131 // 000000005938: D3D84010 18000183
v_accvgpr_read_b32 v17, a135 // 000000005940: D3D84011 18000187
v_accvgpr_read_b32 v18, a139 // 000000005948: D3D84012 1800018B
v_accvgpr_read_b32 v19, a143 // 000000005950: D3D84013 1800018F
v_accvgpr_read_b32 v20, a147 // 000000005958: D3D84014 18000193
v_accvgpr_read_b32 v21, a151 // 000000005960: D3D84015 18000197
v_accvgpr_read_b32 v22, a155 // 000000005968: D3D84016 1800019B
v_accvgpr_read_b32 v23, a159 // 000000005970: D3D84017 1800019F
v_accvgpr_read_b32 v24, a163 // 000000005978: D3D84018 180001A3
v_accvgpr_read_b32 v25, a167 // 000000005980: D3D84019 180001A7
v_accvgpr_read_b32 v26, a171 // 000000005988: D3D8401A 180001AB
v_accvgpr_read_b32 v27, a175 // 000000005990: D3D8401B 180001AF
v_accvgpr_read_b32 v28, a179 // 000000005998: D3D8401C 180001B3
v_accvgpr_read_b32 v29, a183 // 0000000059A0: D3D8401D 180001B7
v_accvgpr_read_b32 v30, a187 // 0000000059A8: D3D8401E 180001BB
v_accvgpr_read_b32 v31, a191 // 0000000059B0: D3D8401F 180001BF
v_accvgpr_read_b32 v32, a195 // 0000000059B8: D3D84020 180001C3
v_accvgpr_read_b32 v33, a199 // 0000000059C0: D3D84021 180001C7
v_accvgpr_read_b32 v34, a203 // 0000000059C8: D3D84022 180001CB
v_accvgpr_read_b32 v35, a207 // 0000000059D0: D3D84023 180001CF
v_accvgpr_read_b32 v36, a211 // 0000000059D8: D3D84024 180001D3
v_accvgpr_read_b32 v37, a215 // 0000000059E0: D3D84025 180001D7
v_accvgpr_read_b32 v38, a219 // 0000000059E8: D3D84026 180001DB
v_accvgpr_read_b32 v39, a223 // 0000000059F0: D3D84027 180001DF
v_accvgpr_read_b32 v40, a227 // 0000000059F8: D3D84028 180001E3
v_accvgpr_read_b32 v41, a231 // 000000005A00: D3D84029 180001E7
v_accvgpr_read_b32 v42, a235 // 000000005A08: D3D8402A 180001EB
v_accvgpr_read_b32 v43, a239 // 000000005A10: D3D8402B 180001EF
v_accvgpr_read_b32 v44, a243 // 000000005A18: D3D8402C 180001F3
v_accvgpr_read_b32 v45, a247 // 000000005A20: D3D8402D 180001F7
v_accvgpr_read_b32 v46, a251 // 000000005A28: D3D8402E 180001FB
v_accvgpr_read_b32 v47, a255 // 000000005A30: D3D8402F 180001FF
v_mov_b32_e32 v8, 0xffff0000 // 000000005A38: 7E1002FF FFFF0000
v_mov_b32_e32 v9, 0x7fff0000 // 000000005A40: 7E1202FF 7FFF0000
v_mov_b32_e32 v10, 0x7fff // 000000005A48: 7E1402FF 00007FFF
v_cvt_pk_bf16_f32 v16, v16, v17 // 000000005A50: D2680010 00022310
v_cvt_pk_bf16_f32 v17, v18, v19 // 000000005A58: D2680011 00022712
v_cvt_pk_bf16_f32 v18, v20, v21 // 000000005A60: D2680012 00022B14
v_cvt_pk_bf16_f32 v19, v22, v23 // 000000005A68: D2680013 00022F16
s_lshl_b32 s12, s36, 1 // 000000005A70: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005A74: 80100C10
s_addc_u32 s17, s17, 0 // 000000005A78: 82118011
buffer_store_dwordx4 v[16:19], v11, s[16:19], 0 offen nt // 000000005A7C: E07E1000 8004100B
v_cvt_pk_bf16_f32 v24, v24, v25 // 000000005A84: D2680018 00023318
v_cvt_pk_bf16_f32 v25, v26, v27 // 000000005A8C: D2680019 0002371A
v_cvt_pk_bf16_f32 v26, v28, v29 // 000000005A94: D268001A 00023B1C
v_cvt_pk_bf16_f32 v27, v30, v31 // 000000005A9C: D268001B 00023F1E
s_lshl_b32 s12, s36, 1 // 000000005AA4: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005AA8: 80100C10
s_addc_u32 s17, s17, 0 // 000000005AAC: 82118011
buffer_store_dwordx4 v[24:27], v11, s[16:19], 0 offen nt // 000000005AB0: E07E1000 8004180B
v_cvt_pk_bf16_f32 v32, v32, v33 // 000000005AB8: D2680020 00024320
v_cvt_pk_bf16_f32 v33, v34, v35 // 000000005AC0: D2680021 00024722
v_cvt_pk_bf16_f32 v34, v36, v37 // 000000005AC8: D2680022 00024B24
v_cvt_pk_bf16_f32 v35, v38, v39 // 000000005AD0: D2680023 00024F26
s_lshl_b32 s12, s36, 1 // 000000005AD8: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005ADC: 80100C10
s_addc_u32 s17, s17, 0 // 000000005AE0: 82118011
buffer_store_dwordx4 v[32:35], v11, s[16:19], 0 offen nt // 000000005AE4: E07E1000 8004200B
v_cvt_pk_bf16_f32 v40, v40, v41 // 000000005AEC: D2680028 00025328
v_cvt_pk_bf16_f32 v41, v42, v43 // 000000005AF4: D2680029 0002572A
v_cvt_pk_bf16_f32 v42, v44, v45 // 000000005AFC: D268002A 00025B2C
v_cvt_pk_bf16_f32 v43, v46, v47 // 000000005B04: D268002B 00025F2E
s_lshl_b32 s12, s36, 1 // 000000005B0C: 8E0C8124
s_add_u32 s16, s16, s12 // 000000005B10: 80100C10
s_addc_u32 s17, s17, 0 // 000000005B14: 82118011
buffer_store_dwordx4 v[40:43], v11, s[16:19], 0 offen nt // 000000005B18: E07E1000 8004280B
s_nop 0 // 000000005B20: BF800000
end:
s_endpgm // 00000001F5D0: BF810000
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel gemm
# ---- basic memory requirements ----
.amdhsa_group_segment_fixed_size 133120
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 32
# ---- register usage (RSRC1) ----
.amdhsa_next_free_vgpr 504
.amdhsa_next_free_sgpr 96
# ---- workgroup / workitem IDs (RSRC2) ----
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 1
# ---- user SGPR enables (descriptor bits >448) ----
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_count 2
.amdhsa_user_sgpr_kernarg_preload_length 0
.amdhsa_user_sgpr_kernarg_preload_offset 0
# ---- gfx90a / gfx940 specific (RSRC3) ----
.amdhsa_accum_offset 248
.amdhsa_uses_dynamic_stack 0
.amdhsa_tg_split 0
.end_amdhsa_kernel
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .address_space: global
.name: C
.offset: 0
.size: 8
.value_kind: global_buffer
.value_type: bf16
- .address_space: global
.name: B
.offset: 8
.size: 8
.value_kind: global_buffer
.value_type: bf16
- .address_space: global
.name: A
.offset: 16
.size: 8
.value_kind: global_buffer
.value_type: bf16
- .name: sz
.offset: 24
.size: 4
.value_kind: by_value
.value_type: u32
- .name: num_wg
.offset: 28
.size: 4
.value_kind: by_value
.value_type: u32
.group_segment_fixed_size: 133120
.kernarg_segment_align: 8
.kernarg_segment_size: 32
.max_flat_workgroup_size: 256
.name: gemm
.private_segment_fixed_size: 0
.sgpr_count: 88
.sgpr_spill_count: 0
.symbol: gemm.kd
.vgpr_count: 248
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.version:
- 1
- 0
...
.end_amdgpu_metadata