mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-14 17:38:06 -05:00
* Add mmapeak implementation for 7900 XTX * Change identation * Use a template instead of multiple assebly files * Fix output formatting * Reduce register file bank conflicts * More accurate measurement for quick instructions * Add support for gfx1201 * RDNA4 wmma requires less VGRPs * RDNA4 does not have s_cmpk instructions * Add v_wmma_i32_16x16x32_iu4 for gfx1201 * Add sparse wmma instructions --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
41 lines
818 B
ArmAsm
41 lines
818 B
ArmAsm
.text
|
|
.globl matmul
|
|
.p2align 8
|
|
.type matmul,@function
|
|
matmul:
|
|
s_mov_b32 s1, 1000000
|
|
s_mov_b32 s2, 0
|
|
inner_loop:
|
|
INSTRUCTION
|
|
s_sub_u32 s1, s1, 1
|
|
s_cmp_lg_i32 s1, s2
|
|
s_cbranch_scc1 inner_loop
|
|
s_endpgm
|
|
|
|
.rodata
|
|
.p2align 6
|
|
.amdhsa_kernel matmul
|
|
.amdhsa_next_free_vgpr .amdgcn.next_free_vgpr
|
|
.amdhsa_next_free_sgpr .amdgcn.next_free_sgpr
|
|
.amdhsa_wavefront_size32 1
|
|
.end_amdhsa_kernel
|
|
|
|
.amdgpu_metadata
|
|
---
|
|
amdhsa.version:
|
|
- 1
|
|
- 0
|
|
amdhsa.kernels:
|
|
- .name: matmul
|
|
.symbol: matmul.kd
|
|
.kernarg_segment_size: 0
|
|
.group_segment_fixed_size: 0
|
|
.private_segment_fixed_size: 0
|
|
.kernarg_segment_align: 4
|
|
.wavefront_size: 32
|
|
.sgpr_count: 8
|
|
.vgpr_count: 32
|
|
.max_flat_workgroup_size: 1024
|
|
.args:
|
|
...
|
|
.end_amdgpu_metadata |