mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-06 21:53:53 -05:00
79 lines
1.8 KiB
ArmAsm
79 lines
1.8 KiB
ArmAsm
.text
|
|
.section .text.
|
|
.global gemm
|
|
.p2align 8
|
|
.type gemm,@function
|
|
|
|
gemm:
|
|
INSTRUCTIONS
|
|
|
|
.section .rodata,"a",@progbits
|
|
.p2align 6, 0x0
|
|
.amdhsa_kernel gemm
|
|
# basic memory requirements
|
|
.amdhsa_group_segment_fixed_size 133120
|
|
.amdhsa_private_segment_fixed_size 0
|
|
.amdhsa_kernarg_size 28
|
|
# register usage (RSRC1)
|
|
.amdhsa_next_free_vgpr 504
|
|
.amdhsa_next_free_sgpr 96
|
|
# workgroup / workitem IDs (RSRC2)
|
|
.amdhsa_system_sgpr_workgroup_id_x 1
|
|
.amdhsa_system_sgpr_workgroup_id_y 1
|
|
.amdhsa_system_sgpr_workgroup_id_z 1
|
|
# user SGPRs, we only specify the kernel args ptr in s[0:1]
|
|
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
|
.amdhsa_user_sgpr_count 2
|
|
.amdhsa_user_sgpr_kernarg_preload_length 0
|
|
.amdhsa_user_sgpr_kernarg_preload_offset 0
|
|
# gfx90a / gfx940 specifics (RSRC3)
|
|
.amdhsa_accum_offset 248
|
|
.amdhsa_uses_dynamic_stack 0
|
|
.amdhsa_tg_split 0
|
|
.end_amdhsa_kernel
|
|
|
|
.amdgpu_metadata
|
|
---
|
|
amdhsa.kernels:
|
|
- .name: gemm
|
|
.symbol: gemm.kd
|
|
.args:
|
|
- .name: C
|
|
.address_space: global
|
|
.offset: 0
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
.value_type: bf16
|
|
- .name: B
|
|
.address_space: global
|
|
.offset: 8
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
.value_type: bf16
|
|
- .name: A
|
|
.address_space: global
|
|
.offset: 16
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
.value_type: bf16
|
|
- .name: sz
|
|
.offset: 24
|
|
.size: 4
|
|
.value_kind: by_value
|
|
.value_type: u32
|
|
.group_segment_fixed_size: 133120
|
|
.private_segment_fixed_size: 0
|
|
.kernarg_segment_align: 8
|
|
.kernarg_segment_size: 28
|
|
.max_flat_workgroup_size: 256
|
|
.sgpr_count: 88
|
|
.sgpr_spill_count: 0
|
|
.vgpr_count: 248
|
|
.vgpr_spill_count: 0
|
|
.wavefront_size: 64
|
|
amdhsa.version:
|
|
- 1
|
|
- 0
|
|
...
|
|
.end_amdgpu_metadata
|