MI300X support (WIP) (#9585)

This commit is contained in:
uuuvn
2025-03-29 16:46:42 +05:00
committed by GitHub
parent 77f0d09ecf
commit 5908b89f71
16 changed files with 313047 additions and 65 deletions

View File

@@ -308,12 +308,23 @@ generate_am() {
sed -i "s\(int64_t)\ \g" $BASE/am/am.py
sed -i "s\AMDGPU_PTE_MTYPE_VG10(2)\AMDGPU_PTE_MTYPE_VG10(0, 2)\g" $BASE/am/am.py # incorrect parsing (TODO: remove when clang2py is gone).
clang2py -k cdefstum \
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
extra/hip_gpu_driver/soc15d.h \
-o $BASE/am/pm4_soc15.py
fixup $BASE/am/pm4_soc15.py
clang2py -k cdefstum \
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
extra/hip_gpu_driver/nvd.h \
-o $BASE/am/pm4_nv.py
fixup $BASE/am/pm4_nv.py
clang2py -k cdefstum \
extra/amdpci/headers/vega10_enum.h \
-o $BASE/am/vega10.py
fixup $BASE/am/vega10.py
clang2py -k cdefstum \
extra/amdpci/headers/navi10_enum.h \
-o $BASE/am/navi10.py
@@ -341,6 +352,13 @@ generate_am() {
-o $BASE/am/mp_11_0.py
fixup $BASE/am/mp_11_0.py
clang2py -k cdefstum \
extra/amdpci/headers/gc_9_4_3_offset.h \
extra/amdpci/headers/gc_9_4_3_sh_mask.h \
extra/amdpci/overlay/gc_9_4_3.h \
-o $BASE/am/gc_9_4_3.py
fixup $BASE/am/gc_9_4_3.py
clang2py -k cdefstum \
extra/amdpci/headers/gc_10_3_0_offset.h \
extra/amdpci/headers/gc_10_3_0_sh_mask.h \
@@ -359,6 +377,13 @@ generate_am() {
-o $BASE/am/gc_12_0_0.py
fixup $BASE/am/gc_12_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/vega10_sdma_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_4_0_0.py
fixup $BASE/am/sdma_4_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/navi10_sdma_pkt_open.h \
@@ -403,6 +428,12 @@ generate_am() {
-o $BASE/am/nbif_6_3_1.py
fixup $BASE/am/nbif_6_3_1.py
clang2py -k cdefstum \
extra/amdpci/headers/nbio_7_9_0_offset.h \
extra/amdpci/headers/nbio_7_9_0_sh_mask.h \
-o $BASE/am/nbio_7_9_0.py
fixup $BASE/am/nbio_7_9_0.py
clang2py -k cdefstum \
extra/amdpci/headers/osssys_6_0_0_offset.h \
extra/amdpci/headers/osssys_6_0_0_sh_mask.h \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
// From MQD struct
#define regCOMPUTE_CURRENT_LOGIC_XCC_ID 0x0e25
#define regCOMPUTE_CURRENT_LOGIC_XCC_ID_BASE_IDX 0
// Mask is probably not full register, doesn't matter though
#define COMPUTE_CURRENT_LOGIC_XCC_ID__CURRENT_LOGIC_XCC_ID__SHIFT 0x0
#define COMPUTE_CURRENT_LOGIC_XCC_ID__CURRENT_LOGIC_XCC_ID_MASK 0xFFFFFFFFL

View File

@@ -0,0 +1,444 @@
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef SOC15_H
#define SOC15_H
#define GFX9_NUM_GFX_RINGS 1
#define GFX9_NUM_COMPUTE_RINGS 8
/*
* PM4
*/
#define PACKET_TYPE0 0
#define PACKET_TYPE1 1
#define PACKET_TYPE2 2
#define PACKET_TYPE3 3
#define CP_PACKET_GET_TYPE(h) (((h) >> 30) & 3)
#define CP_PACKET_GET_COUNT(h) (((h) >> 16) & 0x3FFF)
#define CP_PACKET0_GET_REG(h) ((h) & 0xFFFF)
#define CP_PACKET3_GET_OPCODE(h) (((h) >> 8) & 0xFF)
#define PACKET0(reg, n) ((PACKET_TYPE0 << 30) | \
((reg) & 0xFFFF) | \
((n) & 0x3FFF) << 16)
#define CP_PACKET2 0x80000000
#define PACKET2_PAD_SHIFT 0
#define PACKET2_PAD_MASK (0x3fffffff << 0)
#define PACKET2(v) (CP_PACKET2 | REG_SET(PACKET2_PAD, (v)))
#define PACKET3(op, n) ((PACKET_TYPE3 << 30) | \
(((op) & 0xFF) << 8) | \
((n) & 0x3FFF) << 16)
#define PACKET3_COMPUTE(op, n) (PACKET3(op, n) | 1 << 1)
#define PACKETJ_CONDITION_CHECK0 0
#define PACKETJ_CONDITION_CHECK1 1
#define PACKETJ_CONDITION_CHECK2 2
#define PACKETJ_CONDITION_CHECK3 3
#define PACKETJ_CONDITION_CHECK4 4
#define PACKETJ_CONDITION_CHECK5 5
#define PACKETJ_CONDITION_CHECK6 6
#define PACKETJ_CONDITION_CHECK7 7
#define PACKETJ_TYPE0 0
#define PACKETJ_TYPE1 1
#define PACKETJ_TYPE2 2
#define PACKETJ_TYPE3 3
#define PACKETJ_TYPE4 4
#define PACKETJ_TYPE5 5
#define PACKETJ_TYPE6 6
#define PACKETJ_TYPE7 7
#define PACKETJ(reg, r, cond, type) ((reg & 0x3FFFF) | \
((r & 0x3F) << 18) | \
((cond & 0xF) << 24) | \
((type & 0xF) << 28))
#define CP_PACKETJ_NOP 0x60000000
#define CP_PACKETJ_GET_REG(x) ((x) & 0x3FFFF)
#define CP_PACKETJ_GET_RES(x) (((x) >> 18) & 0x3F)
#define CP_PACKETJ_GET_COND(x) (((x) >> 24) & 0xF)
#define CP_PACKETJ_GET_TYPE(x) (((x) >> 28) & 0xF)
/* Packet 3 types */
#define PACKET3_NOP 0x10
#define PACKET3_SET_BASE 0x11
#define PACKET3_BASE_INDEX(x) ((x) << 0)
#define CE_PARTITION_BASE 3
#define PACKET3_CLEAR_STATE 0x12
#define PACKET3_INDEX_BUFFER_SIZE 0x13
#define PACKET3_DISPATCH_DIRECT 0x15
#define PACKET3_DISPATCH_INDIRECT 0x16
#define PACKET3_ATOMIC_GDS 0x1D
#define PACKET3_ATOMIC_MEM 0x1E
#define PACKET3_OCCLUSION_QUERY 0x1F
#define PACKET3_SET_PREDICATION 0x20
#define PACKET3_REG_RMW 0x21
#define PACKET3_COND_EXEC 0x22
#define PACKET3_PRED_EXEC 0x23
#define PACKET3_DRAW_INDIRECT 0x24
#define PACKET3_DRAW_INDEX_INDIRECT 0x25
#define PACKET3_INDEX_BASE 0x26
#define PACKET3_DRAW_INDEX_2 0x27
#define PACKET3_CONTEXT_CONTROL 0x28
#define PACKET3_INDEX_TYPE 0x2A
#define PACKET3_DRAW_INDIRECT_MULTI 0x2C
#define PACKET3_DRAW_INDEX_AUTO 0x2D
#define PACKET3_NUM_INSTANCES 0x2F
#define PACKET3_DRAW_INDEX_MULTI_AUTO 0x30
#define PACKET3_INDIRECT_BUFFER_CONST 0x33
#define PACKET3_STRMOUT_BUFFER_UPDATE 0x34
#define PACKET3_DRAW_INDEX_OFFSET_2 0x35
#define PACKET3_DRAW_PREAMBLE 0x36
#define PACKET3_WRITE_DATA 0x37
#define WRITE_DATA_DST_SEL(x) ((x) << 8)
/* 0 - register
* 1 - memory (sync - via GRBM)
* 2 - gl2
* 3 - gds
* 4 - reserved
* 5 - memory (async - direct)
*/
#define WR_ONE_ADDR (1 << 16)
#define WR_CONFIRM (1 << 20)
#define WRITE_DATA_CACHE_POLICY(x) ((x) << 25)
/* 0 - LRU
* 1 - Stream
*/
#define WRITE_DATA_ENGINE_SEL(x) ((x) << 30)
/* 0 - me
* 1 - pfp
* 2 - ce
*/
#define PACKET3_DRAW_INDEX_INDIRECT_MULTI 0x38
#define PACKET3_MEM_SEMAPHORE 0x39
# define PACKET3_SEM_USE_MAILBOX (0x1 << 16)
# define PACKET3_SEM_SEL_SIGNAL_TYPE (0x1 << 20) /* 0 = increment, 1 = write 1 */
# define PACKET3_SEM_SEL_SIGNAL (0x6 << 29)
# define PACKET3_SEM_SEL_WAIT (0x7 << 29)
#define PACKET3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0)
/* 0 - always
* 1 - <
* 2 - <=
* 3 - ==
* 4 - !=
* 5 - >=
* 6 - >
*/
#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4)
/* 0 - reg
* 1 - mem
*/
#define WAIT_REG_MEM_OPERATION(x) ((x) << 6)
/* 0 - wait_reg_mem
* 1 - wr_wait_wr_reg
*/
#define WAIT_REG_MEM_ENGINE(x) ((x) << 8)
/* 0 - me
* 1 - pfp
*/
#define PACKET3_INDIRECT_BUFFER 0x3F
#define INDIRECT_BUFFER_VALID (1 << 23)
#define INDIRECT_BUFFER_CACHE_POLICY(x) ((x) << 28)
/* 0 - LRU
* 1 - Stream
* 2 - Bypass
*/
#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21)
#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30)
#define PACKET3_COPY_DATA 0x40
#define PACKET3_PFP_SYNC_ME 0x42
#define PACKET3_COND_WRITE 0x45
#define PACKET3_EVENT_WRITE 0x46
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
* 1 - ZPASS_DONE, PIXEL_PIPE_STAT_*
* 2 - SAMPLE_PIPELINESTAT
* 3 - SAMPLE_STREAMOUTSTAT*
* 4 - *S_PARTIAL_FLUSH
*/
#define PACKET3_RELEASE_MEM 0x49
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
#define EOP_TCL1_VOL_ACTION_EN (1 << 12)
#define EOP_TC_VOL_ACTION_EN (1 << 13) /* L2 */
#define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */
#define EOP_TCL1_ACTION_EN (1 << 16)
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */
#define EOP_TC_NC_ACTION_EN (1 << 19)
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
#define EOP_EXEC (1 << 28) /* For Trailing Fence */
#define DATA_SEL(x) ((x) << 29)
/* 0 - discard
* 1 - send low 32bit data
* 2 - send 64bit data
* 3 - send 64bit GPU counter value
* 4 - send 64bit sys counter value
*/
#define INT_SEL(x) ((x) << 24)
/* 0 - none
* 1 - interrupt only (DATA_SEL = 0)
* 2 - interrupt when data write is confirmed
*/
#define DST_SEL(x) ((x) << 16)
/* 0 - MC
* 1 - TC/L2
*/
#define PACKET3_PREAMBLE_CNTL 0x4A
# define PACKET3_PREAMBLE_BEGIN_CLEAR_STATE (2 << 28)
# define PACKET3_PREAMBLE_END_CLEAR_STATE (3 << 28)
#define PACKET3_DMA_DATA 0x50
/* 1. header
* 2. CONTROL
* 3. SRC_ADDR_LO or DATA [31:0]
* 4. SRC_ADDR_HI [31:0]
* 5. DST_ADDR_LO [31:0]
* 6. DST_ADDR_HI [7:0]
* 7. COMMAND [30:21] | BYTE_COUNT [20:0]
*/
/* CONTROL */
# define PACKET3_DMA_DATA_ENGINE(x) ((x) << 0)
/* 0 - ME
* 1 - PFP
*/
# define PACKET3_DMA_DATA_SRC_CACHE_POLICY(x) ((x) << 13)
/* 0 - LRU
* 1 - Stream
*/
# define PACKET3_DMA_DATA_DST_SEL(x) ((x) << 20)
/* 0 - DST_ADDR using DAS
* 1 - GDS
* 3 - DST_ADDR using L2
*/
# define PACKET3_DMA_DATA_DST_CACHE_POLICY(x) ((x) << 25)
/* 0 - LRU
* 1 - Stream
*/
# define PACKET3_DMA_DATA_SRC_SEL(x) ((x) << 29)
/* 0 - SRC_ADDR using SAS
* 1 - GDS
* 2 - DATA
* 3 - SRC_ADDR using L2
*/
# define PACKET3_DMA_DATA_CP_SYNC (1 << 31)
/* COMMAND */
# define PACKET3_DMA_DATA_CMD_SAS (1 << 26)
/* 0 - memory
* 1 - register
*/
# define PACKET3_DMA_DATA_CMD_DAS (1 << 27)
/* 0 - memory
* 1 - register
*/
# define PACKET3_DMA_DATA_CMD_SAIC (1 << 28)
# define PACKET3_DMA_DATA_CMD_DAIC (1 << 29)
# define PACKET3_DMA_DATA_CMD_RAW_WAIT (1 << 30)
#define PACKET3_ACQUIRE_MEM 0x58
/* 1. HEADER
* 2. COHER_CNTL [30:0]
* 2.1 ENGINE_SEL [31:31]
* 3. COHER_SIZE [31:0]
* 4. COHER_SIZE_HI [7:0]
* 5. COHER_BASE_LO [31:0]
* 6. COHER_BASE_HI [23:0]
* 7. POLL_INTERVAL [15:0]
*/
/* COHER_CNTL fields for CP_COHER_CNTL */
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_NC_ACTION_ENA(x) ((x) << 3)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WC_ACTION_ENA(x) ((x) << 4)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_INV_METADATA_ACTION_ENA(x) ((x) << 5)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_VOL_ACTION_ENA(x) ((x) << 15)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(x) ((x) << 18)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(x) ((x) << 22)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(x) ((x) << 23)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_CB_ACTION_ENA(x) ((x) << 25)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_DB_ACTION_ENA(x) ((x) << 26)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(x) ((x) << 27)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_VOL_ACTION_ENA(x) ((x) << 28)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(x) ((x) << 29)
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_WB_ACTION_ENA(x) ((x) << 30)
#define PACKET3_REWIND 0x59
#define PACKET3_LOAD_UCONFIG_REG 0x5E
#define PACKET3_LOAD_SH_REG 0x5F
#define PACKET3_LOAD_CONFIG_REG 0x60
#define PACKET3_LOAD_CONTEXT_REG 0x61
#define PACKET3_SET_CONFIG_REG 0x68
#define PACKET3_SET_CONFIG_REG_START 0x00002000
#define PACKET3_SET_CONFIG_REG_END 0x00002c00
#define PACKET3_SET_CONTEXT_REG 0x69
#define PACKET3_SET_CONTEXT_REG_START 0x0000a000
#define PACKET3_SET_CONTEXT_REG_END 0x0000a400
#define PACKET3_SET_CONTEXT_REG_INDIRECT 0x73
#define PACKET3_SET_SH_REG 0x76
#define PACKET3_SET_SH_REG_START 0x00002c00
#define PACKET3_SET_SH_REG_END 0x00003000
#define PACKET3_SET_SH_REG_OFFSET 0x77
#define PACKET3_SET_QUEUE_REG 0x78
#define PACKET3_SET_UCONFIG_REG 0x79
#define PACKET3_SET_UCONFIG_REG_START 0x0000c000
#define PACKET3_SET_UCONFIG_REG_END 0x0000c400
#define PACKET3_SET_UCONFIG_REG_INDEX_TYPE (2 << 28)
#define PACKET3_SCRATCH_RAM_WRITE 0x7D
#define PACKET3_SCRATCH_RAM_READ 0x7E
#define PACKET3_LOAD_CONST_RAM 0x80
#define PACKET3_WRITE_CONST_RAM 0x81
#define PACKET3_DUMP_CONST_RAM 0x83
#define PACKET3_INCREMENT_CE_COUNTER 0x84
#define PACKET3_INCREMENT_DE_COUNTER 0x85
#define PACKET3_WAIT_ON_CE_COUNTER 0x86
#define PACKET3_WAIT_ON_DE_COUNTER_DIFF 0x88
#define PACKET3_SWITCH_BUFFER 0x8B
#define PACKET3_FRAME_CONTROL 0x90
# define FRAME_TMZ (1 << 0)
# define FRAME_CMD(x) ((x) << 28)
/*
* x=0: tmz_begin
* x=1: tmz_end
*/
#define PACKET3_INVALIDATE_TLBS 0x98
# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0)
# define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4)
# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5)
# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29)
#define PACKET3_SET_RESOURCES 0xA0
/* 1. header
* 2. CONTROL
* 3. QUEUE_MASK_LO [31:0]
* 4. QUEUE_MASK_HI [31:0]
* 5. GWS_MASK_LO [31:0]
* 6. GWS_MASK_HI [31:0]
* 7. OAC_MASK [15:0]
* 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0]
*/
# define PACKET3_SET_RESOURCES_VMID_MASK(x) ((x) << 0)
# define PACKET3_SET_RESOURCES_UNMAP_LATENTY(x) ((x) << 16)
# define PACKET3_SET_RESOURCES_QUEUE_TYPE(x) ((x) << 29)
#define PACKET3_MAP_QUEUES 0xA2
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. MQD_ADDR_LO [31:0]
* 5. MQD_ADDR_HI [31:0]
* 6. WPTR_ADDR_LO [31:0]
* 7. WPTR_ADDR_HI [31:0]
*/
/* CONTROL */
# define PACKET3_MAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
# define PACKET3_MAP_QUEUES_VMID(x) ((x) << 8)
# define PACKET3_MAP_QUEUES_QUEUE(x) ((x) << 13)
# define PACKET3_MAP_QUEUES_PIPE(x) ((x) << 16)
# define PACKET3_MAP_QUEUES_ME(x) ((x) << 18)
# define PACKET3_MAP_QUEUES_QUEUE_TYPE(x) ((x) << 21)
# define PACKET3_MAP_QUEUES_ALLOC_FORMAT(x) ((x) << 24)
# define PACKET3_MAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
# define PACKET3_MAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
/* CONTROL2 */
# define PACKET3_MAP_QUEUES_CHECK_DISABLE(x) ((x) << 1)
# define PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x) ((x) << 2)
#define PACKET3_UNMAP_QUEUES 0xA3
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. CONTROL3
* 5. CONTROL4
* 6. CONTROL5
*/
/* CONTROL */
# define PACKET3_UNMAP_QUEUES_ACTION(x) ((x) << 0)
/* 0 - PREEMPT_QUEUES
* 1 - RESET_QUEUES
* 2 - DISABLE_PROCESS_QUEUES
* 3 - PREEMPT_QUEUES_NO_UNMAP
*/
# define PACKET3_UNMAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
# define PACKET3_UNMAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
# define PACKET3_UNMAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
/* CONTROL2a */
# define PACKET3_UNMAP_QUEUES_PASID(x) ((x) << 0)
/* CONTROL2b */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x) ((x) << 2)
/* CONTROL3a */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x) ((x) << 2)
/* CONTROL3b */
# define PACKET3_UNMAP_QUEUES_RB_WPTR(x) ((x) << 0)
/* CONTROL4 */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x) ((x) << 2)
/* CONTROL5 */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x) ((x) << 2)
#define PACKET3_QUERY_STATUS 0xA4
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. ADDR_LO [31:0]
* 5. ADDR_HI [31:0]
* 6. DATA_LO [31:0]
* 7. DATA_HI [31:0]
*/
/* CONTROL */
# define PACKET3_QUERY_STATUS_CONTEXT_ID(x) ((x) << 0)
# define PACKET3_QUERY_STATUS_INTERRUPT_SEL(x) ((x) << 28)
# define PACKET3_QUERY_STATUS_COMMAND(x) ((x) << 30)
/* CONTROL2a */
# define PACKET3_QUERY_STATUS_PASID(x) ((x) << 0)
/* CONTROL2b */
# define PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x) ((x) << 2)
# define PACKET3_QUERY_STATUS_ENG_SEL(x) ((x) << 25)
#define PACKET3_RUN_CLEANER_SHADER 0xD2
/* 1. header
* 2. RESERVED [31:0]
*/
#define VCE_CMD_NO_OP 0x00000000
#define VCE_CMD_END 0x00000001
#define VCE_CMD_IB 0x00000002
#define VCE_CMD_FENCE 0x00000003
#define VCE_CMD_TRAP 0x00000004
#define VCE_CMD_IB_AUTO 0x00000005
#define VCE_CMD_SEMAPHORE 0x00000006
#define VCE_CMD_IB_VM 0x00000102
#define VCE_CMD_WAIT_GE 0x00000106
#define VCE_CMD_UPDATE_PTB 0x00000107
#define VCE_CMD_FLUSH_TLB 0x00000108
#define VCE_CMD_REG_WRITE 0x00000109
#define VCE_CMD_REG_WAIT 0x0000010a
#define HEVC_ENC_CMD_NO_OP 0x00000000
#define HEVC_ENC_CMD_END 0x00000001
#define HEVC_ENC_CMD_FENCE 0x00000003
#define HEVC_ENC_CMD_TRAP 0x00000004
#define HEVC_ENC_CMD_IB_VM 0x00000102
#define HEVC_ENC_CMD_REG_WRITE 0x00000109
#define HEVC_ENC_CMD_REG_WAIT 0x0000010a
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,931 @@
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes
class AsDictMixin:
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
F32_MES_PM4_PACKETS_H = True # macro
uint32_t = True # macro
int32_t = True # macro
PM4_MES_HEADER_DEFINED = True # macro
PM4_MEC_RELEASE_MEM_DEFINED = True # macro
PM4_MEC_WRITE_DATA_DEFINED = True # macro
class union_PM4_MES_TYPE_3_HEADER(Union):
pass
class struct_PM4_MES_TYPE_3_HEADER_0(Structure):
pass
struct_PM4_MES_TYPE_3_HEADER_0._pack_ = 1 # source:False
struct_PM4_MES_TYPE_3_HEADER_0._fields_ = [
('reserved1', ctypes.c_uint32, 8),
('opcode', ctypes.c_uint32, 8),
('count', ctypes.c_uint32, 14),
('type', ctypes.c_uint32, 2),
]
union_PM4_MES_TYPE_3_HEADER._pack_ = 1 # source:False
union_PM4_MES_TYPE_3_HEADER._anonymous_ = ('_0',)
union_PM4_MES_TYPE_3_HEADER._fields_ = [
('_0', struct_PM4_MES_TYPE_3_HEADER_0),
('u32All', ctypes.c_uint32),
]
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
5: 'event_index__mec_release_mem__end_of_pipe',
6: 'event_index__mec_release_mem__shader_done',
}
event_index__mec_release_mem__end_of_pipe = 5
event_index__mec_release_mem__shader_done = 6
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'cache_policy__mec_release_mem__lru',
1: 'cache_policy__mec_release_mem__stream',
}
cache_policy__mec_release_mem__lru = 0
cache_policy__mec_release_mem__stream = 1
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'pq_exe_status__mec_release_mem__default',
1: 'pq_exe_status__mec_release_mem__phase_update',
}
pq_exe_status__mec_release_mem__default = 0
pq_exe_status__mec_release_mem__phase_update = 1
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'dst_sel__mec_release_mem__memory_controller',
1: 'dst_sel__mec_release_mem__tc_l2',
2: 'dst_sel__mec_release_mem__queue_write_pointer_register',
3: 'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
}
dst_sel__mec_release_mem__memory_controller = 0
dst_sel__mec_release_mem__tc_l2 = 1
dst_sel__mec_release_mem__queue_write_pointer_register = 2
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'int_sel__mec_release_mem__none',
1: 'int_sel__mec_release_mem__send_interrupt_only',
2: 'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
3: 'int_sel__mec_release_mem__send_data_after_write_confirm',
4: 'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
5: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
6: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
}
int_sel__mec_release_mem__none = 0
int_sel__mec_release_mem__send_interrupt_only = 1
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2
int_sel__mec_release_mem__send_data_after_write_confirm = 3
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'data_sel__mec_release_mem__none',
1: 'data_sel__mec_release_mem__send_32_bit_low',
2: 'data_sel__mec_release_mem__send_64_bit_data',
3: 'data_sel__mec_release_mem__send_gpu_clock_counter',
4: 'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
5: 'data_sel__mec_release_mem__store_gds_data_to_memory',
}
data_sel__mec_release_mem__none = 0
data_sel__mec_release_mem__send_32_bit_low = 1
data_sel__mec_release_mem__send_64_bit_data = 2
data_sel__mec_release_mem__send_gpu_clock_counter = 3
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4
data_sel__mec_release_mem__store_gds_data_to_memory = 5
c_uint32 = ctypes.c_uint32 # enum
class struct_pm4_mec_release_mem(Structure):
pass
class union_pm4_mec_release_mem_0(Union):
pass
union_pm4_mec_release_mem_0._pack_ = 1 # source:False
union_pm4_mec_release_mem_0._fields_ = [
('header', union_PM4_MES_TYPE_3_HEADER),
('ordinal1', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_1(Union):
pass
class struct_pm4_mec_release_mem_1_bitfields2(Structure):
pass
struct_pm4_mec_release_mem_1_bitfields2._pack_ = 1 # source:False
struct_pm4_mec_release_mem_1_bitfields2._fields_ = [
('event_type', ctypes.c_uint32, 6),
('reserved1', ctypes.c_uint32, 2),
('event_index', c_uint32, 4),
('tcl1_vol_action_ena', ctypes.c_uint32, 1),
('tc_vol_action_ena', ctypes.c_uint32, 1),
('reserved2', ctypes.c_uint32, 1),
('tc_wb_action_ena', ctypes.c_uint32, 1),
('tcl1_action_ena', ctypes.c_uint32, 1),
('tc_action_ena', ctypes.c_uint32, 1),
('reserved3', ctypes.c_uint32, 1),
('tc_nc_action_ena', ctypes.c_uint32, 1),
('tc_wc_action_ena', ctypes.c_uint32, 1),
('tc_md_action_ena', ctypes.c_uint32, 1),
('reserved4', ctypes.c_uint32, 3),
('cache_policy', c_uint32, 2),
('reserved5', ctypes.c_uint32, 2),
('pq_exe_status', c_uint32, 1),
('reserved6', ctypes.c_uint32, 2),
]
union_pm4_mec_release_mem_1._pack_ = 1 # source:False
union_pm4_mec_release_mem_1._fields_ = [
('bitfields2', struct_pm4_mec_release_mem_1_bitfields2),
('ordinal2', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_2(Union):
pass
class struct_pm4_mec_release_mem_2_bitfields3(Structure):
pass
struct_pm4_mec_release_mem_2_bitfields3._pack_ = 1 # source:False
struct_pm4_mec_release_mem_2_bitfields3._fields_ = [
('reserved7', ctypes.c_uint32, 16),
('dst_sel', c_uint32, 2),
('reserved8', ctypes.c_uint32, 6),
('int_sel', c_uint32, 3),
('reserved9', ctypes.c_uint32, 2),
('data_sel', c_uint32, 3),
]
union_pm4_mec_release_mem_2._pack_ = 1 # source:False
union_pm4_mec_release_mem_2._fields_ = [
('bitfields3', struct_pm4_mec_release_mem_2_bitfields3),
('ordinal3', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_3(Union):
pass
class struct_pm4_mec_release_mem_3_bitfields4(Structure):
pass
struct_pm4_mec_release_mem_3_bitfields4._pack_ = 1 # source:False
struct_pm4_mec_release_mem_3_bitfields4._fields_ = [
('reserved10', ctypes.c_uint32, 2),
('address_lo_32b', ctypes.c_uint32, 30),
]
class struct_pm4_mec_release_mem_3_bitfields4b(Structure):
pass
struct_pm4_mec_release_mem_3_bitfields4b._pack_ = 1 # source:False
struct_pm4_mec_release_mem_3_bitfields4b._fields_ = [
('reserved11', ctypes.c_uint32, 3),
('address_lo_64b', ctypes.c_uint32, 29),
]
union_pm4_mec_release_mem_3._pack_ = 1 # source:False
union_pm4_mec_release_mem_3._fields_ = [
('bitfields4', struct_pm4_mec_release_mem_3_bitfields4),
('bitfields4b', struct_pm4_mec_release_mem_3_bitfields4b),
('reserved12', ctypes.c_uint32),
('ordinal4', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_4(Union):
pass
union_pm4_mec_release_mem_4._pack_ = 1 # source:False
union_pm4_mec_release_mem_4._fields_ = [
('address_hi', ctypes.c_uint32),
('reserved13', ctypes.c_uint32),
('ordinal5', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_5(Union):
pass
class struct_pm4_mec_release_mem_5_bitfields6c(Structure):
pass
struct_pm4_mec_release_mem_5_bitfields6c._pack_ = 1 # source:False
struct_pm4_mec_release_mem_5_bitfields6c._fields_ = [
('dw_offset', ctypes.c_uint32, 16),
('num_dwords', ctypes.c_uint32, 16),
]
union_pm4_mec_release_mem_5._pack_ = 1 # source:False
union_pm4_mec_release_mem_5._fields_ = [
('data_lo', ctypes.c_uint32),
('cmp_data_lo', ctypes.c_uint32),
('bitfields6c', struct_pm4_mec_release_mem_5_bitfields6c),
('reserved14', ctypes.c_uint32),
('ordinal6', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_6(Union):
pass
union_pm4_mec_release_mem_6._pack_ = 1 # source:False
union_pm4_mec_release_mem_6._fields_ = [
('data_hi', ctypes.c_uint32),
('cmp_data_hi', ctypes.c_uint32),
('reserved15', ctypes.c_uint32),
('reserved16', ctypes.c_uint32),
('ordinal7', ctypes.c_uint32),
]
struct_pm4_mec_release_mem._pack_ = 1 # source:False
struct_pm4_mec_release_mem._anonymous_ = ('_0', '_1', '_2', '_3', '_4', '_5', '_6',)
struct_pm4_mec_release_mem._fields_ = [
('_0', union_pm4_mec_release_mem_0),
('_1', union_pm4_mec_release_mem_1),
('_2', union_pm4_mec_release_mem_2),
('_3', union_pm4_mec_release_mem_3),
('_4', union_pm4_mec_release_mem_4),
('_5', union_pm4_mec_release_mem_5),
('_6', union_pm4_mec_release_mem_6),
('int_ctxid', ctypes.c_uint32),
]
# values for enumeration 'WRITE_DATA_dst_sel_enum'
WRITE_DATA_dst_sel_enum__enumvalues = {
0: 'dst_sel___write_data__mem_mapped_register',
2: 'dst_sel___write_data__tc_l2',
3: 'dst_sel___write_data__gds',
5: 'dst_sel___write_data__memory',
6: 'dst_sel___write_data__memory_mapped_adc_persistent_state',
}
dst_sel___write_data__mem_mapped_register = 0
dst_sel___write_data__tc_l2 = 2
dst_sel___write_data__gds = 3
dst_sel___write_data__memory = 5
dst_sel___write_data__memory_mapped_adc_persistent_state = 6
WRITE_DATA_dst_sel_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_addr_incr_enum'
WRITE_DATA_addr_incr_enum__enumvalues = {
0: 'addr_incr___write_data__increment_address',
1: 'addr_incr___write_data__do_not_increment_address',
}
addr_incr___write_data__increment_address = 0
addr_incr___write_data__do_not_increment_address = 1
WRITE_DATA_addr_incr_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_wr_confirm_enum'
WRITE_DATA_wr_confirm_enum__enumvalues = {
0: 'wr_confirm___write_data__do_not_wait_for_write_confirmation',
1: 'wr_confirm___write_data__wait_for_write_confirmation',
}
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0
wr_confirm___write_data__wait_for_write_confirmation = 1
WRITE_DATA_wr_confirm_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_cache_policy_enum'
WRITE_DATA_cache_policy_enum__enumvalues = {
0: 'cache_policy___write_data__lru',
1: 'cache_policy___write_data__stream',
}
cache_policy___write_data__lru = 0
cache_policy___write_data__stream = 1
WRITE_DATA_cache_policy_enum = ctypes.c_uint32 # enum
class struct_pm4_mec_write_data_mmio(Structure):
pass
class union_pm4_mec_write_data_mmio_0(Union):
pass
union_pm4_mec_write_data_mmio_0._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_0._fields_ = [
('header', union_PM4_MES_TYPE_3_HEADER),
('ordinal1', ctypes.c_uint32),
]
class union_pm4_mec_write_data_mmio_1(Union):
pass
class struct_pm4_mec_write_data_mmio_1_bitfields2(Structure):
pass
struct_pm4_mec_write_data_mmio_1_bitfields2._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio_1_bitfields2._fields_ = [
('reserved1', ctypes.c_uint32, 8),
('dst_sel', ctypes.c_uint32, 4),
('reserved2', ctypes.c_uint32, 4),
('addr_incr', ctypes.c_uint32, 1),
('reserved3', ctypes.c_uint32, 2),
('resume_vf', ctypes.c_uint32, 1),
('wr_confirm', ctypes.c_uint32, 1),
('reserved4', ctypes.c_uint32, 4),
('cache_policy', ctypes.c_uint32, 2),
('reserved5', ctypes.c_uint32, 5),
]
union_pm4_mec_write_data_mmio_1._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_1._fields_ = [
('bitfields2', struct_pm4_mec_write_data_mmio_1_bitfields2),
('ordinal2', ctypes.c_uint32),
]
class union_pm4_mec_write_data_mmio_2(Union):
pass
class struct_pm4_mec_write_data_mmio_2_bitfields3(Structure):
pass
struct_pm4_mec_write_data_mmio_2_bitfields3._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio_2_bitfields3._fields_ = [
('dst_mmreg_addr', ctypes.c_uint32, 18),
('reserved6', ctypes.c_uint32, 14),
]
union_pm4_mec_write_data_mmio_2._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_2._fields_ = [
('bitfields3', struct_pm4_mec_write_data_mmio_2_bitfields3),
('ordinal3', ctypes.c_uint32),
]
struct_pm4_mec_write_data_mmio._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio._anonymous_ = ('_0', '_1', '_2',)
struct_pm4_mec_write_data_mmio._fields_ = [
('_0', union_pm4_mec_write_data_mmio_0),
('_1', union_pm4_mec_write_data_mmio_1),
('_2', union_pm4_mec_write_data_mmio_2),
('reserved7', ctypes.c_uint32),
('data', ctypes.c_uint32),
]
# values for enumeration 'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT'
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT__enumvalues = {
20: 'CACHE_FLUSH_AND_INV_TS_EVENT',
}
CACHE_FLUSH_AND_INV_TS_EVENT = 20
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT = ctypes.c_uint32 # enum
SOC15_H = True # macro
GFX9_NUM_GFX_RINGS = 1 # macro
GFX9_NUM_COMPUTE_RINGS = 8 # macro
PACKET_TYPE0 = 0 # macro
PACKET_TYPE1 = 1 # macro
PACKET_TYPE2 = 2 # macro
PACKET_TYPE3 = 3 # macro
def CP_PACKET_GET_TYPE(h): # macro
return (((h)>>30)&3)
def CP_PACKET_GET_COUNT(h): # macro
return (((h)>>16)&0x3FFF)
def CP_PACKET0_GET_REG(h): # macro
return ((h)&0xFFFF)
def CP_PACKET3_GET_OPCODE(h): # macro
return (((h)>>8)&0xFF)
def PACKET0(reg, n): # macro
return ((0<<30)|((reg)&0xFFFF)|((n)&0x3FFF)<<16)
CP_PACKET2 = 0x80000000 # macro
PACKET2_PAD_SHIFT = 0 # macro
PACKET2_PAD_MASK = (0x3fffffff<<0) # macro
# def PACKET2(v): # macro
# return (0x80000000|REG_SET(PACKET2_PAD,(v)))
def PACKET3(op, n): # macro
return ((3<<30)|(((op)&0xFF)<<8)|((n)&0x3FFF)<<16)
def PACKET3_COMPUTE(op, n): # macro
return (PACKET3(op,n)|1<<1)
PACKETJ_CONDITION_CHECK0 = 0 # macro
PACKETJ_CONDITION_CHECK1 = 1 # macro
PACKETJ_CONDITION_CHECK2 = 2 # macro
PACKETJ_CONDITION_CHECK3 = 3 # macro
PACKETJ_CONDITION_CHECK4 = 4 # macro
PACKETJ_CONDITION_CHECK5 = 5 # macro
PACKETJ_CONDITION_CHECK6 = 6 # macro
PACKETJ_CONDITION_CHECK7 = 7 # macro
PACKETJ_TYPE0 = 0 # macro
PACKETJ_TYPE1 = 1 # macro
PACKETJ_TYPE2 = 2 # macro
PACKETJ_TYPE3 = 3 # macro
PACKETJ_TYPE4 = 4 # macro
PACKETJ_TYPE5 = 5 # macro
PACKETJ_TYPE6 = 6 # macro
PACKETJ_TYPE7 = 7 # macro
def PACKETJ(reg, r, cond, type): # macro
return ((reg&0x3FFFF)|((r&0x3F)<<18)|((cond&0xF)<<24)|((type&0xF)<<28))
CP_PACKETJ_NOP = 0x60000000 # macro
def CP_PACKETJ_GET_REG(x): # macro
return ((x)&0x3FFFF)
def CP_PACKETJ_GET_RES(x): # macro
return (((x)>>18)&0x3F)
def CP_PACKETJ_GET_COND(x): # macro
return (((x)>>24)&0xF)
def CP_PACKETJ_GET_TYPE(x): # macro
return (((x)>>28)&0xF)
PACKET3_NOP = 0x10 # macro
PACKET3_SET_BASE = 0x11 # macro
def PACKET3_BASE_INDEX(x): # macro
return ((x)<<0)
CE_PARTITION_BASE = 3 # macro
PACKET3_CLEAR_STATE = 0x12 # macro
PACKET3_INDEX_BUFFER_SIZE = 0x13 # macro
PACKET3_DISPATCH_DIRECT = 0x15 # macro
PACKET3_DISPATCH_INDIRECT = 0x16 # macro
PACKET3_ATOMIC_GDS = 0x1D # macro
PACKET3_ATOMIC_MEM = 0x1E # macro
PACKET3_OCCLUSION_QUERY = 0x1F # macro
PACKET3_SET_PREDICATION = 0x20 # macro
PACKET3_REG_RMW = 0x21 # macro
PACKET3_COND_EXEC = 0x22 # macro
PACKET3_PRED_EXEC = 0x23 # macro
PACKET3_DRAW_INDIRECT = 0x24 # macro
PACKET3_DRAW_INDEX_INDIRECT = 0x25 # macro
PACKET3_INDEX_BASE = 0x26 # macro
PACKET3_DRAW_INDEX_2 = 0x27 # macro
PACKET3_CONTEXT_CONTROL = 0x28 # macro
PACKET3_INDEX_TYPE = 0x2A # macro
PACKET3_DRAW_INDIRECT_MULTI = 0x2C # macro
PACKET3_DRAW_INDEX_AUTO = 0x2D # macro
PACKET3_NUM_INSTANCES = 0x2F # macro
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30 # macro
PACKET3_INDIRECT_BUFFER_CONST = 0x33 # macro
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34 # macro
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35 # macro
PACKET3_DRAW_PREAMBLE = 0x36 # macro
PACKET3_WRITE_DATA = 0x37 # macro
def WRITE_DATA_DST_SEL(x): # macro
return ((x)<<8)
WR_ONE_ADDR = (1<<16) # macro
WR_CONFIRM = (1<<20) # macro
def WRITE_DATA_CACHE_POLICY(x): # macro
return ((x)<<25)
def WRITE_DATA_ENGINE_SEL(x): # macro
return ((x)<<30)
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38 # macro
PACKET3_MEM_SEMAPHORE = 0x39 # macro
PACKET3_SEM_USE_MAILBOX = (0x1<<16) # macro
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1<<20) # macro
PACKET3_SEM_SEL_SIGNAL = (0x6<<29) # macro
PACKET3_SEM_SEL_WAIT = (0x7<<29) # macro
PACKET3_WAIT_REG_MEM = 0x3C # macro
def WAIT_REG_MEM_FUNCTION(x): # macro
return ((x)<<0)
def WAIT_REG_MEM_MEM_SPACE(x): # macro
return ((x)<<4)
def WAIT_REG_MEM_OPERATION(x): # macro
return ((x)<<6)
def WAIT_REG_MEM_ENGINE(x): # macro
return ((x)<<8)
PACKET3_INDIRECT_BUFFER = 0x3F # macro
INDIRECT_BUFFER_VALID = (1<<23) # macro
def INDIRECT_BUFFER_CACHE_POLICY(x): # macro
return ((x)<<28)
def INDIRECT_BUFFER_PRE_ENB(x): # macro
return ((x)<<21)
def INDIRECT_BUFFER_PRE_RESUME(x): # macro
return ((x)<<30)
PACKET3_COPY_DATA = 0x40 # macro
PACKET3_PFP_SYNC_ME = 0x42 # macro
PACKET3_COND_WRITE = 0x45 # macro
PACKET3_EVENT_WRITE = 0x46 # macro
def EVENT_TYPE(x): # macro
return ((x)<<0)
def EVENT_INDEX(x): # macro
return ((x)<<8)
PACKET3_RELEASE_MEM = 0x49 # macro
EOP_TCL1_VOL_ACTION_EN = (1<<12) # macro
EOP_TC_VOL_ACTION_EN = (1<<13) # macro
EOP_TC_WB_ACTION_EN = (1<<15) # macro
EOP_TCL1_ACTION_EN = (1<<16) # macro
EOP_TC_ACTION_EN = (1<<17) # macro
EOP_TC_NC_ACTION_EN = (1<<19) # macro
EOP_TC_MD_ACTION_EN = (1<<21) # macro
EOP_EXEC = (1<<28) # macro
def DATA_SEL(x): # macro
return ((x)<<29)
def INT_SEL(x): # macro
return ((x)<<24)
def DST_SEL(x): # macro
return ((x)<<16)
PACKET3_PREAMBLE_CNTL = 0x4A # macro
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2<<28) # macro
PACKET3_PREAMBLE_END_CLEAR_STATE = (3<<28) # macro
PACKET3_DMA_DATA = 0x50 # macro
def PACKET3_DMA_DATA_ENGINE(x): # macro
return ((x)<<0)
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): # macro
return ((x)<<13)
def PACKET3_DMA_DATA_DST_SEL(x): # macro
return ((x)<<20)
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): # macro
return ((x)<<25)
def PACKET3_DMA_DATA_SRC_SEL(x): # macro
return ((x)<<29)
PACKET3_DMA_DATA_CP_SYNC = (1<<31) # macro
PACKET3_DMA_DATA_CMD_SAS = (1<<26) # macro
PACKET3_DMA_DATA_CMD_DAS = (1<<27) # macro
PACKET3_DMA_DATA_CMD_SAIC = (1<<28) # macro
PACKET3_DMA_DATA_CMD_DAIC = (1<<29) # macro
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1<<30) # macro
PACKET3_ACQUIRE_MEM = 0x58 # macro
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_NC_ACTION_ENA(x): # macro
return ((x)<<3)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WC_ACTION_ENA(x): # macro
return ((x)<<4)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_INV_METADATA_ACTION_ENA(x): # macro
return ((x)<<5)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_VOL_ACTION_ENA(x): # macro
return ((x)<<15)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(x): # macro
return ((x)<<18)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(x): # macro
return ((x)<<22)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(x): # macro
return ((x)<<23)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_CB_ACTION_ENA(x): # macro
return ((x)<<25)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_DB_ACTION_ENA(x): # macro
return ((x)<<26)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(x): # macro
return ((x)<<27)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_VOL_ACTION_ENA(x): # macro
return ((x)<<28)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(x): # macro
return ((x)<<29)
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_WB_ACTION_ENA(x): # macro
return ((x)<<30)
PACKET3_REWIND = 0x59 # macro
PACKET3_LOAD_UCONFIG_REG = 0x5E # macro
PACKET3_LOAD_SH_REG = 0x5F # macro
PACKET3_LOAD_CONFIG_REG = 0x60 # macro
PACKET3_LOAD_CONTEXT_REG = 0x61 # macro
PACKET3_SET_CONFIG_REG = 0x68 # macro
PACKET3_SET_CONFIG_REG_START = 0x00002000 # macro
PACKET3_SET_CONFIG_REG_END = 0x00002c00 # macro
PACKET3_SET_CONTEXT_REG = 0x69 # macro
PACKET3_SET_CONTEXT_REG_START = 0x0000a000 # macro
PACKET3_SET_CONTEXT_REG_END = 0x0000a400 # macro
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73 # macro
PACKET3_SET_SH_REG = 0x76 # macro
PACKET3_SET_SH_REG_START = 0x00002c00 # macro
PACKET3_SET_SH_REG_END = 0x00003000 # macro
PACKET3_SET_SH_REG_OFFSET = 0x77 # macro
PACKET3_SET_QUEUE_REG = 0x78 # macro
PACKET3_SET_UCONFIG_REG = 0x79 # macro
PACKET3_SET_UCONFIG_REG_START = 0x0000c000 # macro
PACKET3_SET_UCONFIG_REG_END = 0x0000c400 # macro
PACKET3_SET_UCONFIG_REG_INDEX_TYPE = (2<<28) # macro
PACKET3_SCRATCH_RAM_WRITE = 0x7D # macro
PACKET3_SCRATCH_RAM_READ = 0x7E # macro
PACKET3_LOAD_CONST_RAM = 0x80 # macro
PACKET3_WRITE_CONST_RAM = 0x81 # macro
PACKET3_DUMP_CONST_RAM = 0x83 # macro
PACKET3_INCREMENT_CE_COUNTER = 0x84 # macro
PACKET3_INCREMENT_DE_COUNTER = 0x85 # macro
PACKET3_WAIT_ON_CE_COUNTER = 0x86 # macro
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88 # macro
PACKET3_SWITCH_BUFFER = 0x8B # macro
PACKET3_FRAME_CONTROL = 0x90 # macro
FRAME_TMZ = (1<<0) # macro
def FRAME_CMD(x): # macro
return ((x)<<28)
PACKET3_INVALIDATE_TLBS = 0x98 # macro
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): # macro
return ((x)<<0)
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): # macro
return ((x)<<4)
def PACKET3_INVALIDATE_TLBS_PASID(x): # macro
return ((x)<<5)
def PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x): # macro
return ((x)<<29)
PACKET3_SET_RESOURCES = 0xA0 # macro
def PACKET3_SET_RESOURCES_VMID_MASK(x): # macro
return ((x)<<0)
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): # macro
return ((x)<<16)
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): # macro
return ((x)<<29)
PACKET3_MAP_QUEUES = 0xA2 # macro
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): # macro
return ((x)<<4)
def PACKET3_MAP_QUEUES_VMID(x): # macro
return ((x)<<8)
def PACKET3_MAP_QUEUES_QUEUE(x): # macro
return ((x)<<13)
def PACKET3_MAP_QUEUES_PIPE(x): # macro
return ((x)<<16)
def PACKET3_MAP_QUEUES_ME(x): # macro
return ((x)<<18)
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): # macro
return ((x)<<21)
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): # macro
return ((x)<<24)
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): # macro
return ((x)<<26)
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): # macro
return ((x)<<29)
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): # macro
return ((x)<<1)
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): # macro
return ((x)<<2)
PACKET3_UNMAP_QUEUES = 0xA3 # macro
def PACKET3_UNMAP_QUEUES_ACTION(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): # macro
return ((x)<<4)
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): # macro
return ((x)<<26)
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): # macro
return ((x)<<29)
def PACKET3_UNMAP_QUEUES_PASID(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): # macro
return ((x)<<2)
PACKET3_QUERY_STATUS = 0xA4 # macro
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): # macro
return ((x)<<0)
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): # macro
return ((x)<<28)
def PACKET3_QUERY_STATUS_COMMAND(x): # macro
return ((x)<<30)
def PACKET3_QUERY_STATUS_PASID(x): # macro
return ((x)<<0)
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): # macro
return ((x)<<2)
def PACKET3_QUERY_STATUS_ENG_SEL(x): # macro
return ((x)<<25)
PACKET3_RUN_CLEANER_SHADER = 0xD2 # macro
VCE_CMD_NO_OP = 0x00000000 # macro
VCE_CMD_END = 0x00000001 # macro
VCE_CMD_IB = 0x00000002 # macro
VCE_CMD_FENCE = 0x00000003 # macro
VCE_CMD_TRAP = 0x00000004 # macro
VCE_CMD_IB_AUTO = 0x00000005 # macro
VCE_CMD_SEMAPHORE = 0x00000006 # macro
VCE_CMD_IB_VM = 0x00000102 # macro
VCE_CMD_WAIT_GE = 0x00000106 # macro
VCE_CMD_UPDATE_PTB = 0x00000107 # macro
VCE_CMD_FLUSH_TLB = 0x00000108 # macro
VCE_CMD_REG_WRITE = 0x00000109 # macro
VCE_CMD_REG_WAIT = 0x0000010a # macro
HEVC_ENC_CMD_NO_OP = 0x00000000 # macro
HEVC_ENC_CMD_END = 0x00000001 # macro
HEVC_ENC_CMD_FENCE = 0x00000003 # macro
HEVC_ENC_CMD_TRAP = 0x00000004 # macro
HEVC_ENC_CMD_IB_VM = 0x00000102 # macro
HEVC_ENC_CMD_REG_WRITE = 0x00000109 # macro
HEVC_ENC_CMD_REG_WAIT = 0x0000010a # macro
__all__ = \
['CACHE_FLUSH_AND_INV_TS_EVENT', 'CE_PARTITION_BASE',
'CP_PACKET2', 'CP_PACKETJ_NOP', 'EOP_EXEC', 'EOP_TCL1_ACTION_EN',
'EOP_TCL1_VOL_ACTION_EN', 'EOP_TC_ACTION_EN',
'EOP_TC_MD_ACTION_EN', 'EOP_TC_NC_ACTION_EN',
'EOP_TC_VOL_ACTION_EN', 'EOP_TC_WB_ACTION_EN',
'F32_MES_PM4_PACKETS_H', 'FRAME_TMZ', 'GFX9_NUM_COMPUTE_RINGS',
'GFX9_NUM_GFX_RINGS', 'HEVC_ENC_CMD_END', 'HEVC_ENC_CMD_FENCE',
'HEVC_ENC_CMD_IB_VM', 'HEVC_ENC_CMD_NO_OP',
'HEVC_ENC_CMD_REG_WAIT', 'HEVC_ENC_CMD_REG_WRITE',
'HEVC_ENC_CMD_TRAP', 'INDIRECT_BUFFER_VALID', 'PACKET2_PAD_MASK',
'PACKET2_PAD_SHIFT', 'PACKET3_ACQUIRE_MEM', 'PACKET3_ATOMIC_GDS',
'PACKET3_ATOMIC_MEM', 'PACKET3_CLEAR_STATE', 'PACKET3_COND_EXEC',
'PACKET3_COND_WRITE', 'PACKET3_CONTEXT_CONTROL',
'PACKET3_COPY_DATA', 'PACKET3_DISPATCH_DIRECT',
'PACKET3_DISPATCH_INDIRECT', 'PACKET3_DMA_DATA',
'PACKET3_DMA_DATA_CMD_DAIC', 'PACKET3_DMA_DATA_CMD_DAS',
'PACKET3_DMA_DATA_CMD_RAW_WAIT', 'PACKET3_DMA_DATA_CMD_SAIC',
'PACKET3_DMA_DATA_CMD_SAS', 'PACKET3_DMA_DATA_CP_SYNC',
'PACKET3_DRAW_INDEX_2', 'PACKET3_DRAW_INDEX_AUTO',
'PACKET3_DRAW_INDEX_INDIRECT',
'PACKET3_DRAW_INDEX_INDIRECT_MULTI',
'PACKET3_DRAW_INDEX_MULTI_AUTO', 'PACKET3_DRAW_INDEX_OFFSET_2',
'PACKET3_DRAW_INDIRECT', 'PACKET3_DRAW_INDIRECT_MULTI',
'PACKET3_DRAW_PREAMBLE', 'PACKET3_DUMP_CONST_RAM',
'PACKET3_EVENT_WRITE', 'PACKET3_FRAME_CONTROL',
'PACKET3_INCREMENT_CE_COUNTER', 'PACKET3_INCREMENT_DE_COUNTER',
'PACKET3_INDEX_BASE', 'PACKET3_INDEX_BUFFER_SIZE',
'PACKET3_INDEX_TYPE', 'PACKET3_INDIRECT_BUFFER',
'PACKET3_INDIRECT_BUFFER_CONST', 'PACKET3_INVALIDATE_TLBS',
'PACKET3_LOAD_CONFIG_REG', 'PACKET3_LOAD_CONST_RAM',
'PACKET3_LOAD_CONTEXT_REG', 'PACKET3_LOAD_SH_REG',
'PACKET3_LOAD_UCONFIG_REG', 'PACKET3_MAP_QUEUES',
'PACKET3_MEM_SEMAPHORE', 'PACKET3_NOP', 'PACKET3_NUM_INSTANCES',
'PACKET3_OCCLUSION_QUERY', 'PACKET3_PFP_SYNC_ME',
'PACKET3_PREAMBLE_BEGIN_CLEAR_STATE', 'PACKET3_PREAMBLE_CNTL',
'PACKET3_PREAMBLE_END_CLEAR_STATE', 'PACKET3_PRED_EXEC',
'PACKET3_QUERY_STATUS', 'PACKET3_REG_RMW', 'PACKET3_RELEASE_MEM',
'PACKET3_REWIND', 'PACKET3_RUN_CLEANER_SHADER',
'PACKET3_SCRATCH_RAM_READ', 'PACKET3_SCRATCH_RAM_WRITE',
'PACKET3_SEM_SEL_SIGNAL', 'PACKET3_SEM_SEL_SIGNAL_TYPE',
'PACKET3_SEM_SEL_WAIT', 'PACKET3_SEM_USE_MAILBOX',
'PACKET3_SET_BASE', 'PACKET3_SET_CONFIG_REG',
'PACKET3_SET_CONFIG_REG_END', 'PACKET3_SET_CONFIG_REG_START',
'PACKET3_SET_CONTEXT_REG', 'PACKET3_SET_CONTEXT_REG_END',
'PACKET3_SET_CONTEXT_REG_INDIRECT',
'PACKET3_SET_CONTEXT_REG_START', 'PACKET3_SET_PREDICATION',
'PACKET3_SET_QUEUE_REG', 'PACKET3_SET_RESOURCES',
'PACKET3_SET_SH_REG', 'PACKET3_SET_SH_REG_END',
'PACKET3_SET_SH_REG_OFFSET', 'PACKET3_SET_SH_REG_START',
'PACKET3_SET_UCONFIG_REG', 'PACKET3_SET_UCONFIG_REG_END',
'PACKET3_SET_UCONFIG_REG_INDEX_TYPE',
'PACKET3_SET_UCONFIG_REG_START', 'PACKET3_STRMOUT_BUFFER_UPDATE',
'PACKET3_SWITCH_BUFFER', 'PACKET3_UNMAP_QUEUES',
'PACKET3_WAIT_ON_CE_COUNTER', 'PACKET3_WAIT_ON_DE_COUNTER_DIFF',
'PACKET3_WAIT_REG_MEM', 'PACKET3_WRITE_CONST_RAM',
'PACKET3_WRITE_DATA', 'PACKETJ_CONDITION_CHECK0',
'PACKETJ_CONDITION_CHECK1', 'PACKETJ_CONDITION_CHECK2',
'PACKETJ_CONDITION_CHECK3', 'PACKETJ_CONDITION_CHECK4',
'PACKETJ_CONDITION_CHECK5', 'PACKETJ_CONDITION_CHECK6',
'PACKETJ_CONDITION_CHECK7', 'PACKETJ_TYPE0', 'PACKETJ_TYPE1',
'PACKETJ_TYPE2', 'PACKETJ_TYPE3', 'PACKETJ_TYPE4',
'PACKETJ_TYPE5', 'PACKETJ_TYPE6', 'PACKETJ_TYPE7', 'PACKET_TYPE0',
'PACKET_TYPE1', 'PACKET_TYPE2', 'PACKET_TYPE3',
'PM4_MEC_RELEASE_MEM_DEFINED', 'PM4_MEC_WRITE_DATA_DEFINED',
'PM4_MES_HEADER_DEFINED', 'SOC15_H', 'VCE_CMD_END',
'VCE_CMD_FENCE', 'VCE_CMD_FLUSH_TLB', 'VCE_CMD_IB',
'VCE_CMD_IB_AUTO', 'VCE_CMD_IB_VM', 'VCE_CMD_NO_OP',
'VCE_CMD_REG_WAIT', 'VCE_CMD_REG_WRITE', 'VCE_CMD_SEMAPHORE',
'VCE_CMD_TRAP', 'VCE_CMD_UPDATE_PTB', 'VCE_CMD_WAIT_GE',
'WRITE_DATA_addr_incr_enum', 'WRITE_DATA_cache_policy_enum',
'WRITE_DATA_dst_sel_enum', 'WRITE_DATA_wr_confirm_enum',
'WR_CONFIRM', 'WR_ONE_ADDR',
'addr_incr___write_data__do_not_increment_address',
'addr_incr___write_data__increment_address',
'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT', 'c_uint32', 'c_uint32',
'c_uint32', 'c_uint32', 'c_uint32', 'c_uint32',
'cache_policy___write_data__lru',
'cache_policy___write_data__stream',
'cache_policy__mec_release_mem__lru',
'cache_policy__mec_release_mem__stream',
'data_sel__mec_release_mem__none',
'data_sel__mec_release_mem__send_32_bit_low',
'data_sel__mec_release_mem__send_64_bit_data',
'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
'data_sel__mec_release_mem__send_gpu_clock_counter',
'data_sel__mec_release_mem__store_gds_data_to_memory',
'dst_sel___write_data__gds',
'dst_sel___write_data__mem_mapped_register',
'dst_sel___write_data__memory',
'dst_sel___write_data__memory_mapped_adc_persistent_state',
'dst_sel___write_data__tc_l2',
'dst_sel__mec_release_mem__memory_controller',
'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
'dst_sel__mec_release_mem__queue_write_pointer_register',
'dst_sel__mec_release_mem__tc_l2',
'event_index__mec_release_mem__end_of_pipe',
'event_index__mec_release_mem__shader_done', 'int32_t',
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
'int_sel__mec_release_mem__none',
'int_sel__mec_release_mem__send_data_after_write_confirm',
'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
'int_sel__mec_release_mem__send_interrupt_only',
'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
'pq_exe_status__mec_release_mem__default',
'pq_exe_status__mec_release_mem__phase_update',
'struct_PM4_MES_TYPE_3_HEADER_0', 'struct_pm4_mec_release_mem',
'struct_pm4_mec_release_mem_1_bitfields2',
'struct_pm4_mec_release_mem_2_bitfields3',
'struct_pm4_mec_release_mem_3_bitfields4',
'struct_pm4_mec_release_mem_3_bitfields4b',
'struct_pm4_mec_release_mem_5_bitfields6c',
'struct_pm4_mec_write_data_mmio',
'struct_pm4_mec_write_data_mmio_1_bitfields2',
'struct_pm4_mec_write_data_mmio_2_bitfields3', 'uint32_t',
'union_PM4_MES_TYPE_3_HEADER', 'union_pm4_mec_release_mem_0',
'union_pm4_mec_release_mem_1', 'union_pm4_mec_release_mem_2',
'union_pm4_mec_release_mem_3', 'union_pm4_mec_release_mem_4',
'union_pm4_mec_release_mem_5', 'union_pm4_mec_release_mem_6',
'union_pm4_mec_write_data_mmio_0',
'union_pm4_mec_write_data_mmio_1',
'union_pm4_mec_write_data_mmio_2',
'wr_confirm___write_data__do_not_wait_for_write_confirmation',
'wr_confirm___write_data__wait_for_write_confirmation']

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@ from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
from tinygrad.ops import sint
from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, all_same, flatten, DEBUG, OSX
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, libc, pci, vfio, sqtt
@@ -32,7 +32,7 @@ class AMDSignal(HCQSignal):
class AMDComputeQueue(HWQueue):
def __init__(self, dev:AMDDevice):
self.soc, self.pm4, self.gc, self.nbio = dev.soc, dev.pm4, dev.gc, dev.nbio
self.dev, self.soc, self.pm4, self.gc, self.nbio = dev, dev.soc, dev.pm4, dev.gc, dev.nbio
super().__init__()
def __del__(self):
@@ -44,6 +44,15 @@ class AMDComputeQueue(HWQueue):
def gfxreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_SH_REG_START
def ucfgreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_UCONFIG_REG_START
@contextlib.contextmanager
def pred_exec(self, xcc_mask:int):
if self.dev.xccs > 1:
self.pkt3(self.pm4.PACKET3_PRED_EXEC, xcc_mask << 24)
prev_len = len(self._q)
yield
if self.dev.xccs > 1:
self._q[prev_len-1] |= (len(self._q) - prev_len)
def sqtt_userdata(self, data, *extra_dwords):
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
for i in range(0, len(data_ints), 2):
@@ -56,33 +65,68 @@ class AMDComputeQueue(HWQueue):
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
if self.dev.gfxver >= 10:
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
else:
cp_coher_cntl = self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(gli) | \
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(glk) | \
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(1) | \
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(1) | \
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(1)
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, cp_coher_cntl, *data64_le(sz), *data64_le(addr), 0x0000000A)
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
def release_mem(self, address=0x0, value=0, data_sel=0, int_sel=2, ctxid=0, cache_flush=False):
if self.dev.gfxver >= 10:
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
else:
cache_flags_dw = 0 if not cache_flush else (self.pm4.EOP_TC_WB_ACTION_EN | self.pm4.EOP_TC_NC_ACTION_EN)
event_dw = self.pm4.EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) | self.pm4.EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
memsel_dw = self.pm4.DATA_SEL(data_sel) | self.pm4.INT_SEL(int_sel)
ctxid = 0
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
def xcc_barrier(self):
if self.dev.xcc_sync is None: return self
assert self.dev.xccs == 8, 'only 8 XCCs supported'
a, b = self.dev.xcc_sync
mem_eq = self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | self.pm4.WAIT_REG_MEM_MEM_SPACE(1)
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(a.value_addr), *data64_le(1), *data64_le(0), 10) # a += 1
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(a.value_addr), 0, 0b111, 10) # a == 0 (mod 8) via bitmask
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(b.value_addr), *data64_le(1), *data64_le(0), 10) # b += 1
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(b.value_addr), 0, 0b111, 10) # b == 0 (mod 8) via bitmask
return self
def memory_barrier(self):
self.wait_reg_mem(reg_req=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_REQ.addr, reg_done=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_DONE.addr,
value=0xffffffff)
self.acquire_mem()
return self
def xcc_config(self):
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TG_CHUNK_SIZE), 1)
for xcc_id in range(self.dev.xccs):
with self.pred_exec(xcc_mask=1 << xcc_id):
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_CURRENT_LOGIC_XCC_ID), xcc_id)
return self
def spi_config(self, tracing:bool):
spi_config_cntl = self.gc.regSPI_CONFIG_CNTL.encode(ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
@@ -167,6 +211,7 @@ class AMDComputeQueue(HWQueue):
self.acquire_mem(gli=0, gl2=0)
if prg.enable_private_segment_sgpr:
assert self.dev.xccs == 1, "Only architected flat scratch is suppored on multi-xcc"
scratch_hilo = data64_le(prg.dev.scratch.va_addr)
# sgpr word1 bit31 enables swizzle
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
@@ -198,42 +243,52 @@ class AMDComputeQueue(HWQueue):
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), prg.rsrc3)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
if prg.dev.has_scratch_base_registers:
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
if prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
for xcc_id in range(self.dev.xccs):
with self.pred_exec(xcc_mask=1<<xcc_id):
scratch_base = prg.dev.scratch.va_addr + (prg.dev.scratch.size // self.dev.xccs * xcc_id)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(scratch_base >> 8))
if 100000 <= prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESTART_X), 0, 0, 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
if prg.dev.target >= 100000:
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_USER_DATA_0), *user_regs)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESOURCE_LIMITS), 0)
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(cs_w32_en=1, force_start_at_000=1, compute_shader_en=1)
gfx10p = {'cs_w32_en': int(prg.wave32)} if prg.dev.target >= 100000 else {}
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(**gfx10p, force_start_at_000=1, compute_shader_en=1)
self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
if self.dev.xccs > 1: self.release_mem(cache_flush=True)
self.xcc_barrier()
return self
def wait(self, signal:AMDSignal, value:sint=0):
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
self.xcc_barrier()
return self
def timestamp(self, signal:AMDSignal):
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
with self.pred_exec(xcc_mask=0b1):
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
return self
def signal(self, signal:AMDSignal, value:sint=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
with self.pred_exec(xcc_mask=0b1):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
return self
def bind(self, dev:AMDDevice):
@@ -249,6 +304,13 @@ class AMDComputeQueue(HWQueue):
def _submit(self, dev:AMDDevice):
cmds = self.indirect_cmd if dev == self.binded_device else self._q
# WORKAROUND: PACKET3_PRED_EXEC doesn't work in rings, only in IBs, create a fake IB inside a ring to work around that
if self.dev.xccs > 1 and dev != self.binded_device:
ib_end = ((dev.compute_queue.put_value + 5) % len(dev.compute_queue.ring)) + len(cmds)
ib_pad = len(dev.compute_queue.ring) - (ib_end - len(cmds)) if ib_end > len(dev.compute_queue.ring) else 0
ib_ptr = mv_address(dev.compute_queue.ring) + ((dev.compute_queue.put_value + 5 + ib_pad) % len(dev.compute_queue.ring)) * 4
cmds = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(ib_ptr), len(cmds) | self.pm4.INDIRECT_BUFFER_VALID,
self.pm4.PACKET3(self.pm4.PACKET3_NOP, ib_pad + len(cmds) - 1), *((0,) * ib_pad), *cmds]
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
@@ -257,7 +319,7 @@ class AMDComputeQueue(HWQueue):
class AMDCopyQueue(HWQueue):
def __init__(self, dev, max_copy_size=0x40000000):
self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev.sdma, [], max_copy_size
self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev, dev.sdma, [], max_copy_size
super().__init__()
def q(self, *arr):
@@ -277,10 +339,12 @@ class AMDCopyQueue(HWQueue):
return self
def signal(self, signal:AMDSignal, value:sint=0):
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.gfxver >= 10 else 0
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value)
self.q(self.sdma.SDMA_OP_FENCE, *data64_le(signal.value_addr), value)
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
@@ -310,7 +374,7 @@ class AMDCopyQueue(HWQueue):
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
def _submit(self, dev:AMDDevice):
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
if self.binded_device == dev:
# An IB packet must end on a 8 DW boundary.
@@ -361,11 +425,12 @@ class AMDProgram(HCQProgram):
self.dev._ensure_has_local_memory(self.private_segment_size)
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + rodata_entry) # NOTE: this is wrong, it's not this object
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
self.wave32: bool = code.kernel_code_properties & 0x400 == 0x400
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
@@ -400,20 +465,29 @@ class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:boo
@dataclass
class AMDQueueDesc:
ring: memoryview
read_ptr: memoryview
write_ptr: memoryview
doorbell: memoryview
read_ptrs: list[memoryview]
write_ptrs: list[memoryview]
doorbells: list[memoryview]
put_value: int = 0
@property
def read_ptr(self): return min(p[0] for p in self.read_ptrs)
@classmethod
def multi(cls, *queues: AMDQueueDesc):
assert all_same([(mv_address(q.ring), q.put_value) for q in queues]), f"All queues must have the same ring and put_value: {queues}"
return cls(ring=queues[0].ring, put_value=queues[0].put_value, doorbells=flatten(q.doorbells for q in queues),
read_ptrs=flatten(q.read_ptrs for q in queues), write_ptrs=flatten(q.write_ptrs for q in queues))
def signal_doorbell(self, dev):
self.write_ptr[0] = self.put_value
for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value
# Ensure all prior writes are visible to the GPU.
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
# Flush hdp if queue is in dev mem.
if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
self.doorbell[0] = self.put_value
for doorbell in self.doorbells: doorbell[0] = self.put_value
@dataclass(frozen=True)
class AMDReg(AMDRegBase):
@@ -530,21 +604,20 @@ class KFDIface:
n_devices=len(mem.meta.mapped_gpu_ids))
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
ctx_save_restore_address=cwsr_buffer.va_addr if cwsr_buffer else 0, ctx_save_restore_size=ctx_save_restore_size,
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8 * (xcc_id + 1))
if not hasattr(self, 'doorbells'):
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
read_ptrs=[to_mv(queue.read_pointer_address, 8).cast("Q")], write_ptrs=[to_mv(queue.write_pointer_address, 8).cast("Q")],
doorbells=[to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q")])
def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
@@ -680,7 +753,7 @@ class PCIIface:
paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
@@ -688,9 +761,8 @@ class PCIIface:
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbells=[to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q")],
read_ptrs=[to_mv(gart.va_addr, 8).cast("Q")], write_ptrs=[to_mv(gart.va_addr+0x10, 8).cast("Q")])
def sleep(self, timeout):
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
self.irq_fd.read(8 * events_cnt)
@@ -713,41 +785,52 @@ class AMDDevice(HCQCompiled):
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
self.target = int(self.dev_iface.props['gfx_target_version'])
self.gfxver = self.target // 10000
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
if self.target < 100300 or self.target >= 130000: raise RuntimeError(f"Unsupported arch: {self.arch}")
if self.target < 90402 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
self.has_scratch_base_registers = self.target >= 110000
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] // self.dev_iface.props.get('num_xcc', 1) - 1
self.max_wave_id = (self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1) if self.target >= 100100 else \
(min((self.max_cu_id+1)*40, self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine'] * 512) - 1)
self.xccs = self.dev_iface.props.get('num_xcc', 1) if getenv("XCCS", 1) else 1
self.has_scratch_base_registers = self.target >= 110000 or self.target == 90402 # this is what llvm refers to as "architected flat scratch"
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else \
0x80000 if (self.target//100)*100 == 90400 or self.target in {90008, 90010} else 0x40000
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
if self.target//10000 == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE) if self.target >= 100100 else \
round_up((self.max_wave_id + 1) * 8 + 8 + 40, mmap.PAGESIZE)
debug_memory_size = round_up((self.max_cu_id + 1 if self.target >= 100100 else 1) * (self.max_wave_id + 1) * 32, 64)
if self.gfxver == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.target//10000])}")
self.pm4 = importlib.import_module("tinygrad.runtime.autogen.am.pm4_nv")
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({9: 'vega10', 10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.gfxver])}")
self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'nv' if self.gfxver >= 10 else 'soc15'}")
self.sdma = import_module('sdma', self.dev_iface.ip_versions[am.SDMA0_HWIP])
self.gc = AMDIP('gc', self.dev_iface.ip_versions[am.GC_HWIP], self.dev_iface.ip_offsets[am.GC_HWIP])
self.nbio = AMDIP('nbio' if self.target < 120000 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], self.dev_iface.ip_offsets[am.NBIF_HWIP])
pad = (0,) if self.gfxver == 9 else () # ?!?!?!?!??!?!?!
self.nbio = AMDIP('nbio' if self.gfxver < 12 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], pad+self.dev_iface.ip_offsets[am.NBIF_HWIP])
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer() if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self))
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size))
# Scratch setup
self.max_private_segment_size = 0
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
# XCC setup
self.xcc_sync: tuple[AMDSignal, AMDSignal]|None = (AMDSignal(), AMDSignal()) if self.xccs > 1 else None
if self.xccs > 1: AMDComputeQueue(self).xcc_config().submit(self)
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
if self.sqtt_enabled:
@@ -767,8 +850,11 @@ class AMDDevice(HCQCompiled):
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.dev_iface.props.get('num_xcc', 1), mmap.PAGESIZE)
return AMDQueueDesc.multi(*(self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, xcc_id=xcc_id,
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size,
cwsr_buffer=(self.dev_iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None))
for xcc_id in range(self.xccs if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE else 1)))
def _ensure_has_local_memory(self, required):
if self.max_private_segment_size >= required: return
@@ -776,12 +862,13 @@ class AMDDevice(HCQCompiled):
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
scratch_size = (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len # per xcc
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), scratch_size*self.xccs)
if ok:
engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
# >=gfx11 wavesize is per SE
wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
wavesize = scratch_size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
self.tmpring_size = waves << 12 | wavesize
self.max_private_segment_size = required