mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
MI300X support (WIP) (#9585)
This commit is contained in:
@@ -308,12 +308,23 @@ generate_am() {
|
||||
sed -i "s\(int64_t)\ \g" $BASE/am/am.py
|
||||
sed -i "s\AMDGPU_PTE_MTYPE_VG10(2)\AMDGPU_PTE_MTYPE_VG10(0, 2)\g" $BASE/am/am.py # incorrect parsing (TODO: remove when clang2py is gone).
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
|
||||
extra/hip_gpu_driver/soc15d.h \
|
||||
-o $BASE/am/pm4_soc15.py
|
||||
fixup $BASE/am/pm4_soc15.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
|
||||
extra/hip_gpu_driver/nvd.h \
|
||||
-o $BASE/am/pm4_nv.py
|
||||
fixup $BASE/am/pm4_nv.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/vega10_enum.h \
|
||||
-o $BASE/am/vega10.py
|
||||
fixup $BASE/am/vega10.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/navi10_enum.h \
|
||||
-o $BASE/am/navi10.py
|
||||
@@ -341,6 +352,13 @@ generate_am() {
|
||||
-o $BASE/am/mp_11_0.py
|
||||
fixup $BASE/am/mp_11_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/gc_9_4_3_offset.h \
|
||||
extra/amdpci/headers/gc_9_4_3_sh_mask.h \
|
||||
extra/amdpci/overlay/gc_9_4_3.h \
|
||||
-o $BASE/am/gc_9_4_3.py
|
||||
fixup $BASE/am/gc_9_4_3.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/gc_10_3_0_offset.h \
|
||||
extra/amdpci/headers/gc_10_3_0_sh_mask.h \
|
||||
@@ -359,6 +377,13 @@ generate_am() {
|
||||
-o $BASE/am/gc_12_0_0.py
|
||||
fixup $BASE/am/gc_12_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/sdma_registers.h \
|
||||
extra/hip_gpu_driver/vega10_sdma_pkt_open.h \
|
||||
--clang-args="-I/opt/rocm/include -x c++" \
|
||||
-o $BASE/am/sdma_4_0_0.py
|
||||
fixup $BASE/am/sdma_4_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/sdma_registers.h \
|
||||
extra/hip_gpu_driver/navi10_sdma_pkt_open.h \
|
||||
@@ -403,6 +428,12 @@ generate_am() {
|
||||
-o $BASE/am/nbif_6_3_1.py
|
||||
fixup $BASE/am/nbif_6_3_1.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/nbio_7_9_0_offset.h \
|
||||
extra/amdpci/headers/nbio_7_9_0_sh_mask.h \
|
||||
-o $BASE/am/nbio_7_9_0.py
|
||||
fixup $BASE/am/nbio_7_9_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/osssys_6_0_0_offset.h \
|
||||
extra/amdpci/headers/osssys_6_0_0_sh_mask.h \
|
||||
|
||||
7450
extra/amdpci/headers/gc_9_4_3_offset.h
Normal file
7450
extra/amdpci/headers/gc_9_4_3_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
31647
extra/amdpci/headers/gc_9_4_3_sh_mask.h
Normal file
31647
extra/amdpci/headers/gc_9_4_3_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
10004
extra/amdpci/headers/nbio_7_9_0_offset.h
Normal file
10004
extra/amdpci/headers/nbio_7_9_0_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
38900
extra/amdpci/headers/nbio_7_9_0_sh_mask.h
Normal file
38900
extra/amdpci/headers/nbio_7_9_0_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
22532
extra/amdpci/headers/vega10_enum.h
Normal file
22532
extra/amdpci/headers/vega10_enum.h
Normal file
File diff suppressed because it is too large
Load Diff
7
extra/amdpci/overlay/gc_9_4_3.h
Normal file
7
extra/amdpci/overlay/gc_9_4_3.h
Normal file
@@ -0,0 +1,7 @@
|
||||
// From MQD struct
|
||||
#define regCOMPUTE_CURRENT_LOGIC_XCC_ID 0x0e25
|
||||
#define regCOMPUTE_CURRENT_LOGIC_XCC_ID_BASE_IDX 0
|
||||
// Mask is probably not full register, doesn't matter though
|
||||
#define COMPUTE_CURRENT_LOGIC_XCC_ID__CURRENT_LOGIC_XCC_ID__SHIFT 0x0
|
||||
#define COMPUTE_CURRENT_LOGIC_XCC_ID__CURRENT_LOGIC_XCC_ID_MASK 0xFFFFFFFFL
|
||||
|
||||
444
extra/hip_gpu_driver/soc15d.h
Normal file
444
extra/hip_gpu_driver/soc15d.h
Normal file
@@ -0,0 +1,444 @@
|
||||
/*
|
||||
* Copyright 2014 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef SOC15_H
|
||||
#define SOC15_H
|
||||
|
||||
#define GFX9_NUM_GFX_RINGS 1
|
||||
#define GFX9_NUM_COMPUTE_RINGS 8
|
||||
|
||||
/*
|
||||
* PM4
|
||||
*/
|
||||
#define PACKET_TYPE0 0
|
||||
#define PACKET_TYPE1 1
|
||||
#define PACKET_TYPE2 2
|
||||
#define PACKET_TYPE3 3
|
||||
|
||||
#define CP_PACKET_GET_TYPE(h) (((h) >> 30) & 3)
|
||||
#define CP_PACKET_GET_COUNT(h) (((h) >> 16) & 0x3FFF)
|
||||
#define CP_PACKET0_GET_REG(h) ((h) & 0xFFFF)
|
||||
#define CP_PACKET3_GET_OPCODE(h) (((h) >> 8) & 0xFF)
|
||||
#define PACKET0(reg, n) ((PACKET_TYPE0 << 30) | \
|
||||
((reg) & 0xFFFF) | \
|
||||
((n) & 0x3FFF) << 16)
|
||||
#define CP_PACKET2 0x80000000
|
||||
#define PACKET2_PAD_SHIFT 0
|
||||
#define PACKET2_PAD_MASK (0x3fffffff << 0)
|
||||
|
||||
#define PACKET2(v) (CP_PACKET2 | REG_SET(PACKET2_PAD, (v)))
|
||||
|
||||
#define PACKET3(op, n) ((PACKET_TYPE3 << 30) | \
|
||||
(((op) & 0xFF) << 8) | \
|
||||
((n) & 0x3FFF) << 16)
|
||||
|
||||
#define PACKET3_COMPUTE(op, n) (PACKET3(op, n) | 1 << 1)
|
||||
|
||||
#define PACKETJ_CONDITION_CHECK0 0
|
||||
#define PACKETJ_CONDITION_CHECK1 1
|
||||
#define PACKETJ_CONDITION_CHECK2 2
|
||||
#define PACKETJ_CONDITION_CHECK3 3
|
||||
#define PACKETJ_CONDITION_CHECK4 4
|
||||
#define PACKETJ_CONDITION_CHECK5 5
|
||||
#define PACKETJ_CONDITION_CHECK6 6
|
||||
#define PACKETJ_CONDITION_CHECK7 7
|
||||
|
||||
#define PACKETJ_TYPE0 0
|
||||
#define PACKETJ_TYPE1 1
|
||||
#define PACKETJ_TYPE2 2
|
||||
#define PACKETJ_TYPE3 3
|
||||
#define PACKETJ_TYPE4 4
|
||||
#define PACKETJ_TYPE5 5
|
||||
#define PACKETJ_TYPE6 6
|
||||
#define PACKETJ_TYPE7 7
|
||||
|
||||
#define PACKETJ(reg, r, cond, type) ((reg & 0x3FFFF) | \
|
||||
((r & 0x3F) << 18) | \
|
||||
((cond & 0xF) << 24) | \
|
||||
((type & 0xF) << 28))
|
||||
|
||||
#define CP_PACKETJ_NOP 0x60000000
|
||||
#define CP_PACKETJ_GET_REG(x) ((x) & 0x3FFFF)
|
||||
#define CP_PACKETJ_GET_RES(x) (((x) >> 18) & 0x3F)
|
||||
#define CP_PACKETJ_GET_COND(x) (((x) >> 24) & 0xF)
|
||||
#define CP_PACKETJ_GET_TYPE(x) (((x) >> 28) & 0xF)
|
||||
|
||||
/* Packet 3 types */
|
||||
#define PACKET3_NOP 0x10
|
||||
#define PACKET3_SET_BASE 0x11
|
||||
#define PACKET3_BASE_INDEX(x) ((x) << 0)
|
||||
#define CE_PARTITION_BASE 3
|
||||
#define PACKET3_CLEAR_STATE 0x12
|
||||
#define PACKET3_INDEX_BUFFER_SIZE 0x13
|
||||
#define PACKET3_DISPATCH_DIRECT 0x15
|
||||
#define PACKET3_DISPATCH_INDIRECT 0x16
|
||||
#define PACKET3_ATOMIC_GDS 0x1D
|
||||
#define PACKET3_ATOMIC_MEM 0x1E
|
||||
#define PACKET3_OCCLUSION_QUERY 0x1F
|
||||
#define PACKET3_SET_PREDICATION 0x20
|
||||
#define PACKET3_REG_RMW 0x21
|
||||
#define PACKET3_COND_EXEC 0x22
|
||||
#define PACKET3_PRED_EXEC 0x23
|
||||
#define PACKET3_DRAW_INDIRECT 0x24
|
||||
#define PACKET3_DRAW_INDEX_INDIRECT 0x25
|
||||
#define PACKET3_INDEX_BASE 0x26
|
||||
#define PACKET3_DRAW_INDEX_2 0x27
|
||||
#define PACKET3_CONTEXT_CONTROL 0x28
|
||||
#define PACKET3_INDEX_TYPE 0x2A
|
||||
#define PACKET3_DRAW_INDIRECT_MULTI 0x2C
|
||||
#define PACKET3_DRAW_INDEX_AUTO 0x2D
|
||||
#define PACKET3_NUM_INSTANCES 0x2F
|
||||
#define PACKET3_DRAW_INDEX_MULTI_AUTO 0x30
|
||||
#define PACKET3_INDIRECT_BUFFER_CONST 0x33
|
||||
#define PACKET3_STRMOUT_BUFFER_UPDATE 0x34
|
||||
#define PACKET3_DRAW_INDEX_OFFSET_2 0x35
|
||||
#define PACKET3_DRAW_PREAMBLE 0x36
|
||||
#define PACKET3_WRITE_DATA 0x37
|
||||
#define WRITE_DATA_DST_SEL(x) ((x) << 8)
|
||||
/* 0 - register
|
||||
* 1 - memory (sync - via GRBM)
|
||||
* 2 - gl2
|
||||
* 3 - gds
|
||||
* 4 - reserved
|
||||
* 5 - memory (async - direct)
|
||||
*/
|
||||
#define WR_ONE_ADDR (1 << 16)
|
||||
#define WR_CONFIRM (1 << 20)
|
||||
#define WRITE_DATA_CACHE_POLICY(x) ((x) << 25)
|
||||
/* 0 - LRU
|
||||
* 1 - Stream
|
||||
*/
|
||||
#define WRITE_DATA_ENGINE_SEL(x) ((x) << 30)
|
||||
/* 0 - me
|
||||
* 1 - pfp
|
||||
* 2 - ce
|
||||
*/
|
||||
#define PACKET3_DRAW_INDEX_INDIRECT_MULTI 0x38
|
||||
#define PACKET3_MEM_SEMAPHORE 0x39
|
||||
# define PACKET3_SEM_USE_MAILBOX (0x1 << 16)
|
||||
# define PACKET3_SEM_SEL_SIGNAL_TYPE (0x1 << 20) /* 0 = increment, 1 = write 1 */
|
||||
# define PACKET3_SEM_SEL_SIGNAL (0x6 << 29)
|
||||
# define PACKET3_SEM_SEL_WAIT (0x7 << 29)
|
||||
#define PACKET3_WAIT_REG_MEM 0x3C
|
||||
#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0)
|
||||
/* 0 - always
|
||||
* 1 - <
|
||||
* 2 - <=
|
||||
* 3 - ==
|
||||
* 4 - !=
|
||||
* 5 - >=
|
||||
* 6 - >
|
||||
*/
|
||||
#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4)
|
||||
/* 0 - reg
|
||||
* 1 - mem
|
||||
*/
|
||||
#define WAIT_REG_MEM_OPERATION(x) ((x) << 6)
|
||||
/* 0 - wait_reg_mem
|
||||
* 1 - wr_wait_wr_reg
|
||||
*/
|
||||
#define WAIT_REG_MEM_ENGINE(x) ((x) << 8)
|
||||
/* 0 - me
|
||||
* 1 - pfp
|
||||
*/
|
||||
#define PACKET3_INDIRECT_BUFFER 0x3F
|
||||
#define INDIRECT_BUFFER_VALID (1 << 23)
|
||||
#define INDIRECT_BUFFER_CACHE_POLICY(x) ((x) << 28)
|
||||
/* 0 - LRU
|
||||
* 1 - Stream
|
||||
* 2 - Bypass
|
||||
*/
|
||||
#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21)
|
||||
#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30)
|
||||
#define PACKET3_COPY_DATA 0x40
|
||||
#define PACKET3_PFP_SYNC_ME 0x42
|
||||
#define PACKET3_COND_WRITE 0x45
|
||||
#define PACKET3_EVENT_WRITE 0x46
|
||||
#define EVENT_TYPE(x) ((x) << 0)
|
||||
#define EVENT_INDEX(x) ((x) << 8)
|
||||
/* 0 - any non-TS event
|
||||
* 1 - ZPASS_DONE, PIXEL_PIPE_STAT_*
|
||||
* 2 - SAMPLE_PIPELINESTAT
|
||||
* 3 - SAMPLE_STREAMOUTSTAT*
|
||||
* 4 - *S_PARTIAL_FLUSH
|
||||
*/
|
||||
#define PACKET3_RELEASE_MEM 0x49
|
||||
#define EVENT_TYPE(x) ((x) << 0)
|
||||
#define EVENT_INDEX(x) ((x) << 8)
|
||||
#define EOP_TCL1_VOL_ACTION_EN (1 << 12)
|
||||
#define EOP_TC_VOL_ACTION_EN (1 << 13) /* L2 */
|
||||
#define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */
|
||||
#define EOP_TCL1_ACTION_EN (1 << 16)
|
||||
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */
|
||||
#define EOP_TC_NC_ACTION_EN (1 << 19)
|
||||
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
|
||||
#define EOP_EXEC (1 << 28) /* For Trailing Fence */
|
||||
|
||||
#define DATA_SEL(x) ((x) << 29)
|
||||
/* 0 - discard
|
||||
* 1 - send low 32bit data
|
||||
* 2 - send 64bit data
|
||||
* 3 - send 64bit GPU counter value
|
||||
* 4 - send 64bit sys counter value
|
||||
*/
|
||||
#define INT_SEL(x) ((x) << 24)
|
||||
/* 0 - none
|
||||
* 1 - interrupt only (DATA_SEL = 0)
|
||||
* 2 - interrupt when data write is confirmed
|
||||
*/
|
||||
#define DST_SEL(x) ((x) << 16)
|
||||
/* 0 - MC
|
||||
* 1 - TC/L2
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#define PACKET3_PREAMBLE_CNTL 0x4A
|
||||
# define PACKET3_PREAMBLE_BEGIN_CLEAR_STATE (2 << 28)
|
||||
# define PACKET3_PREAMBLE_END_CLEAR_STATE (3 << 28)
|
||||
#define PACKET3_DMA_DATA 0x50
|
||||
/* 1. header
|
||||
* 2. CONTROL
|
||||
* 3. SRC_ADDR_LO or DATA [31:0]
|
||||
* 4. SRC_ADDR_HI [31:0]
|
||||
* 5. DST_ADDR_LO [31:0]
|
||||
* 6. DST_ADDR_HI [7:0]
|
||||
* 7. COMMAND [30:21] | BYTE_COUNT [20:0]
|
||||
*/
|
||||
/* CONTROL */
|
||||
# define PACKET3_DMA_DATA_ENGINE(x) ((x) << 0)
|
||||
/* 0 - ME
|
||||
* 1 - PFP
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_SRC_CACHE_POLICY(x) ((x) << 13)
|
||||
/* 0 - LRU
|
||||
* 1 - Stream
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_DST_SEL(x) ((x) << 20)
|
||||
/* 0 - DST_ADDR using DAS
|
||||
* 1 - GDS
|
||||
* 3 - DST_ADDR using L2
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_DST_CACHE_POLICY(x) ((x) << 25)
|
||||
/* 0 - LRU
|
||||
* 1 - Stream
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_SRC_SEL(x) ((x) << 29)
|
||||
/* 0 - SRC_ADDR using SAS
|
||||
* 1 - GDS
|
||||
* 2 - DATA
|
||||
* 3 - SRC_ADDR using L2
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_CP_SYNC (1 << 31)
|
||||
/* COMMAND */
|
||||
# define PACKET3_DMA_DATA_CMD_SAS (1 << 26)
|
||||
/* 0 - memory
|
||||
* 1 - register
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_CMD_DAS (1 << 27)
|
||||
/* 0 - memory
|
||||
* 1 - register
|
||||
*/
|
||||
# define PACKET3_DMA_DATA_CMD_SAIC (1 << 28)
|
||||
# define PACKET3_DMA_DATA_CMD_DAIC (1 << 29)
|
||||
# define PACKET3_DMA_DATA_CMD_RAW_WAIT (1 << 30)
|
||||
#define PACKET3_ACQUIRE_MEM 0x58
|
||||
/* 1. HEADER
|
||||
* 2. COHER_CNTL [30:0]
|
||||
* 2.1 ENGINE_SEL [31:31]
|
||||
* 3. COHER_SIZE [31:0]
|
||||
* 4. COHER_SIZE_HI [7:0]
|
||||
* 5. COHER_BASE_LO [31:0]
|
||||
* 6. COHER_BASE_HI [23:0]
|
||||
* 7. POLL_INTERVAL [15:0]
|
||||
*/
|
||||
/* COHER_CNTL fields for CP_COHER_CNTL */
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_NC_ACTION_ENA(x) ((x) << 3)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WC_ACTION_ENA(x) ((x) << 4)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_INV_METADATA_ACTION_ENA(x) ((x) << 5)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_VOL_ACTION_ENA(x) ((x) << 15)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(x) ((x) << 18)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(x) ((x) << 22)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(x) ((x) << 23)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_CB_ACTION_ENA(x) ((x) << 25)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_DB_ACTION_ENA(x) ((x) << 26)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(x) ((x) << 27)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_VOL_ACTION_ENA(x) ((x) << 28)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(x) ((x) << 29)
|
||||
#define PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_WB_ACTION_ENA(x) ((x) << 30)
|
||||
#define PACKET3_REWIND 0x59
|
||||
#define PACKET3_LOAD_UCONFIG_REG 0x5E
|
||||
#define PACKET3_LOAD_SH_REG 0x5F
|
||||
#define PACKET3_LOAD_CONFIG_REG 0x60
|
||||
#define PACKET3_LOAD_CONTEXT_REG 0x61
|
||||
#define PACKET3_SET_CONFIG_REG 0x68
|
||||
#define PACKET3_SET_CONFIG_REG_START 0x00002000
|
||||
#define PACKET3_SET_CONFIG_REG_END 0x00002c00
|
||||
#define PACKET3_SET_CONTEXT_REG 0x69
|
||||
#define PACKET3_SET_CONTEXT_REG_START 0x0000a000
|
||||
#define PACKET3_SET_CONTEXT_REG_END 0x0000a400
|
||||
#define PACKET3_SET_CONTEXT_REG_INDIRECT 0x73
|
||||
#define PACKET3_SET_SH_REG 0x76
|
||||
#define PACKET3_SET_SH_REG_START 0x00002c00
|
||||
#define PACKET3_SET_SH_REG_END 0x00003000
|
||||
#define PACKET3_SET_SH_REG_OFFSET 0x77
|
||||
#define PACKET3_SET_QUEUE_REG 0x78
|
||||
#define PACKET3_SET_UCONFIG_REG 0x79
|
||||
#define PACKET3_SET_UCONFIG_REG_START 0x0000c000
|
||||
#define PACKET3_SET_UCONFIG_REG_END 0x0000c400
|
||||
#define PACKET3_SET_UCONFIG_REG_INDEX_TYPE (2 << 28)
|
||||
#define PACKET3_SCRATCH_RAM_WRITE 0x7D
|
||||
#define PACKET3_SCRATCH_RAM_READ 0x7E
|
||||
#define PACKET3_LOAD_CONST_RAM 0x80
|
||||
#define PACKET3_WRITE_CONST_RAM 0x81
|
||||
#define PACKET3_DUMP_CONST_RAM 0x83
|
||||
#define PACKET3_INCREMENT_CE_COUNTER 0x84
|
||||
#define PACKET3_INCREMENT_DE_COUNTER 0x85
|
||||
#define PACKET3_WAIT_ON_CE_COUNTER 0x86
|
||||
#define PACKET3_WAIT_ON_DE_COUNTER_DIFF 0x88
|
||||
#define PACKET3_SWITCH_BUFFER 0x8B
|
||||
#define PACKET3_FRAME_CONTROL 0x90
|
||||
# define FRAME_TMZ (1 << 0)
|
||||
# define FRAME_CMD(x) ((x) << 28)
|
||||
/*
|
||||
* x=0: tmz_begin
|
||||
* x=1: tmz_end
|
||||
*/
|
||||
|
||||
#define PACKET3_INVALIDATE_TLBS 0x98
|
||||
# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0)
|
||||
# define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4)
|
||||
# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5)
|
||||
# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29)
|
||||
#define PACKET3_SET_RESOURCES 0xA0
|
||||
/* 1. header
|
||||
* 2. CONTROL
|
||||
* 3. QUEUE_MASK_LO [31:0]
|
||||
* 4. QUEUE_MASK_HI [31:0]
|
||||
* 5. GWS_MASK_LO [31:0]
|
||||
* 6. GWS_MASK_HI [31:0]
|
||||
* 7. OAC_MASK [15:0]
|
||||
* 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0]
|
||||
*/
|
||||
# define PACKET3_SET_RESOURCES_VMID_MASK(x) ((x) << 0)
|
||||
# define PACKET3_SET_RESOURCES_UNMAP_LATENTY(x) ((x) << 16)
|
||||
# define PACKET3_SET_RESOURCES_QUEUE_TYPE(x) ((x) << 29)
|
||||
#define PACKET3_MAP_QUEUES 0xA2
|
||||
/* 1. header
|
||||
* 2. CONTROL
|
||||
* 3. CONTROL2
|
||||
* 4. MQD_ADDR_LO [31:0]
|
||||
* 5. MQD_ADDR_HI [31:0]
|
||||
* 6. WPTR_ADDR_LO [31:0]
|
||||
* 7. WPTR_ADDR_HI [31:0]
|
||||
*/
|
||||
/* CONTROL */
|
||||
# define PACKET3_MAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
|
||||
# define PACKET3_MAP_QUEUES_VMID(x) ((x) << 8)
|
||||
# define PACKET3_MAP_QUEUES_QUEUE(x) ((x) << 13)
|
||||
# define PACKET3_MAP_QUEUES_PIPE(x) ((x) << 16)
|
||||
# define PACKET3_MAP_QUEUES_ME(x) ((x) << 18)
|
||||
# define PACKET3_MAP_QUEUES_QUEUE_TYPE(x) ((x) << 21)
|
||||
# define PACKET3_MAP_QUEUES_ALLOC_FORMAT(x) ((x) << 24)
|
||||
# define PACKET3_MAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
|
||||
# define PACKET3_MAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
|
||||
/* CONTROL2 */
|
||||
# define PACKET3_MAP_QUEUES_CHECK_DISABLE(x) ((x) << 1)
|
||||
# define PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x) ((x) << 2)
|
||||
#define PACKET3_UNMAP_QUEUES 0xA3
|
||||
/* 1. header
|
||||
* 2. CONTROL
|
||||
* 3. CONTROL2
|
||||
* 4. CONTROL3
|
||||
* 5. CONTROL4
|
||||
* 6. CONTROL5
|
||||
*/
|
||||
/* CONTROL */
|
||||
# define PACKET3_UNMAP_QUEUES_ACTION(x) ((x) << 0)
|
||||
/* 0 - PREEMPT_QUEUES
|
||||
* 1 - RESET_QUEUES
|
||||
* 2 - DISABLE_PROCESS_QUEUES
|
||||
* 3 - PREEMPT_QUEUES_NO_UNMAP
|
||||
*/
|
||||
# define PACKET3_UNMAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
|
||||
# define PACKET3_UNMAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
|
||||
# define PACKET3_UNMAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
|
||||
/* CONTROL2a */
|
||||
# define PACKET3_UNMAP_QUEUES_PASID(x) ((x) << 0)
|
||||
/* CONTROL2b */
|
||||
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x) ((x) << 2)
|
||||
/* CONTROL3a */
|
||||
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x) ((x) << 2)
|
||||
/* CONTROL3b */
|
||||
# define PACKET3_UNMAP_QUEUES_RB_WPTR(x) ((x) << 0)
|
||||
/* CONTROL4 */
|
||||
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x) ((x) << 2)
|
||||
/* CONTROL5 */
|
||||
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x) ((x) << 2)
|
||||
#define PACKET3_QUERY_STATUS 0xA4
|
||||
/* 1. header
|
||||
* 2. CONTROL
|
||||
* 3. CONTROL2
|
||||
* 4. ADDR_LO [31:0]
|
||||
* 5. ADDR_HI [31:0]
|
||||
* 6. DATA_LO [31:0]
|
||||
* 7. DATA_HI [31:0]
|
||||
*/
|
||||
/* CONTROL */
|
||||
# define PACKET3_QUERY_STATUS_CONTEXT_ID(x) ((x) << 0)
|
||||
# define PACKET3_QUERY_STATUS_INTERRUPT_SEL(x) ((x) << 28)
|
||||
# define PACKET3_QUERY_STATUS_COMMAND(x) ((x) << 30)
|
||||
/* CONTROL2a */
|
||||
# define PACKET3_QUERY_STATUS_PASID(x) ((x) << 0)
|
||||
/* CONTROL2b */
|
||||
# define PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x) ((x) << 2)
|
||||
# define PACKET3_QUERY_STATUS_ENG_SEL(x) ((x) << 25)
|
||||
|
||||
#define PACKET3_RUN_CLEANER_SHADER 0xD2
|
||||
/* 1. header
|
||||
* 2. RESERVED [31:0]
|
||||
*/
|
||||
|
||||
#define VCE_CMD_NO_OP 0x00000000
|
||||
#define VCE_CMD_END 0x00000001
|
||||
#define VCE_CMD_IB 0x00000002
|
||||
#define VCE_CMD_FENCE 0x00000003
|
||||
#define VCE_CMD_TRAP 0x00000004
|
||||
#define VCE_CMD_IB_AUTO 0x00000005
|
||||
#define VCE_CMD_SEMAPHORE 0x00000006
|
||||
|
||||
#define VCE_CMD_IB_VM 0x00000102
|
||||
#define VCE_CMD_WAIT_GE 0x00000106
|
||||
#define VCE_CMD_UPDATE_PTB 0x00000107
|
||||
#define VCE_CMD_FLUSH_TLB 0x00000108
|
||||
#define VCE_CMD_REG_WRITE 0x00000109
|
||||
#define VCE_CMD_REG_WAIT 0x0000010a
|
||||
|
||||
#define HEVC_ENC_CMD_NO_OP 0x00000000
|
||||
#define HEVC_ENC_CMD_END 0x00000001
|
||||
#define HEVC_ENC_CMD_FENCE 0x00000003
|
||||
#define HEVC_ENC_CMD_TRAP 0x00000004
|
||||
#define HEVC_ENC_CMD_IB_VM 0x00000102
|
||||
#define HEVC_ENC_CMD_REG_WRITE 0x00000109
|
||||
#define HEVC_ENC_CMD_REG_WAIT 0x0000010a
|
||||
|
||||
#endif
|
||||
3335
extra/hip_gpu_driver/vega10_sdma_pkt_open.h
Normal file
3335
extra/hip_gpu_driver/vega10_sdma_pkt_open.h
Normal file
File diff suppressed because it is too large
Load Diff
66438
tinygrad/runtime/autogen/am/gc_9_4_3.py
Normal file
66438
tinygrad/runtime/autogen/am/gc_9_4_3.py
Normal file
File diff suppressed because it is too large
Load Diff
84562
tinygrad/runtime/autogen/am/nbio_7_9_0.py
Normal file
84562
tinygrad/runtime/autogen/am/nbio_7_9_0.py
Normal file
File diff suppressed because it is too large
Load Diff
931
tinygrad/runtime/autogen/am/pm4_soc15.py
Normal file
931
tinygrad/runtime/autogen/am/pm4_soc15.py
Normal file
@@ -0,0 +1,931 @@
|
||||
# mypy: ignore-errors
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# TARGET arch is: []
|
||||
# WORD_SIZE is: 8
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes
|
||||
|
||||
|
||||
class AsDictMixin:
|
||||
@classmethod
|
||||
def as_dict(cls, self):
|
||||
result = {}
|
||||
if not isinstance(self, AsDictMixin):
|
||||
# not a structure, assume it's already a python object
|
||||
return self
|
||||
if not hasattr(cls, "_fields_"):
|
||||
return result
|
||||
# sys.version_info >= (3, 5)
|
||||
# for (field, *_) in cls._fields_: # noqa
|
||||
for field_tuple in cls._fields_: # noqa
|
||||
field = field_tuple[0]
|
||||
if field.startswith('PADDING_'):
|
||||
continue
|
||||
value = getattr(self, field)
|
||||
type_ = type(value)
|
||||
if hasattr(value, "_length_") and hasattr(value, "_type_"):
|
||||
# array
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = [v for v in value]
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = [type_.as_dict(v) for v in value]
|
||||
elif hasattr(value, "contents") and hasattr(value, "_type_"):
|
||||
# pointer
|
||||
try:
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = value.contents
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = type_.as_dict(value.contents)
|
||||
except ValueError:
|
||||
# nullptr
|
||||
value = None
|
||||
elif isinstance(value, AsDictMixin):
|
||||
# other structure
|
||||
value = type_.as_dict(value)
|
||||
result[field] = value
|
||||
return result
|
||||
|
||||
|
||||
class Structure(ctypes.Structure, AsDictMixin):
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
# We don't want to use positional arguments fill PADDING_* fields
|
||||
|
||||
args = dict(zip(self.__class__._field_names_(), args))
|
||||
args.update(kwds)
|
||||
super(Structure, self).__init__(**args)
|
||||
|
||||
@classmethod
|
||||
def _field_names_(cls):
|
||||
if hasattr(cls, '_fields_'):
|
||||
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
|
||||
else:
|
||||
return ()
|
||||
|
||||
@classmethod
|
||||
def get_type(cls, field):
|
||||
for f in cls._fields_:
|
||||
if f[0] == field:
|
||||
return f[1]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def bind(cls, bound_fields):
|
||||
fields = {}
|
||||
for name, type_ in cls._fields_:
|
||||
if hasattr(type_, "restype"):
|
||||
if name in bound_fields:
|
||||
if bound_fields[name] is None:
|
||||
fields[name] = type_()
|
||||
else:
|
||||
# use a closure to capture the callback from the loop scope
|
||||
fields[name] = (
|
||||
type_((lambda callback: lambda *args: callback(*args))(
|
||||
bound_fields[name]))
|
||||
)
|
||||
del bound_fields[name]
|
||||
else:
|
||||
# default callback implementation (does nothing)
|
||||
try:
|
||||
default_ = type_(0).restype().value
|
||||
except TypeError:
|
||||
default_ = None
|
||||
fields[name] = type_((
|
||||
lambda default_: lambda *args: default_)(default_))
|
||||
else:
|
||||
# not a callback function, use default initialization
|
||||
if name in bound_fields:
|
||||
fields[name] = bound_fields[name]
|
||||
del bound_fields[name]
|
||||
else:
|
||||
fields[name] = type_()
|
||||
if len(bound_fields) != 0:
|
||||
raise ValueError(
|
||||
"Cannot bind the following unknown callback(s) {}.{}".format(
|
||||
cls.__name__, bound_fields.keys()
|
||||
))
|
||||
return cls(**fields)
|
||||
|
||||
|
||||
class Union(ctypes.Union, AsDictMixin):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
F32_MES_PM4_PACKETS_H = True # macro
|
||||
uint32_t = True # macro
|
||||
int32_t = True # macro
|
||||
PM4_MES_HEADER_DEFINED = True # macro
|
||||
PM4_MEC_RELEASE_MEM_DEFINED = True # macro
|
||||
PM4_MEC_WRITE_DATA_DEFINED = True # macro
|
||||
class union_PM4_MES_TYPE_3_HEADER(Union):
|
||||
pass
|
||||
|
||||
class struct_PM4_MES_TYPE_3_HEADER_0(Structure):
|
||||
pass
|
||||
|
||||
struct_PM4_MES_TYPE_3_HEADER_0._pack_ = 1 # source:False
|
||||
struct_PM4_MES_TYPE_3_HEADER_0._fields_ = [
|
||||
('reserved1', ctypes.c_uint32, 8),
|
||||
('opcode', ctypes.c_uint32, 8),
|
||||
('count', ctypes.c_uint32, 14),
|
||||
('type', ctypes.c_uint32, 2),
|
||||
]
|
||||
|
||||
union_PM4_MES_TYPE_3_HEADER._pack_ = 1 # source:False
|
||||
union_PM4_MES_TYPE_3_HEADER._anonymous_ = ('_0',)
|
||||
union_PM4_MES_TYPE_3_HEADER._fields_ = [
|
||||
('_0', struct_PM4_MES_TYPE_3_HEADER_0),
|
||||
('u32All', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
5: 'event_index__mec_release_mem__end_of_pipe',
|
||||
6: 'event_index__mec_release_mem__shader_done',
|
||||
}
|
||||
event_index__mec_release_mem__end_of_pipe = 5
|
||||
event_index__mec_release_mem__shader_done = 6
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'cache_policy__mec_release_mem__lru',
|
||||
1: 'cache_policy__mec_release_mem__stream',
|
||||
}
|
||||
cache_policy__mec_release_mem__lru = 0
|
||||
cache_policy__mec_release_mem__stream = 1
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'pq_exe_status__mec_release_mem__default',
|
||||
1: 'pq_exe_status__mec_release_mem__phase_update',
|
||||
}
|
||||
pq_exe_status__mec_release_mem__default = 0
|
||||
pq_exe_status__mec_release_mem__phase_update = 1
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'dst_sel__mec_release_mem__memory_controller',
|
||||
1: 'dst_sel__mec_release_mem__tc_l2',
|
||||
2: 'dst_sel__mec_release_mem__queue_write_pointer_register',
|
||||
3: 'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
|
||||
}
|
||||
dst_sel__mec_release_mem__memory_controller = 0
|
||||
dst_sel__mec_release_mem__tc_l2 = 1
|
||||
dst_sel__mec_release_mem__queue_write_pointer_register = 2
|
||||
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'int_sel__mec_release_mem__none',
|
||||
1: 'int_sel__mec_release_mem__send_interrupt_only',
|
||||
2: 'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
|
||||
3: 'int_sel__mec_release_mem__send_data_after_write_confirm',
|
||||
4: 'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
|
||||
5: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
|
||||
6: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
|
||||
}
|
||||
int_sel__mec_release_mem__none = 0
|
||||
int_sel__mec_release_mem__send_interrupt_only = 1
|
||||
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2
|
||||
int_sel__mec_release_mem__send_data_after_write_confirm = 3
|
||||
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'data_sel__mec_release_mem__none',
|
||||
1: 'data_sel__mec_release_mem__send_32_bit_low',
|
||||
2: 'data_sel__mec_release_mem__send_64_bit_data',
|
||||
3: 'data_sel__mec_release_mem__send_gpu_clock_counter',
|
||||
4: 'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
|
||||
5: 'data_sel__mec_release_mem__store_gds_data_to_memory',
|
||||
}
|
||||
data_sel__mec_release_mem__none = 0
|
||||
data_sel__mec_release_mem__send_32_bit_low = 1
|
||||
data_sel__mec_release_mem__send_64_bit_data = 2
|
||||
data_sel__mec_release_mem__send_gpu_clock_counter = 3
|
||||
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4
|
||||
data_sel__mec_release_mem__store_gds_data_to_memory = 5
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
class struct_pm4_mec_release_mem(Structure):
|
||||
pass
|
||||
|
||||
class union_pm4_mec_release_mem_0(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_0._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_0._fields_ = [
|
||||
('header', union_PM4_MES_TYPE_3_HEADER),
|
||||
('ordinal1', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_1(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_1_bitfields2(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_1_bitfields2._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_1_bitfields2._fields_ = [
|
||||
('event_type', ctypes.c_uint32, 6),
|
||||
('reserved1', ctypes.c_uint32, 2),
|
||||
('event_index', c_uint32, 4),
|
||||
('tcl1_vol_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_vol_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved2', ctypes.c_uint32, 1),
|
||||
('tc_wb_action_ena', ctypes.c_uint32, 1),
|
||||
('tcl1_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved3', ctypes.c_uint32, 1),
|
||||
('tc_nc_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_wc_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_md_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved4', ctypes.c_uint32, 3),
|
||||
('cache_policy', c_uint32, 2),
|
||||
('reserved5', ctypes.c_uint32, 2),
|
||||
('pq_exe_status', c_uint32, 1),
|
||||
('reserved6', ctypes.c_uint32, 2),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_1._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_1._fields_ = [
|
||||
('bitfields2', struct_pm4_mec_release_mem_1_bitfields2),
|
||||
('ordinal2', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_2(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_2_bitfields3(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_2_bitfields3._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_2_bitfields3._fields_ = [
|
||||
('reserved7', ctypes.c_uint32, 16),
|
||||
('dst_sel', c_uint32, 2),
|
||||
('reserved8', ctypes.c_uint32, 6),
|
||||
('int_sel', c_uint32, 3),
|
||||
('reserved9', ctypes.c_uint32, 2),
|
||||
('data_sel', c_uint32, 3),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_2._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_2._fields_ = [
|
||||
('bitfields3', struct_pm4_mec_release_mem_2_bitfields3),
|
||||
('ordinal3', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_3(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_3_bitfields4(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_3_bitfields4._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_3_bitfields4._fields_ = [
|
||||
('reserved10', ctypes.c_uint32, 2),
|
||||
('address_lo_32b', ctypes.c_uint32, 30),
|
||||
]
|
||||
|
||||
class struct_pm4_mec_release_mem_3_bitfields4b(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_3_bitfields4b._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_3_bitfields4b._fields_ = [
|
||||
('reserved11', ctypes.c_uint32, 3),
|
||||
('address_lo_64b', ctypes.c_uint32, 29),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_3._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_3._fields_ = [
|
||||
('bitfields4', struct_pm4_mec_release_mem_3_bitfields4),
|
||||
('bitfields4b', struct_pm4_mec_release_mem_3_bitfields4b),
|
||||
('reserved12', ctypes.c_uint32),
|
||||
('ordinal4', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_4(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_4._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_4._fields_ = [
|
||||
('address_hi', ctypes.c_uint32),
|
||||
('reserved13', ctypes.c_uint32),
|
||||
('ordinal5', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_5(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_5_bitfields6c(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_5_bitfields6c._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_5_bitfields6c._fields_ = [
|
||||
('dw_offset', ctypes.c_uint32, 16),
|
||||
('num_dwords', ctypes.c_uint32, 16),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_5._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_5._fields_ = [
|
||||
('data_lo', ctypes.c_uint32),
|
||||
('cmp_data_lo', ctypes.c_uint32),
|
||||
('bitfields6c', struct_pm4_mec_release_mem_5_bitfields6c),
|
||||
('reserved14', ctypes.c_uint32),
|
||||
('ordinal6', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_6(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_6._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_6._fields_ = [
|
||||
('data_hi', ctypes.c_uint32),
|
||||
('cmp_data_hi', ctypes.c_uint32),
|
||||
('reserved15', ctypes.c_uint32),
|
||||
('reserved16', ctypes.c_uint32),
|
||||
('ordinal7', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
struct_pm4_mec_release_mem._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem._anonymous_ = ('_0', '_1', '_2', '_3', '_4', '_5', '_6',)
|
||||
struct_pm4_mec_release_mem._fields_ = [
|
||||
('_0', union_pm4_mec_release_mem_0),
|
||||
('_1', union_pm4_mec_release_mem_1),
|
||||
('_2', union_pm4_mec_release_mem_2),
|
||||
('_3', union_pm4_mec_release_mem_3),
|
||||
('_4', union_pm4_mec_release_mem_4),
|
||||
('_5', union_pm4_mec_release_mem_5),
|
||||
('_6', union_pm4_mec_release_mem_6),
|
||||
('int_ctxid', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'WRITE_DATA_dst_sel_enum'
|
||||
WRITE_DATA_dst_sel_enum__enumvalues = {
|
||||
0: 'dst_sel___write_data__mem_mapped_register',
|
||||
2: 'dst_sel___write_data__tc_l2',
|
||||
3: 'dst_sel___write_data__gds',
|
||||
5: 'dst_sel___write_data__memory',
|
||||
6: 'dst_sel___write_data__memory_mapped_adc_persistent_state',
|
||||
}
|
||||
dst_sel___write_data__mem_mapped_register = 0
|
||||
dst_sel___write_data__tc_l2 = 2
|
||||
dst_sel___write_data__gds = 3
|
||||
dst_sel___write_data__memory = 5
|
||||
dst_sel___write_data__memory_mapped_adc_persistent_state = 6
|
||||
WRITE_DATA_dst_sel_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_addr_incr_enum'
|
||||
WRITE_DATA_addr_incr_enum__enumvalues = {
|
||||
0: 'addr_incr___write_data__increment_address',
|
||||
1: 'addr_incr___write_data__do_not_increment_address',
|
||||
}
|
||||
addr_incr___write_data__increment_address = 0
|
||||
addr_incr___write_data__do_not_increment_address = 1
|
||||
WRITE_DATA_addr_incr_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_wr_confirm_enum'
|
||||
WRITE_DATA_wr_confirm_enum__enumvalues = {
|
||||
0: 'wr_confirm___write_data__do_not_wait_for_write_confirmation',
|
||||
1: 'wr_confirm___write_data__wait_for_write_confirmation',
|
||||
}
|
||||
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0
|
||||
wr_confirm___write_data__wait_for_write_confirmation = 1
|
||||
WRITE_DATA_wr_confirm_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_cache_policy_enum'
|
||||
WRITE_DATA_cache_policy_enum__enumvalues = {
|
||||
0: 'cache_policy___write_data__lru',
|
||||
1: 'cache_policy___write_data__stream',
|
||||
}
|
||||
cache_policy___write_data__lru = 0
|
||||
cache_policy___write_data__stream = 1
|
||||
WRITE_DATA_cache_policy_enum = ctypes.c_uint32 # enum
|
||||
class struct_pm4_mec_write_data_mmio(Structure):
|
||||
pass
|
||||
|
||||
class union_pm4_mec_write_data_mmio_0(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_write_data_mmio_0._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_0._fields_ = [
|
||||
('header', union_PM4_MES_TYPE_3_HEADER),
|
||||
('ordinal1', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_write_data_mmio_1(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_write_data_mmio_1_bitfields2(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_write_data_mmio_1_bitfields2._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio_1_bitfields2._fields_ = [
|
||||
('reserved1', ctypes.c_uint32, 8),
|
||||
('dst_sel', ctypes.c_uint32, 4),
|
||||
('reserved2', ctypes.c_uint32, 4),
|
||||
('addr_incr', ctypes.c_uint32, 1),
|
||||
('reserved3', ctypes.c_uint32, 2),
|
||||
('resume_vf', ctypes.c_uint32, 1),
|
||||
('wr_confirm', ctypes.c_uint32, 1),
|
||||
('reserved4', ctypes.c_uint32, 4),
|
||||
('cache_policy', ctypes.c_uint32, 2),
|
||||
('reserved5', ctypes.c_uint32, 5),
|
||||
]
|
||||
|
||||
union_pm4_mec_write_data_mmio_1._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_1._fields_ = [
|
||||
('bitfields2', struct_pm4_mec_write_data_mmio_1_bitfields2),
|
||||
('ordinal2', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_write_data_mmio_2(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_write_data_mmio_2_bitfields3(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_write_data_mmio_2_bitfields3._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio_2_bitfields3._fields_ = [
|
||||
('dst_mmreg_addr', ctypes.c_uint32, 18),
|
||||
('reserved6', ctypes.c_uint32, 14),
|
||||
]
|
||||
|
||||
union_pm4_mec_write_data_mmio_2._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_2._fields_ = [
|
||||
('bitfields3', struct_pm4_mec_write_data_mmio_2_bitfields3),
|
||||
('ordinal3', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
struct_pm4_mec_write_data_mmio._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio._anonymous_ = ('_0', '_1', '_2',)
|
||||
struct_pm4_mec_write_data_mmio._fields_ = [
|
||||
('_0', union_pm4_mec_write_data_mmio_0),
|
||||
('_1', union_pm4_mec_write_data_mmio_1),
|
||||
('_2', union_pm4_mec_write_data_mmio_2),
|
||||
('reserved7', ctypes.c_uint32),
|
||||
('data', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT'
|
||||
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT__enumvalues = {
|
||||
20: 'CACHE_FLUSH_AND_INV_TS_EVENT',
|
||||
}
|
||||
CACHE_FLUSH_AND_INV_TS_EVENT = 20
|
||||
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT = ctypes.c_uint32 # enum
|
||||
SOC15_H = True # macro
|
||||
GFX9_NUM_GFX_RINGS = 1 # macro
|
||||
GFX9_NUM_COMPUTE_RINGS = 8 # macro
|
||||
PACKET_TYPE0 = 0 # macro
|
||||
PACKET_TYPE1 = 1 # macro
|
||||
PACKET_TYPE2 = 2 # macro
|
||||
PACKET_TYPE3 = 3 # macro
|
||||
def CP_PACKET_GET_TYPE(h): # macro
|
||||
return (((h)>>30)&3)
|
||||
def CP_PACKET_GET_COUNT(h): # macro
|
||||
return (((h)>>16)&0x3FFF)
|
||||
def CP_PACKET0_GET_REG(h): # macro
|
||||
return ((h)&0xFFFF)
|
||||
def CP_PACKET3_GET_OPCODE(h): # macro
|
||||
return (((h)>>8)&0xFF)
|
||||
def PACKET0(reg, n): # macro
|
||||
return ((0<<30)|((reg)&0xFFFF)|((n)&0x3FFF)<<16)
|
||||
CP_PACKET2 = 0x80000000 # macro
|
||||
PACKET2_PAD_SHIFT = 0 # macro
|
||||
PACKET2_PAD_MASK = (0x3fffffff<<0) # macro
|
||||
# def PACKET2(v): # macro
|
||||
# return (0x80000000|REG_SET(PACKET2_PAD,(v)))
|
||||
def PACKET3(op, n): # macro
|
||||
return ((3<<30)|(((op)&0xFF)<<8)|((n)&0x3FFF)<<16)
|
||||
def PACKET3_COMPUTE(op, n): # macro
|
||||
return (PACKET3(op,n)|1<<1)
|
||||
PACKETJ_CONDITION_CHECK0 = 0 # macro
|
||||
PACKETJ_CONDITION_CHECK1 = 1 # macro
|
||||
PACKETJ_CONDITION_CHECK2 = 2 # macro
|
||||
PACKETJ_CONDITION_CHECK3 = 3 # macro
|
||||
PACKETJ_CONDITION_CHECK4 = 4 # macro
|
||||
PACKETJ_CONDITION_CHECK5 = 5 # macro
|
||||
PACKETJ_CONDITION_CHECK6 = 6 # macro
|
||||
PACKETJ_CONDITION_CHECK7 = 7 # macro
|
||||
PACKETJ_TYPE0 = 0 # macro
|
||||
PACKETJ_TYPE1 = 1 # macro
|
||||
PACKETJ_TYPE2 = 2 # macro
|
||||
PACKETJ_TYPE3 = 3 # macro
|
||||
PACKETJ_TYPE4 = 4 # macro
|
||||
PACKETJ_TYPE5 = 5 # macro
|
||||
PACKETJ_TYPE6 = 6 # macro
|
||||
PACKETJ_TYPE7 = 7 # macro
|
||||
def PACKETJ(reg, r, cond, type): # macro
|
||||
return ((reg&0x3FFFF)|((r&0x3F)<<18)|((cond&0xF)<<24)|((type&0xF)<<28))
|
||||
CP_PACKETJ_NOP = 0x60000000 # macro
|
||||
def CP_PACKETJ_GET_REG(x): # macro
|
||||
return ((x)&0x3FFFF)
|
||||
def CP_PACKETJ_GET_RES(x): # macro
|
||||
return (((x)>>18)&0x3F)
|
||||
def CP_PACKETJ_GET_COND(x): # macro
|
||||
return (((x)>>24)&0xF)
|
||||
def CP_PACKETJ_GET_TYPE(x): # macro
|
||||
return (((x)>>28)&0xF)
|
||||
PACKET3_NOP = 0x10 # macro
|
||||
PACKET3_SET_BASE = 0x11 # macro
|
||||
def PACKET3_BASE_INDEX(x): # macro
|
||||
return ((x)<<0)
|
||||
CE_PARTITION_BASE = 3 # macro
|
||||
PACKET3_CLEAR_STATE = 0x12 # macro
|
||||
PACKET3_INDEX_BUFFER_SIZE = 0x13 # macro
|
||||
PACKET3_DISPATCH_DIRECT = 0x15 # macro
|
||||
PACKET3_DISPATCH_INDIRECT = 0x16 # macro
|
||||
PACKET3_ATOMIC_GDS = 0x1D # macro
|
||||
PACKET3_ATOMIC_MEM = 0x1E # macro
|
||||
PACKET3_OCCLUSION_QUERY = 0x1F # macro
|
||||
PACKET3_SET_PREDICATION = 0x20 # macro
|
||||
PACKET3_REG_RMW = 0x21 # macro
|
||||
PACKET3_COND_EXEC = 0x22 # macro
|
||||
PACKET3_PRED_EXEC = 0x23 # macro
|
||||
PACKET3_DRAW_INDIRECT = 0x24 # macro
|
||||
PACKET3_DRAW_INDEX_INDIRECT = 0x25 # macro
|
||||
PACKET3_INDEX_BASE = 0x26 # macro
|
||||
PACKET3_DRAW_INDEX_2 = 0x27 # macro
|
||||
PACKET3_CONTEXT_CONTROL = 0x28 # macro
|
||||
PACKET3_INDEX_TYPE = 0x2A # macro
|
||||
PACKET3_DRAW_INDIRECT_MULTI = 0x2C # macro
|
||||
PACKET3_DRAW_INDEX_AUTO = 0x2D # macro
|
||||
PACKET3_NUM_INSTANCES = 0x2F # macro
|
||||
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30 # macro
|
||||
PACKET3_INDIRECT_BUFFER_CONST = 0x33 # macro
|
||||
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34 # macro
|
||||
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35 # macro
|
||||
PACKET3_DRAW_PREAMBLE = 0x36 # macro
|
||||
PACKET3_WRITE_DATA = 0x37 # macro
|
||||
def WRITE_DATA_DST_SEL(x): # macro
|
||||
return ((x)<<8)
|
||||
WR_ONE_ADDR = (1<<16) # macro
|
||||
WR_CONFIRM = (1<<20) # macro
|
||||
def WRITE_DATA_CACHE_POLICY(x): # macro
|
||||
return ((x)<<25)
|
||||
def WRITE_DATA_ENGINE_SEL(x): # macro
|
||||
return ((x)<<30)
|
||||
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38 # macro
|
||||
PACKET3_MEM_SEMAPHORE = 0x39 # macro
|
||||
PACKET3_SEM_USE_MAILBOX = (0x1<<16) # macro
|
||||
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1<<20) # macro
|
||||
PACKET3_SEM_SEL_SIGNAL = (0x6<<29) # macro
|
||||
PACKET3_SEM_SEL_WAIT = (0x7<<29) # macro
|
||||
PACKET3_WAIT_REG_MEM = 0x3C # macro
|
||||
def WAIT_REG_MEM_FUNCTION(x): # macro
|
||||
return ((x)<<0)
|
||||
def WAIT_REG_MEM_MEM_SPACE(x): # macro
|
||||
return ((x)<<4)
|
||||
def WAIT_REG_MEM_OPERATION(x): # macro
|
||||
return ((x)<<6)
|
||||
def WAIT_REG_MEM_ENGINE(x): # macro
|
||||
return ((x)<<8)
|
||||
PACKET3_INDIRECT_BUFFER = 0x3F # macro
|
||||
INDIRECT_BUFFER_VALID = (1<<23) # macro
|
||||
def INDIRECT_BUFFER_CACHE_POLICY(x): # macro
|
||||
return ((x)<<28)
|
||||
def INDIRECT_BUFFER_PRE_ENB(x): # macro
|
||||
return ((x)<<21)
|
||||
def INDIRECT_BUFFER_PRE_RESUME(x): # macro
|
||||
return ((x)<<30)
|
||||
PACKET3_COPY_DATA = 0x40 # macro
|
||||
PACKET3_PFP_SYNC_ME = 0x42 # macro
|
||||
PACKET3_COND_WRITE = 0x45 # macro
|
||||
PACKET3_EVENT_WRITE = 0x46 # macro
|
||||
def EVENT_TYPE(x): # macro
|
||||
return ((x)<<0)
|
||||
def EVENT_INDEX(x): # macro
|
||||
return ((x)<<8)
|
||||
PACKET3_RELEASE_MEM = 0x49 # macro
|
||||
EOP_TCL1_VOL_ACTION_EN = (1<<12) # macro
|
||||
EOP_TC_VOL_ACTION_EN = (1<<13) # macro
|
||||
EOP_TC_WB_ACTION_EN = (1<<15) # macro
|
||||
EOP_TCL1_ACTION_EN = (1<<16) # macro
|
||||
EOP_TC_ACTION_EN = (1<<17) # macro
|
||||
EOP_TC_NC_ACTION_EN = (1<<19) # macro
|
||||
EOP_TC_MD_ACTION_EN = (1<<21) # macro
|
||||
EOP_EXEC = (1<<28) # macro
|
||||
def DATA_SEL(x): # macro
|
||||
return ((x)<<29)
|
||||
def INT_SEL(x): # macro
|
||||
return ((x)<<24)
|
||||
def DST_SEL(x): # macro
|
||||
return ((x)<<16)
|
||||
PACKET3_PREAMBLE_CNTL = 0x4A # macro
|
||||
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2<<28) # macro
|
||||
PACKET3_PREAMBLE_END_CLEAR_STATE = (3<<28) # macro
|
||||
PACKET3_DMA_DATA = 0x50 # macro
|
||||
def PACKET3_DMA_DATA_ENGINE(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): # macro
|
||||
return ((x)<<13)
|
||||
def PACKET3_DMA_DATA_DST_SEL(x): # macro
|
||||
return ((x)<<20)
|
||||
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): # macro
|
||||
return ((x)<<25)
|
||||
def PACKET3_DMA_DATA_SRC_SEL(x): # macro
|
||||
return ((x)<<29)
|
||||
PACKET3_DMA_DATA_CP_SYNC = (1<<31) # macro
|
||||
PACKET3_DMA_DATA_CMD_SAS = (1<<26) # macro
|
||||
PACKET3_DMA_DATA_CMD_DAS = (1<<27) # macro
|
||||
PACKET3_DMA_DATA_CMD_SAIC = (1<<28) # macro
|
||||
PACKET3_DMA_DATA_CMD_DAIC = (1<<29) # macro
|
||||
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1<<30) # macro
|
||||
PACKET3_ACQUIRE_MEM = 0x58 # macro
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_NC_ACTION_ENA(x): # macro
|
||||
return ((x)<<3)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WC_ACTION_ENA(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_INV_METADATA_ACTION_ENA(x): # macro
|
||||
return ((x)<<5)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_VOL_ACTION_ENA(x): # macro
|
||||
return ((x)<<15)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(x): # macro
|
||||
return ((x)<<18)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(x): # macro
|
||||
return ((x)<<22)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(x): # macro
|
||||
return ((x)<<23)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_CB_ACTION_ENA(x): # macro
|
||||
return ((x)<<25)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_DB_ACTION_ENA(x): # macro
|
||||
return ((x)<<26)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(x): # macro
|
||||
return ((x)<<27)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_VOL_ACTION_ENA(x): # macro
|
||||
return ((x)<<28)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_WB_ACTION_ENA(x): # macro
|
||||
return ((x)<<30)
|
||||
PACKET3_REWIND = 0x59 # macro
|
||||
PACKET3_LOAD_UCONFIG_REG = 0x5E # macro
|
||||
PACKET3_LOAD_SH_REG = 0x5F # macro
|
||||
PACKET3_LOAD_CONFIG_REG = 0x60 # macro
|
||||
PACKET3_LOAD_CONTEXT_REG = 0x61 # macro
|
||||
PACKET3_SET_CONFIG_REG = 0x68 # macro
|
||||
PACKET3_SET_CONFIG_REG_START = 0x00002000 # macro
|
||||
PACKET3_SET_CONFIG_REG_END = 0x00002c00 # macro
|
||||
PACKET3_SET_CONTEXT_REG = 0x69 # macro
|
||||
PACKET3_SET_CONTEXT_REG_START = 0x0000a000 # macro
|
||||
PACKET3_SET_CONTEXT_REG_END = 0x0000a400 # macro
|
||||
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73 # macro
|
||||
PACKET3_SET_SH_REG = 0x76 # macro
|
||||
PACKET3_SET_SH_REG_START = 0x00002c00 # macro
|
||||
PACKET3_SET_SH_REG_END = 0x00003000 # macro
|
||||
PACKET3_SET_SH_REG_OFFSET = 0x77 # macro
|
||||
PACKET3_SET_QUEUE_REG = 0x78 # macro
|
||||
PACKET3_SET_UCONFIG_REG = 0x79 # macro
|
||||
PACKET3_SET_UCONFIG_REG_START = 0x0000c000 # macro
|
||||
PACKET3_SET_UCONFIG_REG_END = 0x0000c400 # macro
|
||||
PACKET3_SET_UCONFIG_REG_INDEX_TYPE = (2<<28) # macro
|
||||
PACKET3_SCRATCH_RAM_WRITE = 0x7D # macro
|
||||
PACKET3_SCRATCH_RAM_READ = 0x7E # macro
|
||||
PACKET3_LOAD_CONST_RAM = 0x80 # macro
|
||||
PACKET3_WRITE_CONST_RAM = 0x81 # macro
|
||||
PACKET3_DUMP_CONST_RAM = 0x83 # macro
|
||||
PACKET3_INCREMENT_CE_COUNTER = 0x84 # macro
|
||||
PACKET3_INCREMENT_DE_COUNTER = 0x85 # macro
|
||||
PACKET3_WAIT_ON_CE_COUNTER = 0x86 # macro
|
||||
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88 # macro
|
||||
PACKET3_SWITCH_BUFFER = 0x8B # macro
|
||||
PACKET3_FRAME_CONTROL = 0x90 # macro
|
||||
FRAME_TMZ = (1<<0) # macro
|
||||
def FRAME_CMD(x): # macro
|
||||
return ((x)<<28)
|
||||
PACKET3_INVALIDATE_TLBS = 0x98 # macro
|
||||
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_INVALIDATE_TLBS_PASID(x): # macro
|
||||
return ((x)<<5)
|
||||
def PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x): # macro
|
||||
return ((x)<<29)
|
||||
PACKET3_SET_RESOURCES = 0xA0 # macro
|
||||
def PACKET3_SET_RESOURCES_VMID_MASK(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): # macro
|
||||
return ((x)<<16)
|
||||
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): # macro
|
||||
return ((x)<<29)
|
||||
PACKET3_MAP_QUEUES = 0xA2 # macro
|
||||
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_MAP_QUEUES_VMID(x): # macro
|
||||
return ((x)<<8)
|
||||
def PACKET3_MAP_QUEUES_QUEUE(x): # macro
|
||||
return ((x)<<13)
|
||||
def PACKET3_MAP_QUEUES_PIPE(x): # macro
|
||||
return ((x)<<16)
|
||||
def PACKET3_MAP_QUEUES_ME(x): # macro
|
||||
return ((x)<<18)
|
||||
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): # macro
|
||||
return ((x)<<21)
|
||||
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): # macro
|
||||
return ((x)<<24)
|
||||
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): # macro
|
||||
return ((x)<<26)
|
||||
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): # macro
|
||||
return ((x)<<1)
|
||||
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): # macro
|
||||
return ((x)<<2)
|
||||
PACKET3_UNMAP_QUEUES = 0xA3 # macro
|
||||
def PACKET3_UNMAP_QUEUES_ACTION(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): # macro
|
||||
return ((x)<<26)
|
||||
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_UNMAP_QUEUES_PASID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): # macro
|
||||
return ((x)<<2)
|
||||
PACKET3_QUERY_STATUS = 0xA4 # macro
|
||||
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): # macro
|
||||
return ((x)<<28)
|
||||
def PACKET3_QUERY_STATUS_COMMAND(x): # macro
|
||||
return ((x)<<30)
|
||||
def PACKET3_QUERY_STATUS_PASID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_QUERY_STATUS_ENG_SEL(x): # macro
|
||||
return ((x)<<25)
|
||||
PACKET3_RUN_CLEANER_SHADER = 0xD2 # macro
|
||||
VCE_CMD_NO_OP = 0x00000000 # macro
|
||||
VCE_CMD_END = 0x00000001 # macro
|
||||
VCE_CMD_IB = 0x00000002 # macro
|
||||
VCE_CMD_FENCE = 0x00000003 # macro
|
||||
VCE_CMD_TRAP = 0x00000004 # macro
|
||||
VCE_CMD_IB_AUTO = 0x00000005 # macro
|
||||
VCE_CMD_SEMAPHORE = 0x00000006 # macro
|
||||
VCE_CMD_IB_VM = 0x00000102 # macro
|
||||
VCE_CMD_WAIT_GE = 0x00000106 # macro
|
||||
VCE_CMD_UPDATE_PTB = 0x00000107 # macro
|
||||
VCE_CMD_FLUSH_TLB = 0x00000108 # macro
|
||||
VCE_CMD_REG_WRITE = 0x00000109 # macro
|
||||
VCE_CMD_REG_WAIT = 0x0000010a # macro
|
||||
HEVC_ENC_CMD_NO_OP = 0x00000000 # macro
|
||||
HEVC_ENC_CMD_END = 0x00000001 # macro
|
||||
HEVC_ENC_CMD_FENCE = 0x00000003 # macro
|
||||
HEVC_ENC_CMD_TRAP = 0x00000004 # macro
|
||||
HEVC_ENC_CMD_IB_VM = 0x00000102 # macro
|
||||
HEVC_ENC_CMD_REG_WRITE = 0x00000109 # macro
|
||||
HEVC_ENC_CMD_REG_WAIT = 0x0000010a # macro
|
||||
__all__ = \
|
||||
['CACHE_FLUSH_AND_INV_TS_EVENT', 'CE_PARTITION_BASE',
|
||||
'CP_PACKET2', 'CP_PACKETJ_NOP', 'EOP_EXEC', 'EOP_TCL1_ACTION_EN',
|
||||
'EOP_TCL1_VOL_ACTION_EN', 'EOP_TC_ACTION_EN',
|
||||
'EOP_TC_MD_ACTION_EN', 'EOP_TC_NC_ACTION_EN',
|
||||
'EOP_TC_VOL_ACTION_EN', 'EOP_TC_WB_ACTION_EN',
|
||||
'F32_MES_PM4_PACKETS_H', 'FRAME_TMZ', 'GFX9_NUM_COMPUTE_RINGS',
|
||||
'GFX9_NUM_GFX_RINGS', 'HEVC_ENC_CMD_END', 'HEVC_ENC_CMD_FENCE',
|
||||
'HEVC_ENC_CMD_IB_VM', 'HEVC_ENC_CMD_NO_OP',
|
||||
'HEVC_ENC_CMD_REG_WAIT', 'HEVC_ENC_CMD_REG_WRITE',
|
||||
'HEVC_ENC_CMD_TRAP', 'INDIRECT_BUFFER_VALID', 'PACKET2_PAD_MASK',
|
||||
'PACKET2_PAD_SHIFT', 'PACKET3_ACQUIRE_MEM', 'PACKET3_ATOMIC_GDS',
|
||||
'PACKET3_ATOMIC_MEM', 'PACKET3_CLEAR_STATE', 'PACKET3_COND_EXEC',
|
||||
'PACKET3_COND_WRITE', 'PACKET3_CONTEXT_CONTROL',
|
||||
'PACKET3_COPY_DATA', 'PACKET3_DISPATCH_DIRECT',
|
||||
'PACKET3_DISPATCH_INDIRECT', 'PACKET3_DMA_DATA',
|
||||
'PACKET3_DMA_DATA_CMD_DAIC', 'PACKET3_DMA_DATA_CMD_DAS',
|
||||
'PACKET3_DMA_DATA_CMD_RAW_WAIT', 'PACKET3_DMA_DATA_CMD_SAIC',
|
||||
'PACKET3_DMA_DATA_CMD_SAS', 'PACKET3_DMA_DATA_CP_SYNC',
|
||||
'PACKET3_DRAW_INDEX_2', 'PACKET3_DRAW_INDEX_AUTO',
|
||||
'PACKET3_DRAW_INDEX_INDIRECT',
|
||||
'PACKET3_DRAW_INDEX_INDIRECT_MULTI',
|
||||
'PACKET3_DRAW_INDEX_MULTI_AUTO', 'PACKET3_DRAW_INDEX_OFFSET_2',
|
||||
'PACKET3_DRAW_INDIRECT', 'PACKET3_DRAW_INDIRECT_MULTI',
|
||||
'PACKET3_DRAW_PREAMBLE', 'PACKET3_DUMP_CONST_RAM',
|
||||
'PACKET3_EVENT_WRITE', 'PACKET3_FRAME_CONTROL',
|
||||
'PACKET3_INCREMENT_CE_COUNTER', 'PACKET3_INCREMENT_DE_COUNTER',
|
||||
'PACKET3_INDEX_BASE', 'PACKET3_INDEX_BUFFER_SIZE',
|
||||
'PACKET3_INDEX_TYPE', 'PACKET3_INDIRECT_BUFFER',
|
||||
'PACKET3_INDIRECT_BUFFER_CONST', 'PACKET3_INVALIDATE_TLBS',
|
||||
'PACKET3_LOAD_CONFIG_REG', 'PACKET3_LOAD_CONST_RAM',
|
||||
'PACKET3_LOAD_CONTEXT_REG', 'PACKET3_LOAD_SH_REG',
|
||||
'PACKET3_LOAD_UCONFIG_REG', 'PACKET3_MAP_QUEUES',
|
||||
'PACKET3_MEM_SEMAPHORE', 'PACKET3_NOP', 'PACKET3_NUM_INSTANCES',
|
||||
'PACKET3_OCCLUSION_QUERY', 'PACKET3_PFP_SYNC_ME',
|
||||
'PACKET3_PREAMBLE_BEGIN_CLEAR_STATE', 'PACKET3_PREAMBLE_CNTL',
|
||||
'PACKET3_PREAMBLE_END_CLEAR_STATE', 'PACKET3_PRED_EXEC',
|
||||
'PACKET3_QUERY_STATUS', 'PACKET3_REG_RMW', 'PACKET3_RELEASE_MEM',
|
||||
'PACKET3_REWIND', 'PACKET3_RUN_CLEANER_SHADER',
|
||||
'PACKET3_SCRATCH_RAM_READ', 'PACKET3_SCRATCH_RAM_WRITE',
|
||||
'PACKET3_SEM_SEL_SIGNAL', 'PACKET3_SEM_SEL_SIGNAL_TYPE',
|
||||
'PACKET3_SEM_SEL_WAIT', 'PACKET3_SEM_USE_MAILBOX',
|
||||
'PACKET3_SET_BASE', 'PACKET3_SET_CONFIG_REG',
|
||||
'PACKET3_SET_CONFIG_REG_END', 'PACKET3_SET_CONFIG_REG_START',
|
||||
'PACKET3_SET_CONTEXT_REG', 'PACKET3_SET_CONTEXT_REG_END',
|
||||
'PACKET3_SET_CONTEXT_REG_INDIRECT',
|
||||
'PACKET3_SET_CONTEXT_REG_START', 'PACKET3_SET_PREDICATION',
|
||||
'PACKET3_SET_QUEUE_REG', 'PACKET3_SET_RESOURCES',
|
||||
'PACKET3_SET_SH_REG', 'PACKET3_SET_SH_REG_END',
|
||||
'PACKET3_SET_SH_REG_OFFSET', 'PACKET3_SET_SH_REG_START',
|
||||
'PACKET3_SET_UCONFIG_REG', 'PACKET3_SET_UCONFIG_REG_END',
|
||||
'PACKET3_SET_UCONFIG_REG_INDEX_TYPE',
|
||||
'PACKET3_SET_UCONFIG_REG_START', 'PACKET3_STRMOUT_BUFFER_UPDATE',
|
||||
'PACKET3_SWITCH_BUFFER', 'PACKET3_UNMAP_QUEUES',
|
||||
'PACKET3_WAIT_ON_CE_COUNTER', 'PACKET3_WAIT_ON_DE_COUNTER_DIFF',
|
||||
'PACKET3_WAIT_REG_MEM', 'PACKET3_WRITE_CONST_RAM',
|
||||
'PACKET3_WRITE_DATA', 'PACKETJ_CONDITION_CHECK0',
|
||||
'PACKETJ_CONDITION_CHECK1', 'PACKETJ_CONDITION_CHECK2',
|
||||
'PACKETJ_CONDITION_CHECK3', 'PACKETJ_CONDITION_CHECK4',
|
||||
'PACKETJ_CONDITION_CHECK5', 'PACKETJ_CONDITION_CHECK6',
|
||||
'PACKETJ_CONDITION_CHECK7', 'PACKETJ_TYPE0', 'PACKETJ_TYPE1',
|
||||
'PACKETJ_TYPE2', 'PACKETJ_TYPE3', 'PACKETJ_TYPE4',
|
||||
'PACKETJ_TYPE5', 'PACKETJ_TYPE6', 'PACKETJ_TYPE7', 'PACKET_TYPE0',
|
||||
'PACKET_TYPE1', 'PACKET_TYPE2', 'PACKET_TYPE3',
|
||||
'PM4_MEC_RELEASE_MEM_DEFINED', 'PM4_MEC_WRITE_DATA_DEFINED',
|
||||
'PM4_MES_HEADER_DEFINED', 'SOC15_H', 'VCE_CMD_END',
|
||||
'VCE_CMD_FENCE', 'VCE_CMD_FLUSH_TLB', 'VCE_CMD_IB',
|
||||
'VCE_CMD_IB_AUTO', 'VCE_CMD_IB_VM', 'VCE_CMD_NO_OP',
|
||||
'VCE_CMD_REG_WAIT', 'VCE_CMD_REG_WRITE', 'VCE_CMD_SEMAPHORE',
|
||||
'VCE_CMD_TRAP', 'VCE_CMD_UPDATE_PTB', 'VCE_CMD_WAIT_GE',
|
||||
'WRITE_DATA_addr_incr_enum', 'WRITE_DATA_cache_policy_enum',
|
||||
'WRITE_DATA_dst_sel_enum', 'WRITE_DATA_wr_confirm_enum',
|
||||
'WR_CONFIRM', 'WR_ONE_ADDR',
|
||||
'addr_incr___write_data__do_not_increment_address',
|
||||
'addr_incr___write_data__increment_address',
|
||||
'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT', 'c_uint32', 'c_uint32',
|
||||
'c_uint32', 'c_uint32', 'c_uint32', 'c_uint32',
|
||||
'cache_policy___write_data__lru',
|
||||
'cache_policy___write_data__stream',
|
||||
'cache_policy__mec_release_mem__lru',
|
||||
'cache_policy__mec_release_mem__stream',
|
||||
'data_sel__mec_release_mem__none',
|
||||
'data_sel__mec_release_mem__send_32_bit_low',
|
||||
'data_sel__mec_release_mem__send_64_bit_data',
|
||||
'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
|
||||
'data_sel__mec_release_mem__send_gpu_clock_counter',
|
||||
'data_sel__mec_release_mem__store_gds_data_to_memory',
|
||||
'dst_sel___write_data__gds',
|
||||
'dst_sel___write_data__mem_mapped_register',
|
||||
'dst_sel___write_data__memory',
|
||||
'dst_sel___write_data__memory_mapped_adc_persistent_state',
|
||||
'dst_sel___write_data__tc_l2',
|
||||
'dst_sel__mec_release_mem__memory_controller',
|
||||
'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
|
||||
'dst_sel__mec_release_mem__queue_write_pointer_register',
|
||||
'dst_sel__mec_release_mem__tc_l2',
|
||||
'event_index__mec_release_mem__end_of_pipe',
|
||||
'event_index__mec_release_mem__shader_done', 'int32_t',
|
||||
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
|
||||
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
|
||||
'int_sel__mec_release_mem__none',
|
||||
'int_sel__mec_release_mem__send_data_after_write_confirm',
|
||||
'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
|
||||
'int_sel__mec_release_mem__send_interrupt_only',
|
||||
'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
|
||||
'pq_exe_status__mec_release_mem__default',
|
||||
'pq_exe_status__mec_release_mem__phase_update',
|
||||
'struct_PM4_MES_TYPE_3_HEADER_0', 'struct_pm4_mec_release_mem',
|
||||
'struct_pm4_mec_release_mem_1_bitfields2',
|
||||
'struct_pm4_mec_release_mem_2_bitfields3',
|
||||
'struct_pm4_mec_release_mem_3_bitfields4',
|
||||
'struct_pm4_mec_release_mem_3_bitfields4b',
|
||||
'struct_pm4_mec_release_mem_5_bitfields6c',
|
||||
'struct_pm4_mec_write_data_mmio',
|
||||
'struct_pm4_mec_write_data_mmio_1_bitfields2',
|
||||
'struct_pm4_mec_write_data_mmio_2_bitfields3', 'uint32_t',
|
||||
'union_PM4_MES_TYPE_3_HEADER', 'union_pm4_mec_release_mem_0',
|
||||
'union_pm4_mec_release_mem_1', 'union_pm4_mec_release_mem_2',
|
||||
'union_pm4_mec_release_mem_3', 'union_pm4_mec_release_mem_4',
|
||||
'union_pm4_mec_release_mem_5', 'union_pm4_mec_release_mem_6',
|
||||
'union_pm4_mec_write_data_mmio_0',
|
||||
'union_pm4_mec_write_data_mmio_1',
|
||||
'union_pm4_mec_write_data_mmio_2',
|
||||
'wr_confirm___write_data__do_not_wait_for_write_confirmation',
|
||||
'wr_confirm___write_data__wait_for_write_confirmation']
|
||||
5209
tinygrad/runtime/autogen/am/sdma_4_0_0.py
Normal file
5209
tinygrad/runtime/autogen/am/sdma_4_0_0.py
Normal file
File diff suppressed because it is too large
Load Diff
5209
tinygrad/runtime/autogen/am/sdma_4_4_2.py
Normal file
5209
tinygrad/runtime/autogen/am/sdma_4_4_2.py
Normal file
File diff suppressed because it is too large
Load Diff
36196
tinygrad/runtime/autogen/am/vega10.py
Normal file
36196
tinygrad/runtime/autogen/am/vega10.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, all_same, flatten, DEBUG, OSX
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, libc, pci, vfio, sqtt
|
||||
@@ -32,7 +32,7 @@ class AMDSignal(HCQSignal):
|
||||
|
||||
class AMDComputeQueue(HWQueue):
|
||||
def __init__(self, dev:AMDDevice):
|
||||
self.soc, self.pm4, self.gc, self.nbio = dev.soc, dev.pm4, dev.gc, dev.nbio
|
||||
self.dev, self.soc, self.pm4, self.gc, self.nbio = dev, dev.soc, dev.pm4, dev.gc, dev.nbio
|
||||
super().__init__()
|
||||
|
||||
def __del__(self):
|
||||
@@ -44,6 +44,15 @@ class AMDComputeQueue(HWQueue):
|
||||
def gfxreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_SH_REG_START
|
||||
def ucfgreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_UCONFIG_REG_START
|
||||
|
||||
@contextlib.contextmanager
|
||||
def pred_exec(self, xcc_mask:int):
|
||||
if self.dev.xccs > 1:
|
||||
self.pkt3(self.pm4.PACKET3_PRED_EXEC, xcc_mask << 24)
|
||||
prev_len = len(self._q)
|
||||
yield
|
||||
if self.dev.xccs > 1:
|
||||
self._q[prev_len-1] |= (len(self._q) - prev_len)
|
||||
|
||||
def sqtt_userdata(self, data, *extra_dwords):
|
||||
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
|
||||
for i in range(0, len(data_ints), 2):
|
||||
@@ -56,33 +65,68 @@ class AMDComputeQueue(HWQueue):
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
||||
|
||||
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
||||
if self.dev.gfxver >= 10:
|
||||
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
||||
|
||||
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
||||
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
||||
else:
|
||||
cp_coher_cntl = self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA(gli) | \
|
||||
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA(glk) | \
|
||||
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA(1) | \
|
||||
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA(1) | \
|
||||
self.pm4.PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA(1)
|
||||
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, cp_coher_cntl, *data64_le(sz), *data64_le(addr), 0x0000000A)
|
||||
|
||||
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
|
||||
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
|
||||
def release_mem(self, address=0x0, value=0, data_sel=0, int_sel=2, ctxid=0, cache_flush=False):
|
||||
if self.dev.gfxver >= 10:
|
||||
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
|
||||
|
||||
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
||||
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
||||
|
||||
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
|
||||
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
|
||||
else:
|
||||
cache_flags_dw = 0 if not cache_flush else (self.pm4.EOP_TC_WB_ACTION_EN | self.pm4.EOP_TC_NC_ACTION_EN)
|
||||
|
||||
event_dw = self.pm4.EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) | self.pm4.EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
||||
|
||||
memsel_dw = self.pm4.DATA_SEL(data_sel) | self.pm4.INT_SEL(int_sel)
|
||||
|
||||
ctxid = 0
|
||||
|
||||
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
||||
|
||||
def xcc_barrier(self):
|
||||
if self.dev.xcc_sync is None: return self
|
||||
assert self.dev.xccs == 8, 'only 8 XCCs supported'
|
||||
a, b = self.dev.xcc_sync
|
||||
mem_eq = self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | self.pm4.WAIT_REG_MEM_MEM_SPACE(1)
|
||||
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(a.value_addr), *data64_le(1), *data64_le(0), 10) # a += 1
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(a.value_addr), 0, 0b111, 10) # a == 0 (mod 8) via bitmask
|
||||
self.pkt3(self.pm4.PACKET3_ATOMIC_MEM, self.soc.TC_OP_ATOMIC_ADD_RTN_32, *data64_le(b.value_addr), *data64_le(1), *data64_le(0), 10) # b += 1
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, mem_eq, *data64_le(b.value_addr), 0, 0b111, 10) # b == 0 (mod 8) via bitmask
|
||||
return self
|
||||
|
||||
def memory_barrier(self):
|
||||
self.wait_reg_mem(reg_req=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_REQ.addr, reg_done=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_DONE.addr,
|
||||
value=0xffffffff)
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def xcc_config(self):
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TG_CHUNK_SIZE), 1)
|
||||
for xcc_id in range(self.dev.xccs):
|
||||
with self.pred_exec(xcc_mask=1 << xcc_id):
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_CURRENT_LOGIC_XCC_ID), xcc_id)
|
||||
return self
|
||||
|
||||
def spi_config(self, tracing:bool):
|
||||
spi_config_cntl = self.gc.regSPI_CONFIG_CNTL.encode(ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
||||
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
||||
@@ -167,6 +211,7 @@ class AMDComputeQueue(HWQueue):
|
||||
self.acquire_mem(gli=0, gl2=0)
|
||||
|
||||
if prg.enable_private_segment_sgpr:
|
||||
assert self.dev.xccs == 1, "Only architected flat scratch is suppored on multi-xcc"
|
||||
scratch_hilo = data64_le(prg.dev.scratch.va_addr)
|
||||
# sgpr word1 bit31 enables swizzle
|
||||
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
|
||||
@@ -198,42 +243,52 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), prg.rsrc3)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
||||
if prg.dev.has_scratch_base_registers:
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
||||
if prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
|
||||
for xcc_id in range(self.dev.xccs):
|
||||
with self.pred_exec(xcc_mask=1<<xcc_id):
|
||||
scratch_base = prg.dev.scratch.va_addr + (prg.dev.scratch.size // self.dev.xccs * xcc_id)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(scratch_base >> 8))
|
||||
if 100000 <= prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESTART_X), 0, 0, 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
if prg.dev.target >= 100000:
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_USER_DATA_0), *user_regs)
|
||||
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESOURCE_LIMITS), 0)
|
||||
|
||||
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(cs_w32_en=1, force_start_at_000=1, compute_shader_en=1)
|
||||
gfx10p = {'cs_w32_en': int(prg.wave32)} if prg.dev.target >= 100000 else {}
|
||||
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(**gfx10p, force_start_at_000=1, compute_shader_en=1)
|
||||
self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
|
||||
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
if self.dev.xccs > 1: self.release_mem(cache_flush=True)
|
||||
self.xcc_barrier()
|
||||
return self
|
||||
|
||||
def wait(self, signal:AMDSignal, value:sint=0):
|
||||
self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
||||
self.xcc_barrier()
|
||||
return self
|
||||
|
||||
def timestamp(self, signal:AMDSignal):
|
||||
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
|
||||
with self.pred_exec(xcc_mask=0b1):
|
||||
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
|
||||
return self
|
||||
|
||||
def signal(self, signal:AMDSignal, value:sint=0):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
with self.pred_exec(xcc_mask=0b1):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
return self
|
||||
|
||||
def bind(self, dev:AMDDevice):
|
||||
@@ -249,6 +304,13 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
def _submit(self, dev:AMDDevice):
|
||||
cmds = self.indirect_cmd if dev == self.binded_device else self._q
|
||||
# WORKAROUND: PACKET3_PRED_EXEC doesn't work in rings, only in IBs, create a fake IB inside a ring to work around that
|
||||
if self.dev.xccs > 1 and dev != self.binded_device:
|
||||
ib_end = ((dev.compute_queue.put_value + 5) % len(dev.compute_queue.ring)) + len(cmds)
|
||||
ib_pad = len(dev.compute_queue.ring) - (ib_end - len(cmds)) if ib_end > len(dev.compute_queue.ring) else 0
|
||||
ib_ptr = mv_address(dev.compute_queue.ring) + ((dev.compute_queue.put_value + 5 + ib_pad) % len(dev.compute_queue.ring)) * 4
|
||||
cmds = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(ib_ptr), len(cmds) | self.pm4.INDIRECT_BUFFER_VALID,
|
||||
self.pm4.PACKET3(self.pm4.PACKET3_NOP, ib_pad + len(cmds) - 1), *((0,) * ib_pad), *cmds]
|
||||
|
||||
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
|
||||
|
||||
@@ -257,7 +319,7 @@ class AMDComputeQueue(HWQueue):
|
||||
|
||||
class AMDCopyQueue(HWQueue):
|
||||
def __init__(self, dev, max_copy_size=0x40000000):
|
||||
self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev.sdma, [], max_copy_size
|
||||
self.dev, self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev, dev.sdma, [], max_copy_size
|
||||
super().__init__()
|
||||
|
||||
def q(self, *arr):
|
||||
@@ -277,10 +339,12 @@ class AMDCopyQueue(HWQueue):
|
||||
return self
|
||||
|
||||
def signal(self, signal:AMDSignal, value:sint=0):
|
||||
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
|
||||
fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.gfxver >= 10 else 0
|
||||
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value)
|
||||
self.q(self.sdma.SDMA_OP_FENCE, *data64_le(signal.value_addr), value)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
||||
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
||||
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
||||
elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
||||
|
||||
@@ -310,7 +374,7 @@ class AMDCopyQueue(HWQueue):
|
||||
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
||||
|
||||
def _submit(self, dev:AMDDevice):
|
||||
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
||||
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
||||
|
||||
if self.binded_device == dev:
|
||||
# An IB packet must end on a 8 DW boundary.
|
||||
@@ -361,11 +425,12 @@ class AMDProgram(HCQProgram):
|
||||
self.dev._ensure_has_local_memory(self.private_segment_size)
|
||||
|
||||
code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + rodata_entry) # NOTE: this is wrong, it's not this object
|
||||
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
||||
self.wave32: bool = code.kernel_code_properties & 0x400 == 0x400
|
||||
|
||||
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
|
||||
self.rsrc1: int = code.compute_pgm_rsrc1 | ((1 << 20) if 110000 <= self.dev.target < 120000 else 0)
|
||||
self.rsrc2: int = code.compute_pgm_rsrc2 | (lds_size << 15)
|
||||
self.rsrc3: int = image[rodata_entry+44:rodata_entry+48].cast("I")[0] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
|
||||
self.prog_addr: int = self.lib_gpu.va_addr + rodata_entry + code.kernel_code_entry_byte_offset
|
||||
if code.kernel_code_entry_byte_offset == 0: self.prog_addr = self.lib_gpu.va_addr + text_entry
|
||||
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
|
||||
@@ -400,20 +465,29 @@ class ProfileSQTTEvent(ProfileEvent): device:str; se:int; blob:bytes; itrace:boo
|
||||
@dataclass
|
||||
class AMDQueueDesc:
|
||||
ring: memoryview
|
||||
read_ptr: memoryview
|
||||
write_ptr: memoryview
|
||||
doorbell: memoryview
|
||||
read_ptrs: list[memoryview]
|
||||
write_ptrs: list[memoryview]
|
||||
doorbells: list[memoryview]
|
||||
put_value: int = 0
|
||||
|
||||
@property
|
||||
def read_ptr(self): return min(p[0] for p in self.read_ptrs)
|
||||
|
||||
@classmethod
|
||||
def multi(cls, *queues: AMDQueueDesc):
|
||||
assert all_same([(mv_address(q.ring), q.put_value) for q in queues]), f"All queues must have the same ring and put_value: {queues}"
|
||||
return cls(ring=queues[0].ring, put_value=queues[0].put_value, doorbells=flatten(q.doorbells for q in queues),
|
||||
read_ptrs=flatten(q.read_ptrs for q in queues), write_ptrs=flatten(q.write_ptrs for q in queues))
|
||||
|
||||
def signal_doorbell(self, dev):
|
||||
self.write_ptr[0] = self.put_value
|
||||
for write_ptr in self.write_ptrs: write_ptr[0] = self.put_value
|
||||
|
||||
# Ensure all prior writes are visible to the GPU.
|
||||
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
||||
|
||||
# Flush hdp if queue is in dev mem.
|
||||
if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
self.doorbell[0] = self.put_value
|
||||
for doorbell in self.doorbells: doorbell[0] = self.put_value
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AMDReg(AMDRegBase):
|
||||
@@ -530,21 +604,20 @@ class KFDIface:
|
||||
n_devices=len(mem.meta.mapped_gpu_ids))
|
||||
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
|
||||
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
||||
cwsr_ctx = self.alloc(round_up(ctx_save_restore_size + debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
||||
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(KFDIface.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE|(xcc_id<<8), queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,
|
||||
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=ctx_save_restore_size,
|
||||
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
|
||||
ctx_save_restore_address=cwsr_buffer.va_addr if cwsr_buffer else 0, ctx_save_restore_size=ctx_save_restore_size,
|
||||
write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8 * (xcc_id + 1))
|
||||
|
||||
if not hasattr(self, 'doorbells'):
|
||||
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
||||
|
||||
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
|
||||
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
||||
doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
|
||||
read_ptrs=[to_mv(queue.read_pointer_address, 8).cast("Q")], write_ptrs=[to_mv(queue.write_pointer_address, 8).cast("Q")],
|
||||
doorbells=[to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q")])
|
||||
|
||||
def sleep(self, tm:int): kfd.AMDKFD_IOC_WAIT_EVENTS(KFDIface.kfd, events_ptr=self.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=tm)
|
||||
|
||||
@@ -680,7 +753,7 @@ class PCIIface:
|
||||
paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
|
||||
self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
|
||||
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
||||
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
||||
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
||||
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
|
||||
@@ -688,9 +761,8 @@ class PCIIface:
|
||||
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
||||
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
||||
|
||||
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"),
|
||||
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
|
||||
|
||||
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), doorbells=[to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q")],
|
||||
read_ptrs=[to_mv(gart.va_addr, 8).cast("Q")], write_ptrs=[to_mv(gart.va_addr+0x10, 8).cast("Q")])
|
||||
def sleep(self, timeout):
|
||||
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
|
||||
self.irq_fd.read(8 * events_cnt)
|
||||
@@ -713,41 +785,52 @@ class AMDDevice(HCQCompiled):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
|
||||
self.target = int(self.dev_iface.props['gfx_target_version'])
|
||||
self.gfxver = self.target // 10000
|
||||
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
||||
if self.target < 100300 or self.target >= 130000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
if self.target < 90402 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
|
||||
|
||||
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
|
||||
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
|
||||
self.has_scratch_base_registers = self.target >= 110000
|
||||
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] // self.dev_iface.props.get('num_xcc', 1) - 1
|
||||
self.max_wave_id = (self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1) if self.target >= 100100 else \
|
||||
(min((self.max_cu_id+1)*40, self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine'] * 512) - 1)
|
||||
self.xccs = self.dev_iface.props.get('num_xcc', 1) if getenv("XCCS", 1) else 1
|
||||
self.has_scratch_base_registers = self.target >= 110000 or self.target == 90402 # this is what llvm refers to as "architected flat scratch"
|
||||
|
||||
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
|
||||
sgrp_size_per_cu, lds_size_per_cu, hwreg_size_per_cu = 0x4000, 0x10000, 0x1000
|
||||
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else 0x40000
|
||||
vgpr_size_per_cu = 0x60000 if self.target in {110000, 110001, 120000, 120001} else \
|
||||
0x80000 if (self.target//100)*100 == 90400 or self.target in {90008, 90010} else 0x40000
|
||||
wg_data_size = round_up((vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu) * (self.max_cu_id + 1), mmap.PAGESIZE)
|
||||
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE)
|
||||
if self.target//10000 == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
|
||||
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
|
||||
ctl_stack_size = round_up(12 * (self.max_cu_id + 1) * (self.max_wave_id + 1) + 8 + 40, mmap.PAGESIZE) if self.target >= 100100 else \
|
||||
round_up((self.max_wave_id + 1) * 8 + 8 + 40, mmap.PAGESIZE)
|
||||
debug_memory_size = round_up((self.max_cu_id + 1 if self.target >= 100100 else 1) * (self.max_wave_id + 1) * 32, 64)
|
||||
if self.gfxver == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
|
||||
|
||||
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.target//10000])}")
|
||||
self.pm4 = importlib.import_module("tinygrad.runtime.autogen.am.pm4_nv")
|
||||
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({9: 'vega10', 10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.gfxver])}")
|
||||
self.pm4 = importlib.import_module(f"tinygrad.runtime.autogen.am.pm4_{'nv' if self.gfxver >= 10 else 'soc15'}")
|
||||
self.sdma = import_module('sdma', self.dev_iface.ip_versions[am.SDMA0_HWIP])
|
||||
self.gc = AMDIP('gc', self.dev_iface.ip_versions[am.GC_HWIP], self.dev_iface.ip_offsets[am.GC_HWIP])
|
||||
self.nbio = AMDIP('nbio' if self.target < 120000 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], self.dev_iface.ip_offsets[am.NBIF_HWIP])
|
||||
pad = (0,) if self.gfxver == 9 else () # ?!?!?!?!??!?!?!
|
||||
self.nbio = AMDIP('nbio' if self.gfxver < 12 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], pad+self.dev_iface.ip_offsets[am.NBIF_HWIP])
|
||||
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
||||
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer() if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
|
||||
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self))
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size))
|
||||
|
||||
# Scratch setup
|
||||
self.max_private_segment_size = 0
|
||||
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
||||
|
||||
# XCC setup
|
||||
self.xcc_sync: tuple[AMDSignal, AMDSignal]|None = (AMDSignal(), AMDSignal()) if self.xccs > 1 else None
|
||||
if self.xccs > 1: AMDComputeQueue(self).xcc_config().submit(self)
|
||||
|
||||
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
@@ -767,8 +850,11 @@ class AMDDevice(HCQCompiled):
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
||||
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
||||
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
||||
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.dev_iface.props.get('num_xcc', 1), mmap.PAGESIZE)
|
||||
return AMDQueueDesc.multi(*(self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, xcc_id=xcc_id,
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size,
|
||||
cwsr_buffer=(self.dev_iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None))
|
||||
for xcc_id in range(self.xccs if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE else 1)))
|
||||
|
||||
def _ensure_has_local_memory(self, required):
|
||||
if self.max_private_segment_size >= required: return
|
||||
@@ -776,12 +862,13 @@ class AMDDevice(HCQCompiled):
|
||||
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
|
||||
wave_scratch_len = round_up(((self.max_wave_id + 1) * required), 256 if self.target >= 110000 else 1024)
|
||||
|
||||
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len)
|
||||
scratch_size = (self.max_cu_id+1)*self.dev_iface.props['max_slots_scratch_cu']*wave_scratch_len # per xcc
|
||||
self.scratch, ok = self._realloc(getattr(self, 'scratch', None), scratch_size*self.xccs)
|
||||
if ok:
|
||||
engines = self.dev_iface.props['array_count'] // self.dev_iface.props['simd_arrays_per_engine']
|
||||
waves = wave_scratch_len // (256 if self.target >= 110000 else 1024)
|
||||
# >=gfx11 wavesize is per SE
|
||||
wavesize = self.scratch.size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
|
||||
wavesize = scratch_size // ((wave_scratch_len * engines) if self.target >= 110000 else wave_scratch_len)
|
||||
self.tmpring_size = waves << 12 | wavesize
|
||||
self.max_private_segment_size = required
|
||||
|
||||
|
||||
Reference in New Issue
Block a user