kfd free buffers (#4027)

* kfd free buffers

* unmap

* all test passes

* better pm4

* forgot these

* invalidate only range

* better cache

* forgot

* comments

* fixes
This commit is contained in:
nimlgen
2024-04-02 01:50:58 +03:00
committed by GitHub
parent 77a68fc52f
commit d6ba44bc1e
5 changed files with 1451 additions and 30 deletions

View File

@@ -79,16 +79,21 @@ generate_hsa() {
/opt/rocm/include/hsa/amd_hsa_signal.h \
/opt/rocm/include/hsa/amd_hsa_queue.h \
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
/opt/rocm/include/hsa/hsa_ven_amd_aqlprofile.h \
--clang-args="-I/opt/rocm/include" \
-o $BASE/hsa.py -l /opt/rocm/lib/libhsa-runtime64.so
# clang2py broken when pass -x c++ to prev headers
clang2py extra/hip_gpu_driver/sdma_registers.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/amd_sdma.py -l /opt/rocm/lib/libhsa-runtime64.so
-o $BASE/amd_gpu.py -l /opt/rocm/lib/libhsa-runtime64.so
sed 's/^\(.*\)\(\s*\/\*\)\(.*\)$/\1 #\2\3/; s/^\(\s*\*\)\(.*\)$/#\1\2/' extra/hip_gpu_driver/nvd.h >> $BASE/amd_gpu.py # comments
sed -i 's/#\s*define\s*\([^ \t]*\)(\([^)]*\))\s*\(.*\)/def \1(\2): return \3/' $BASE/amd_gpu.py # #define name(x) (smth) -> def name(x): return (smth)
sed -i '/#\s*define\s\+\([^ \t]\+\)\s\+\([^ ]\+\)/s//\1 = \2/' $BASE/amd_gpu.py # #define name val -> name = val
fixup $BASE/hsa.py
fixup $BASE/amd_sdma.py
fixup $BASE/amd_gpu.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/hsa.py
sed -i "s\'/opt/rocm/\os.getenv('ROCM_PATH', '/opt/rocm/')+'/\g" $BASE/hsa.py
python3 -c "import tinygrad.runtime.autogen.hsa"

470
extra/hip_gpu_driver/nvd.h Normal file
View File

@@ -0,0 +1,470 @@
/*
* Copyright 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef NVD_H
#define NVD_H
/**
* Navi's PM4 definitions
*/
#define PACKET_TYPE0 0
#define PACKET_TYPE1 1
#define PACKET_TYPE2 2
#define PACKET_TYPE3 3
#define CP_PACKET_GET_TYPE(h) (((h) >> 30) & 3)
#define CP_PACKET_GET_COUNT(h) (((h) >> 16) & 0x3FFF)
#define CP_PACKET0_GET_REG(h) ((h) & 0xFFFF)
#define CP_PACKET3_GET_OPCODE(h) (((h) >> 8) & 0xFF)
#define PACKET0(reg, n) ((PACKET_TYPE0 << 30) | \
((reg) & 0xFFFF) | \
((n) & 0x3FFF) << 16)
#define CP_PACKET2 0x80000000
#define PACKET2_PAD_SHIFT 0
#define PACKET2_PAD_MASK (0x3fffffff << 0)
#define PACKET2(v) (CP_PACKET2 | REG_SET(PACKET2_PAD, (v)))
#define PACKET3(op, n) ((PACKET_TYPE3 << 30) | \
(((op) & 0xFF) << 8) | \
((n) & 0x3FFF) << 16)
#define PACKET3_COMPUTE(op, n) (PACKET3(op, n) | 1 << 1)
/* Packet 3 types */
#define PACKET3_NOP 0x10
#define PACKET3_SET_BASE 0x11
#define PACKET3_BASE_INDEX(x) ((x) << 0)
#define CE_PARTITION_BASE 3
#define PACKET3_CLEAR_STATE 0x12
#define PACKET3_INDEX_BUFFER_SIZE 0x13
#define PACKET3_DISPATCH_DIRECT 0x15
#define PACKET3_DISPATCH_INDIRECT 0x16
#define PACKET3_INDIRECT_BUFFER_END 0x17
#define PACKET3_INDIRECT_BUFFER_CNST_END 0x19
#define PACKET3_ATOMIC_GDS 0x1D
#define PACKET3_ATOMIC_MEM 0x1E
#define PACKET3_OCCLUSION_QUERY 0x1F
#define PACKET3_SET_PREDICATION 0x20
#define PACKET3_REG_RMW 0x21
#define PACKET3_COND_EXEC 0x22
#define PACKET3_PRED_EXEC 0x23
#define PACKET3_DRAW_INDIRECT 0x24
#define PACKET3_DRAW_INDEX_INDIRECT 0x25
#define PACKET3_INDEX_BASE 0x26
#define PACKET3_DRAW_INDEX_2 0x27
#define PACKET3_CONTEXT_CONTROL 0x28
#define PACKET3_INDEX_TYPE 0x2A
#define PACKET3_DRAW_INDIRECT_MULTI 0x2C
#define PACKET3_DRAW_INDEX_AUTO 0x2D
#define PACKET3_NUM_INSTANCES 0x2F
#define PACKET3_DRAW_INDEX_MULTI_AUTO 0x30
#define PACKET3_INDIRECT_BUFFER_PRIV 0x32
#define PACKET3_INDIRECT_BUFFER_CNST 0x33
#define PACKET3_COND_INDIRECT_BUFFER_CNST 0x33
#define PACKET3_STRMOUT_BUFFER_UPDATE 0x34
#define PACKET3_DRAW_INDEX_OFFSET_2 0x35
#define PACKET3_DRAW_PREAMBLE 0x36
#define PACKET3_WRITE_DATA 0x37
#define WRITE_DATA_DST_SEL(x) ((x) << 8)
/* 0 - register
* 1 - memory (sync - via GRBM)
* 2 - gl2
* 3 - gds
* 4 - reserved
* 5 - memory (async - direct)
*/
#define WR_ONE_ADDR (1 << 16)
#define WR_CONFIRM (1 << 20)
#define WRITE_DATA_CACHE_POLICY(x) ((x) << 25)
/* 0 - LRU
* 1 - Stream
*/
#define WRITE_DATA_ENGINE_SEL(x) ((x) << 30)
/* 0 - me
* 1 - pfp
* 2 - ce
*/
#define PACKET3_DRAW_INDEX_INDIRECT_MULTI 0x38
#define PACKET3_MEM_SEMAPHORE 0x39
# define PACKET3_SEM_USE_MAILBOX (0x1 << 16)
# define PACKET3_SEM_SEL_SIGNAL_TYPE (0x1 << 20) /* 0 = increment, 1 = write 1 */
# define PACKET3_SEM_SEL_SIGNAL (0x6 << 29)
# define PACKET3_SEM_SEL_WAIT (0x7 << 29)
#define PACKET3_DRAW_INDEX_MULTI_INST 0x3A
#define PACKET3_COPY_DW 0x3B
#define PACKET3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0)
/* 0 - always
* 1 - <
* 2 - <=
* 3 - ==
* 4 - !=
* 5 - >=
* 6 - >
*/
#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4)
/* 0 - reg
* 1 - mem
*/
#define WAIT_REG_MEM_OPERATION(x) ((x) << 6)
/* 0 - wait_reg_mem
* 1 - wr_wait_wr_reg
*/
#define WAIT_REG_MEM_ENGINE(x) ((x) << 8)
/* 0 - me
* 1 - pfp
*/
#define PACKET3_INDIRECT_BUFFER 0x3F
#define INDIRECT_BUFFER_VALID (1 << 23)
#define INDIRECT_BUFFER_CACHE_POLICY(x) ((x) << 28)
/* 0 - LRU
* 1 - Stream
* 2 - Bypass
*/
#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21)
#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30)
#define PACKET3_COND_INDIRECT_BUFFER 0x3F
#define PACKET3_COPY_DATA 0x40
#define PACKET3_CP_DMA 0x41
#define PACKET3_PFP_SYNC_ME 0x42
#define PACKET3_SURFACE_SYNC 0x43
#define PACKET3_ME_INITIALIZE 0x44
#define PACKET3_COND_WRITE 0x45
#define PACKET3_EVENT_WRITE 0x46
#define EVENT_TYPE(x) ((x) << 0)
#define EVENT_INDEX(x) ((x) << 8)
/* 0 - any non-TS event
* 1 - ZPASS_DONE, PIXEL_PIPE_STAT_*
* 2 - SAMPLE_PIPELINESTAT
* 3 - SAMPLE_STREAMOUTSTAT*
* 4 - *S_PARTIAL_FLUSH
*/
#define PACKET3_EVENT_WRITE_EOP 0x47
#define PACKET3_EVENT_WRITE_EOS 0x48
#define PACKET3_RELEASE_MEM 0x49
#define PACKET3_RELEASE_MEM_EVENT_TYPE(x) ((x) << 0)
#define PACKET3_RELEASE_MEM_EVENT_INDEX(x) ((x) << 8)
#define PACKET3_RELEASE_MEM_GCR_GLM_WB (1 << 12)
#define PACKET3_RELEASE_MEM_GCR_GLM_INV (1 << 13)
#define PACKET3_RELEASE_MEM_GCR_GLV_INV (1 << 14)
#define PACKET3_RELEASE_MEM_GCR_GL1_INV (1 << 15)
#define PACKET3_RELEASE_MEM_GCR_GL2_US (1 << 16)
#define PACKET3_RELEASE_MEM_GCR_GL2_RANGE (1 << 17)
#define PACKET3_RELEASE_MEM_GCR_GL2_DISCARD (1 << 19)
#define PACKET3_RELEASE_MEM_GCR_GL2_INV (1 << 20)
#define PACKET3_RELEASE_MEM_GCR_GL2_WB (1 << 21)
#define PACKET3_RELEASE_MEM_GCR_SEQ (1 << 22)
#define PACKET3_RELEASE_MEM_CACHE_POLICY(x) ((x) << 25)
/* 0 - cache_policy__me_release_mem__lru
* 1 - cache_policy__me_release_mem__stream
* 2 - cache_policy__me_release_mem__noa
* 3 - cache_policy__me_release_mem__bypass
*/
#define PACKET3_RELEASE_MEM_EXECUTE (1 << 28)
#define PACKET3_RELEASE_MEM_DATA_SEL(x) ((x) << 29)
/* 0 - discard
* 1 - send low 32bit data
* 2 - send 64bit data
* 3 - send 64bit GPU counter value
* 4 - send 64bit sys counter value
*/
#define PACKET3_RELEASE_MEM_INT_SEL(x) ((x) << 24)
/* 0 - none
* 1 - interrupt only (DATA_SEL = 0)
* 2 - interrupt when data write is confirmed
*/
#define PACKET3_RELEASE_MEM_DST_SEL(x) ((x) << 16)
/* 0 - MC
* 1 - TC/L2
*/
#define PACKET3_PREAMBLE_CNTL 0x4A
# define PACKET3_PREAMBLE_BEGIN_CLEAR_STATE (2 << 28)
# define PACKET3_PREAMBLE_END_CLEAR_STATE (3 << 28)
#define PACKET3_DMA_DATA 0x50
/* 1. header
* 2. CONTROL
* 3. SRC_ADDR_LO or DATA [31:0]
* 4. SRC_ADDR_HI [31:0]
* 5. DST_ADDR_LO [31:0]
* 6. DST_ADDR_HI [7:0]
* 7. COMMAND [31:26] | BYTE_COUNT [25:0]
*/
/* CONTROL */
# define PACKET3_DMA_DATA_ENGINE(x) ((x) << 0)
/* 0 - ME
* 1 - PFP
*/
# define PACKET3_DMA_DATA_SRC_CACHE_POLICY(x) ((x) << 13)
/* 0 - LRU
* 1 - Stream
*/
# define PACKET3_DMA_DATA_DST_SEL(x) ((x) << 20)
/* 0 - DST_ADDR using DAS
* 1 - GDS
* 3 - DST_ADDR using L2
*/
# define PACKET3_DMA_DATA_DST_CACHE_POLICY(x) ((x) << 25)
/* 0 - LRU
* 1 - Stream
*/
# define PACKET3_DMA_DATA_SRC_SEL(x) ((x) << 29)
/* 0 - SRC_ADDR using SAS
* 1 - GDS
* 2 - DATA
* 3 - SRC_ADDR using L2
*/
# define PACKET3_DMA_DATA_CP_SYNC (1 << 31)
/* COMMAND */
# define PACKET3_DMA_DATA_CMD_SAS (1 << 26)
/* 0 - memory
* 1 - register
*/
# define PACKET3_DMA_DATA_CMD_DAS (1 << 27)
/* 0 - memory
* 1 - register
*/
# define PACKET3_DMA_DATA_CMD_SAIC (1 << 28)
# define PACKET3_DMA_DATA_CMD_DAIC (1 << 29)
# define PACKET3_DMA_DATA_CMD_RAW_WAIT (1 << 30)
#define PACKET3_CONTEXT_REG_RMW 0x51
#define PACKET3_GFX_CNTX_UPDATE 0x52
#define PACKET3_BLK_CNTX_UPDATE 0x53
#define PACKET3_INCR_UPDT_STATE 0x55
#define PACKET3_ACQUIRE_MEM 0x58
/* 1. HEADER
* 2. COHER_CNTL [30:0]
* 2.1 ENGINE_SEL [31:31]
* 2. COHER_SIZE [31:0]
* 3. COHER_SIZE_HI [7:0]
* 4. COHER_BASE_LO [31:0]
* 5. COHER_BASE_HI [23:0]
* 7. POLL_INTERVAL [15:0]
* 8. GCR_CNTL [18:0]
*/
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x) ((x) << 0)
/*
* 0:NOP
* 1:ALL
* 2:RANGE
* 3:FIRST_LAST
*/
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x) ((x) << 2)
/*
* 0:ALL
* 1:reserved
* 2:RANGE
* 3:FIRST_LAST
*/
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x) ((x) << 4)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x) ((x) << 5)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x) ((x) << 6)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x) ((x) << 7)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x) ((x) << 8)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x) ((x) << 9)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x) ((x) << 10)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x) ((x) << 11)
/*
* 0:ALL
* 1:VOL
* 2:RANGE
* 3:FIRST_LAST
*/
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x) ((x) << 13)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x) ((x) << 14)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x) ((x) << 15)
#define PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x) ((x) << 16)
/*
* 0: PARALLEL
* 1: FORWARD
* 2: REVERSE
*/
#define PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA (1 << 18)
#define PACKET3_REWIND 0x59
#define PACKET3_INTERRUPT 0x5A
#define PACKET3_GEN_PDEPTE 0x5B
#define PACKET3_INDIRECT_BUFFER_PASID 0x5C
#define PACKET3_PRIME_UTCL2 0x5D
#define PACKET3_LOAD_UCONFIG_REG 0x5E
#define PACKET3_LOAD_SH_REG 0x5F
#define PACKET3_LOAD_CONFIG_REG 0x60
#define PACKET3_LOAD_CONTEXT_REG 0x61
#define PACKET3_LOAD_COMPUTE_STATE 0x62
#define PACKET3_LOAD_SH_REG_INDEX 0x63
#define PACKET3_SET_CONFIG_REG 0x68
#define PACKET3_SET_CONFIG_REG_START 0x00002000
#define PACKET3_SET_CONFIG_REG_END 0x00002c00
#define PACKET3_SET_CONTEXT_REG 0x69
#define PACKET3_SET_CONTEXT_REG_START 0x0000a000
#define PACKET3_SET_CONTEXT_REG_END 0x0000a400
#define PACKET3_SET_CONTEXT_REG_INDEX 0x6A
#define PACKET3_SET_VGPR_REG_DI_MULTI 0x71
#define PACKET3_SET_SH_REG_DI 0x72
#define PACKET3_SET_CONTEXT_REG_INDIRECT 0x73
#define PACKET3_SET_SH_REG_DI_MULTI 0x74
#define PACKET3_GFX_PIPE_LOCK 0x75
#define PACKET3_SET_SH_REG 0x76
#define PACKET3_SET_SH_REG_START 0x00002c00
#define PACKET3_SET_SH_REG_END 0x00003000
#define PACKET3_SET_SH_REG_OFFSET 0x77
#define PACKET3_SET_QUEUE_REG 0x78
#define PACKET3_SET_UCONFIG_REG 0x79
#define PACKET3_SET_UCONFIG_REG_START 0x0000c000
#define PACKET3_SET_UCONFIG_REG_END 0x0000c400
#define PACKET3_SET_UCONFIG_REG_INDEX 0x7A
#define PACKET3_FORWARD_HEADER 0x7C
#define PACKET3_SCRATCH_RAM_WRITE 0x7D
#define PACKET3_SCRATCH_RAM_READ 0x7E
#define PACKET3_LOAD_CONST_RAM 0x80
#define PACKET3_WRITE_CONST_RAM 0x81
#define PACKET3_DUMP_CONST_RAM 0x83
#define PACKET3_INCREMENT_CE_COUNTER 0x84
#define PACKET3_INCREMENT_DE_COUNTER 0x85
#define PACKET3_WAIT_ON_CE_COUNTER 0x86
#define PACKET3_WAIT_ON_DE_COUNTER_DIFF 0x88
#define PACKET3_SWITCH_BUFFER 0x8B
#define PACKET3_DISPATCH_DRAW_PREAMBLE 0x8C
#define PACKET3_DISPATCH_DRAW_PREAMBLE_ACE 0x8C
#define PACKET3_DISPATCH_DRAW 0x8D
#define PACKET3_DISPATCH_DRAW_ACE 0x8D
#define PACKET3_GET_LOD_STATS 0x8E
#define PACKET3_DRAW_MULTI_PREAMBLE 0x8F
#define PACKET3_FRAME_CONTROL 0x90
# define FRAME_TMZ (1 << 0)
# define FRAME_CMD(x) ((x) << 28)
/*
* x=0: tmz_begin
* x=1: tmz_end
*/
#define PACKET3_INDEX_ATTRIBUTES_INDIRECT 0x91
#define PACKET3_WAIT_REG_MEM64 0x93
#define PACKET3_COND_PREEMPT 0x94
#define PACKET3_HDP_FLUSH 0x95
#define PACKET3_COPY_DATA_RB 0x96
#define PACKET3_INVALIDATE_TLBS 0x98
# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0)
# define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4)
# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5)
#define PACKET3_AQL_PACKET 0x99
#define PACKET3_DMA_DATA_FILL_MULTI 0x9A
#define PACKET3_SET_SH_REG_INDEX 0x9B
#define PACKET3_DRAW_INDIRECT_COUNT_MULTI 0x9C
#define PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI 0x9D
#define PACKET3_DUMP_CONST_RAM_OFFSET 0x9E
#define PACKET3_LOAD_CONTEXT_REG_INDEX 0x9F
#define PACKET3_SET_RESOURCES 0xA0
/* 1. header
* 2. CONTROL
* 3. QUEUE_MASK_LO [31:0]
* 4. QUEUE_MASK_HI [31:0]
* 5. GWS_MASK_LO [31:0]
* 6. GWS_MASK_HI [31:0]
* 7. OAC_MASK [15:0]
* 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0]
*/
# define PACKET3_SET_RESOURCES_VMID_MASK(x) ((x) << 0)
# define PACKET3_SET_RESOURCES_UNMAP_LATENTY(x) ((x) << 16)
# define PACKET3_SET_RESOURCES_QUEUE_TYPE(x) ((x) << 29)
#define PACKET3_MAP_PROCESS 0xA1
#define PACKET3_MAP_QUEUES 0xA2
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. MQD_ADDR_LO [31:0]
* 5. MQD_ADDR_HI [31:0]
* 6. WPTR_ADDR_LO [31:0]
* 7. WPTR_ADDR_HI [31:0]
*/
/* CONTROL */
# define PACKET3_MAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
# define PACKET3_MAP_QUEUES_VMID(x) ((x) << 8)
# define PACKET3_MAP_QUEUES_QUEUE(x) ((x) << 13)
# define PACKET3_MAP_QUEUES_PIPE(x) ((x) << 16)
# define PACKET3_MAP_QUEUES_ME(x) ((x) << 18)
# define PACKET3_MAP_QUEUES_QUEUE_TYPE(x) ((x) << 21)
# define PACKET3_MAP_QUEUES_ALLOC_FORMAT(x) ((x) << 24)
# define PACKET3_MAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
# define PACKET3_MAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
/* CONTROL2 */
# define PACKET3_MAP_QUEUES_CHECK_DISABLE(x) ((x) << 1)
# define PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x) ((x) << 2)
#define PACKET3_UNMAP_QUEUES 0xA3
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. CONTROL3
* 5. CONTROL4
* 6. CONTROL5
*/
/* CONTROL */
# define PACKET3_UNMAP_QUEUES_ACTION(x) ((x) << 0)
/* 0 - PREEMPT_QUEUES
* 1 - RESET_QUEUES
* 2 - DISABLE_PROCESS_QUEUES
* 3 - PREEMPT_QUEUES_NO_UNMAP
*/
# define PACKET3_UNMAP_QUEUES_QUEUE_SEL(x) ((x) << 4)
# define PACKET3_UNMAP_QUEUES_ENGINE_SEL(x) ((x) << 26)
# define PACKET3_UNMAP_QUEUES_NUM_QUEUES(x) ((x) << 29)
/* CONTROL2a */
# define PACKET3_UNMAP_QUEUES_PASID(x) ((x) << 0)
/* CONTROL2b */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x) ((x) << 2)
/* CONTROL3a */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x) ((x) << 2)
/* CONTROL3b */
# define PACKET3_UNMAP_QUEUES_RB_WPTR(x) ((x) << 0)
/* CONTROL4 */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x) ((x) << 2)
/* CONTROL5 */
# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x) ((x) << 2)
#define PACKET3_QUERY_STATUS 0xA4
/* 1. header
* 2. CONTROL
* 3. CONTROL2
* 4. ADDR_LO [31:0]
* 5. ADDR_HI [31:0]
* 6. DATA_LO [31:0]
* 7. DATA_HI [31:0]
*/
/* CONTROL */
# define PACKET3_QUERY_STATUS_CONTEXT_ID(x) ((x) << 0)
# define PACKET3_QUERY_STATUS_INTERRUPT_SEL(x) ((x) << 28)
# define PACKET3_QUERY_STATUS_COMMAND(x) ((x) << 30)
/* CONTROL2a */
# define PACKET3_QUERY_STATUS_PASID(x) ((x) << 0)
/* CONTROL2b */
# define PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x) ((x) << 2)
# define PACKET3_QUERY_STATUS_ENG_SEL(x) ((x) << 25)
#define PACKET3_RUN_LIST 0xA5
#define PACKET3_MAP_PROCESS_VM 0xA6
/* GFX11 */
#define PACKET3_SET_Q_PREEMPTION_MODE 0xF0
# define PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x) ((x) << 0)
# define PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM (1 << 0)
#endif

View File

@@ -1428,3 +1428,473 @@ __all__ = \
'union_SDMA_PKT_TIMESTAMP_TAG_HEADER_UNION',
'union_SDMA_PKT_TRAP_TAG_HEADER_UNION',
'union_SDMA_PKT_TRAP_TAG_INT_CONTEXT_UNION']
#/*
# * Copyright 2019 Advanced Micro Devices, Inc.
# *
# * Permission is hereby granted, free of charge, to any person obtaining a
# * copy of this software and associated documentation files (the "Software"),
# * to deal in the Software without restriction, including without limitation
# * the rights to use, copy, modify, merge, publish, distribute, sublicense,
# * and/or sell copies of the Software, and to permit persons to whom the
# * Software is furnished to do so, subject to the following conditions:
# *
# * The above copyright notice and this permission notice shall be included in
# * all copies or substantial portions of the Software.
# *
# * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
# * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# * OTHER DEALINGS IN THE SOFTWARE.
# *
# */
#ifndef NVD_H
#define NVD_H
#/**
# * Navi's PM4 definitions
# */
PACKET_TYPE0 = 0
PACKET_TYPE1 = 1
PACKET_TYPE2 = 2
PACKET_TYPE3 = 3
def CP_PACKET_GET_TYPE(h): return (((h) >> 30) & 3)
def CP_PACKET_GET_COUNT(h): return (((h) >> 16) & 0x3FFF)
def CP_PACKET0_GET_REG(h): return ((h) & 0xFFFF)
def CP_PACKET3_GET_OPCODE(h): return (((h) >> 8) & 0xFF)
def PACKET0(reg, n): return ((PACKET_TYPE0 << 30) | \
((reg) & 0xFFFF) | \
((n) & 0x3FFF) << 16)
CP_PACKET2 = 0x80000000
PACKET2_PAD_SHIFT = 0
PACKET2_PAD_MASK = (0x3fffffff << 0)
def PACKET2(v): return (CP_PACKET2 | REG_SET(PACKET2_PAD, (v)))
def PACKET3(op, n): return ((PACKET_TYPE3 << 30) | \
(((op) & 0xFF) << 8) | \
((n) & 0x3FFF) << 16)
def PACKET3_COMPUTE(op, n): return (PACKET3(op, n) | 1 << 1)
#/* Packet 3 types */
PACKET3_NOP = 0x10
PACKET3_SET_BASE = 0x11
def PACKET3_BASE_INDEX(x): return ((x) << 0)
CE_PARTITION_BASE = 3
PACKET3_CLEAR_STATE = 0x12
PACKET3_INDEX_BUFFER_SIZE = 0x13
PACKET3_DISPATCH_DIRECT = 0x15
PACKET3_DISPATCH_INDIRECT = 0x16
PACKET3_INDIRECT_BUFFER_END = 0x17
PACKET3_INDIRECT_BUFFER_CNST_END = 0x19
PACKET3_ATOMIC_GDS = 0x1D
PACKET3_ATOMIC_MEM = 0x1E
PACKET3_OCCLUSION_QUERY = 0x1F
PACKET3_SET_PREDICATION = 0x20
PACKET3_REG_RMW = 0x21
PACKET3_COND_EXEC = 0x22
PACKET3_PRED_EXEC = 0x23
PACKET3_DRAW_INDIRECT = 0x24
PACKET3_DRAW_INDEX_INDIRECT = 0x25
PACKET3_INDEX_BASE = 0x26
PACKET3_DRAW_INDEX_2 = 0x27
PACKET3_CONTEXT_CONTROL = 0x28
PACKET3_INDEX_TYPE = 0x2A
PACKET3_DRAW_INDIRECT_MULTI = 0x2C
PACKET3_DRAW_INDEX_AUTO = 0x2D
PACKET3_NUM_INSTANCES = 0x2F
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30
PACKET3_INDIRECT_BUFFER_PRIV = 0x32
PACKET3_INDIRECT_BUFFER_CNST = 0x33
PACKET3_COND_INDIRECT_BUFFER_CNST = 0x33
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35
PACKET3_DRAW_PREAMBLE = 0x36
PACKET3_WRITE_DATA = 0x37
def WRITE_DATA_DST_SEL(x): return ((x) << 8)
#/* 0 - register
# * 1 - memory (sync - via GRBM)
# * 2 - gl2
# * 3 - gds
# * 4 - reserved
# * 5 - memory (async - direct)
# */
WR_ONE_ADDR = (1 << 16)
WR_CONFIRM = (1 << 20)
def WRITE_DATA_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - LRU
# * 1 - Stream
# */
def WRITE_DATA_ENGINE_SEL(x): return ((x) << 30)
#/* 0 - me
# * 1 - pfp
# * 2 - ce
# */
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38
PACKET3_MEM_SEMAPHORE = 0x39
PACKET3_SEM_USE_MAILBOX = (0x1 << 16)
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1 << 20) #/* 0 = increment, 1 = write 1 */
PACKET3_SEM_SEL_SIGNAL = (0x6 << 29)
PACKET3_SEM_SEL_WAIT = (0x7 << 29)
PACKET3_DRAW_INDEX_MULTI_INST = 0x3A
PACKET3_COPY_DW = 0x3B
PACKET3_WAIT_REG_MEM = 0x3C
def WAIT_REG_MEM_FUNCTION(x): return ((x) << 0)
#/* 0 - always
# * 1 - <
# * 2 - <=
# * 3 - ==
# * 4 - !=
# * 5 - >=
# * 6 - >
# */
def WAIT_REG_MEM_MEM_SPACE(x): return ((x) << 4)
#/* 0 - reg
# * 1 - mem
# */
def WAIT_REG_MEM_OPERATION(x): return ((x) << 6)
#/* 0 - wait_reg_mem
# * 1 - wr_wait_wr_reg
# */
def WAIT_REG_MEM_ENGINE(x): return ((x) << 8)
#/* 0 - me
# * 1 - pfp
# */
PACKET3_INDIRECT_BUFFER = 0x3F
INDIRECT_BUFFER_VALID = (1 << 23)
def INDIRECT_BUFFER_CACHE_POLICY(x): return ((x) << 28)
#/* 0 - LRU
# * 1 - Stream
# * 2 - Bypass
# */
def INDIRECT_BUFFER_PRE_ENB(x): return ((x) << 21)
def INDIRECT_BUFFER_PRE_RESUME(x): return ((x) << 30)
PACKET3_COND_INDIRECT_BUFFER = 0x3F
PACKET3_COPY_DATA = 0x40
PACKET3_CP_DMA = 0x41
PACKET3_PFP_SYNC_ME = 0x42
PACKET3_SURFACE_SYNC = 0x43
PACKET3_ME_INITIALIZE = 0x44
PACKET3_COND_WRITE = 0x45
PACKET3_EVENT_WRITE = 0x46
def EVENT_TYPE(x): return ((x) << 0)
def EVENT_INDEX(x): return ((x) << 8)
#/* 0 - any non-TS event
# * 1 - ZPASS_DONE, PIXEL_PIPE_STAT_*
# * 2 - SAMPLE_PIPELINESTAT
# * 3 - SAMPLE_STREAMOUTSTAT*
# * 4 - *S_PARTIAL_FLUSH
# */
PACKET3_EVENT_WRITE_EOP = 0x47
PACKET3_EVENT_WRITE_EOS = 0x48
PACKET3_RELEASE_MEM = 0x49
def PACKET3_RELEASE_MEM_EVENT_TYPE(x): return ((x) << 0)
def PACKET3_RELEASE_MEM_EVENT_INDEX(x): return ((x) << 8)
PACKET3_RELEASE_MEM_GCR_GLM_WB = (1 << 12)
PACKET3_RELEASE_MEM_GCR_GLM_INV = (1 << 13)
PACKET3_RELEASE_MEM_GCR_GLV_INV = (1 << 14)
PACKET3_RELEASE_MEM_GCR_GL1_INV = (1 << 15)
PACKET3_RELEASE_MEM_GCR_GL2_US = (1 << 16)
PACKET3_RELEASE_MEM_GCR_GL2_RANGE = (1 << 17)
PACKET3_RELEASE_MEM_GCR_GL2_DISCARD = (1 << 19)
PACKET3_RELEASE_MEM_GCR_GL2_INV = (1 << 20)
PACKET3_RELEASE_MEM_GCR_GL2_WB = (1 << 21)
PACKET3_RELEASE_MEM_GCR_SEQ = (1 << 22)
def PACKET3_RELEASE_MEM_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - cache_policy__me_release_mem__lru
# * 1 - cache_policy__me_release_mem__stream
# * 2 - cache_policy__me_release_mem__noa
# * 3 - cache_policy__me_release_mem__bypass
# */
PACKET3_RELEASE_MEM_EXECUTE = (1 << 28)
def PACKET3_RELEASE_MEM_DATA_SEL(x): return ((x) << 29)
#/* 0 - discard
# * 1 - send low 32bit data
# * 2 - send 64bit data
# * 3 - send 64bit GPU counter value
# * 4 - send 64bit sys counter value
# */
def PACKET3_RELEASE_MEM_INT_SEL(x): return ((x) << 24)
#/* 0 - none
# * 1 - interrupt only (DATA_SEL = 0)
# * 2 - interrupt when data write is confirmed
# */
def PACKET3_RELEASE_MEM_DST_SEL(x): return ((x) << 16)
#/* 0 - MC
# * 1 - TC/L2
# */
PACKET3_PREAMBLE_CNTL = 0x4A
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2 << 28)
PACKET3_PREAMBLE_END_CLEAR_STATE = (3 << 28)
PACKET3_DMA_DATA = 0x50
#/* 1. header
# * 2. CONTROL
# * 3. SRC_ADDR_LO or DATA [31:0]
# * 4. SRC_ADDR_HI [31:0]
# * 5. DST_ADDR_LO [31:0]
# * 6. DST_ADDR_HI [7:0]
# * 7. COMMAND [31:26] | BYTE_COUNT [25:0]
# */
#/* CONTROL */
def PACKET3_DMA_DATA_ENGINE(x): return ((x) << 0)
#/* 0 - ME
# * 1 - PFP
# */
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): return ((x) << 13)
#/* 0 - LRU
# * 1 - Stream
# */
def PACKET3_DMA_DATA_DST_SEL(x): return ((x) << 20)
#/* 0 - DST_ADDR using DAS
# * 1 - GDS
# * 3 - DST_ADDR using L2
# */
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - LRU
# * 1 - Stream
# */
def PACKET3_DMA_DATA_SRC_SEL(x): return ((x) << 29)
#/* 0 - SRC_ADDR using SAS
# * 1 - GDS
# * 2 - DATA
# * 3 - SRC_ADDR using L2
# */
PACKET3_DMA_DATA_CP_SYNC = (1 << 31)
#/* COMMAND */
PACKET3_DMA_DATA_CMD_SAS = (1 << 26)
#/* 0 - memory
# * 1 - register
# */
PACKET3_DMA_DATA_CMD_DAS = (1 << 27)
#/* 0 - memory
# * 1 - register
# */
PACKET3_DMA_DATA_CMD_SAIC = (1 << 28)
PACKET3_DMA_DATA_CMD_DAIC = (1 << 29)
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1 << 30)
PACKET3_CONTEXT_REG_RMW = 0x51
PACKET3_GFX_CNTX_UPDATE = 0x52
PACKET3_BLK_CNTX_UPDATE = 0x53
PACKET3_INCR_UPDT_STATE = 0x55
PACKET3_ACQUIRE_MEM = 0x58
#/* 1. HEADER
# * 2. COHER_CNTL [30:0]
# * 2.1 ENGINE_SEL [31:31]
# * 2. COHER_SIZE [31:0]
# * 3. COHER_SIZE_HI [7:0]
# * 4. COHER_BASE_LO [31:0]
# * 5. COHER_BASE_HI [23:0]
# * 7. POLL_INTERVAL [15:0]
# * 8. GCR_CNTL [18:0]
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x): return ((x) << 0)
#/*
# * 0:NOP
# * 1:ALL
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x): return ((x) << 2)
#/*
# * 0:ALL
# * 1:reserved
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x): return ((x) << 4)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x): return ((x) << 5)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x): return ((x) << 6)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x): return ((x) << 7)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x): return ((x) << 8)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x): return ((x) << 9)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x): return ((x) << 10)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x): return ((x) << 11)
#/*
# * 0:ALL
# * 1:VOL
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x): return ((x) << 13)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x): return ((x) << 14)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x): return ((x) << 15)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x): return ((x) << 16)
#/*
# * 0: PARALLEL
# * 1: FORWARD
# * 2: REVERSE
# */
PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA = (1 << 18)
PACKET3_REWIND = 0x59
PACKET3_INTERRUPT = 0x5A
PACKET3_GEN_PDEPTE = 0x5B
PACKET3_INDIRECT_BUFFER_PASID = 0x5C
PACKET3_PRIME_UTCL2 = 0x5D
PACKET3_LOAD_UCONFIG_REG = 0x5E
PACKET3_LOAD_SH_REG = 0x5F
PACKET3_LOAD_CONFIG_REG = 0x60
PACKET3_LOAD_CONTEXT_REG = 0x61
PACKET3_LOAD_COMPUTE_STATE = 0x62
PACKET3_LOAD_SH_REG_INDEX = 0x63
PACKET3_SET_CONFIG_REG = 0x68
PACKET3_SET_CONFIG_REG_START = 0x00002000
PACKET3_SET_CONFIG_REG_END = 0x00002c00
PACKET3_SET_CONTEXT_REG = 0x69
PACKET3_SET_CONTEXT_REG_START = 0x0000a000
PACKET3_SET_CONTEXT_REG_END = 0x0000a400
PACKET3_SET_CONTEXT_REG_INDEX = 0x6A
PACKET3_SET_VGPR_REG_DI_MULTI = 0x71
PACKET3_SET_SH_REG_DI = 0x72
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73
PACKET3_SET_SH_REG_DI_MULTI = 0x74
PACKET3_GFX_PIPE_LOCK = 0x75
PACKET3_SET_SH_REG = 0x76
PACKET3_SET_SH_REG_START = 0x00002c00
PACKET3_SET_SH_REG_END = 0x00003000
PACKET3_SET_SH_REG_OFFSET = 0x77
PACKET3_SET_QUEUE_REG = 0x78
PACKET3_SET_UCONFIG_REG = 0x79
PACKET3_SET_UCONFIG_REG_START = 0x0000c000
PACKET3_SET_UCONFIG_REG_END = 0x0000c400
PACKET3_SET_UCONFIG_REG_INDEX = 0x7A
PACKET3_FORWARD_HEADER = 0x7C
PACKET3_SCRATCH_RAM_WRITE = 0x7D
PACKET3_SCRATCH_RAM_READ = 0x7E
PACKET3_LOAD_CONST_RAM = 0x80
PACKET3_WRITE_CONST_RAM = 0x81
PACKET3_DUMP_CONST_RAM = 0x83
PACKET3_INCREMENT_CE_COUNTER = 0x84
PACKET3_INCREMENT_DE_COUNTER = 0x85
PACKET3_WAIT_ON_CE_COUNTER = 0x86
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88
PACKET3_SWITCH_BUFFER = 0x8B
PACKET3_DISPATCH_DRAW_PREAMBLE = 0x8C
PACKET3_DISPATCH_DRAW_PREAMBLE_ACE = 0x8C
PACKET3_DISPATCH_DRAW = 0x8D
PACKET3_DISPATCH_DRAW_ACE = 0x8D
PACKET3_GET_LOD_STATS = 0x8E
PACKET3_DRAW_MULTI_PREAMBLE = 0x8F
PACKET3_FRAME_CONTROL = 0x90
FRAME_TMZ = (1 << 0)
def FRAME_CMD(x): return ((x) << 28)
#/*
# * x=0: tmz_begin
# * x=1: tmz_end
# */
PACKET3_INDEX_ATTRIBUTES_INDIRECT = 0x91
PACKET3_WAIT_REG_MEM64 = 0x93
PACKET3_COND_PREEMPT = 0x94
PACKET3_HDP_FLUSH = 0x95
PACKET3_COPY_DATA_RB = 0x96
PACKET3_INVALIDATE_TLBS = 0x98
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): return ((x) << 0)
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): return ((x) << 4)
def PACKET3_INVALIDATE_TLBS_PASID(x): return ((x) << 5)
PACKET3_AQL_PACKET = 0x99
PACKET3_DMA_DATA_FILL_MULTI = 0x9A
PACKET3_SET_SH_REG_INDEX = 0x9B
PACKET3_DRAW_INDIRECT_COUNT_MULTI = 0x9C
PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI = 0x9D
PACKET3_DUMP_CONST_RAM_OFFSET = 0x9E
PACKET3_LOAD_CONTEXT_REG_INDEX = 0x9F
PACKET3_SET_RESOURCES = 0xA0
#/* 1. header
# * 2. CONTROL
# * 3. QUEUE_MASK_LO [31:0]
# * 4. QUEUE_MASK_HI [31:0]
# * 5. GWS_MASK_LO [31:0]
# * 6. GWS_MASK_HI [31:0]
# * 7. OAC_MASK [15:0]
# * 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0]
# */
def PACKET3_SET_RESOURCES_VMID_MASK(x): return ((x) << 0)
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): return ((x) << 16)
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): return ((x) << 29)
PACKET3_MAP_PROCESS = 0xA1
PACKET3_MAP_QUEUES = 0xA2
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. MQD_ADDR_LO [31:0]
# * 5. MQD_ADDR_HI [31:0]
# * 6. WPTR_ADDR_LO [31:0]
# * 7. WPTR_ADDR_HI [31:0]
# */
#/* CONTROL */
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): return ((x) << 4)
def PACKET3_MAP_QUEUES_VMID(x): return ((x) << 8)
def PACKET3_MAP_QUEUES_QUEUE(x): return ((x) << 13)
def PACKET3_MAP_QUEUES_PIPE(x): return ((x) << 16)
def PACKET3_MAP_QUEUES_ME(x): return ((x) << 18)
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): return ((x) << 21)
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): return ((x) << 24)
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): return ((x) << 26)
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): return ((x) << 29)
#/* CONTROL2 */
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): return ((x) << 1)
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): return ((x) << 2)
PACKET3_UNMAP_QUEUES = 0xA3
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. CONTROL3
# * 5. CONTROL4
# * 6. CONTROL5
# */
#/* CONTROL */
def PACKET3_UNMAP_QUEUES_ACTION(x): return ((x) << 0)
#/* 0 - PREEMPT_QUEUES
# * 1 - RESET_QUEUES
# * 2 - DISABLE_PROCESS_QUEUES
# * 3 - PREEMPT_QUEUES_NO_UNMAP
# */
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): return ((x) << 4)
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): return ((x) << 26)
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): return ((x) << 29)
#/* CONTROL2a */
def PACKET3_UNMAP_QUEUES_PASID(x): return ((x) << 0)
#/* CONTROL2b */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): return ((x) << 2)
#/* CONTROL3a */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): return ((x) << 2)
#/* CONTROL3b */
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): return ((x) << 0)
#/* CONTROL4 */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): return ((x) << 2)
#/* CONTROL5 */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): return ((x) << 2)
PACKET3_QUERY_STATUS = 0xA4
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. ADDR_LO [31:0]
# * 5. ADDR_HI [31:0]
# * 6. DATA_LO [31:0]
# * 7. DATA_HI [31:0]
# */
#/* CONTROL */
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): return ((x) << 0)
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): return ((x) << 28)
def PACKET3_QUERY_STATUS_COMMAND(x): return ((x) << 30)
#/* CONTROL2a */
def PACKET3_QUERY_STATUS_PASID(x): return ((x) << 0)
#/* CONTROL2b */
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): return ((x) << 2)
def PACKET3_QUERY_STATUS_ENG_SEL(x): return ((x) << 25)
PACKET3_RUN_LIST = 0xA5
PACKET3_MAP_PROCESS_VM = 0xA6
#/* GFX11 */
PACKET3_SET_Q_PREEMPTION_MODE = 0xF0
def PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x): return ((x) << 0)
PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM = (1 << 0)
#endif

View File

@@ -146,6 +146,16 @@ if ctypes.sizeof(ctypes.c_longdouble) == 16:
else:
c_long_double_t = ctypes.c_ubyte*16
class FunctionFactoryStub:
def __getattr__(self, _):
return ctypes.CFUNCTYPE(lambda y:y)
# libraries['FIXME_STUB'] explanation
# As you did not list (-l libraryname.so) a library that exports this function
# This is a non-working stub instead.
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB')
@@ -3834,6 +3844,327 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
]
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
try:
hsa_ven_amd_aqlprofile_version_major = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_major
hsa_ven_amd_aqlprofile_version_major.restype = uint32_t
hsa_ven_amd_aqlprofile_version_major.argtypes = []
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_version_minor = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_minor
hsa_ven_amd_aqlprofile_version_minor.restype = uint32_t
hsa_ven_amd_aqlprofile_version_minor.argtypes = []
except AttributeError:
pass
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_event_type_t'
c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC',
1: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE',
}
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1
c__EA_hsa_ven_amd_aqlprofile_event_type_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_event_type_t = c__EA_hsa_ven_amd_aqlprofile_event_type_t
hsa_ven_amd_aqlprofile_event_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_block_name_t'
c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC',
1: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF',
2: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS',
3: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM',
4: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE',
5: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI',
6: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ',
7: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS',
8: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM',
9: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX',
10: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA',
11: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA',
12: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC',
13: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP',
14: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD',
15: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB',
16: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB',
17: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM',
18: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ',
19: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2',
20: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR',
21: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC',
22: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2',
23: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA',
24: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB',
25: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA',
26: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A',
27: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C',
28: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A',
29: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C',
30: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR',
31: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS',
32: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC',
33: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA',
34: 'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER',
}
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33
HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER = 34
c__EA_hsa_ven_amd_aqlprofile_block_name_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_block_name_t = c__EA_hsa_ven_amd_aqlprofile_block_name_t
hsa_ven_amd_aqlprofile_block_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues
class struct_c__SA_hsa_ven_amd_aqlprofile_event_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_event_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_event_t._fields_ = [
('block_name', hsa_ven_amd_aqlprofile_block_name_t),
('block_index', ctypes.c_uint32),
('counter_id', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_event_t = struct_c__SA_hsa_ven_amd_aqlprofile_event_t
try:
hsa_ven_amd_aqlprofile_validate_event = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_validate_event
hsa_ven_amd_aqlprofile_validate_event.restype = hsa_status_t
hsa_ven_amd_aqlprofile_validate_event.argtypes = [hsa_agent_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool)]
except AttributeError:
pass
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t'
c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET',
1: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
2: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK',
3: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK',
4: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2',
5: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK',
6: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE',
7: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT',
}
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7
c__EA_hsa_ven_amd_aqlprofile_parameter_name_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_parameter_name_t = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t
hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues
class struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._fields_ = [
('parameter_name', hsa_ven_amd_aqlprofile_parameter_name_t),
('value', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_parameter_t = struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t
class struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._fields_ = [
('ptr', ctypes.POINTER(None)),
('size', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
]
hsa_ven_amd_aqlprofile_descriptor_t = struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t
class struct_c__SA_hsa_ven_amd_aqlprofile_profile_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._fields_ = [
('agent', hsa_agent_t),
('type', hsa_ven_amd_aqlprofile_event_type_t),
('PADDING_0', ctypes.c_ubyte * 4),
('events', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t)),
('event_count', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('parameters', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t)),
('parameter_count', ctypes.c_uint32),
('PADDING_2', ctypes.c_ubyte * 4),
('output_buffer', hsa_ven_amd_aqlprofile_descriptor_t),
('command_buffer', hsa_ven_amd_aqlprofile_descriptor_t),
]
hsa_ven_amd_aqlprofile_profile_t = struct_c__SA_hsa_ven_amd_aqlprofile_profile_t
class struct_c__SA_hsa_ext_amd_aql_pm4_packet_t(Structure):
pass
struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._pack_ = 1 # source:False
struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._fields_ = [
('header', ctypes.c_uint16),
('pm4_command', ctypes.c_uint16 * 27),
('completion_signal', hsa_signal_t),
]
hsa_ext_amd_aql_pm4_packet_t = struct_c__SA_hsa_ext_amd_aql_pm4_packet_t
try:
hsa_ven_amd_aqlprofile_start = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_start
hsa_ven_amd_aqlprofile_start.restype = hsa_status_t
hsa_ven_amd_aqlprofile_start.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_stop = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_stop
hsa_ven_amd_aqlprofile_stop.restype = hsa_status_t
hsa_ven_amd_aqlprofile_stop.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_read = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_read
hsa_ven_amd_aqlprofile_read.restype = hsa_status_t
hsa_ven_amd_aqlprofile_read.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192 # Variable ctypes.c_uint32
try:
hsa_ven_amd_aqlprofile_legacy_get_pm4 = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_legacy_get_pm4
hsa_ven_amd_aqlprofile_legacy_get_pm4.restype = hsa_status_t
hsa_ven_amd_aqlprofile_legacy_get_pm4.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None)]
except AttributeError:
pass
class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t(Structure):
pass
class union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0(Union):
pass
class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._fields_ = [
('event', hsa_ven_amd_aqlprofile_event_t),
('PADDING_0', ctypes.c_ubyte * 4),
('result', ctypes.c_uint64),
]
union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._pack_ = 1 # source:False
union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._fields_ = [
('pmc_data', struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data),
('trace_data', hsa_ven_amd_aqlprofile_descriptor_t),
('PADDING_0', ctypes.c_ubyte * 8),
]
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._anonymous_ = ('_0',)
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._fields_ = [
('sample_id', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
('_0', union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0),
]
hsa_ven_amd_aqlprofile_info_data_t = struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t
class struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._fields_ = [
('name', ctypes.POINTER(ctypes.c_char)),
('id', ctypes.c_uint32),
('instance_count', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_id_query_t = struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_info_type_t'
c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE',
1: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE',
2: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA',
3: 'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA',
4: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS',
5: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID',
6: 'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD',
7: 'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD',
}
HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0
HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1
HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2
HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3
HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4
HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5
HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6
HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7
c__EA_hsa_ven_amd_aqlprofile_info_type_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_info_type_t = c__EA_hsa_ven_amd_aqlprofile_info_type_t
hsa_ven_amd_aqlprofile_info_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues
hsa_ven_amd_aqlprofile_data_callback_t = ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None))
try:
hsa_ven_amd_aqlprofile_get_info = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_get_info
hsa_ven_amd_aqlprofile_get_info.restype = hsa_status_t
hsa_ven_amd_aqlprofile_get_info.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_iterate_data = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_iterate_data
hsa_ven_amd_aqlprofile_iterate_data.restype = hsa_status_t
hsa_ven_amd_aqlprofile_iterate_data.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_data_callback_t, ctypes.POINTER(None)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_error_string = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_error_string
hsa_ven_amd_aqlprofile_error_string.restype = hsa_status_t
hsa_ven_amd_aqlprofile_error_string.argtypes = [ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
kAqlProfileLib = 'libhsa-amd-aqlprofile64.so' # Variable ctypes.c_char * 27
class struct_hsa_ven_amd_aqlprofile_1_00_pfn_s(Structure):
pass
struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._pack_ = 1 # source:False
struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._fields_ = [
('hsa_ven_amd_aqlprofile_version_major', ctypes.CFUNCTYPE(ctypes.c_uint32)),
('hsa_ven_amd_aqlprofile_version_minor', ctypes.CFUNCTYPE(ctypes.c_uint32)),
('hsa_ven_amd_aqlprofile_error_string', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)))),
('hsa_ven_amd_aqlprofile_validate_event', ctypes.CFUNCTYPE(c__EA_hsa_status_t, struct_hsa_agent_s, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool))),
('hsa_ven_amd_aqlprofile_start', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_stop', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_read', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_legacy_get_pm4', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None))),
('hsa_ven_amd_aqlprofile_get_info', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None))),
('hsa_ven_amd_aqlprofile_iterate_data', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None)), ctypes.POINTER(None))),
]
hsa_ven_amd_aqlprofile_1_00_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
hsa_ven_amd_aqlprofile_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
__all__ = \
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
@@ -4221,13 +4552,68 @@ __all__ = \
'HSA_SYSTEM_INFO_VERSION_MAJOR', 'HSA_SYSTEM_INFO_VERSION_MINOR',
'HSA_VARIABLE_ALLOCATION_AGENT',
'HSA_VARIABLE_ALLOCATION_PROGRAM', 'HSA_VARIABLE_SEGMENT_GLOBAL',
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
'amd_queue_properties32_t', 'amd_queue_properties_t',
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
'amd_signal_t', 'c__EA_hsa_access_permission_t',
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
'HSA_VARIABLE_SEGMENT_READONLY',
'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC',
'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC',
'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE',
'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS',
'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID',
'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE',
'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD',
'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD',
'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA',
'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE',
'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA',
'HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
'HSA_WAIT_STATE_ACTIVE', 'HSA_WAIT_STATE_BLOCKED',
'HSA_WAVEFRONT_INFO_SIZE', 'MEMORY_TYPE_NONE',
'MEMORY_TYPE_PINNED', 'amd_queue_properties32_t',
'amd_queue_properties_t', 'amd_queue_t', 'amd_signal_kind64_t',
'amd_signal_kind_t', 'amd_signal_t',
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
'c__EA_hsa_agent_info_t',
'c__EA_hsa_amd_agent_memory_pool_info_t',
'c__EA_hsa_amd_copy_direction_t',
'c__EA_hsa_amd_hw_exception_reset_cause_t',
@@ -4267,6 +4653,10 @@ __all__ = \
'c__EA_hsa_status_t', 'c__EA_hsa_symbol_kind_t',
'c__EA_hsa_symbol_linkage_t', 'c__EA_hsa_system_info_t',
'c__EA_hsa_variable_allocation_t', 'c__EA_hsa_variable_segment_t',
'c__EA_hsa_ven_amd_aqlprofile_block_name_t',
'c__EA_hsa_ven_amd_aqlprofile_event_type_t',
'c__EA_hsa_ven_amd_aqlprofile_info_type_t',
'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t',
'c__EA_hsa_wait_state_t', 'c__EA_hsa_wavefront_info_t',
'c__Ea_HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS',
'c__Ea_HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED',
@@ -4418,7 +4808,8 @@ __all__ = \
'hsa_executable_symbol_info_t__enumvalues',
'hsa_executable_symbol_t', 'hsa_executable_t',
'hsa_executable_validate', 'hsa_executable_validate_alt',
'hsa_ext_control_directives_t', 'hsa_ext_finalizer_1_00_pfn_t',
'hsa_ext_amd_aql_pm4_packet_t', 'hsa_ext_control_directives_t',
'hsa_ext_finalizer_1_00_pfn_t',
'hsa_ext_finalizer_call_convention_t',
'hsa_ext_finalizer_call_convention_t__enumvalues',
'hsa_ext_image_capability_t',
@@ -4562,11 +4953,45 @@ __all__ = \
'hsa_system_major_extension_supported',
'hsa_variable_allocation_t',
'hsa_variable_allocation_t__enumvalues', 'hsa_variable_segment_t',
'hsa_variable_segment_t__enumvalues', 'hsa_wait_state_t',
'hsa_variable_segment_t__enumvalues',
'hsa_ven_amd_aqlprofile_1_00_pfn_t',
'hsa_ven_amd_aqlprofile_block_name_t',
'hsa_ven_amd_aqlprofile_block_name_t__enumvalues',
'hsa_ven_amd_aqlprofile_data_callback_t',
'hsa_ven_amd_aqlprofile_descriptor_t',
'hsa_ven_amd_aqlprofile_error_string',
'hsa_ven_amd_aqlprofile_event_t',
'hsa_ven_amd_aqlprofile_event_type_t',
'hsa_ven_amd_aqlprofile_event_type_t__enumvalues',
'hsa_ven_amd_aqlprofile_get_info',
'hsa_ven_amd_aqlprofile_id_query_t',
'hsa_ven_amd_aqlprofile_info_data_t',
'hsa_ven_amd_aqlprofile_info_type_t',
'hsa_ven_amd_aqlprofile_info_type_t__enumvalues',
'hsa_ven_amd_aqlprofile_iterate_data',
'hsa_ven_amd_aqlprofile_legacy_get_pm4',
'hsa_ven_amd_aqlprofile_parameter_name_t',
'hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues',
'hsa_ven_amd_aqlprofile_parameter_t',
'hsa_ven_amd_aqlprofile_pfn_t',
'hsa_ven_amd_aqlprofile_profile_t', 'hsa_ven_amd_aqlprofile_read',
'hsa_ven_amd_aqlprofile_start', 'hsa_ven_amd_aqlprofile_stop',
'hsa_ven_amd_aqlprofile_validate_event',
'hsa_ven_amd_aqlprofile_version_major',
'hsa_ven_amd_aqlprofile_version_minor', 'hsa_wait_state_t',
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
'struct_amd_queue_s', 'struct_amd_signal_s',
'hsa_wavefront_t', 'int32_t', 'kAqlProfileLib', 'size_t',
'struct_BrigModuleHeader', 'struct_amd_queue_s',
'struct_amd_signal_s',
'struct_c__SA_hsa_ext_amd_aql_pm4_packet_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_event_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data',
'struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_profile_t',
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
'struct_hsa_amd_gpu_hw_exception_info_s',
@@ -4599,6 +5024,8 @@ __all__ = \
'struct_hsa_loaded_code_object_s', 'struct_hsa_pitched_ptr_s',
'struct_hsa_queue_s', 'struct_hsa_region_s',
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
'struct_hsa_ven_amd_aqlprofile_1_00_pfn_s',
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
'union_amd_signal_s_0', 'union_amd_signal_s_1',
'union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0',
'union_hsa_amd_event_s_0']

View File

@@ -1,18 +1,20 @@
from __future__ import annotations
from typing import Tuple
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
libc = ctypes.CDLL("libc.so.6")
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
libc.munmap.restype = ctypes.c_int
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
@@ -40,7 +42,7 @@ kio = ioctls_from_header()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
structs = {}
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
names = set()
fields = []
for pkt_fields in pkt._fields_:
@@ -72,6 +74,8 @@ class KFDCompiler(Compiler):
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
SDMA_MAX_COPY_SIZE = 0x400000
VENDOR_HEADER = hsa.HSA_PACKET_TYPE_VENDOR_SPECIFIC << hsa.HSA_PACKET_HEADER_TYPE
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
@@ -96,6 +100,8 @@ class KFDProgram:
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
self.device._submit_cache_inv(gli=2)
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
self.handle = self.lib_gpu.va_addr + entry_point
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
@@ -105,7 +111,8 @@ class KFDProgram:
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
# NOTE: no programs are ever freed
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if not hasattr(self, "args_struct_t"):
@@ -150,8 +157,15 @@ class KFDAllocator(LRUAllocator):
super().__init__()
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
try:
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
except OSError as e:
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
else: raise
def _free(self, gpumem, options:BufferOptions):
self.device._gpu_free(gpumem)
def copyin(self, dest, src: memoryview):
# TODO: need to make the address visible to gpu and pass it directly to sdma.
@@ -194,11 +208,18 @@ class KFDDevice(Compiled):
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
if map_to_gpu:
arr = (ctypes.c_int32 * 1)(self.gpu_id)
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
mem.__setattr__("mapped_gpu_ids", (ctypes.c_int32 * 1)(self.gpu_id))
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus:=mem.mapped_gpu_ids), n_devices=len(gpus))
assert stm.n_success == 1
return mem
def _gpu_free(self, mem):
if (gpus:=getattr(mem, "mapped_gpu_ids", None)) is not None:
stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=len(gpus))
assert stm.n_success == len(gpus)
libc.munmap(mem.va_addr, mem.size)
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
def __init__(self, device:str=""):
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -215,6 +236,7 @@ class KFDDevice(Compiled):
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
@@ -274,12 +296,18 @@ class KFDDevice(Compiled):
# prebuilt packets
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
GCR_CONTROL_GL2_RANGE=0)
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_RANGE=0)
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds,
completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal)))
# Helpers
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
@@ -302,11 +330,11 @@ class KFDDevice(Compiled):
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
for sig in wait_signals:
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
blit_sdma_command(self.sdma_flush_hdp_pkt)
blit_sdma_command(self.sdma_cache_inv)
@@ -315,22 +343,43 @@ class KFDDevice(Compiled):
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
for _ in range(copies_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
blit_sdma_command(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
copied += step_copy_size
blit_sdma_command(self.sdma_cache_wb)
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
if completion_signal is not None:
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
blit_sdma_command(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
if completion_signal.event_mailbox_ptr != 0:
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
blit_sdma_command(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
data=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
self.sdma_write_pointer[0] = self.sdma_doorbell_value
self.sdma_doorbell[0] = self.sdma_doorbell_value
def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0):
pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I")
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value
ctypes.memmove(self.aql_ring.va_addr + (self.aql_doorbell_value * AQL_PACKET_SIZE) % self.aql_ring.size,
ctypes.addressof(self.pm4_packet), AQL_PACKET_SIZE)
self.amd_aql_queue.write_dispatch_id = self.aql_doorbell_value + 1
self.aql_doorbell[0] = self.aql_doorbell_value
self.aql_doorbell_value += 1
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"