kfd free buffers (#4027)

* kfd free buffers

* unmap

* all test passes

* better pm4

* forgot these

* invalidate only range

* better cache

* forgot

* comments

* fixes
This commit is contained in:
nimlgen
2024-04-02 01:50:58 +03:00
committed by GitHub
parent 77a68fc52f
commit d6ba44bc1e
5 changed files with 1451 additions and 30 deletions

View File

@@ -1428,3 +1428,473 @@ __all__ = \
'union_SDMA_PKT_TIMESTAMP_TAG_HEADER_UNION',
'union_SDMA_PKT_TRAP_TAG_HEADER_UNION',
'union_SDMA_PKT_TRAP_TAG_INT_CONTEXT_UNION']
#/*
# * Copyright 2019 Advanced Micro Devices, Inc.
# *
# * Permission is hereby granted, free of charge, to any person obtaining a
# * copy of this software and associated documentation files (the "Software"),
# * to deal in the Software without restriction, including without limitation
# * the rights to use, copy, modify, merge, publish, distribute, sublicense,
# * and/or sell copies of the Software, and to permit persons to whom the
# * Software is furnished to do so, subject to the following conditions:
# *
# * The above copyright notice and this permission notice shall be included in
# * all copies or substantial portions of the Software.
# *
# * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
# * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# * OTHER DEALINGS IN THE SOFTWARE.
# *
# */
#ifndef NVD_H
#define NVD_H
#/**
# * Navi's PM4 definitions
# */
PACKET_TYPE0 = 0
PACKET_TYPE1 = 1
PACKET_TYPE2 = 2
PACKET_TYPE3 = 3
def CP_PACKET_GET_TYPE(h): return (((h) >> 30) & 3)
def CP_PACKET_GET_COUNT(h): return (((h) >> 16) & 0x3FFF)
def CP_PACKET0_GET_REG(h): return ((h) & 0xFFFF)
def CP_PACKET3_GET_OPCODE(h): return (((h) >> 8) & 0xFF)
def PACKET0(reg, n): return ((PACKET_TYPE0 << 30) | \
((reg) & 0xFFFF) | \
((n) & 0x3FFF) << 16)
CP_PACKET2 = 0x80000000
PACKET2_PAD_SHIFT = 0
PACKET2_PAD_MASK = (0x3fffffff << 0)
def PACKET2(v): return (CP_PACKET2 | REG_SET(PACKET2_PAD, (v)))
def PACKET3(op, n): return ((PACKET_TYPE3 << 30) | \
(((op) & 0xFF) << 8) | \
((n) & 0x3FFF) << 16)
def PACKET3_COMPUTE(op, n): return (PACKET3(op, n) | 1 << 1)
#/* Packet 3 types */
PACKET3_NOP = 0x10
PACKET3_SET_BASE = 0x11
def PACKET3_BASE_INDEX(x): return ((x) << 0)
CE_PARTITION_BASE = 3
PACKET3_CLEAR_STATE = 0x12
PACKET3_INDEX_BUFFER_SIZE = 0x13
PACKET3_DISPATCH_DIRECT = 0x15
PACKET3_DISPATCH_INDIRECT = 0x16
PACKET3_INDIRECT_BUFFER_END = 0x17
PACKET3_INDIRECT_BUFFER_CNST_END = 0x19
PACKET3_ATOMIC_GDS = 0x1D
PACKET3_ATOMIC_MEM = 0x1E
PACKET3_OCCLUSION_QUERY = 0x1F
PACKET3_SET_PREDICATION = 0x20
PACKET3_REG_RMW = 0x21
PACKET3_COND_EXEC = 0x22
PACKET3_PRED_EXEC = 0x23
PACKET3_DRAW_INDIRECT = 0x24
PACKET3_DRAW_INDEX_INDIRECT = 0x25
PACKET3_INDEX_BASE = 0x26
PACKET3_DRAW_INDEX_2 = 0x27
PACKET3_CONTEXT_CONTROL = 0x28
PACKET3_INDEX_TYPE = 0x2A
PACKET3_DRAW_INDIRECT_MULTI = 0x2C
PACKET3_DRAW_INDEX_AUTO = 0x2D
PACKET3_NUM_INSTANCES = 0x2F
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30
PACKET3_INDIRECT_BUFFER_PRIV = 0x32
PACKET3_INDIRECT_BUFFER_CNST = 0x33
PACKET3_COND_INDIRECT_BUFFER_CNST = 0x33
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35
PACKET3_DRAW_PREAMBLE = 0x36
PACKET3_WRITE_DATA = 0x37
def WRITE_DATA_DST_SEL(x): return ((x) << 8)
#/* 0 - register
# * 1 - memory (sync - via GRBM)
# * 2 - gl2
# * 3 - gds
# * 4 - reserved
# * 5 - memory (async - direct)
# */
WR_ONE_ADDR = (1 << 16)
WR_CONFIRM = (1 << 20)
def WRITE_DATA_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - LRU
# * 1 - Stream
# */
def WRITE_DATA_ENGINE_SEL(x): return ((x) << 30)
#/* 0 - me
# * 1 - pfp
# * 2 - ce
# */
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38
PACKET3_MEM_SEMAPHORE = 0x39
PACKET3_SEM_USE_MAILBOX = (0x1 << 16)
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1 << 20) #/* 0 = increment, 1 = write 1 */
PACKET3_SEM_SEL_SIGNAL = (0x6 << 29)
PACKET3_SEM_SEL_WAIT = (0x7 << 29)
PACKET3_DRAW_INDEX_MULTI_INST = 0x3A
PACKET3_COPY_DW = 0x3B
PACKET3_WAIT_REG_MEM = 0x3C
def WAIT_REG_MEM_FUNCTION(x): return ((x) << 0)
#/* 0 - always
# * 1 - <
# * 2 - <=
# * 3 - ==
# * 4 - !=
# * 5 - >=
# * 6 - >
# */
def WAIT_REG_MEM_MEM_SPACE(x): return ((x) << 4)
#/* 0 - reg
# * 1 - mem
# */
def WAIT_REG_MEM_OPERATION(x): return ((x) << 6)
#/* 0 - wait_reg_mem
# * 1 - wr_wait_wr_reg
# */
def WAIT_REG_MEM_ENGINE(x): return ((x) << 8)
#/* 0 - me
# * 1 - pfp
# */
PACKET3_INDIRECT_BUFFER = 0x3F
INDIRECT_BUFFER_VALID = (1 << 23)
def INDIRECT_BUFFER_CACHE_POLICY(x): return ((x) << 28)
#/* 0 - LRU
# * 1 - Stream
# * 2 - Bypass
# */
def INDIRECT_BUFFER_PRE_ENB(x): return ((x) << 21)
def INDIRECT_BUFFER_PRE_RESUME(x): return ((x) << 30)
PACKET3_COND_INDIRECT_BUFFER = 0x3F
PACKET3_COPY_DATA = 0x40
PACKET3_CP_DMA = 0x41
PACKET3_PFP_SYNC_ME = 0x42
PACKET3_SURFACE_SYNC = 0x43
PACKET3_ME_INITIALIZE = 0x44
PACKET3_COND_WRITE = 0x45
PACKET3_EVENT_WRITE = 0x46
def EVENT_TYPE(x): return ((x) << 0)
def EVENT_INDEX(x): return ((x) << 8)
#/* 0 - any non-TS event
# * 1 - ZPASS_DONE, PIXEL_PIPE_STAT_*
# * 2 - SAMPLE_PIPELINESTAT
# * 3 - SAMPLE_STREAMOUTSTAT*
# * 4 - *S_PARTIAL_FLUSH
# */
PACKET3_EVENT_WRITE_EOP = 0x47
PACKET3_EVENT_WRITE_EOS = 0x48
PACKET3_RELEASE_MEM = 0x49
def PACKET3_RELEASE_MEM_EVENT_TYPE(x): return ((x) << 0)
def PACKET3_RELEASE_MEM_EVENT_INDEX(x): return ((x) << 8)
PACKET3_RELEASE_MEM_GCR_GLM_WB = (1 << 12)
PACKET3_RELEASE_MEM_GCR_GLM_INV = (1 << 13)
PACKET3_RELEASE_MEM_GCR_GLV_INV = (1 << 14)
PACKET3_RELEASE_MEM_GCR_GL1_INV = (1 << 15)
PACKET3_RELEASE_MEM_GCR_GL2_US = (1 << 16)
PACKET3_RELEASE_MEM_GCR_GL2_RANGE = (1 << 17)
PACKET3_RELEASE_MEM_GCR_GL2_DISCARD = (1 << 19)
PACKET3_RELEASE_MEM_GCR_GL2_INV = (1 << 20)
PACKET3_RELEASE_MEM_GCR_GL2_WB = (1 << 21)
PACKET3_RELEASE_MEM_GCR_SEQ = (1 << 22)
def PACKET3_RELEASE_MEM_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - cache_policy__me_release_mem__lru
# * 1 - cache_policy__me_release_mem__stream
# * 2 - cache_policy__me_release_mem__noa
# * 3 - cache_policy__me_release_mem__bypass
# */
PACKET3_RELEASE_MEM_EXECUTE = (1 << 28)
def PACKET3_RELEASE_MEM_DATA_SEL(x): return ((x) << 29)
#/* 0 - discard
# * 1 - send low 32bit data
# * 2 - send 64bit data
# * 3 - send 64bit GPU counter value
# * 4 - send 64bit sys counter value
# */
def PACKET3_RELEASE_MEM_INT_SEL(x): return ((x) << 24)
#/* 0 - none
# * 1 - interrupt only (DATA_SEL = 0)
# * 2 - interrupt when data write is confirmed
# */
def PACKET3_RELEASE_MEM_DST_SEL(x): return ((x) << 16)
#/* 0 - MC
# * 1 - TC/L2
# */
PACKET3_PREAMBLE_CNTL = 0x4A
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2 << 28)
PACKET3_PREAMBLE_END_CLEAR_STATE = (3 << 28)
PACKET3_DMA_DATA = 0x50
#/* 1. header
# * 2. CONTROL
# * 3. SRC_ADDR_LO or DATA [31:0]
# * 4. SRC_ADDR_HI [31:0]
# * 5. DST_ADDR_LO [31:0]
# * 6. DST_ADDR_HI [7:0]
# * 7. COMMAND [31:26] | BYTE_COUNT [25:0]
# */
#/* CONTROL */
def PACKET3_DMA_DATA_ENGINE(x): return ((x) << 0)
#/* 0 - ME
# * 1 - PFP
# */
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): return ((x) << 13)
#/* 0 - LRU
# * 1 - Stream
# */
def PACKET3_DMA_DATA_DST_SEL(x): return ((x) << 20)
#/* 0 - DST_ADDR using DAS
# * 1 - GDS
# * 3 - DST_ADDR using L2
# */
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): return ((x) << 25)
#/* 0 - LRU
# * 1 - Stream
# */
def PACKET3_DMA_DATA_SRC_SEL(x): return ((x) << 29)
#/* 0 - SRC_ADDR using SAS
# * 1 - GDS
# * 2 - DATA
# * 3 - SRC_ADDR using L2
# */
PACKET3_DMA_DATA_CP_SYNC = (1 << 31)
#/* COMMAND */
PACKET3_DMA_DATA_CMD_SAS = (1 << 26)
#/* 0 - memory
# * 1 - register
# */
PACKET3_DMA_DATA_CMD_DAS = (1 << 27)
#/* 0 - memory
# * 1 - register
# */
PACKET3_DMA_DATA_CMD_SAIC = (1 << 28)
PACKET3_DMA_DATA_CMD_DAIC = (1 << 29)
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1 << 30)
PACKET3_CONTEXT_REG_RMW = 0x51
PACKET3_GFX_CNTX_UPDATE = 0x52
PACKET3_BLK_CNTX_UPDATE = 0x53
PACKET3_INCR_UPDT_STATE = 0x55
PACKET3_ACQUIRE_MEM = 0x58
#/* 1. HEADER
# * 2. COHER_CNTL [30:0]
# * 2.1 ENGINE_SEL [31:31]
# * 2. COHER_SIZE [31:0]
# * 3. COHER_SIZE_HI [7:0]
# * 4. COHER_BASE_LO [31:0]
# * 5. COHER_BASE_HI [23:0]
# * 7. POLL_INTERVAL [15:0]
# * 8. GCR_CNTL [18:0]
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x): return ((x) << 0)
#/*
# * 0:NOP
# * 1:ALL
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x): return ((x) << 2)
#/*
# * 0:ALL
# * 1:reserved
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x): return ((x) << 4)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x): return ((x) << 5)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x): return ((x) << 6)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x): return ((x) << 7)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x): return ((x) << 8)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x): return ((x) << 9)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x): return ((x) << 10)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x): return ((x) << 11)
#/*
# * 0:ALL
# * 1:VOL
# * 2:RANGE
# * 3:FIRST_LAST
# */
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x): return ((x) << 13)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x): return ((x) << 14)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x): return ((x) << 15)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x): return ((x) << 16)
#/*
# * 0: PARALLEL
# * 1: FORWARD
# * 2: REVERSE
# */
PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA = (1 << 18)
PACKET3_REWIND = 0x59
PACKET3_INTERRUPT = 0x5A
PACKET3_GEN_PDEPTE = 0x5B
PACKET3_INDIRECT_BUFFER_PASID = 0x5C
PACKET3_PRIME_UTCL2 = 0x5D
PACKET3_LOAD_UCONFIG_REG = 0x5E
PACKET3_LOAD_SH_REG = 0x5F
PACKET3_LOAD_CONFIG_REG = 0x60
PACKET3_LOAD_CONTEXT_REG = 0x61
PACKET3_LOAD_COMPUTE_STATE = 0x62
PACKET3_LOAD_SH_REG_INDEX = 0x63
PACKET3_SET_CONFIG_REG = 0x68
PACKET3_SET_CONFIG_REG_START = 0x00002000
PACKET3_SET_CONFIG_REG_END = 0x00002c00
PACKET3_SET_CONTEXT_REG = 0x69
PACKET3_SET_CONTEXT_REG_START = 0x0000a000
PACKET3_SET_CONTEXT_REG_END = 0x0000a400
PACKET3_SET_CONTEXT_REG_INDEX = 0x6A
PACKET3_SET_VGPR_REG_DI_MULTI = 0x71
PACKET3_SET_SH_REG_DI = 0x72
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73
PACKET3_SET_SH_REG_DI_MULTI = 0x74
PACKET3_GFX_PIPE_LOCK = 0x75
PACKET3_SET_SH_REG = 0x76
PACKET3_SET_SH_REG_START = 0x00002c00
PACKET3_SET_SH_REG_END = 0x00003000
PACKET3_SET_SH_REG_OFFSET = 0x77
PACKET3_SET_QUEUE_REG = 0x78
PACKET3_SET_UCONFIG_REG = 0x79
PACKET3_SET_UCONFIG_REG_START = 0x0000c000
PACKET3_SET_UCONFIG_REG_END = 0x0000c400
PACKET3_SET_UCONFIG_REG_INDEX = 0x7A
PACKET3_FORWARD_HEADER = 0x7C
PACKET3_SCRATCH_RAM_WRITE = 0x7D
PACKET3_SCRATCH_RAM_READ = 0x7E
PACKET3_LOAD_CONST_RAM = 0x80
PACKET3_WRITE_CONST_RAM = 0x81
PACKET3_DUMP_CONST_RAM = 0x83
PACKET3_INCREMENT_CE_COUNTER = 0x84
PACKET3_INCREMENT_DE_COUNTER = 0x85
PACKET3_WAIT_ON_CE_COUNTER = 0x86
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88
PACKET3_SWITCH_BUFFER = 0x8B
PACKET3_DISPATCH_DRAW_PREAMBLE = 0x8C
PACKET3_DISPATCH_DRAW_PREAMBLE_ACE = 0x8C
PACKET3_DISPATCH_DRAW = 0x8D
PACKET3_DISPATCH_DRAW_ACE = 0x8D
PACKET3_GET_LOD_STATS = 0x8E
PACKET3_DRAW_MULTI_PREAMBLE = 0x8F
PACKET3_FRAME_CONTROL = 0x90
FRAME_TMZ = (1 << 0)
def FRAME_CMD(x): return ((x) << 28)
#/*
# * x=0: tmz_begin
# * x=1: tmz_end
# */
PACKET3_INDEX_ATTRIBUTES_INDIRECT = 0x91
PACKET3_WAIT_REG_MEM64 = 0x93
PACKET3_COND_PREEMPT = 0x94
PACKET3_HDP_FLUSH = 0x95
PACKET3_COPY_DATA_RB = 0x96
PACKET3_INVALIDATE_TLBS = 0x98
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): return ((x) << 0)
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): return ((x) << 4)
def PACKET3_INVALIDATE_TLBS_PASID(x): return ((x) << 5)
PACKET3_AQL_PACKET = 0x99
PACKET3_DMA_DATA_FILL_MULTI = 0x9A
PACKET3_SET_SH_REG_INDEX = 0x9B
PACKET3_DRAW_INDIRECT_COUNT_MULTI = 0x9C
PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI = 0x9D
PACKET3_DUMP_CONST_RAM_OFFSET = 0x9E
PACKET3_LOAD_CONTEXT_REG_INDEX = 0x9F
PACKET3_SET_RESOURCES = 0xA0
#/* 1. header
# * 2. CONTROL
# * 3. QUEUE_MASK_LO [31:0]
# * 4. QUEUE_MASK_HI [31:0]
# * 5. GWS_MASK_LO [31:0]
# * 6. GWS_MASK_HI [31:0]
# * 7. OAC_MASK [15:0]
# * 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0]
# */
def PACKET3_SET_RESOURCES_VMID_MASK(x): return ((x) << 0)
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): return ((x) << 16)
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): return ((x) << 29)
PACKET3_MAP_PROCESS = 0xA1
PACKET3_MAP_QUEUES = 0xA2
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. MQD_ADDR_LO [31:0]
# * 5. MQD_ADDR_HI [31:0]
# * 6. WPTR_ADDR_LO [31:0]
# * 7. WPTR_ADDR_HI [31:0]
# */
#/* CONTROL */
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): return ((x) << 4)
def PACKET3_MAP_QUEUES_VMID(x): return ((x) << 8)
def PACKET3_MAP_QUEUES_QUEUE(x): return ((x) << 13)
def PACKET3_MAP_QUEUES_PIPE(x): return ((x) << 16)
def PACKET3_MAP_QUEUES_ME(x): return ((x) << 18)
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): return ((x) << 21)
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): return ((x) << 24)
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): return ((x) << 26)
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): return ((x) << 29)
#/* CONTROL2 */
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): return ((x) << 1)
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): return ((x) << 2)
PACKET3_UNMAP_QUEUES = 0xA3
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. CONTROL3
# * 5. CONTROL4
# * 6. CONTROL5
# */
#/* CONTROL */
def PACKET3_UNMAP_QUEUES_ACTION(x): return ((x) << 0)
#/* 0 - PREEMPT_QUEUES
# * 1 - RESET_QUEUES
# * 2 - DISABLE_PROCESS_QUEUES
# * 3 - PREEMPT_QUEUES_NO_UNMAP
# */
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): return ((x) << 4)
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): return ((x) << 26)
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): return ((x) << 29)
#/* CONTROL2a */
def PACKET3_UNMAP_QUEUES_PASID(x): return ((x) << 0)
#/* CONTROL2b */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): return ((x) << 2)
#/* CONTROL3a */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): return ((x) << 2)
#/* CONTROL3b */
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): return ((x) << 0)
#/* CONTROL4 */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): return ((x) << 2)
#/* CONTROL5 */
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): return ((x) << 2)
PACKET3_QUERY_STATUS = 0xA4
#/* 1. header
# * 2. CONTROL
# * 3. CONTROL2
# * 4. ADDR_LO [31:0]
# * 5. ADDR_HI [31:0]
# * 6. DATA_LO [31:0]
# * 7. DATA_HI [31:0]
# */
#/* CONTROL */
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): return ((x) << 0)
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): return ((x) << 28)
def PACKET3_QUERY_STATUS_COMMAND(x): return ((x) << 30)
#/* CONTROL2a */
def PACKET3_QUERY_STATUS_PASID(x): return ((x) << 0)
#/* CONTROL2b */
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): return ((x) << 2)
def PACKET3_QUERY_STATUS_ENG_SEL(x): return ((x) << 25)
PACKET3_RUN_LIST = 0xA5
PACKET3_MAP_PROCESS_VM = 0xA6
#/* GFX11 */
PACKET3_SET_Q_PREEMPTION_MODE = 0xF0
def PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x): return ((x) << 0)
PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM = (1 << 0)
#endif

View File

@@ -146,6 +146,16 @@ if ctypes.sizeof(ctypes.c_longdouble) == 16:
else:
c_long_double_t = ctypes.c_ubyte*16
class FunctionFactoryStub:
def __getattr__(self, _):
return ctypes.CFUNCTYPE(lambda y:y)
# libraries['FIXME_STUB'] explanation
# As you did not list (-l libraryname.so) a library that exports this function
# This is a non-working stub instead.
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB')
@@ -3834,6 +3844,327 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
]
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
try:
hsa_ven_amd_aqlprofile_version_major = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_major
hsa_ven_amd_aqlprofile_version_major.restype = uint32_t
hsa_ven_amd_aqlprofile_version_major.argtypes = []
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_version_minor = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_minor
hsa_ven_amd_aqlprofile_version_minor.restype = uint32_t
hsa_ven_amd_aqlprofile_version_minor.argtypes = []
except AttributeError:
pass
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_event_type_t'
c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC',
1: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE',
}
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1
c__EA_hsa_ven_amd_aqlprofile_event_type_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_event_type_t = c__EA_hsa_ven_amd_aqlprofile_event_type_t
hsa_ven_amd_aqlprofile_event_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_block_name_t'
c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC',
1: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF',
2: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS',
3: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM',
4: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE',
5: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI',
6: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ',
7: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS',
8: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM',
9: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX',
10: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA',
11: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA',
12: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC',
13: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP',
14: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD',
15: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB',
16: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB',
17: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM',
18: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ',
19: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2',
20: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR',
21: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC',
22: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2',
23: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA',
24: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB',
25: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA',
26: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A',
27: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C',
28: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A',
29: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C',
30: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR',
31: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS',
32: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC',
33: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA',
34: 'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER',
}
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32
HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33
HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER = 34
c__EA_hsa_ven_amd_aqlprofile_block_name_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_block_name_t = c__EA_hsa_ven_amd_aqlprofile_block_name_t
hsa_ven_amd_aqlprofile_block_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues
class struct_c__SA_hsa_ven_amd_aqlprofile_event_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_event_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_event_t._fields_ = [
('block_name', hsa_ven_amd_aqlprofile_block_name_t),
('block_index', ctypes.c_uint32),
('counter_id', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_event_t = struct_c__SA_hsa_ven_amd_aqlprofile_event_t
try:
hsa_ven_amd_aqlprofile_validate_event = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_validate_event
hsa_ven_amd_aqlprofile_validate_event.restype = hsa_status_t
hsa_ven_amd_aqlprofile_validate_event.argtypes = [hsa_agent_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool)]
except AttributeError:
pass
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t'
c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET',
1: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
2: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK',
3: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK',
4: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2',
5: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK',
6: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE',
7: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT',
}
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7
c__EA_hsa_ven_amd_aqlprofile_parameter_name_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_parameter_name_t = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t
hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues
class struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._fields_ = [
('parameter_name', hsa_ven_amd_aqlprofile_parameter_name_t),
('value', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_parameter_t = struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t
class struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._fields_ = [
('ptr', ctypes.POINTER(None)),
('size', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
]
hsa_ven_amd_aqlprofile_descriptor_t = struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t
class struct_c__SA_hsa_ven_amd_aqlprofile_profile_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._fields_ = [
('agent', hsa_agent_t),
('type', hsa_ven_amd_aqlprofile_event_type_t),
('PADDING_0', ctypes.c_ubyte * 4),
('events', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t)),
('event_count', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('parameters', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t)),
('parameter_count', ctypes.c_uint32),
('PADDING_2', ctypes.c_ubyte * 4),
('output_buffer', hsa_ven_amd_aqlprofile_descriptor_t),
('command_buffer', hsa_ven_amd_aqlprofile_descriptor_t),
]
hsa_ven_amd_aqlprofile_profile_t = struct_c__SA_hsa_ven_amd_aqlprofile_profile_t
class struct_c__SA_hsa_ext_amd_aql_pm4_packet_t(Structure):
pass
struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._pack_ = 1 # source:False
struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._fields_ = [
('header', ctypes.c_uint16),
('pm4_command', ctypes.c_uint16 * 27),
('completion_signal', hsa_signal_t),
]
hsa_ext_amd_aql_pm4_packet_t = struct_c__SA_hsa_ext_amd_aql_pm4_packet_t
try:
hsa_ven_amd_aqlprofile_start = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_start
hsa_ven_amd_aqlprofile_start.restype = hsa_status_t
hsa_ven_amd_aqlprofile_start.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_stop = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_stop
hsa_ven_amd_aqlprofile_stop.restype = hsa_status_t
hsa_ven_amd_aqlprofile_stop.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_read = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_read
hsa_ven_amd_aqlprofile_read.restype = hsa_status_t
hsa_ven_amd_aqlprofile_read.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)]
except AttributeError:
pass
HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192 # Variable ctypes.c_uint32
try:
hsa_ven_amd_aqlprofile_legacy_get_pm4 = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_legacy_get_pm4
hsa_ven_amd_aqlprofile_legacy_get_pm4.restype = hsa_status_t
hsa_ven_amd_aqlprofile_legacy_get_pm4.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None)]
except AttributeError:
pass
class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t(Structure):
pass
class union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0(Union):
pass
class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._fields_ = [
('event', hsa_ven_amd_aqlprofile_event_t),
('PADDING_0', ctypes.c_ubyte * 4),
('result', ctypes.c_uint64),
]
union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._pack_ = 1 # source:False
union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._fields_ = [
('pmc_data', struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data),
('trace_data', hsa_ven_amd_aqlprofile_descriptor_t),
('PADDING_0', ctypes.c_ubyte * 8),
]
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._anonymous_ = ('_0',)
struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._fields_ = [
('sample_id', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
('_0', union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0),
]
hsa_ven_amd_aqlprofile_info_data_t = struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t
class struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t(Structure):
pass
struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._pack_ = 1 # source:False
struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._fields_ = [
('name', ctypes.POINTER(ctypes.c_char)),
('id', ctypes.c_uint32),
('instance_count', ctypes.c_uint32),
]
hsa_ven_amd_aqlprofile_id_query_t = struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t
# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_info_type_t'
c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues = {
0: 'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE',
1: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE',
2: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA',
3: 'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA',
4: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS',
5: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID',
6: 'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD',
7: 'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD',
}
HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0
HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1
HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2
HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3
HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4
HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5
HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6
HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7
c__EA_hsa_ven_amd_aqlprofile_info_type_t = ctypes.c_uint32 # enum
hsa_ven_amd_aqlprofile_info_type_t = c__EA_hsa_ven_amd_aqlprofile_info_type_t
hsa_ven_amd_aqlprofile_info_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues
hsa_ven_amd_aqlprofile_data_callback_t = ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None))
try:
hsa_ven_amd_aqlprofile_get_info = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_get_info
hsa_ven_amd_aqlprofile_get_info.restype = hsa_status_t
hsa_ven_amd_aqlprofile_get_info.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_iterate_data = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_iterate_data
hsa_ven_amd_aqlprofile_iterate_data.restype = hsa_status_t
hsa_ven_amd_aqlprofile_iterate_data.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_data_callback_t, ctypes.POINTER(None)]
except AttributeError:
pass
try:
hsa_ven_amd_aqlprofile_error_string = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_error_string
hsa_ven_amd_aqlprofile_error_string.restype = hsa_status_t
hsa_ven_amd_aqlprofile_error_string.argtypes = [ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
kAqlProfileLib = 'libhsa-amd-aqlprofile64.so' # Variable ctypes.c_char * 27
class struct_hsa_ven_amd_aqlprofile_1_00_pfn_s(Structure):
pass
struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._pack_ = 1 # source:False
struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._fields_ = [
('hsa_ven_amd_aqlprofile_version_major', ctypes.CFUNCTYPE(ctypes.c_uint32)),
('hsa_ven_amd_aqlprofile_version_minor', ctypes.CFUNCTYPE(ctypes.c_uint32)),
('hsa_ven_amd_aqlprofile_error_string', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)))),
('hsa_ven_amd_aqlprofile_validate_event', ctypes.CFUNCTYPE(c__EA_hsa_status_t, struct_hsa_agent_s, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool))),
('hsa_ven_amd_aqlprofile_start', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_stop', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_read', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))),
('hsa_ven_amd_aqlprofile_legacy_get_pm4', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None))),
('hsa_ven_amd_aqlprofile_get_info', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None))),
('hsa_ven_amd_aqlprofile_iterate_data', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None)), ctypes.POINTER(None))),
]
hsa_ven_amd_aqlprofile_1_00_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
hsa_ven_amd_aqlprofile_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
__all__ = \
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
@@ -4221,13 +4552,68 @@ __all__ = \
'HSA_SYSTEM_INFO_VERSION_MAJOR', 'HSA_SYSTEM_INFO_VERSION_MINOR',
'HSA_VARIABLE_ALLOCATION_AGENT',
'HSA_VARIABLE_ALLOCATION_PROGRAM', 'HSA_VARIABLE_SEGMENT_GLOBAL',
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
'amd_queue_properties32_t', 'amd_queue_properties_t',
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
'amd_signal_t', 'c__EA_hsa_access_permission_t',
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
'HSA_VARIABLE_SEGMENT_READONLY',
'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD',
'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC',
'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC',
'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE',
'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS',
'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID',
'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE',
'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD',
'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD',
'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA',
'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE',
'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA',
'HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2',
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
'HSA_WAIT_STATE_ACTIVE', 'HSA_WAIT_STATE_BLOCKED',
'HSA_WAVEFRONT_INFO_SIZE', 'MEMORY_TYPE_NONE',
'MEMORY_TYPE_PINNED', 'amd_queue_properties32_t',
'amd_queue_properties_t', 'amd_queue_t', 'amd_signal_kind64_t',
'amd_signal_kind_t', 'amd_signal_t',
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
'c__EA_hsa_agent_info_t',
'c__EA_hsa_amd_agent_memory_pool_info_t',
'c__EA_hsa_amd_copy_direction_t',
'c__EA_hsa_amd_hw_exception_reset_cause_t',
@@ -4267,6 +4653,10 @@ __all__ = \
'c__EA_hsa_status_t', 'c__EA_hsa_symbol_kind_t',
'c__EA_hsa_symbol_linkage_t', 'c__EA_hsa_system_info_t',
'c__EA_hsa_variable_allocation_t', 'c__EA_hsa_variable_segment_t',
'c__EA_hsa_ven_amd_aqlprofile_block_name_t',
'c__EA_hsa_ven_amd_aqlprofile_event_type_t',
'c__EA_hsa_ven_amd_aqlprofile_info_type_t',
'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t',
'c__EA_hsa_wait_state_t', 'c__EA_hsa_wavefront_info_t',
'c__Ea_HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS',
'c__Ea_HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED',
@@ -4418,7 +4808,8 @@ __all__ = \
'hsa_executable_symbol_info_t__enumvalues',
'hsa_executable_symbol_t', 'hsa_executable_t',
'hsa_executable_validate', 'hsa_executable_validate_alt',
'hsa_ext_control_directives_t', 'hsa_ext_finalizer_1_00_pfn_t',
'hsa_ext_amd_aql_pm4_packet_t', 'hsa_ext_control_directives_t',
'hsa_ext_finalizer_1_00_pfn_t',
'hsa_ext_finalizer_call_convention_t',
'hsa_ext_finalizer_call_convention_t__enumvalues',
'hsa_ext_image_capability_t',
@@ -4562,11 +4953,45 @@ __all__ = \
'hsa_system_major_extension_supported',
'hsa_variable_allocation_t',
'hsa_variable_allocation_t__enumvalues', 'hsa_variable_segment_t',
'hsa_variable_segment_t__enumvalues', 'hsa_wait_state_t',
'hsa_variable_segment_t__enumvalues',
'hsa_ven_amd_aqlprofile_1_00_pfn_t',
'hsa_ven_amd_aqlprofile_block_name_t',
'hsa_ven_amd_aqlprofile_block_name_t__enumvalues',
'hsa_ven_amd_aqlprofile_data_callback_t',
'hsa_ven_amd_aqlprofile_descriptor_t',
'hsa_ven_amd_aqlprofile_error_string',
'hsa_ven_amd_aqlprofile_event_t',
'hsa_ven_amd_aqlprofile_event_type_t',
'hsa_ven_amd_aqlprofile_event_type_t__enumvalues',
'hsa_ven_amd_aqlprofile_get_info',
'hsa_ven_amd_aqlprofile_id_query_t',
'hsa_ven_amd_aqlprofile_info_data_t',
'hsa_ven_amd_aqlprofile_info_type_t',
'hsa_ven_amd_aqlprofile_info_type_t__enumvalues',
'hsa_ven_amd_aqlprofile_iterate_data',
'hsa_ven_amd_aqlprofile_legacy_get_pm4',
'hsa_ven_amd_aqlprofile_parameter_name_t',
'hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues',
'hsa_ven_amd_aqlprofile_parameter_t',
'hsa_ven_amd_aqlprofile_pfn_t',
'hsa_ven_amd_aqlprofile_profile_t', 'hsa_ven_amd_aqlprofile_read',
'hsa_ven_amd_aqlprofile_start', 'hsa_ven_amd_aqlprofile_stop',
'hsa_ven_amd_aqlprofile_validate_event',
'hsa_ven_amd_aqlprofile_version_major',
'hsa_ven_amd_aqlprofile_version_minor', 'hsa_wait_state_t',
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
'struct_amd_queue_s', 'struct_amd_signal_s',
'hsa_wavefront_t', 'int32_t', 'kAqlProfileLib', 'size_t',
'struct_BrigModuleHeader', 'struct_amd_queue_s',
'struct_amd_signal_s',
'struct_c__SA_hsa_ext_amd_aql_pm4_packet_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_event_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data',
'struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_profile_t',
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
'struct_hsa_amd_gpu_hw_exception_info_s',
@@ -4599,6 +5024,8 @@ __all__ = \
'struct_hsa_loaded_code_object_s', 'struct_hsa_pitched_ptr_s',
'struct_hsa_queue_s', 'struct_hsa_region_s',
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
'struct_hsa_ven_amd_aqlprofile_1_00_pfn_s',
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
'union_amd_signal_s_0', 'union_amd_signal_s_1',
'union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0',
'union_hsa_amd_event_s_0']

View File

@@ -1,18 +1,20 @@
from __future__ import annotations
from typing import Tuple
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
libc = ctypes.CDLL("libc.so.6")
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
libc.munmap.restype = ctypes.c_int
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
@@ -40,7 +42,7 @@ kio = ioctls_from_header()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
structs = {}
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
names = set()
fields = []
for pkt_fields in pkt._fields_:
@@ -72,6 +74,8 @@ class KFDCompiler(Compiler):
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
SDMA_MAX_COPY_SIZE = 0x400000
VENDOR_HEADER = hsa.HSA_PACKET_TYPE_VENDOR_SPECIFIC << hsa.HSA_PACKET_HEADER_TYPE
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
@@ -96,6 +100,8 @@ class KFDProgram:
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
self.device._submit_cache_inv(gli=2)
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
self.handle = self.lib_gpu.va_addr + entry_point
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
@@ -105,7 +111,8 @@ class KFDProgram:
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
# NOTE: no programs are ever freed
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
def __del__(self):
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if not hasattr(self, "args_struct_t"):
@@ -150,8 +157,15 @@ class KFDAllocator(LRUAllocator):
super().__init__()
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
try:
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
except OSError as e:
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
else: raise
def _free(self, gpumem, options:BufferOptions):
self.device._gpu_free(gpumem)
def copyin(self, dest, src: memoryview):
# TODO: need to make the address visible to gpu and pass it directly to sdma.
@@ -194,11 +208,18 @@ class KFDDevice(Compiled):
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
if map_to_gpu:
arr = (ctypes.c_int32 * 1)(self.gpu_id)
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
mem.__setattr__("mapped_gpu_ids", (ctypes.c_int32 * 1)(self.gpu_id))
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus:=mem.mapped_gpu_ids), n_devices=len(gpus))
assert stm.n_success == 1
return mem
def _gpu_free(self, mem):
if (gpus:=getattr(mem, "mapped_gpu_ids", None)) is not None:
stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=len(gpus))
assert stm.n_success == len(gpus)
libc.munmap(mem.va_addr, mem.size)
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
def __init__(self, device:str=""):
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -215,6 +236,7 @@ class KFDDevice(Compiled):
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
@@ -274,12 +296,18 @@ class KFDDevice(Compiled):
# prebuilt packets
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
GCR_CONTROL_GL2_RANGE=0)
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_RANGE=0)
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds,
completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal)))
# Helpers
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
@@ -302,11 +330,11 @@ class KFDDevice(Compiled):
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
for sig in wait_signals:
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
blit_sdma_command(self.sdma_flush_hdp_pkt)
blit_sdma_command(self.sdma_cache_inv)
@@ -315,22 +343,43 @@ class KFDDevice(Compiled):
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
for _ in range(copies_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
blit_sdma_command(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
copied += step_copy_size
blit_sdma_command(self.sdma_cache_wb)
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
if completion_signal is not None:
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
blit_sdma_command(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
if completion_signal.event_mailbox_ptr != 0:
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
blit_sdma_command(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
data=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
self.sdma_write_pointer[0] = self.sdma_doorbell_value
self.sdma_doorbell[0] = self.sdma_doorbell_value
def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0):
pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I")
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value
ctypes.memmove(self.aql_ring.va_addr + (self.aql_doorbell_value * AQL_PACKET_SIZE) % self.aql_ring.size,
ctypes.addressof(self.pm4_packet), AQL_PACKET_SIZE)
self.amd_aql_queue.write_dispatch_id = self.aql_doorbell_value + 1
self.aql_doorbell[0] = self.aql_doorbell_value
self.aql_doorbell_value += 1
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"