From d6ba44bc1e905ddbd06cb2d168de93669fac4c15 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 2 Apr 2024 01:50:58 +0300 Subject: [PATCH] kfd free buffers (#4027) * kfd free buffers * unmap * all test passes * better pm4 * forgot these * invalidate only range * better cache * forgot * comments * fixes --- autogen_stubs.sh | 9 +- extra/hip_gpu_driver/nvd.h | 470 ++++++++++++++++++ .../autogen/{amd_sdma.py => amd_gpu.py} | 470 ++++++++++++++++++ tinygrad/runtime/autogen/hsa.py | 449 ++++++++++++++++- tinygrad/runtime/ops_kfd.py | 83 +++- 5 files changed, 1451 insertions(+), 30 deletions(-) create mode 100644 extra/hip_gpu_driver/nvd.h rename tinygrad/runtime/autogen/{amd_sdma.py => amd_gpu.py} (77%) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 8f7341f3d8..9faf0efabd 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -79,16 +79,21 @@ generate_hsa() { /opt/rocm/include/hsa/amd_hsa_signal.h \ /opt/rocm/include/hsa/amd_hsa_queue.h \ /opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \ + /opt/rocm/include/hsa/hsa_ven_amd_aqlprofile.h \ --clang-args="-I/opt/rocm/include" \ -o $BASE/hsa.py -l /opt/rocm/lib/libhsa-runtime64.so # clang2py broken when pass -x c++ to prev headers clang2py extra/hip_gpu_driver/sdma_registers.h \ --clang-args="-I/opt/rocm/include -x c++" \ - -o $BASE/amd_sdma.py -l /opt/rocm/lib/libhsa-runtime64.so + -o $BASE/amd_gpu.py -l /opt/rocm/lib/libhsa-runtime64.so + + sed 's/^\(.*\)\(\s*\/\*\)\(.*\)$/\1 #\2\3/; s/^\(\s*\*\)\(.*\)$/#\1\2/' extra/hip_gpu_driver/nvd.h >> $BASE/amd_gpu.py # comments + sed -i 's/#\s*define\s*\([^ \t]*\)(\([^)]*\))\s*\(.*\)/def \1(\2): return \3/' $BASE/amd_gpu.py # #define name(x) (smth) -> def name(x): return (smth) + sed -i '/#\s*define\s\+\([^ \t]\+\)\s\+\([^ ]\+\)/s//\1 = \2/' $BASE/amd_gpu.py # #define name val -> name = val fixup $BASE/hsa.py - fixup $BASE/amd_sdma.py + fixup $BASE/amd_gpu.py sed -i "s\import ctypes\import ctypes, os\g" $BASE/hsa.py sed -i "s\'/opt/rocm/\os.getenv('ROCM_PATH', '/opt/rocm/')+'/\g" $BASE/hsa.py python3 -c "import tinygrad.runtime.autogen.hsa" diff --git a/extra/hip_gpu_driver/nvd.h b/extra/hip_gpu_driver/nvd.h new file mode 100644 index 0000000000..631dafb922 --- /dev/null +++ b/extra/hip_gpu_driver/nvd.h @@ -0,0 +1,470 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef NVD_H +#define NVD_H + +/** + * Navi's PM4 definitions + */ +#define PACKET_TYPE0 0 +#define PACKET_TYPE1 1 +#define PACKET_TYPE2 2 +#define PACKET_TYPE3 3 + +#define CP_PACKET_GET_TYPE(h) (((h) >> 30) & 3) +#define CP_PACKET_GET_COUNT(h) (((h) >> 16) & 0x3FFF) +#define CP_PACKET0_GET_REG(h) ((h) & 0xFFFF) +#define CP_PACKET3_GET_OPCODE(h) (((h) >> 8) & 0xFF) +#define PACKET0(reg, n) ((PACKET_TYPE0 << 30) | \ + ((reg) & 0xFFFF) | \ + ((n) & 0x3FFF) << 16) +#define CP_PACKET2 0x80000000 +#define PACKET2_PAD_SHIFT 0 +#define PACKET2_PAD_MASK (0x3fffffff << 0) + +#define PACKET2(v) (CP_PACKET2 | REG_SET(PACKET2_PAD, (v))) + +#define PACKET3(op, n) ((PACKET_TYPE3 << 30) | \ + (((op) & 0xFF) << 8) | \ + ((n) & 0x3FFF) << 16) + +#define PACKET3_COMPUTE(op, n) (PACKET3(op, n) | 1 << 1) + +/* Packet 3 types */ +#define PACKET3_NOP 0x10 +#define PACKET3_SET_BASE 0x11 +#define PACKET3_BASE_INDEX(x) ((x) << 0) +#define CE_PARTITION_BASE 3 +#define PACKET3_CLEAR_STATE 0x12 +#define PACKET3_INDEX_BUFFER_SIZE 0x13 +#define PACKET3_DISPATCH_DIRECT 0x15 +#define PACKET3_DISPATCH_INDIRECT 0x16 +#define PACKET3_INDIRECT_BUFFER_END 0x17 +#define PACKET3_INDIRECT_BUFFER_CNST_END 0x19 +#define PACKET3_ATOMIC_GDS 0x1D +#define PACKET3_ATOMIC_MEM 0x1E +#define PACKET3_OCCLUSION_QUERY 0x1F +#define PACKET3_SET_PREDICATION 0x20 +#define PACKET3_REG_RMW 0x21 +#define PACKET3_COND_EXEC 0x22 +#define PACKET3_PRED_EXEC 0x23 +#define PACKET3_DRAW_INDIRECT 0x24 +#define PACKET3_DRAW_INDEX_INDIRECT 0x25 +#define PACKET3_INDEX_BASE 0x26 +#define PACKET3_DRAW_INDEX_2 0x27 +#define PACKET3_CONTEXT_CONTROL 0x28 +#define PACKET3_INDEX_TYPE 0x2A +#define PACKET3_DRAW_INDIRECT_MULTI 0x2C +#define PACKET3_DRAW_INDEX_AUTO 0x2D +#define PACKET3_NUM_INSTANCES 0x2F +#define PACKET3_DRAW_INDEX_MULTI_AUTO 0x30 +#define PACKET3_INDIRECT_BUFFER_PRIV 0x32 +#define PACKET3_INDIRECT_BUFFER_CNST 0x33 +#define PACKET3_COND_INDIRECT_BUFFER_CNST 0x33 +#define PACKET3_STRMOUT_BUFFER_UPDATE 0x34 +#define PACKET3_DRAW_INDEX_OFFSET_2 0x35 +#define PACKET3_DRAW_PREAMBLE 0x36 +#define PACKET3_WRITE_DATA 0x37 +#define WRITE_DATA_DST_SEL(x) ((x) << 8) + /* 0 - register + * 1 - memory (sync - via GRBM) + * 2 - gl2 + * 3 - gds + * 4 - reserved + * 5 - memory (async - direct) + */ +#define WR_ONE_ADDR (1 << 16) +#define WR_CONFIRM (1 << 20) +#define WRITE_DATA_CACHE_POLICY(x) ((x) << 25) + /* 0 - LRU + * 1 - Stream + */ +#define WRITE_DATA_ENGINE_SEL(x) ((x) << 30) + /* 0 - me + * 1 - pfp + * 2 - ce + */ +#define PACKET3_DRAW_INDEX_INDIRECT_MULTI 0x38 +#define PACKET3_MEM_SEMAPHORE 0x39 +# define PACKET3_SEM_USE_MAILBOX (0x1 << 16) +# define PACKET3_SEM_SEL_SIGNAL_TYPE (0x1 << 20) /* 0 = increment, 1 = write 1 */ +# define PACKET3_SEM_SEL_SIGNAL (0x6 << 29) +# define PACKET3_SEM_SEL_WAIT (0x7 << 29) +#define PACKET3_DRAW_INDEX_MULTI_INST 0x3A +#define PACKET3_COPY_DW 0x3B +#define PACKET3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0) + /* 0 - always + * 1 - < + * 2 - <= + * 3 - == + * 4 - != + * 5 - >= + * 6 - > + */ +#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4) + /* 0 - reg + * 1 - mem + */ +#define WAIT_REG_MEM_OPERATION(x) ((x) << 6) + /* 0 - wait_reg_mem + * 1 - wr_wait_wr_reg + */ +#define WAIT_REG_MEM_ENGINE(x) ((x) << 8) + /* 0 - me + * 1 - pfp + */ +#define PACKET3_INDIRECT_BUFFER 0x3F +#define INDIRECT_BUFFER_VALID (1 << 23) +#define INDIRECT_BUFFER_CACHE_POLICY(x) ((x) << 28) + /* 0 - LRU + * 1 - Stream + * 2 - Bypass + */ +#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21) +#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30) +#define PACKET3_COND_INDIRECT_BUFFER 0x3F +#define PACKET3_COPY_DATA 0x40 +#define PACKET3_CP_DMA 0x41 +#define PACKET3_PFP_SYNC_ME 0x42 +#define PACKET3_SURFACE_SYNC 0x43 +#define PACKET3_ME_INITIALIZE 0x44 +#define PACKET3_COND_WRITE 0x45 +#define PACKET3_EVENT_WRITE 0x46 +#define EVENT_TYPE(x) ((x) << 0) +#define EVENT_INDEX(x) ((x) << 8) + /* 0 - any non-TS event + * 1 - ZPASS_DONE, PIXEL_PIPE_STAT_* + * 2 - SAMPLE_PIPELINESTAT + * 3 - SAMPLE_STREAMOUTSTAT* + * 4 - *S_PARTIAL_FLUSH + */ +#define PACKET3_EVENT_WRITE_EOP 0x47 +#define PACKET3_EVENT_WRITE_EOS 0x48 +#define PACKET3_RELEASE_MEM 0x49 +#define PACKET3_RELEASE_MEM_EVENT_TYPE(x) ((x) << 0) +#define PACKET3_RELEASE_MEM_EVENT_INDEX(x) ((x) << 8) +#define PACKET3_RELEASE_MEM_GCR_GLM_WB (1 << 12) +#define PACKET3_RELEASE_MEM_GCR_GLM_INV (1 << 13) +#define PACKET3_RELEASE_MEM_GCR_GLV_INV (1 << 14) +#define PACKET3_RELEASE_MEM_GCR_GL1_INV (1 << 15) +#define PACKET3_RELEASE_MEM_GCR_GL2_US (1 << 16) +#define PACKET3_RELEASE_MEM_GCR_GL2_RANGE (1 << 17) +#define PACKET3_RELEASE_MEM_GCR_GL2_DISCARD (1 << 19) +#define PACKET3_RELEASE_MEM_GCR_GL2_INV (1 << 20) +#define PACKET3_RELEASE_MEM_GCR_GL2_WB (1 << 21) +#define PACKET3_RELEASE_MEM_GCR_SEQ (1 << 22) +#define PACKET3_RELEASE_MEM_CACHE_POLICY(x) ((x) << 25) + /* 0 - cache_policy__me_release_mem__lru + * 1 - cache_policy__me_release_mem__stream + * 2 - cache_policy__me_release_mem__noa + * 3 - cache_policy__me_release_mem__bypass + */ +#define PACKET3_RELEASE_MEM_EXECUTE (1 << 28) + +#define PACKET3_RELEASE_MEM_DATA_SEL(x) ((x) << 29) + /* 0 - discard + * 1 - send low 32bit data + * 2 - send 64bit data + * 3 - send 64bit GPU counter value + * 4 - send 64bit sys counter value + */ +#define PACKET3_RELEASE_MEM_INT_SEL(x) ((x) << 24) + /* 0 - none + * 1 - interrupt only (DATA_SEL = 0) + * 2 - interrupt when data write is confirmed + */ +#define PACKET3_RELEASE_MEM_DST_SEL(x) ((x) << 16) + /* 0 - MC + * 1 - TC/L2 + */ + + + +#define PACKET3_PREAMBLE_CNTL 0x4A +# define PACKET3_PREAMBLE_BEGIN_CLEAR_STATE (2 << 28) +# define PACKET3_PREAMBLE_END_CLEAR_STATE (3 << 28) +#define PACKET3_DMA_DATA 0x50 +/* 1. header + * 2. CONTROL + * 3. SRC_ADDR_LO or DATA [31:0] + * 4. SRC_ADDR_HI [31:0] + * 5. DST_ADDR_LO [31:0] + * 6. DST_ADDR_HI [7:0] + * 7. COMMAND [31:26] | BYTE_COUNT [25:0] + */ +/* CONTROL */ +# define PACKET3_DMA_DATA_ENGINE(x) ((x) << 0) + /* 0 - ME + * 1 - PFP + */ +# define PACKET3_DMA_DATA_SRC_CACHE_POLICY(x) ((x) << 13) + /* 0 - LRU + * 1 - Stream + */ +# define PACKET3_DMA_DATA_DST_SEL(x) ((x) << 20) + /* 0 - DST_ADDR using DAS + * 1 - GDS + * 3 - DST_ADDR using L2 + */ +# define PACKET3_DMA_DATA_DST_CACHE_POLICY(x) ((x) << 25) + /* 0 - LRU + * 1 - Stream + */ +# define PACKET3_DMA_DATA_SRC_SEL(x) ((x) << 29) + /* 0 - SRC_ADDR using SAS + * 1 - GDS + * 2 - DATA + * 3 - SRC_ADDR using L2 + */ +# define PACKET3_DMA_DATA_CP_SYNC (1 << 31) +/* COMMAND */ +# define PACKET3_DMA_DATA_CMD_SAS (1 << 26) + /* 0 - memory + * 1 - register + */ +# define PACKET3_DMA_DATA_CMD_DAS (1 << 27) + /* 0 - memory + * 1 - register + */ +# define PACKET3_DMA_DATA_CMD_SAIC (1 << 28) +# define PACKET3_DMA_DATA_CMD_DAIC (1 << 29) +# define PACKET3_DMA_DATA_CMD_RAW_WAIT (1 << 30) +#define PACKET3_CONTEXT_REG_RMW 0x51 +#define PACKET3_GFX_CNTX_UPDATE 0x52 +#define PACKET3_BLK_CNTX_UPDATE 0x53 +#define PACKET3_INCR_UPDT_STATE 0x55 +#define PACKET3_ACQUIRE_MEM 0x58 +/* 1. HEADER + * 2. COHER_CNTL [30:0] + * 2.1 ENGINE_SEL [31:31] + * 2. COHER_SIZE [31:0] + * 3. COHER_SIZE_HI [7:0] + * 4. COHER_BASE_LO [31:0] + * 5. COHER_BASE_HI [23:0] + * 7. POLL_INTERVAL [15:0] + * 8. GCR_CNTL [18:0] + */ +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x) ((x) << 0) + /* + * 0:NOP + * 1:ALL + * 2:RANGE + * 3:FIRST_LAST + */ +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x) ((x) << 2) + /* + * 0:ALL + * 1:reserved + * 2:RANGE + * 3:FIRST_LAST + */ +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x) ((x) << 4) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x) ((x) << 5) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x) ((x) << 6) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x) ((x) << 7) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x) ((x) << 8) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x) ((x) << 9) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x) ((x) << 10) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x) ((x) << 11) + /* + * 0:ALL + * 1:VOL + * 2:RANGE + * 3:FIRST_LAST + */ +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x) ((x) << 13) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x) ((x) << 14) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x) ((x) << 15) +#define PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x) ((x) << 16) + /* + * 0: PARALLEL + * 1: FORWARD + * 2: REVERSE + */ +#define PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA (1 << 18) +#define PACKET3_REWIND 0x59 +#define PACKET3_INTERRUPT 0x5A +#define PACKET3_GEN_PDEPTE 0x5B +#define PACKET3_INDIRECT_BUFFER_PASID 0x5C +#define PACKET3_PRIME_UTCL2 0x5D +#define PACKET3_LOAD_UCONFIG_REG 0x5E +#define PACKET3_LOAD_SH_REG 0x5F +#define PACKET3_LOAD_CONFIG_REG 0x60 +#define PACKET3_LOAD_CONTEXT_REG 0x61 +#define PACKET3_LOAD_COMPUTE_STATE 0x62 +#define PACKET3_LOAD_SH_REG_INDEX 0x63 +#define PACKET3_SET_CONFIG_REG 0x68 +#define PACKET3_SET_CONFIG_REG_START 0x00002000 +#define PACKET3_SET_CONFIG_REG_END 0x00002c00 +#define PACKET3_SET_CONTEXT_REG 0x69 +#define PACKET3_SET_CONTEXT_REG_START 0x0000a000 +#define PACKET3_SET_CONTEXT_REG_END 0x0000a400 +#define PACKET3_SET_CONTEXT_REG_INDEX 0x6A +#define PACKET3_SET_VGPR_REG_DI_MULTI 0x71 +#define PACKET3_SET_SH_REG_DI 0x72 +#define PACKET3_SET_CONTEXT_REG_INDIRECT 0x73 +#define PACKET3_SET_SH_REG_DI_MULTI 0x74 +#define PACKET3_GFX_PIPE_LOCK 0x75 +#define PACKET3_SET_SH_REG 0x76 +#define PACKET3_SET_SH_REG_START 0x00002c00 +#define PACKET3_SET_SH_REG_END 0x00003000 +#define PACKET3_SET_SH_REG_OFFSET 0x77 +#define PACKET3_SET_QUEUE_REG 0x78 +#define PACKET3_SET_UCONFIG_REG 0x79 +#define PACKET3_SET_UCONFIG_REG_START 0x0000c000 +#define PACKET3_SET_UCONFIG_REG_END 0x0000c400 +#define PACKET3_SET_UCONFIG_REG_INDEX 0x7A +#define PACKET3_FORWARD_HEADER 0x7C +#define PACKET3_SCRATCH_RAM_WRITE 0x7D +#define PACKET3_SCRATCH_RAM_READ 0x7E +#define PACKET3_LOAD_CONST_RAM 0x80 +#define PACKET3_WRITE_CONST_RAM 0x81 +#define PACKET3_DUMP_CONST_RAM 0x83 +#define PACKET3_INCREMENT_CE_COUNTER 0x84 +#define PACKET3_INCREMENT_DE_COUNTER 0x85 +#define PACKET3_WAIT_ON_CE_COUNTER 0x86 +#define PACKET3_WAIT_ON_DE_COUNTER_DIFF 0x88 +#define PACKET3_SWITCH_BUFFER 0x8B +#define PACKET3_DISPATCH_DRAW_PREAMBLE 0x8C +#define PACKET3_DISPATCH_DRAW_PREAMBLE_ACE 0x8C +#define PACKET3_DISPATCH_DRAW 0x8D +#define PACKET3_DISPATCH_DRAW_ACE 0x8D +#define PACKET3_GET_LOD_STATS 0x8E +#define PACKET3_DRAW_MULTI_PREAMBLE 0x8F +#define PACKET3_FRAME_CONTROL 0x90 +# define FRAME_TMZ (1 << 0) +# define FRAME_CMD(x) ((x) << 28) + /* + * x=0: tmz_begin + * x=1: tmz_end + */ +#define PACKET3_INDEX_ATTRIBUTES_INDIRECT 0x91 +#define PACKET3_WAIT_REG_MEM64 0x93 +#define PACKET3_COND_PREEMPT 0x94 +#define PACKET3_HDP_FLUSH 0x95 +#define PACKET3_COPY_DATA_RB 0x96 +#define PACKET3_INVALIDATE_TLBS 0x98 +# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) +# define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) +# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) +#define PACKET3_AQL_PACKET 0x99 +#define PACKET3_DMA_DATA_FILL_MULTI 0x9A +#define PACKET3_SET_SH_REG_INDEX 0x9B +#define PACKET3_DRAW_INDIRECT_COUNT_MULTI 0x9C +#define PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI 0x9D +#define PACKET3_DUMP_CONST_RAM_OFFSET 0x9E +#define PACKET3_LOAD_CONTEXT_REG_INDEX 0x9F +#define PACKET3_SET_RESOURCES 0xA0 +/* 1. header + * 2. CONTROL + * 3. QUEUE_MASK_LO [31:0] + * 4. QUEUE_MASK_HI [31:0] + * 5. GWS_MASK_LO [31:0] + * 6. GWS_MASK_HI [31:0] + * 7. OAC_MASK [15:0] + * 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0] + */ +# define PACKET3_SET_RESOURCES_VMID_MASK(x) ((x) << 0) +# define PACKET3_SET_RESOURCES_UNMAP_LATENTY(x) ((x) << 16) +# define PACKET3_SET_RESOURCES_QUEUE_TYPE(x) ((x) << 29) +#define PACKET3_MAP_PROCESS 0xA1 +#define PACKET3_MAP_QUEUES 0xA2 +/* 1. header + * 2. CONTROL + * 3. CONTROL2 + * 4. MQD_ADDR_LO [31:0] + * 5. MQD_ADDR_HI [31:0] + * 6. WPTR_ADDR_LO [31:0] + * 7. WPTR_ADDR_HI [31:0] + */ +/* CONTROL */ +# define PACKET3_MAP_QUEUES_QUEUE_SEL(x) ((x) << 4) +# define PACKET3_MAP_QUEUES_VMID(x) ((x) << 8) +# define PACKET3_MAP_QUEUES_QUEUE(x) ((x) << 13) +# define PACKET3_MAP_QUEUES_PIPE(x) ((x) << 16) +# define PACKET3_MAP_QUEUES_ME(x) ((x) << 18) +# define PACKET3_MAP_QUEUES_QUEUE_TYPE(x) ((x) << 21) +# define PACKET3_MAP_QUEUES_ALLOC_FORMAT(x) ((x) << 24) +# define PACKET3_MAP_QUEUES_ENGINE_SEL(x) ((x) << 26) +# define PACKET3_MAP_QUEUES_NUM_QUEUES(x) ((x) << 29) +/* CONTROL2 */ +# define PACKET3_MAP_QUEUES_CHECK_DISABLE(x) ((x) << 1) +# define PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x) ((x) << 2) +#define PACKET3_UNMAP_QUEUES 0xA3 +/* 1. header + * 2. CONTROL + * 3. CONTROL2 + * 4. CONTROL3 + * 5. CONTROL4 + * 6. CONTROL5 + */ +/* CONTROL */ +# define PACKET3_UNMAP_QUEUES_ACTION(x) ((x) << 0) + /* 0 - PREEMPT_QUEUES + * 1 - RESET_QUEUES + * 2 - DISABLE_PROCESS_QUEUES + * 3 - PREEMPT_QUEUES_NO_UNMAP + */ +# define PACKET3_UNMAP_QUEUES_QUEUE_SEL(x) ((x) << 4) +# define PACKET3_UNMAP_QUEUES_ENGINE_SEL(x) ((x) << 26) +# define PACKET3_UNMAP_QUEUES_NUM_QUEUES(x) ((x) << 29) +/* CONTROL2a */ +# define PACKET3_UNMAP_QUEUES_PASID(x) ((x) << 0) +/* CONTROL2b */ +# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x) ((x) << 2) +/* CONTROL3a */ +# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x) ((x) << 2) +/* CONTROL3b */ +# define PACKET3_UNMAP_QUEUES_RB_WPTR(x) ((x) << 0) +/* CONTROL4 */ +# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x) ((x) << 2) +/* CONTROL5 */ +# define PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x) ((x) << 2) +#define PACKET3_QUERY_STATUS 0xA4 +/* 1. header + * 2. CONTROL + * 3. CONTROL2 + * 4. ADDR_LO [31:0] + * 5. ADDR_HI [31:0] + * 6. DATA_LO [31:0] + * 7. DATA_HI [31:0] + */ +/* CONTROL */ +# define PACKET3_QUERY_STATUS_CONTEXT_ID(x) ((x) << 0) +# define PACKET3_QUERY_STATUS_INTERRUPT_SEL(x) ((x) << 28) +# define PACKET3_QUERY_STATUS_COMMAND(x) ((x) << 30) +/* CONTROL2a */ +# define PACKET3_QUERY_STATUS_PASID(x) ((x) << 0) +/* CONTROL2b */ +# define PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x) ((x) << 2) +# define PACKET3_QUERY_STATUS_ENG_SEL(x) ((x) << 25) +#define PACKET3_RUN_LIST 0xA5 +#define PACKET3_MAP_PROCESS_VM 0xA6 +/* GFX11 */ +#define PACKET3_SET_Q_PREEMPTION_MODE 0xF0 +# define PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x) ((x) << 0) +# define PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM (1 << 0) + +#endif diff --git a/tinygrad/runtime/autogen/amd_sdma.py b/tinygrad/runtime/autogen/amd_gpu.py similarity index 77% rename from tinygrad/runtime/autogen/amd_sdma.py rename to tinygrad/runtime/autogen/amd_gpu.py index 42a13a28f5..1d7aac9171 100644 --- a/tinygrad/runtime/autogen/amd_sdma.py +++ b/tinygrad/runtime/autogen/amd_gpu.py @@ -1428,3 +1428,473 @@ __all__ = \ 'union_SDMA_PKT_TIMESTAMP_TAG_HEADER_UNION', 'union_SDMA_PKT_TRAP_TAG_HEADER_UNION', 'union_SDMA_PKT_TRAP_TAG_INT_CONTEXT_UNION'] + #/* +# * Copyright 2019 Advanced Micro Devices, Inc. +# * +# * Permission is hereby granted, free of charge, to any person obtaining a +# * copy of this software and associated documentation files (the "Software"), +# * to deal in the Software without restriction, including without limitation +# * the rights to use, copy, modify, merge, publish, distribute, sublicense, +# * and/or sell copies of the Software, and to permit persons to whom the +# * Software is furnished to do so, subject to the following conditions: +# * +# * The above copyright notice and this permission notice shall be included in +# * all copies or substantial portions of the Software. +# * +# * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +# * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# * OTHER DEALINGS IN THE SOFTWARE. +# * +# */ + +#ifndef NVD_H +#define NVD_H + + #/** +# * Navi's PM4 definitions +# */ +PACKET_TYPE0 = 0 +PACKET_TYPE1 = 1 +PACKET_TYPE2 = 2 +PACKET_TYPE3 = 3 + +def CP_PACKET_GET_TYPE(h): return (((h) >> 30) & 3) +def CP_PACKET_GET_COUNT(h): return (((h) >> 16) & 0x3FFF) +def CP_PACKET0_GET_REG(h): return ((h) & 0xFFFF) +def CP_PACKET3_GET_OPCODE(h): return (((h) >> 8) & 0xFF) +def PACKET0(reg, n): return ((PACKET_TYPE0 << 30) | \ + ((reg) & 0xFFFF) | \ + ((n) & 0x3FFF) << 16) +CP_PACKET2 = 0x80000000 +PACKET2_PAD_SHIFT = 0 +PACKET2_PAD_MASK = (0x3fffffff << 0) + +def PACKET2(v): return (CP_PACKET2 | REG_SET(PACKET2_PAD, (v))) + +def PACKET3(op, n): return ((PACKET_TYPE3 << 30) | \ + (((op) & 0xFF) << 8) | \ + ((n) & 0x3FFF) << 16) + +def PACKET3_COMPUTE(op, n): return (PACKET3(op, n) | 1 << 1) + + #/* Packet 3 types */ +PACKET3_NOP = 0x10 +PACKET3_SET_BASE = 0x11 +def PACKET3_BASE_INDEX(x): return ((x) << 0) +CE_PARTITION_BASE = 3 +PACKET3_CLEAR_STATE = 0x12 +PACKET3_INDEX_BUFFER_SIZE = 0x13 +PACKET3_DISPATCH_DIRECT = 0x15 +PACKET3_DISPATCH_INDIRECT = 0x16 +PACKET3_INDIRECT_BUFFER_END = 0x17 +PACKET3_INDIRECT_BUFFER_CNST_END = 0x19 +PACKET3_ATOMIC_GDS = 0x1D +PACKET3_ATOMIC_MEM = 0x1E +PACKET3_OCCLUSION_QUERY = 0x1F +PACKET3_SET_PREDICATION = 0x20 +PACKET3_REG_RMW = 0x21 +PACKET3_COND_EXEC = 0x22 +PACKET3_PRED_EXEC = 0x23 +PACKET3_DRAW_INDIRECT = 0x24 +PACKET3_DRAW_INDEX_INDIRECT = 0x25 +PACKET3_INDEX_BASE = 0x26 +PACKET3_DRAW_INDEX_2 = 0x27 +PACKET3_CONTEXT_CONTROL = 0x28 +PACKET3_INDEX_TYPE = 0x2A +PACKET3_DRAW_INDIRECT_MULTI = 0x2C +PACKET3_DRAW_INDEX_AUTO = 0x2D +PACKET3_NUM_INSTANCES = 0x2F +PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30 +PACKET3_INDIRECT_BUFFER_PRIV = 0x32 +PACKET3_INDIRECT_BUFFER_CNST = 0x33 +PACKET3_COND_INDIRECT_BUFFER_CNST = 0x33 +PACKET3_STRMOUT_BUFFER_UPDATE = 0x34 +PACKET3_DRAW_INDEX_OFFSET_2 = 0x35 +PACKET3_DRAW_PREAMBLE = 0x36 +PACKET3_WRITE_DATA = 0x37 +def WRITE_DATA_DST_SEL(x): return ((x) << 8) + #/* 0 - register +# * 1 - memory (sync - via GRBM) +# * 2 - gl2 +# * 3 - gds +# * 4 - reserved +# * 5 - memory (async - direct) +# */ +WR_ONE_ADDR = (1 << 16) +WR_CONFIRM = (1 << 20) +def WRITE_DATA_CACHE_POLICY(x): return ((x) << 25) + #/* 0 - LRU +# * 1 - Stream +# */ +def WRITE_DATA_ENGINE_SEL(x): return ((x) << 30) + #/* 0 - me +# * 1 - pfp +# * 2 - ce +# */ +PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38 +PACKET3_MEM_SEMAPHORE = 0x39 +PACKET3_SEM_USE_MAILBOX = (0x1 << 16) +PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1 << 20) #/* 0 = increment, 1 = write 1 */ +PACKET3_SEM_SEL_SIGNAL = (0x6 << 29) +PACKET3_SEM_SEL_WAIT = (0x7 << 29) +PACKET3_DRAW_INDEX_MULTI_INST = 0x3A +PACKET3_COPY_DW = 0x3B +PACKET3_WAIT_REG_MEM = 0x3C +def WAIT_REG_MEM_FUNCTION(x): return ((x) << 0) + #/* 0 - always +# * 1 - < +# * 2 - <= +# * 3 - == +# * 4 - != +# * 5 - >= +# * 6 - > +# */ +def WAIT_REG_MEM_MEM_SPACE(x): return ((x) << 4) + #/* 0 - reg +# * 1 - mem +# */ +def WAIT_REG_MEM_OPERATION(x): return ((x) << 6) + #/* 0 - wait_reg_mem +# * 1 - wr_wait_wr_reg +# */ +def WAIT_REG_MEM_ENGINE(x): return ((x) << 8) + #/* 0 - me +# * 1 - pfp +# */ +PACKET3_INDIRECT_BUFFER = 0x3F +INDIRECT_BUFFER_VALID = (1 << 23) +def INDIRECT_BUFFER_CACHE_POLICY(x): return ((x) << 28) + #/* 0 - LRU +# * 1 - Stream +# * 2 - Bypass +# */ +def INDIRECT_BUFFER_PRE_ENB(x): return ((x) << 21) +def INDIRECT_BUFFER_PRE_RESUME(x): return ((x) << 30) +PACKET3_COND_INDIRECT_BUFFER = 0x3F +PACKET3_COPY_DATA = 0x40 +PACKET3_CP_DMA = 0x41 +PACKET3_PFP_SYNC_ME = 0x42 +PACKET3_SURFACE_SYNC = 0x43 +PACKET3_ME_INITIALIZE = 0x44 +PACKET3_COND_WRITE = 0x45 +PACKET3_EVENT_WRITE = 0x46 +def EVENT_TYPE(x): return ((x) << 0) +def EVENT_INDEX(x): return ((x) << 8) + #/* 0 - any non-TS event +# * 1 - ZPASS_DONE, PIXEL_PIPE_STAT_* +# * 2 - SAMPLE_PIPELINESTAT +# * 3 - SAMPLE_STREAMOUTSTAT* +# * 4 - *S_PARTIAL_FLUSH +# */ +PACKET3_EVENT_WRITE_EOP = 0x47 +PACKET3_EVENT_WRITE_EOS = 0x48 +PACKET3_RELEASE_MEM = 0x49 +def PACKET3_RELEASE_MEM_EVENT_TYPE(x): return ((x) << 0) +def PACKET3_RELEASE_MEM_EVENT_INDEX(x): return ((x) << 8) +PACKET3_RELEASE_MEM_GCR_GLM_WB = (1 << 12) +PACKET3_RELEASE_MEM_GCR_GLM_INV = (1 << 13) +PACKET3_RELEASE_MEM_GCR_GLV_INV = (1 << 14) +PACKET3_RELEASE_MEM_GCR_GL1_INV = (1 << 15) +PACKET3_RELEASE_MEM_GCR_GL2_US = (1 << 16) +PACKET3_RELEASE_MEM_GCR_GL2_RANGE = (1 << 17) +PACKET3_RELEASE_MEM_GCR_GL2_DISCARD = (1 << 19) +PACKET3_RELEASE_MEM_GCR_GL2_INV = (1 << 20) +PACKET3_RELEASE_MEM_GCR_GL2_WB = (1 << 21) +PACKET3_RELEASE_MEM_GCR_SEQ = (1 << 22) +def PACKET3_RELEASE_MEM_CACHE_POLICY(x): return ((x) << 25) + #/* 0 - cache_policy__me_release_mem__lru +# * 1 - cache_policy__me_release_mem__stream +# * 2 - cache_policy__me_release_mem__noa +# * 3 - cache_policy__me_release_mem__bypass +# */ +PACKET3_RELEASE_MEM_EXECUTE = (1 << 28) + +def PACKET3_RELEASE_MEM_DATA_SEL(x): return ((x) << 29) + #/* 0 - discard +# * 1 - send low 32bit data +# * 2 - send 64bit data +# * 3 - send 64bit GPU counter value +# * 4 - send 64bit sys counter value +# */ +def PACKET3_RELEASE_MEM_INT_SEL(x): return ((x) << 24) + #/* 0 - none +# * 1 - interrupt only (DATA_SEL = 0) +# * 2 - interrupt when data write is confirmed +# */ +def PACKET3_RELEASE_MEM_DST_SEL(x): return ((x) << 16) + #/* 0 - MC +# * 1 - TC/L2 +# */ + + + +PACKET3_PREAMBLE_CNTL = 0x4A +PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2 << 28) +PACKET3_PREAMBLE_END_CLEAR_STATE = (3 << 28) +PACKET3_DMA_DATA = 0x50 + #/* 1. header +# * 2. CONTROL +# * 3. SRC_ADDR_LO or DATA [31:0] +# * 4. SRC_ADDR_HI [31:0] +# * 5. DST_ADDR_LO [31:0] +# * 6. DST_ADDR_HI [7:0] +# * 7. COMMAND [31:26] | BYTE_COUNT [25:0] +# */ + #/* CONTROL */ +def PACKET3_DMA_DATA_ENGINE(x): return ((x) << 0) + #/* 0 - ME +# * 1 - PFP +# */ +def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): return ((x) << 13) + #/* 0 - LRU +# * 1 - Stream +# */ +def PACKET3_DMA_DATA_DST_SEL(x): return ((x) << 20) + #/* 0 - DST_ADDR using DAS +# * 1 - GDS +# * 3 - DST_ADDR using L2 +# */ +def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): return ((x) << 25) + #/* 0 - LRU +# * 1 - Stream +# */ +def PACKET3_DMA_DATA_SRC_SEL(x): return ((x) << 29) + #/* 0 - SRC_ADDR using SAS +# * 1 - GDS +# * 2 - DATA +# * 3 - SRC_ADDR using L2 +# */ +PACKET3_DMA_DATA_CP_SYNC = (1 << 31) + #/* COMMAND */ +PACKET3_DMA_DATA_CMD_SAS = (1 << 26) + #/* 0 - memory +# * 1 - register +# */ +PACKET3_DMA_DATA_CMD_DAS = (1 << 27) + #/* 0 - memory +# * 1 - register +# */ +PACKET3_DMA_DATA_CMD_SAIC = (1 << 28) +PACKET3_DMA_DATA_CMD_DAIC = (1 << 29) +PACKET3_DMA_DATA_CMD_RAW_WAIT = (1 << 30) +PACKET3_CONTEXT_REG_RMW = 0x51 +PACKET3_GFX_CNTX_UPDATE = 0x52 +PACKET3_BLK_CNTX_UPDATE = 0x53 +PACKET3_INCR_UPDT_STATE = 0x55 +PACKET3_ACQUIRE_MEM = 0x58 + #/* 1. HEADER +# * 2. COHER_CNTL [30:0] +# * 2.1 ENGINE_SEL [31:31] +# * 2. COHER_SIZE [31:0] +# * 3. COHER_SIZE_HI [7:0] +# * 4. COHER_BASE_LO [31:0] +# * 5. COHER_BASE_HI [23:0] +# * 7. POLL_INTERVAL [15:0] +# * 8. GCR_CNTL [18:0] +# */ +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x): return ((x) << 0) + #/* +# * 0:NOP +# * 1:ALL +# * 2:RANGE +# * 3:FIRST_LAST +# */ +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x): return ((x) << 2) + #/* +# * 0:ALL +# * 1:reserved +# * 2:RANGE +# * 3:FIRST_LAST +# */ +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x): return ((x) << 4) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x): return ((x) << 5) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x): return ((x) << 6) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x): return ((x) << 7) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x): return ((x) << 8) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x): return ((x) << 9) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x): return ((x) << 10) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x): return ((x) << 11) + #/* +# * 0:ALL +# * 1:VOL +# * 2:RANGE +# * 3:FIRST_LAST +# */ +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x): return ((x) << 13) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x): return ((x) << 14) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x): return ((x) << 15) +def PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x): return ((x) << 16) + #/* +# * 0: PARALLEL +# * 1: FORWARD +# * 2: REVERSE +# */ +PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA = (1 << 18) +PACKET3_REWIND = 0x59 +PACKET3_INTERRUPT = 0x5A +PACKET3_GEN_PDEPTE = 0x5B +PACKET3_INDIRECT_BUFFER_PASID = 0x5C +PACKET3_PRIME_UTCL2 = 0x5D +PACKET3_LOAD_UCONFIG_REG = 0x5E +PACKET3_LOAD_SH_REG = 0x5F +PACKET3_LOAD_CONFIG_REG = 0x60 +PACKET3_LOAD_CONTEXT_REG = 0x61 +PACKET3_LOAD_COMPUTE_STATE = 0x62 +PACKET3_LOAD_SH_REG_INDEX = 0x63 +PACKET3_SET_CONFIG_REG = 0x68 +PACKET3_SET_CONFIG_REG_START = 0x00002000 +PACKET3_SET_CONFIG_REG_END = 0x00002c00 +PACKET3_SET_CONTEXT_REG = 0x69 +PACKET3_SET_CONTEXT_REG_START = 0x0000a000 +PACKET3_SET_CONTEXT_REG_END = 0x0000a400 +PACKET3_SET_CONTEXT_REG_INDEX = 0x6A +PACKET3_SET_VGPR_REG_DI_MULTI = 0x71 +PACKET3_SET_SH_REG_DI = 0x72 +PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73 +PACKET3_SET_SH_REG_DI_MULTI = 0x74 +PACKET3_GFX_PIPE_LOCK = 0x75 +PACKET3_SET_SH_REG = 0x76 +PACKET3_SET_SH_REG_START = 0x00002c00 +PACKET3_SET_SH_REG_END = 0x00003000 +PACKET3_SET_SH_REG_OFFSET = 0x77 +PACKET3_SET_QUEUE_REG = 0x78 +PACKET3_SET_UCONFIG_REG = 0x79 +PACKET3_SET_UCONFIG_REG_START = 0x0000c000 +PACKET3_SET_UCONFIG_REG_END = 0x0000c400 +PACKET3_SET_UCONFIG_REG_INDEX = 0x7A +PACKET3_FORWARD_HEADER = 0x7C +PACKET3_SCRATCH_RAM_WRITE = 0x7D +PACKET3_SCRATCH_RAM_READ = 0x7E +PACKET3_LOAD_CONST_RAM = 0x80 +PACKET3_WRITE_CONST_RAM = 0x81 +PACKET3_DUMP_CONST_RAM = 0x83 +PACKET3_INCREMENT_CE_COUNTER = 0x84 +PACKET3_INCREMENT_DE_COUNTER = 0x85 +PACKET3_WAIT_ON_CE_COUNTER = 0x86 +PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88 +PACKET3_SWITCH_BUFFER = 0x8B +PACKET3_DISPATCH_DRAW_PREAMBLE = 0x8C +PACKET3_DISPATCH_DRAW_PREAMBLE_ACE = 0x8C +PACKET3_DISPATCH_DRAW = 0x8D +PACKET3_DISPATCH_DRAW_ACE = 0x8D +PACKET3_GET_LOD_STATS = 0x8E +PACKET3_DRAW_MULTI_PREAMBLE = 0x8F +PACKET3_FRAME_CONTROL = 0x90 +FRAME_TMZ = (1 << 0) +def FRAME_CMD(x): return ((x) << 28) + #/* +# * x=0: tmz_begin +# * x=1: tmz_end +# */ +PACKET3_INDEX_ATTRIBUTES_INDIRECT = 0x91 +PACKET3_WAIT_REG_MEM64 = 0x93 +PACKET3_COND_PREEMPT = 0x94 +PACKET3_HDP_FLUSH = 0x95 +PACKET3_COPY_DATA_RB = 0x96 +PACKET3_INVALIDATE_TLBS = 0x98 +def PACKET3_INVALIDATE_TLBS_DST_SEL(x): return ((x) << 0) +def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): return ((x) << 4) +def PACKET3_INVALIDATE_TLBS_PASID(x): return ((x) << 5) +PACKET3_AQL_PACKET = 0x99 +PACKET3_DMA_DATA_FILL_MULTI = 0x9A +PACKET3_SET_SH_REG_INDEX = 0x9B +PACKET3_DRAW_INDIRECT_COUNT_MULTI = 0x9C +PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI = 0x9D +PACKET3_DUMP_CONST_RAM_OFFSET = 0x9E +PACKET3_LOAD_CONTEXT_REG_INDEX = 0x9F +PACKET3_SET_RESOURCES = 0xA0 + #/* 1. header +# * 2. CONTROL +# * 3. QUEUE_MASK_LO [31:0] +# * 4. QUEUE_MASK_HI [31:0] +# * 5. GWS_MASK_LO [31:0] +# * 6. GWS_MASK_HI [31:0] +# * 7. OAC_MASK [15:0] +# * 8. GDS_HEAP_SIZE [16:11] | GDS_HEAP_BASE [5:0] +# */ +def PACKET3_SET_RESOURCES_VMID_MASK(x): return ((x) << 0) +def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): return ((x) << 16) +def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): return ((x) << 29) +PACKET3_MAP_PROCESS = 0xA1 +PACKET3_MAP_QUEUES = 0xA2 + #/* 1. header +# * 2. CONTROL +# * 3. CONTROL2 +# * 4. MQD_ADDR_LO [31:0] +# * 5. MQD_ADDR_HI [31:0] +# * 6. WPTR_ADDR_LO [31:0] +# * 7. WPTR_ADDR_HI [31:0] +# */ + #/* CONTROL */ +def PACKET3_MAP_QUEUES_QUEUE_SEL(x): return ((x) << 4) +def PACKET3_MAP_QUEUES_VMID(x): return ((x) << 8) +def PACKET3_MAP_QUEUES_QUEUE(x): return ((x) << 13) +def PACKET3_MAP_QUEUES_PIPE(x): return ((x) << 16) +def PACKET3_MAP_QUEUES_ME(x): return ((x) << 18) +def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): return ((x) << 21) +def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): return ((x) << 24) +def PACKET3_MAP_QUEUES_ENGINE_SEL(x): return ((x) << 26) +def PACKET3_MAP_QUEUES_NUM_QUEUES(x): return ((x) << 29) + #/* CONTROL2 */ +def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): return ((x) << 1) +def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): return ((x) << 2) +PACKET3_UNMAP_QUEUES = 0xA3 + #/* 1. header +# * 2. CONTROL +# * 3. CONTROL2 +# * 4. CONTROL3 +# * 5. CONTROL4 +# * 6. CONTROL5 +# */ + #/* CONTROL */ +def PACKET3_UNMAP_QUEUES_ACTION(x): return ((x) << 0) + #/* 0 - PREEMPT_QUEUES +# * 1 - RESET_QUEUES +# * 2 - DISABLE_PROCESS_QUEUES +# * 3 - PREEMPT_QUEUES_NO_UNMAP +# */ +def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): return ((x) << 4) +def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): return ((x) << 26) +def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): return ((x) << 29) + #/* CONTROL2a */ +def PACKET3_UNMAP_QUEUES_PASID(x): return ((x) << 0) + #/* CONTROL2b */ +def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): return ((x) << 2) + #/* CONTROL3a */ +def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): return ((x) << 2) + #/* CONTROL3b */ +def PACKET3_UNMAP_QUEUES_RB_WPTR(x): return ((x) << 0) + #/* CONTROL4 */ +def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): return ((x) << 2) + #/* CONTROL5 */ +def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): return ((x) << 2) +PACKET3_QUERY_STATUS = 0xA4 + #/* 1. header +# * 2. CONTROL +# * 3. CONTROL2 +# * 4. ADDR_LO [31:0] +# * 5. ADDR_HI [31:0] +# * 6. DATA_LO [31:0] +# * 7. DATA_HI [31:0] +# */ + #/* CONTROL */ +def PACKET3_QUERY_STATUS_CONTEXT_ID(x): return ((x) << 0) +def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): return ((x) << 28) +def PACKET3_QUERY_STATUS_COMMAND(x): return ((x) << 30) + #/* CONTROL2a */ +def PACKET3_QUERY_STATUS_PASID(x): return ((x) << 0) + #/* CONTROL2b */ +def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): return ((x) << 2) +def PACKET3_QUERY_STATUS_ENG_SEL(x): return ((x) << 25) +PACKET3_RUN_LIST = 0xA5 +PACKET3_MAP_PROCESS_VM = 0xA6 + #/* GFX11 */ +PACKET3_SET_Q_PREEMPTION_MODE = 0xF0 +def PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x): return ((x) << 0) +PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM = (1 << 0) + +#endif diff --git a/tinygrad/runtime/autogen/hsa.py b/tinygrad/runtime/autogen/hsa.py index 86a27530ba..586eddfe9a 100644 --- a/tinygrad/runtime/autogen/hsa.py +++ b/tinygrad/runtime/autogen/hsa.py @@ -146,6 +146,16 @@ if ctypes.sizeof(ctypes.c_longdouble) == 16: else: c_long_double_t = ctypes.c_ubyte*16 +class FunctionFactoryStub: + def __getattr__(self, _): + return ctypes.CFUNCTYPE(lambda y:y) + +# libraries['FIXME_STUB'] explanation +# As you did not list (-l libraryname.so) a library that exports this function +# This is a non-working stub instead. +# You can either re-run clan2py with -l /path/to/library.so +# Or manually fix this by comment the ctypes.CDLL loading +_libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB') @@ -3834,6 +3844,327 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [ ] hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s +try: + hsa_ven_amd_aqlprofile_version_major = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_major + hsa_ven_amd_aqlprofile_version_major.restype = uint32_t + hsa_ven_amd_aqlprofile_version_major.argtypes = [] +except AttributeError: + pass +try: + hsa_ven_amd_aqlprofile_version_minor = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_version_minor + hsa_ven_amd_aqlprofile_version_minor.restype = uint32_t + hsa_ven_amd_aqlprofile_version_minor.argtypes = [] +except AttributeError: + pass + +# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_event_type_t' +c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues = { + 0: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC', + 1: 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE', +} +HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0 +HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1 +c__EA_hsa_ven_amd_aqlprofile_event_type_t = ctypes.c_uint32 # enum +hsa_ven_amd_aqlprofile_event_type_t = c__EA_hsa_ven_amd_aqlprofile_event_type_t +hsa_ven_amd_aqlprofile_event_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_event_type_t__enumvalues + +# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_block_name_t' +c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues = { + 0: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC', + 1: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF', + 2: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS', + 3: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM', + 4: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE', + 5: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI', + 6: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ', + 7: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS', + 8: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM', + 9: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX', + 10: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA', + 11: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA', + 12: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC', + 13: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP', + 14: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD', + 15: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB', + 16: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB', + 17: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM', + 18: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ', + 19: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2', + 20: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR', + 21: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC', + 22: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2', + 23: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA', + 24: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB', + 25: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA', + 26: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A', + 27: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C', + 28: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A', + 29: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C', + 30: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR', + 31: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS', + 32: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC', + 33: 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA', + 34: 'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER', +} +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32 +HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33 +HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER = 34 +c__EA_hsa_ven_amd_aqlprofile_block_name_t = ctypes.c_uint32 # enum +hsa_ven_amd_aqlprofile_block_name_t = c__EA_hsa_ven_amd_aqlprofile_block_name_t +hsa_ven_amd_aqlprofile_block_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_block_name_t__enumvalues +class struct_c__SA_hsa_ven_amd_aqlprofile_event_t(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_event_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_event_t._fields_ = [ + ('block_name', hsa_ven_amd_aqlprofile_block_name_t), + ('block_index', ctypes.c_uint32), + ('counter_id', ctypes.c_uint32), +] + +hsa_ven_amd_aqlprofile_event_t = struct_c__SA_hsa_ven_amd_aqlprofile_event_t +try: + hsa_ven_amd_aqlprofile_validate_event = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_validate_event + hsa_ven_amd_aqlprofile_validate_event.restype = hsa_status_t + hsa_ven_amd_aqlprofile_validate_event.argtypes = [hsa_agent_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool)] +except AttributeError: + pass + +# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t' +c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = { + 0: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET', + 1: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK', + 2: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK', + 3: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK', + 4: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2', + 5: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK', + 6: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE', + 7: 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT', +} +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6 +HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7 +c__EA_hsa_ven_amd_aqlprofile_parameter_name_t = ctypes.c_uint32 # enum +hsa_ven_amd_aqlprofile_parameter_name_t = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t +hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues +class struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t._fields_ = [ + ('parameter_name', hsa_ven_amd_aqlprofile_parameter_name_t), + ('value', ctypes.c_uint32), +] + +hsa_ven_amd_aqlprofile_parameter_t = struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t +class struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t._fields_ = [ + ('ptr', ctypes.POINTER(None)), + ('size', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), +] + +hsa_ven_amd_aqlprofile_descriptor_t = struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t +class struct_c__SA_hsa_ven_amd_aqlprofile_profile_t(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_profile_t._fields_ = [ + ('agent', hsa_agent_t), + ('type', hsa_ven_amd_aqlprofile_event_type_t), + ('PADDING_0', ctypes.c_ubyte * 4), + ('events', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t)), + ('event_count', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('parameters', ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t)), + ('parameter_count', ctypes.c_uint32), + ('PADDING_2', ctypes.c_ubyte * 4), + ('output_buffer', hsa_ven_amd_aqlprofile_descriptor_t), + ('command_buffer', hsa_ven_amd_aqlprofile_descriptor_t), +] + +hsa_ven_amd_aqlprofile_profile_t = struct_c__SA_hsa_ven_amd_aqlprofile_profile_t +class struct_c__SA_hsa_ext_amd_aql_pm4_packet_t(Structure): + pass + +struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._pack_ = 1 # source:False +struct_c__SA_hsa_ext_amd_aql_pm4_packet_t._fields_ = [ + ('header', ctypes.c_uint16), + ('pm4_command', ctypes.c_uint16 * 27), + ('completion_signal', hsa_signal_t), +] + +hsa_ext_amd_aql_pm4_packet_t = struct_c__SA_hsa_ext_amd_aql_pm4_packet_t +try: + hsa_ven_amd_aqlprofile_start = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_start + hsa_ven_amd_aqlprofile_start.restype = hsa_status_t + hsa_ven_amd_aqlprofile_start.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)] +except AttributeError: + pass +try: + hsa_ven_amd_aqlprofile_stop = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_stop + hsa_ven_amd_aqlprofile_stop.restype = hsa_status_t + hsa_ven_amd_aqlprofile_stop.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)] +except AttributeError: + pass +try: + hsa_ven_amd_aqlprofile_read = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_read + hsa_ven_amd_aqlprofile_read.restype = hsa_status_t + hsa_ven_amd_aqlprofile_read.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t)] +except AttributeError: + pass +HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192 # Variable ctypes.c_uint32 +try: + hsa_ven_amd_aqlprofile_legacy_get_pm4 = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_legacy_get_pm4 + hsa_ven_amd_aqlprofile_legacy_get_pm4.restype = hsa_status_t + hsa_ven_amd_aqlprofile_legacy_get_pm4.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None)] +except AttributeError: + pass +class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t(Structure): + pass + +class union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0(Union): + pass + +class struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data._fields_ = [ + ('event', hsa_ven_amd_aqlprofile_event_t), + ('PADDING_0', ctypes.c_ubyte * 4), + ('result', ctypes.c_uint64), +] + +union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._pack_ = 1 # source:False +union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0._fields_ = [ + ('pmc_data', struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data), + ('trace_data', hsa_ven_amd_aqlprofile_descriptor_t), + ('PADDING_0', ctypes.c_ubyte * 8), +] + +struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._anonymous_ = ('_0',) +struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t._fields_ = [ + ('sample_id', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('_0', union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0), +] + +hsa_ven_amd_aqlprofile_info_data_t = struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t +class struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t(Structure): + pass + +struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._pack_ = 1 # source:False +struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t._fields_ = [ + ('name', ctypes.POINTER(ctypes.c_char)), + ('id', ctypes.c_uint32), + ('instance_count', ctypes.c_uint32), +] + +hsa_ven_amd_aqlprofile_id_query_t = struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t + +# values for enumeration 'c__EA_hsa_ven_amd_aqlprofile_info_type_t' +c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues = { + 0: 'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE', + 1: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE', + 2: 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA', + 3: 'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA', + 4: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS', + 5: 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID', + 6: 'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD', + 7: 'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD', +} +HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0 +HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1 +HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2 +HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3 +HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4 +HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5 +HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6 +HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7 +c__EA_hsa_ven_amd_aqlprofile_info_type_t = ctypes.c_uint32 # enum +hsa_ven_amd_aqlprofile_info_type_t = c__EA_hsa_ven_amd_aqlprofile_info_type_t +hsa_ven_amd_aqlprofile_info_type_t__enumvalues = c__EA_hsa_ven_amd_aqlprofile_info_type_t__enumvalues +hsa_ven_amd_aqlprofile_data_callback_t = ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None)) +try: + hsa_ven_amd_aqlprofile_get_info = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_get_info + hsa_ven_amd_aqlprofile_get_info.restype = hsa_status_t + hsa_ven_amd_aqlprofile_get_info.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None)] +except AttributeError: + pass +try: + hsa_ven_amd_aqlprofile_iterate_data = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_iterate_data + hsa_ven_amd_aqlprofile_iterate_data.restype = hsa_status_t + hsa_ven_amd_aqlprofile_iterate_data.argtypes = [ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), hsa_ven_amd_aqlprofile_data_callback_t, ctypes.POINTER(None)] +except AttributeError: + pass +try: + hsa_ven_amd_aqlprofile_error_string = _libraries['FIXME_STUB'].hsa_ven_amd_aqlprofile_error_string + hsa_ven_amd_aqlprofile_error_string.restype = hsa_status_t + hsa_ven_amd_aqlprofile_error_string.argtypes = [ctypes.POINTER(ctypes.POINTER(ctypes.c_char))] +except AttributeError: + pass +kAqlProfileLib = 'libhsa-amd-aqlprofile64.so' # Variable ctypes.c_char * 27 +class struct_hsa_ven_amd_aqlprofile_1_00_pfn_s(Structure): + pass + +struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._pack_ = 1 # source:False +struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._fields_ = [ + ('hsa_ven_amd_aqlprofile_version_major', ctypes.CFUNCTYPE(ctypes.c_uint32)), + ('hsa_ven_amd_aqlprofile_version_minor', ctypes.CFUNCTYPE(ctypes.c_uint32)), + ('hsa_ven_amd_aqlprofile_error_string', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)))), + ('hsa_ven_amd_aqlprofile_validate_event', ctypes.CFUNCTYPE(c__EA_hsa_status_t, struct_hsa_agent_s, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_event_t), ctypes.POINTER(ctypes.c_bool))), + ('hsa_ven_amd_aqlprofile_start', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))), + ('hsa_ven_amd_aqlprofile_stop', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))), + ('hsa_ven_amd_aqlprofile_read', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t))), + ('hsa_ven_amd_aqlprofile_legacy_get_pm4', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ext_amd_aql_pm4_packet_t), ctypes.POINTER(None))), + ('hsa_ven_amd_aqlprofile_get_info', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(None))), + ('hsa_ven_amd_aqlprofile_iterate_data', ctypes.CFUNCTYPE(c__EA_hsa_status_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_profile_t), ctypes.CFUNCTYPE(c__EA_hsa_status_t, c__EA_hsa_ven_amd_aqlprofile_info_type_t, ctypes.POINTER(struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t), ctypes.POINTER(None)), ctypes.POINTER(None))), +] + +hsa_ven_amd_aqlprofile_1_00_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s +hsa_ven_amd_aqlprofile_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s __all__ = \ ['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING', 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT', @@ -4221,13 +4552,68 @@ __all__ = \ 'HSA_SYSTEM_INFO_VERSION_MAJOR', 'HSA_SYSTEM_INFO_VERSION_MINOR', 'HSA_VARIABLE_ALLOCATION_AGENT', 'HSA_VARIABLE_ALLOCATION_PROGRAM', 'HSA_VARIABLE_SEGMENT_GLOBAL', - 'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE', - 'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE', - 'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED', - 'amd_queue_properties32_t', 'amd_queue_properties_t', - 'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t', - 'amd_signal_t', 'c__EA_hsa_access_permission_t', - 'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t', + 'HSA_VARIABLE_SEGMENT_READONLY', + 'HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD', + 'HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC', + 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC', + 'HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE', + 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS', + 'HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID', + 'HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE', + 'HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD', + 'HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD', + 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA', + 'HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE', + 'HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA', + 'HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2', + 'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK', + 'HSA_WAIT_STATE_ACTIVE', 'HSA_WAIT_STATE_BLOCKED', + 'HSA_WAVEFRONT_INFO_SIZE', 'MEMORY_TYPE_NONE', + 'MEMORY_TYPE_PINNED', 'amd_queue_properties32_t', + 'amd_queue_properties_t', 'amd_queue_t', 'amd_signal_kind64_t', + 'amd_signal_kind_t', 'amd_signal_t', + 'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t', + 'c__EA_hsa_agent_info_t', 'c__EA_hsa_amd_agent_memory_pool_info_t', 'c__EA_hsa_amd_copy_direction_t', 'c__EA_hsa_amd_hw_exception_reset_cause_t', @@ -4267,6 +4653,10 @@ __all__ = \ 'c__EA_hsa_status_t', 'c__EA_hsa_symbol_kind_t', 'c__EA_hsa_symbol_linkage_t', 'c__EA_hsa_system_info_t', 'c__EA_hsa_variable_allocation_t', 'c__EA_hsa_variable_segment_t', + 'c__EA_hsa_ven_amd_aqlprofile_block_name_t', + 'c__EA_hsa_ven_amd_aqlprofile_event_type_t', + 'c__EA_hsa_ven_amd_aqlprofile_info_type_t', + 'c__EA_hsa_ven_amd_aqlprofile_parameter_name_t', 'c__EA_hsa_wait_state_t', 'c__EA_hsa_wavefront_info_t', 'c__Ea_HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS', 'c__Ea_HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED', @@ -4418,7 +4808,8 @@ __all__ = \ 'hsa_executable_symbol_info_t__enumvalues', 'hsa_executable_symbol_t', 'hsa_executable_t', 'hsa_executable_validate', 'hsa_executable_validate_alt', - 'hsa_ext_control_directives_t', 'hsa_ext_finalizer_1_00_pfn_t', + 'hsa_ext_amd_aql_pm4_packet_t', 'hsa_ext_control_directives_t', + 'hsa_ext_finalizer_1_00_pfn_t', 'hsa_ext_finalizer_call_convention_t', 'hsa_ext_finalizer_call_convention_t__enumvalues', 'hsa_ext_image_capability_t', @@ -4562,11 +4953,45 @@ __all__ = \ 'hsa_system_major_extension_supported', 'hsa_variable_allocation_t', 'hsa_variable_allocation_t__enumvalues', 'hsa_variable_segment_t', - 'hsa_variable_segment_t__enumvalues', 'hsa_wait_state_t', + 'hsa_variable_segment_t__enumvalues', + 'hsa_ven_amd_aqlprofile_1_00_pfn_t', + 'hsa_ven_amd_aqlprofile_block_name_t', + 'hsa_ven_amd_aqlprofile_block_name_t__enumvalues', + 'hsa_ven_amd_aqlprofile_data_callback_t', + 'hsa_ven_amd_aqlprofile_descriptor_t', + 'hsa_ven_amd_aqlprofile_error_string', + 'hsa_ven_amd_aqlprofile_event_t', + 'hsa_ven_amd_aqlprofile_event_type_t', + 'hsa_ven_amd_aqlprofile_event_type_t__enumvalues', + 'hsa_ven_amd_aqlprofile_get_info', + 'hsa_ven_amd_aqlprofile_id_query_t', + 'hsa_ven_amd_aqlprofile_info_data_t', + 'hsa_ven_amd_aqlprofile_info_type_t', + 'hsa_ven_amd_aqlprofile_info_type_t__enumvalues', + 'hsa_ven_amd_aqlprofile_iterate_data', + 'hsa_ven_amd_aqlprofile_legacy_get_pm4', + 'hsa_ven_amd_aqlprofile_parameter_name_t', + 'hsa_ven_amd_aqlprofile_parameter_name_t__enumvalues', + 'hsa_ven_amd_aqlprofile_parameter_t', + 'hsa_ven_amd_aqlprofile_pfn_t', + 'hsa_ven_amd_aqlprofile_profile_t', 'hsa_ven_amd_aqlprofile_read', + 'hsa_ven_amd_aqlprofile_start', 'hsa_ven_amd_aqlprofile_stop', + 'hsa_ven_amd_aqlprofile_validate_event', + 'hsa_ven_amd_aqlprofile_version_major', + 'hsa_ven_amd_aqlprofile_version_minor', 'hsa_wait_state_t', 'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info', 'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues', - 'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader', - 'struct_amd_queue_s', 'struct_amd_signal_s', + 'hsa_wavefront_t', 'int32_t', 'kAqlProfileLib', 'size_t', + 'struct_BrigModuleHeader', 'struct_amd_queue_s', + 'struct_amd_signal_s', + 'struct_c__SA_hsa_ext_amd_aql_pm4_packet_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_event_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_id_query_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0_pmc_data', + 'struct_c__SA_hsa_ven_amd_aqlprofile_parameter_t', + 'struct_c__SA_hsa_ven_amd_aqlprofile_profile_t', 'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s', 'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s', 'struct_hsa_amd_gpu_hw_exception_info_s', @@ -4599,6 +5024,8 @@ __all__ = \ 'struct_hsa_loaded_code_object_s', 'struct_hsa_pitched_ptr_s', 'struct_hsa_queue_s', 'struct_hsa_region_s', 'struct_hsa_signal_group_s', 'struct_hsa_signal_s', + 'struct_hsa_ven_amd_aqlprofile_1_00_pfn_s', 'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t', 'union_amd_signal_s_0', 'union_amd_signal_s_1', + 'union_c__SA_hsa_ven_amd_aqlprofile_info_data_t_0', 'union_hsa_amd_event_s_0'] diff --git a/tinygrad/runtime/ops_kfd.py b/tinygrad/runtime/ops_kfd.py index 8e8a12c187..afba9a2b15 100644 --- a/tinygrad/runtime/ops_kfd.py +++ b/tinygrad/runtime/ops_kfd.py @@ -1,18 +1,20 @@ from __future__ import annotations from typing import Tuple -import os, fcntl, ctypes, functools, re, pathlib, mmap, struct +import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.runtime.driver.hip_comgr import compile_hip import tinygrad.runtime.autogen.kfd as kfd import tinygrad.runtime.autogen.hsa as hsa -import tinygrad.runtime.autogen.amd_sdma as amd_sdma +import tinygrad.runtime.autogen.amd_gpu as amd_gpu if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 libc = ctypes.CDLL("libc.so.6") libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long] libc.mmap.restype = ctypes.c_void_p +libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t] +libc.munmap.restype = ctypes.c_int def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}" @@ -40,7 +42,7 @@ kio = ioctls_from_header() def create_sdma_packets(): # TODO: clean up this, if we want to keep it structs = {} - for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]: + for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]: names = set() fields = [] for pkt_fields in pkt._fields_: @@ -72,6 +74,8 @@ class KFDCompiler(Compiler): AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) SDMA_MAX_COPY_SIZE = 0x400000 +VENDOR_HEADER = hsa.HSA_PACKET_TYPE_VENDOR_SPECIFIC << hsa.HSA_PACKET_HEADER_TYPE + DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE @@ -96,6 +100,8 @@ class KFDProgram: for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections: if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size] + self.device._submit_cache_inv(gli=2) + entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC) self.handle = self.lib_gpu.va_addr + entry_point self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4] @@ -105,7 +111,8 @@ class KFDProgram: f"{self.private_segment_size=} > {self.device.max_private_segment_size=}" # NOTE: no programs are ever freed - def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle) + def __del__(self): + if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu) def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): if not hasattr(self, "args_struct_t"): @@ -150,8 +157,15 @@ class KFDAllocator(LRUAllocator): super().__init__() def _alloc(self, size:int, options:BufferOptions): - if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) - else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True) + try: + if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) + else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True) + except OSError as e: + if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e + else: raise + + def _free(self, gpumem, options:BufferOptions): + self.device._gpu_free(gpumem) def copyin(self, dest, src: memoryview): # TODO: need to make the address visible to gpu and pass it directly to sdma. @@ -194,11 +208,18 @@ class KFDDevice(Compiled): assert buf != 0xffffffffffffffff assert addr == buf == mem.va_addr if map_to_gpu: - arr = (ctypes.c_int32 * 1)(self.gpu_id) - stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1) + mem.__setattr__("mapped_gpu_ids", (ctypes.c_int32 * 1)(self.gpu_id)) + stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus:=mem.mapped_gpu_ids), n_devices=len(gpus)) assert stm.n_success == 1 return mem + def _gpu_free(self, mem): + if (gpus:=getattr(mem, "mapped_gpu_ids", None)) is not None: + stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(gpus), n_devices=len(gpus)) + assert stm.n_success == len(gpus) + libc.munmap(mem.va_addr, mem.size) + kio.free_memory_of_gpu(self.kfd, handle=mem.handle) + def __init__(self, device:str=""): if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR) self.device_id = int(device.split(":")[1]) if ":" in device else 0 @@ -215,6 +236,7 @@ class KFDDevice(Compiled): self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) + self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr) @@ -274,12 +296,18 @@ class KFDDevice(Compiled): # prebuilt packets self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0) - self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1, + self.sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1, GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1, GCR_CONTROL_GL2_RANGE=0) - self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1, + self.sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1, GCR_CONTROL_GL2_RANGE=0) + pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff, + (self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa) + ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd)) + self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds, + completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal))) + # Helpers map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0) @@ -302,11 +330,11 @@ class KFDDevice(Compiled): # NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals for sig in wait_signals: poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset - blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr, + blit_sdma_command(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr, value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff)) if completion_signal is not None: - blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, + blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)) blit_sdma_command(self.sdma_flush_hdp_pkt) blit_sdma_command(self.sdma_cache_inv) @@ -315,22 +343,43 @@ class KFDDevice(Compiled): copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE for _ in range(copies_commands): step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE) - blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR, + blit_sdma_command(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR, count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied)) copied += step_copy_size blit_sdma_command(self.sdma_cache_wb) if completion_signal is not None: - blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, + blit_sdma_command(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)) if completion_signal is not None: signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset - blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1)) + blit_sdma_command(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1)) if completion_signal.event_mailbox_ptr != 0: - blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr, + blit_sdma_command(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr, data=completion_signal.event_id)) - blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id)) + blit_sdma_command(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=completion_signal.event_id)) self.sdma_write_pointer[0] = self.sdma_doorbell_value self.sdma_doorbell[0] = self.sdma_doorbell_value + + def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0): + pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I") + pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, + sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0, + amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \ + amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \ + amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)] + for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value + ctypes.memmove(self.aql_ring.va_addr + (self.aql_doorbell_value * AQL_PACKET_SIZE) % self.aql_ring.size, + ctypes.addressof(self.pm4_packet), AQL_PACKET_SIZE) + + self.amd_aql_queue.write_dispatch_id = self.aql_doorbell_value + 1 + self.aql_doorbell[0] = self.aql_doorbell_value + self.aql_doorbell_value += 1 + + evt_arr = (kfd.struct_kfd_event_data * 1)() + evt_arr[0].event_id = self.completion_signal.event_id + kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000) + + assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"