amd pkt3 refactor (#7923)

* amd pkt3 refactor

* replace this

* linter

* fix

* cmt

* fast

* simpler

* linter

* smth

* missing
This commit is contained in:
nimlgen
2024-11-28 11:06:37 +03:00
committed by GitHub
parent e3fe7023b0
commit 81d415be03
5 changed files with 62284 additions and 64 deletions

View File

@@ -162,6 +162,8 @@ generate_amd() {
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/nvd.h \
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
extra/hip_gpu_driver/soc21_enum.h \
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h \
extra/hip_gpu_driver/gc_11_0_0_offset.h \
extra/hip_gpu_driver/gc_10_3_0_offset.h \

View File

@@ -0,0 +1,247 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright 2016-2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef F32_MES_PM4_PACKETS_H
#define F32_MES_PM4_PACKETS_H
#define uint32_t unsigned int
#define int32_t int
#ifndef PM4_MES_HEADER_DEFINED
#define PM4_MES_HEADER_DEFINED
union PM4_MES_TYPE_3_HEADER {
struct {
uint32_t reserved1 : 8; /* < reserved */
uint32_t opcode : 8; /* < IT opcode */
uint32_t count : 14;/* < number of DWORDs - 1 in the
* information body.
*/
uint32_t type : 2; /* < packet identifier.
* It should be 3 for type 3 packets
*/
};
uint32_t u32All;
};
#endif /* PM4_MES_HEADER_DEFINED */
#ifndef PM4_MEC_RELEASE_MEM_DEFINED
#define PM4_MEC_RELEASE_MEM_DEFINED
enum mec_release_mem_event_index_enum {
event_index__mec_release_mem__end_of_pipe = 5,
event_index__mec_release_mem__shader_done = 6
};
enum mec_release_mem_cache_policy_enum {
cache_policy__mec_release_mem__lru = 0,
cache_policy__mec_release_mem__stream = 1
};
enum mec_release_mem_pq_exe_status_enum {
pq_exe_status__mec_release_mem__default = 0,
pq_exe_status__mec_release_mem__phase_update = 1
};
enum mec_release_mem_dst_sel_enum {
dst_sel__mec_release_mem__memory_controller = 0,
dst_sel__mec_release_mem__tc_l2 = 1,
dst_sel__mec_release_mem__queue_write_pointer_register = 2,
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
};
enum mec_release_mem_int_sel_enum {
int_sel__mec_release_mem__none = 0,
int_sel__mec_release_mem__send_interrupt_only = 1,
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
int_sel__mec_release_mem__send_data_after_write_confirm = 3,
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
};
enum mec_release_mem_data_sel_enum {
data_sel__mec_release_mem__none = 0,
data_sel__mec_release_mem__send_32_bit_low = 1,
data_sel__mec_release_mem__send_64_bit_data = 2,
data_sel__mec_release_mem__send_gpu_clock_counter = 3,
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
data_sel__mec_release_mem__store_gds_data_to_memory = 5
};
struct pm4_mec_release_mem {
union {
union PM4_MES_TYPE_3_HEADER header; /*header */
unsigned int ordinal1;
};
union {
struct {
unsigned int event_type:6;
unsigned int reserved1:2;
enum mec_release_mem_event_index_enum event_index:4;
unsigned int tcl1_vol_action_ena:1;
unsigned int tc_vol_action_ena:1;
unsigned int reserved2:1;
unsigned int tc_wb_action_ena:1;
unsigned int tcl1_action_ena:1;
unsigned int tc_action_ena:1;
uint32_t reserved3:1;
uint32_t tc_nc_action_ena:1;
uint32_t tc_wc_action_ena:1;
uint32_t tc_md_action_ena:1;
uint32_t reserved4:3;
enum mec_release_mem_cache_policy_enum cache_policy:2;
uint32_t reserved5:2;
enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
uint32_t reserved6:2;
} bitfields2;
unsigned int ordinal2;
};
union {
struct {
uint32_t reserved7:16;
enum mec_release_mem_dst_sel_enum dst_sel:2;
uint32_t reserved8:6;
enum mec_release_mem_int_sel_enum int_sel:3;
uint32_t reserved9:2;
enum mec_release_mem_data_sel_enum data_sel:3;
} bitfields3;
unsigned int ordinal3;
};
union {
struct {
uint32_t reserved10:2;
unsigned int address_lo_32b:30;
} bitfields4;
struct {
uint32_t reserved11:3;
uint32_t address_lo_64b:29;
} bitfields4b;
uint32_t reserved12;
unsigned int ordinal4;
};
union {
uint32_t address_hi;
uint32_t reserved13;
uint32_t ordinal5;
};
union {
uint32_t data_lo;
uint32_t cmp_data_lo;
struct {
uint32_t dw_offset:16;
uint32_t num_dwords:16;
} bitfields6c;
uint32_t reserved14;
uint32_t ordinal6;
};
union {
uint32_t data_hi;
uint32_t cmp_data_hi;
uint32_t reserved15;
uint32_t reserved16;
uint32_t ordinal7;
};
uint32_t int_ctxid;
};
#endif
#ifndef PM4_MEC_WRITE_DATA_DEFINED
#define PM4_MEC_WRITE_DATA_DEFINED
enum WRITE_DATA_dst_sel_enum {
dst_sel___write_data__mem_mapped_register = 0,
dst_sel___write_data__tc_l2 = 2,
dst_sel___write_data__gds = 3,
dst_sel___write_data__memory = 5,
dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
};
enum WRITE_DATA_addr_incr_enum {
addr_incr___write_data__increment_address = 0,
addr_incr___write_data__do_not_increment_address = 1
};
enum WRITE_DATA_wr_confirm_enum {
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
wr_confirm___write_data__wait_for_write_confirmation = 1
};
enum WRITE_DATA_cache_policy_enum {
cache_policy___write_data__lru = 0,
cache_policy___write_data__stream = 1
};
struct pm4_mec_write_data_mmio {
union {
union PM4_MES_TYPE_3_HEADER header; /*header */
unsigned int ordinal1;
};
union {
struct {
unsigned int reserved1:8;
unsigned int dst_sel:4;
unsigned int reserved2:4;
unsigned int addr_incr:1;
unsigned int reserved3:2;
unsigned int resume_vf:1;
unsigned int wr_confirm:1;
unsigned int reserved4:4;
unsigned int cache_policy:2;
unsigned int reserved5:5;
} bitfields2;
unsigned int ordinal2;
};
union {
struct {
unsigned int dst_mmreg_addr:18;
unsigned int reserved6:14;
} bitfields3;
unsigned int ordinal3;
};
uint32_t reserved7;
uint32_t data;
};
#endif
enum {
CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
};
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -19,10 +19,7 @@ def is_usable_gpu(gpu_id):
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
# VGT_EVENT_TYPE in navi10_enum.h
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
@@ -60,37 +57,41 @@ class AMDComputeQueue(HWQueue):
if self.binded_device is not None:
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
def pkt3(self, cmd, *vals): self.q += [amd_gpu.PACKET3(cmd, len(vals) - 1), *vals]
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
cache_flush_flags = 0
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
if cache_flush:
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
# event_index__mec_release_mem__end_of_pipe = 5
# event_index__mec_release_mem__shader_done = 6
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
*data64_le(address), *data64_le(value), cst]
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
def _memory_barrier(self):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
self._acquire_mem()
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
self.acquire_mem()
def _exec(self, prg:AMDProgram, args_state:AMDArgsState, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
self._acquire_mem(gli=0, gl2=0)
self.acquire_mem(gli=0, gl2=0)
cmd_idx = self._cur_cmd_idx()
user_regs = [*data64_le(prg.dev.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
@@ -103,27 +104,26 @@ class AMDComputeQueue(HWQueue):
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
user_regs += [*data64_le(args_state.ptr)]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size]
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
if prg.dev.has_scratch_base_registers:
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8)]
if prg.dev.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
def _update_exec(self, cmd_idx, global_size, local_size):
if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
@@ -134,20 +134,19 @@ class AMDComputeQueue(HWQueue):
if global_size is not None:
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
def _wait(self, signal:AMDSignal, value=0):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal.value_addr), value, 0xffffffff, 4]
def _wait(self, signal:AMDSignal, value=0): self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
def _timestamp(self, signal:AMDSignal):
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal.timestamp_addr)
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
def _signal(self, signal:AMDSignal, value=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.value_addr, value=value, cache_flush=True)
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
if signal.is_timeline:
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
self.release_mem(signal._event_mailbox_ptr, signal._event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=signal._event.event_id)
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))