mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
amd pkt3 refactor (#7923)
* amd pkt3 refactor * replace this * linter * fix * cmt * fast * simpler * linter * smth * missing
This commit is contained in:
@@ -162,6 +162,8 @@ generate_amd() {
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/sdma_registers.h \
|
||||
extra/hip_gpu_driver/nvd.h \
|
||||
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
|
||||
extra/hip_gpu_driver/soc21_enum.h \
|
||||
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h \
|
||||
extra/hip_gpu_driver/gc_11_0_0_offset.h \
|
||||
extra/hip_gpu_driver/gc_10_3_0_offset.h \
|
||||
|
||||
247
extra/hip_gpu_driver/kfd_pm4_headers_ai.h
Normal file
247
extra/hip_gpu_driver/kfd_pm4_headers_ai.h
Normal file
@@ -0,0 +1,247 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||
/*
|
||||
* Copyright 2016-2022 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef F32_MES_PM4_PACKETS_H
|
||||
#define F32_MES_PM4_PACKETS_H
|
||||
|
||||
#define uint32_t unsigned int
|
||||
#define int32_t int
|
||||
|
||||
#ifndef PM4_MES_HEADER_DEFINED
|
||||
#define PM4_MES_HEADER_DEFINED
|
||||
union PM4_MES_TYPE_3_HEADER {
|
||||
struct {
|
||||
uint32_t reserved1 : 8; /* < reserved */
|
||||
uint32_t opcode : 8; /* < IT opcode */
|
||||
uint32_t count : 14;/* < number of DWORDs - 1 in the
|
||||
* information body.
|
||||
*/
|
||||
uint32_t type : 2; /* < packet identifier.
|
||||
* It should be 3 for type 3 packets
|
||||
*/
|
||||
};
|
||||
uint32_t u32All;
|
||||
};
|
||||
#endif /* PM4_MES_HEADER_DEFINED */
|
||||
|
||||
#ifndef PM4_MEC_RELEASE_MEM_DEFINED
|
||||
#define PM4_MEC_RELEASE_MEM_DEFINED
|
||||
|
||||
enum mec_release_mem_event_index_enum {
|
||||
event_index__mec_release_mem__end_of_pipe = 5,
|
||||
event_index__mec_release_mem__shader_done = 6
|
||||
};
|
||||
|
||||
enum mec_release_mem_cache_policy_enum {
|
||||
cache_policy__mec_release_mem__lru = 0,
|
||||
cache_policy__mec_release_mem__stream = 1
|
||||
};
|
||||
|
||||
enum mec_release_mem_pq_exe_status_enum {
|
||||
pq_exe_status__mec_release_mem__default = 0,
|
||||
pq_exe_status__mec_release_mem__phase_update = 1
|
||||
};
|
||||
|
||||
enum mec_release_mem_dst_sel_enum {
|
||||
dst_sel__mec_release_mem__memory_controller = 0,
|
||||
dst_sel__mec_release_mem__tc_l2 = 1,
|
||||
dst_sel__mec_release_mem__queue_write_pointer_register = 2,
|
||||
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
|
||||
};
|
||||
|
||||
enum mec_release_mem_int_sel_enum {
|
||||
int_sel__mec_release_mem__none = 0,
|
||||
int_sel__mec_release_mem__send_interrupt_only = 1,
|
||||
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
|
||||
int_sel__mec_release_mem__send_data_after_write_confirm = 3,
|
||||
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
|
||||
};
|
||||
|
||||
enum mec_release_mem_data_sel_enum {
|
||||
data_sel__mec_release_mem__none = 0,
|
||||
data_sel__mec_release_mem__send_32_bit_low = 1,
|
||||
data_sel__mec_release_mem__send_64_bit_data = 2,
|
||||
data_sel__mec_release_mem__send_gpu_clock_counter = 3,
|
||||
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
|
||||
data_sel__mec_release_mem__store_gds_data_to_memory = 5
|
||||
};
|
||||
|
||||
struct pm4_mec_release_mem {
|
||||
union {
|
||||
union PM4_MES_TYPE_3_HEADER header; /*header */
|
||||
unsigned int ordinal1;
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int event_type:6;
|
||||
unsigned int reserved1:2;
|
||||
enum mec_release_mem_event_index_enum event_index:4;
|
||||
unsigned int tcl1_vol_action_ena:1;
|
||||
unsigned int tc_vol_action_ena:1;
|
||||
unsigned int reserved2:1;
|
||||
unsigned int tc_wb_action_ena:1;
|
||||
unsigned int tcl1_action_ena:1;
|
||||
unsigned int tc_action_ena:1;
|
||||
uint32_t reserved3:1;
|
||||
uint32_t tc_nc_action_ena:1;
|
||||
uint32_t tc_wc_action_ena:1;
|
||||
uint32_t tc_md_action_ena:1;
|
||||
uint32_t reserved4:3;
|
||||
enum mec_release_mem_cache_policy_enum cache_policy:2;
|
||||
uint32_t reserved5:2;
|
||||
enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
|
||||
uint32_t reserved6:2;
|
||||
} bitfields2;
|
||||
unsigned int ordinal2;
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t reserved7:16;
|
||||
enum mec_release_mem_dst_sel_enum dst_sel:2;
|
||||
uint32_t reserved8:6;
|
||||
enum mec_release_mem_int_sel_enum int_sel:3;
|
||||
uint32_t reserved9:2;
|
||||
enum mec_release_mem_data_sel_enum data_sel:3;
|
||||
} bitfields3;
|
||||
unsigned int ordinal3;
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t reserved10:2;
|
||||
unsigned int address_lo_32b:30;
|
||||
} bitfields4;
|
||||
struct {
|
||||
uint32_t reserved11:3;
|
||||
uint32_t address_lo_64b:29;
|
||||
} bitfields4b;
|
||||
uint32_t reserved12;
|
||||
unsigned int ordinal4;
|
||||
};
|
||||
|
||||
union {
|
||||
uint32_t address_hi;
|
||||
uint32_t reserved13;
|
||||
uint32_t ordinal5;
|
||||
};
|
||||
|
||||
union {
|
||||
uint32_t data_lo;
|
||||
uint32_t cmp_data_lo;
|
||||
struct {
|
||||
uint32_t dw_offset:16;
|
||||
uint32_t num_dwords:16;
|
||||
} bitfields6c;
|
||||
uint32_t reserved14;
|
||||
uint32_t ordinal6;
|
||||
};
|
||||
|
||||
union {
|
||||
uint32_t data_hi;
|
||||
uint32_t cmp_data_hi;
|
||||
uint32_t reserved15;
|
||||
uint32_t reserved16;
|
||||
uint32_t ordinal7;
|
||||
};
|
||||
|
||||
uint32_t int_ctxid;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef PM4_MEC_WRITE_DATA_DEFINED
|
||||
#define PM4_MEC_WRITE_DATA_DEFINED
|
||||
|
||||
enum WRITE_DATA_dst_sel_enum {
|
||||
dst_sel___write_data__mem_mapped_register = 0,
|
||||
dst_sel___write_data__tc_l2 = 2,
|
||||
dst_sel___write_data__gds = 3,
|
||||
dst_sel___write_data__memory = 5,
|
||||
dst_sel___write_data__memory_mapped_adc_persistent_state = 6,
|
||||
};
|
||||
|
||||
enum WRITE_DATA_addr_incr_enum {
|
||||
addr_incr___write_data__increment_address = 0,
|
||||
addr_incr___write_data__do_not_increment_address = 1
|
||||
};
|
||||
|
||||
enum WRITE_DATA_wr_confirm_enum {
|
||||
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0,
|
||||
wr_confirm___write_data__wait_for_write_confirmation = 1
|
||||
};
|
||||
|
||||
enum WRITE_DATA_cache_policy_enum {
|
||||
cache_policy___write_data__lru = 0,
|
||||
cache_policy___write_data__stream = 1
|
||||
};
|
||||
|
||||
|
||||
struct pm4_mec_write_data_mmio {
|
||||
union {
|
||||
union PM4_MES_TYPE_3_HEADER header; /*header */
|
||||
unsigned int ordinal1;
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int reserved1:8;
|
||||
unsigned int dst_sel:4;
|
||||
unsigned int reserved2:4;
|
||||
unsigned int addr_incr:1;
|
||||
unsigned int reserved3:2;
|
||||
unsigned int resume_vf:1;
|
||||
unsigned int wr_confirm:1;
|
||||
unsigned int reserved4:4;
|
||||
unsigned int cache_policy:2;
|
||||
unsigned int reserved5:5;
|
||||
} bitfields2;
|
||||
unsigned int ordinal2;
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
unsigned int dst_mmreg_addr:18;
|
||||
unsigned int reserved6:14;
|
||||
} bitfields3;
|
||||
unsigned int ordinal3;
|
||||
};
|
||||
|
||||
uint32_t reserved7;
|
||||
|
||||
uint32_t data;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
enum {
|
||||
CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
|
||||
};
|
||||
#endif
|
||||
|
||||
22477
extra/hip_gpu_driver/soc21_enum.h
Normal file
22477
extra/hip_gpu_driver/soc21_enum.h
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -19,10 +19,7 @@ def is_usable_gpu(gpu_id):
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
||||
|
||||
# VGT_EVENT_TYPE in navi10_enum.h
|
||||
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
||||
|
||||
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
||||
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
||||
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
|
||||
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
||||
@@ -60,37 +57,41 @@ class AMDComputeQueue(HWQueue):
|
||||
if self.binded_device is not None:
|
||||
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
||||
|
||||
def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
||||
def pkt3(self, cmd, *vals): self.q += [amd_gpu.PACKET3(cmd, len(vals) - 1), *vals]
|
||||
|
||||
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
||||
cache_flush_flags = 0
|
||||
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
||||
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
||||
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
|
||||
|
||||
if cache_flush:
|
||||
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
||||
|
||||
# event_index__mec_release_mem__end_of_pipe = 5
|
||||
# event_index__mec_release_mem__shader_done = 6
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
||||
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
*data64_le(address), *data64_le(value), cst]
|
||||
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
||||
|
||||
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
|
||||
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
|
||||
|
||||
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
|
||||
|
||||
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
||||
|
||||
def _memory_barrier(self):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
||||
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
||||
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
||||
self._acquire_mem()
|
||||
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
|
||||
self.acquire_mem()
|
||||
|
||||
def _exec(self, prg:AMDProgram, args_state:AMDArgsState, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
||||
self._acquire_mem(gli=0, gl2=0)
|
||||
self.acquire_mem(gli=0, gl2=0)
|
||||
|
||||
cmd_idx = self._cur_cmd_idx()
|
||||
user_regs = [*data64_le(prg.dev.scratch.va_addr), 0xffffffff, 0xc00000] if prg.enable_private_segment_sgpr else []
|
||||
@@ -103,27 +104,26 @@ class AMDComputeQueue(HWQueue):
|
||||
self.cmd_idx_to_dispatch_packet[cmd_idx] = dp
|
||||
user_regs += [*data64_le(args_state.ptr)]
|
||||
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size]
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
||||
if prg.dev.has_scratch_base_registers:
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2),
|
||||
gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8)]
|
||||
if prg.dev.target < 110000: self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
||||
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
|
||||
|
||||
self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros.
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
|
||||
|
||||
self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT.
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
|
||||
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
|
||||
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
|
||||
def _update_exec(self, cmd_idx, global_size, local_size):
|
||||
if local_size is not None: self._patch(cmd_idx, offset=self.cmd_idx_to_local_offset[cmd_idx], data=local_size)
|
||||
@@ -134,20 +134,19 @@ class AMDComputeQueue(HWQueue):
|
||||
if global_size is not None:
|
||||
dp.grid_size_x,dp.grid_size_y,dp.grid_size_z = [g*l for g,l in zip(global_size,[dp.workgroup_size_x,dp.workgroup_size_y,dp.workgroup_size_z])]
|
||||
|
||||
def _wait(self, signal:AMDSignal, value=0):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal.value_addr), value, 0xffffffff, 4]
|
||||
def _wait(self, signal:AMDSignal, value=0): self.wait_reg_mem(mem=signal.value_addr, value=value, mask=0xffffffff)
|
||||
|
||||
def _timestamp(self, signal:AMDSignal):
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal.timestamp_addr)
|
||||
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
|
||||
|
||||
def _signal(self, signal:AMDSignal, value=0):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.value_addr, value=value, cache_flush=True)
|
||||
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
|
||||
if signal.is_timeline:
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
|
||||
value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
|
||||
self.release_mem(signal._event_mailbox_ptr, signal._event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=signal._event.event_id)
|
||||
|
||||
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))
|
||||
|
||||
Reference in New Issue
Block a user