mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 05:48:08 -05:00
qcom refactor CP_LOAD_STATE6_FRAG with qreg (#6354)
* qcom refactor CP_LOAD_STATE6_FRAG with qreg * qreg for wait and signal * make qreg work for false flags
This commit is contained in:
committed by
GitHub
parent
326a77336e
commit
a86e7d598e
@@ -11,7 +11,7 @@ if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # p
|
||||
|
||||
def _qreg_exec(reg, __val=0, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
__val |= (getattr(adreno, reg[4:] + "_" + k.upper())) if isinstance(v, bool) else (v << getattr(adreno, reg[4:] + "_" + k.upper() + "__SHIFT"))
|
||||
__val |= (getattr(adreno, f'{reg[4:]}_{k.upper()}') if v else 0) if type(v) is bool else (v << getattr(adreno, f'{reg[4:]}_{k.upper()}__SHIFT'))
|
||||
return __val
|
||||
qreg: Any = type("QREG", (object,), {name[4:].lower(): functools.partial(_qreg_exec, name) for name in adreno.__dict__.keys() if name[:4] == 'REG_'})
|
||||
|
||||
@@ -53,8 +53,8 @@ class QCOMComputeQueue(HWComputeQueue):
|
||||
|
||||
def _signal(self, signal, value=0, ts=False):
|
||||
if QCOMDevice.gpu_id < 700:
|
||||
self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_FLUSH_TS | (0 if not ts else adreno.CP_EVENT_WRITE_0_TIMESTAMP),
|
||||
*data64_le(mv_address(signal._signal) + (0 if not ts else 8)), value & 0xFFFFFFFF)
|
||||
self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts),
|
||||
*data64_le(mv_address(signal._signal) + (0 if not ts else 8)), qreg.cp_event_write_3(value&0xFFFFFFFF))
|
||||
self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_INVALIDATE)
|
||||
else:
|
||||
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
|
||||
@@ -63,8 +63,8 @@ class QCOMComputeQueue(HWComputeQueue):
|
||||
def _timestamp(self, signal): return self._signal(signal, 0, ts=True)
|
||||
|
||||
def _wait(self, signal, value=0):
|
||||
self.cmd(adreno.CP_WAIT_REG_MEM, adreno.WRITE_GE | adreno.CP_WAIT_REG_MEM_0_POLL(adreno.POLL_MEMORY),
|
||||
*data64_le(mv_address(signal._signal)), value & 0xFFFFFFFF, 0xFFFFFFFF, 32) # busy wait for 32 cycles
|
||||
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(mv_address(signal._signal)),
|
||||
qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
|
||||
|
||||
def _update_signal(self, cmd_idx, signal, value):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
|
||||
@@ -95,7 +95,7 @@ class QCOMComputeQueue(HWComputeQueue):
|
||||
@hcq_command
|
||||
def setup(self):
|
||||
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
||||
self.cmd(adreno.CP_SET_MARKER, adreno.RM6_COMPUTE)
|
||||
self.cmd(adreno.CP_SET_MARKER, qreg.a6xx_cp_set_marker_0(mode=adreno.RM6_COMPUTE))
|
||||
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, qreg.a6xx_hlsq_invalidate_cmd(cs_state=True, cs_ibo=True))
|
||||
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, qreg.a6xx_hlsq_invalidate_cmd())
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_COUNT, qreg.a6xx_sp_cs_tex_count(0xff))
|
||||
@@ -119,12 +119,12 @@ class QCOMComputeQueue(HWComputeQueue):
|
||||
qreg.a6xx_sp_cs_unknown_a9b1(unk5=True, unk6=True, shared_size=prg.shared_size), 0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
|
||||
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.device._stack.va_addr),
|
||||
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_CONSTANTS) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(prg.kernargs_alloc_size // 4),
|
||||
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=prg.kernargs_alloc_size // 4),
|
||||
*data64_le(args_state.ptr))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_SHADER) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(round_up(prg.image_size, 128) // 128),
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
|
||||
*data64_le(prg.lib_gpu.va_addr))
|
||||
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc,
|
||||
qreg.a6xx_hlsq_cs_cntl(constlen=prg.kernargs_alloc_size // 4, enabled=True))
|
||||
@@ -132,25 +132,21 @@ class QCOMComputeQueue(HWComputeQueue):
|
||||
self.reg(adreno.REG_A6XX_SP_CS_INSTRLEN, qreg.a6xx_sp_cs_instrlen(prg.image_size // 4))
|
||||
|
||||
if hasattr(args_state, 'samplers_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_SHADER) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_TEX) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.samplers_cnt),
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.samplers_cnt),
|
||||
*data64_le(args_state.samplers_ptr.va_addr))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.samplers_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.device._border_color_base()))
|
||||
|
||||
if hasattr(args_state, 'descriptors_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_CONSTANTS) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_TEX) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.descriptors_cnt),
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.descriptors_cnt),
|
||||
*data64_le(args_state.descriptors_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.descriptors_ptr.va_addr))
|
||||
|
||||
if hasattr(args_state, 'ibos_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST6_IBO) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.ibos_cnt),
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.ibos_cnt),
|
||||
*data64_le(args_state.ibos_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ibos_ptr.va_addr))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user