mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-15 17:15:48 -05:00
amd more accurate cache managment (#5631)
* amd more accurate cache managment * fix amd * add memory_barrier + copies tests * tranfer test as well * linter
This commit is contained in:
@@ -101,7 +101,7 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
self.binded_device.synchronize()
|
||||
self.binded_device._gpu_free(self.hw_page)
|
||||
|
||||
def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
|
||||
@@ -109,14 +109,29 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
|
||||
|
||||
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
||||
cache_flush_flags = 0
|
||||
|
||||
if cache_flush:
|
||||
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
||||
|
||||
# event_index__mec_release_mem__end_of_pipe = 5
|
||||
# event_index__mec_release_mem__shader_done = 6
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
||||
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
*data64_le(address), *data64_le(value), cst]
|
||||
|
||||
def _memory_barrier(self):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
|
||||
amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
|
||||
nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
|
||||
self._invalidate_cache()
|
||||
self._acquire_mem()
|
||||
|
||||
def _exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
|
||||
self._invalidate_cache()
|
||||
self._acquire_mem(gli=0, gl2=0)
|
||||
|
||||
user_data = [*data64_le(kernargs)]
|
||||
if hasattr(prg, 'dispatch_packet_offset'):
|
||||
@@ -154,21 +169,6 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
|
||||
|
||||
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
||||
cache_flush_flags = 0
|
||||
|
||||
if cache_flush:
|
||||
cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
|
||||
|
||||
# event_index__mec_release_mem__end_of_pipe = 5
|
||||
# event_index__mec_release_mem__shader_done = 6
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
||||
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
*data64_le(address), *data64_le(value), cst]
|
||||
|
||||
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
|
||||
|
||||
def _signal(self, signal, value=0):
|
||||
@@ -176,7 +176,7 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
|
||||
if signal._event_mailbox_ptr != 0:
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
|
||||
value=signal._event_id, cst=signal._event_id, cache_flush=True)
|
||||
value=signal._event_id, cst=signal._event_id, cache_flush=False)
|
||||
|
||||
def _update_wait(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
|
||||
@@ -220,10 +220,6 @@ class AMDCopyQueue(HWCopyQueue):
|
||||
self.internal_cmd_sizes.append(len(arr))
|
||||
|
||||
def _copy(self, dest, src, copy_size):
|
||||
# Invalidate cache inv
|
||||
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
|
||||
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
|
||||
|
||||
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
self.copy_cmds_per_copy[len(self) - 1] = copy_commands
|
||||
for _ in range(copy_commands):
|
||||
@@ -234,13 +230,10 @@ class AMDCopyQueue(HWCopyQueue):
|
||||
|
||||
copied += step_copy_size
|
||||
|
||||
# Invalidate cache wb
|
||||
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
|
||||
|
||||
def _update_copy(self, cmd_idx, dest=None, src=None):
|
||||
for i in range(self.copy_cmds_per_copy[cmd_idx]):
|
||||
if src is not None: self._patch(cmd_idx, offset=8+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
||||
if dest is not None: self._patch(cmd_idx, offset=10+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
||||
if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
|
||||
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
||||
|
||||
def _signal(self, signal, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
|
||||
|
||||
Reference in New Issue
Block a user