tiny amd cleanups (#5420)

This commit is contained in:
nimlgen
2024-07-12 22:54:42 +03:00
committed by GitHub
parent b17e4adb3a
commit f4944ced09

View File

@@ -21,9 +21,8 @@ def is_usable_gpu(gpu_id):
except OSError:
return False
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
made = made_struct or user_struct(**kwargs)
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made
@@ -43,7 +42,6 @@ def ioctls_from_header():
kio = ioctls_from_header()
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
@@ -59,6 +57,8 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
def signal_value_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'value').offset
def signal_ts_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset
def disasm(lib):
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
@@ -131,10 +131,9 @@ class AMDComputeQueue(HWComputeQueue):
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
def _wait(self, signal:hsa.amd_signal_t, value=0):
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal_value_addr(signal)), value, 0xffffffff, 4]
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
cache_flush_flags = 0
@@ -151,24 +150,21 @@ class AMDComputeQueue(HWComputeQueue):
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
*data64_le(address), *data64_le(value), cst]
def _timestamp(self, signal):
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
address=ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal_ts_addr(signal))
def _signal(self, signal:hsa.amd_signal_t, value=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
value=value, cache_flush=True)
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal_value_addr(signal), value=value, cache_flush=True)
if signal.event_mailbox_ptr != 0:
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
value=signal.event_id, cst=signal.event_id, cache_flush=True)
def _update_wait(self, cmd_idx, signal=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal_value_addr(signal)))
if value is not None: self._patch(cmd_idx, offset=4, data=[value])
def _update_signal(self, cmd_idx, signal=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal_value_addr(signal)))
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
# Check if the signal command has mailptr part
@@ -228,7 +224,7 @@ class AMDCopyQueue(HWCopyQueue):
if dest is not None: self._patch(cmd_idx, offset=10+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
def _signal(self, signal: hsa.amd_signal_t, value=0):
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal_value_addr(signal)), value])
if signal.event_mailbox_ptr != 0:
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
@@ -236,17 +232,17 @@ class AMDCopyQueue(HWCopyQueue):
def _wait(self, signal: hsa.amd_signal_t, value=0):
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal_value_addr(signal)), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
def _update_wait(self, cmd_idx, signal=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal_value_addr(signal)))
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
def _timestamp(self, signal:hsa.amd_signal_t):
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
*data64_le(ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
*data64_le(signal_ts_addr(signal))])
def _submit(self, device):
if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")