mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 01:26:29 -05:00
tiny amd cleanups (#5420)
This commit is contained in:
@@ -21,9 +21,8 @@ def is_usable_gpu(gpu_id):
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
||||
made = made_struct or user_struct(**kwargs)
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
@@ -43,7 +42,6 @@ def ioctls_from_header():
|
||||
kio = ioctls_from_header()
|
||||
|
||||
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
|
||||
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
||||
@@ -59,6 +57,8 @@ COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
||||
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
||||
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
||||
def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
|
||||
def signal_value_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'value').offset
|
||||
def signal_ts_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset
|
||||
|
||||
def disasm(lib):
|
||||
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
|
||||
@@ -131,10 +131,9 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
|
||||
|
||||
def _wait(self, signal:hsa.amd_signal_t, value=0):
|
||||
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(addr), value, 0xffffffff, 4]
|
||||
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal_value_addr(signal)), value, 0xffffffff, 4]
|
||||
|
||||
def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
|
||||
cache_flush_flags = 0
|
||||
@@ -151,24 +150,21 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
*data64_le(address), *data64_le(value), cst]
|
||||
|
||||
def _timestamp(self, signal):
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0,
|
||||
address=ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
|
||||
def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal_ts_addr(signal))
|
||||
|
||||
def _signal(self, signal:hsa.amd_signal_t, value=0):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
||||
value=value, cache_flush=True)
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal_value_addr(signal), value=value, cache_flush=True)
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
|
||||
value=signal.event_id, cst=signal.event_id, cache_flush=True)
|
||||
|
||||
def _update_wait(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal_value_addr(signal)))
|
||||
if value is not None: self._patch(cmd_idx, offset=4, data=[value])
|
||||
|
||||
def _update_signal(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
|
||||
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal_value_addr(signal)))
|
||||
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
||||
|
||||
# Check if the signal command has mailptr part
|
||||
@@ -228,7 +224,7 @@ class AMDCopyQueue(HWCopyQueue):
|
||||
if dest is not None: self._patch(cmd_idx, offset=10+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
|
||||
|
||||
def _signal(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal_value_addr(signal)), value])
|
||||
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
|
||||
@@ -236,17 +232,17 @@ class AMDCopyQueue(HWCopyQueue):
|
||||
|
||||
def _wait(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal_value_addr(signal)), value, 0xffffffff,
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
||||
|
||||
def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
|
||||
def _update_wait(self, cmd_idx, signal=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET))
|
||||
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal_value_addr(signal)))
|
||||
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
|
||||
|
||||
def _timestamp(self, signal:hsa.amd_signal_t):
|
||||
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
||||
*data64_le(ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)])
|
||||
*data64_le(signal_ts_addr(signal))])
|
||||
|
||||
def _submit(self, device):
|
||||
if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
||||
|
||||
Reference in New Issue
Block a user