From d3660ccc51f4b78c6048d5efe71b762942b7308b Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 29 Nov 2024 18:20:07 +0300 Subject: [PATCH] prereqs for hcq updates removal (#7959) * hcq signals touch ups * hcq compiled has device id * helpers * prreq hcq api * oops --- tinygrad/helpers.py | 6 ++-- tinygrad/runtime/ops_amd.py | 50 +++++++++++++------------- tinygrad/runtime/ops_nv.py | 63 +++++++++++++++++---------------- tinygrad/runtime/ops_qcom.py | 16 ++++----- tinygrad/runtime/support/hcq.py | 19 +++++----- 5 files changed, 79 insertions(+), 75 deletions(-) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 507cecff61..52ec6e37b1 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -45,8 +45,8 @@ def ceildiv(num, amt): ret = -(num//-amt) return ret if not isinstance(ret, float) else int(ret) def round_up(num:int, amt:int) -> int: return (num+amt-1)//amt * amt -def data64(data: int) -> Tuple[int, int]: return (data >> 32, data & 0xFFFFFFFF) -def data64_le(data: int) -> Tuple[int, int]: return (data & 0xFFFFFFFF, data >> 32) +def data64(data:Any) -> Tuple[Any, Any]: return (data >> 32, data & 0xFFFFFFFF) # Any is sint +def data64_le(data:Any) -> Tuple[Any, Any]: return (data & 0xFFFFFFFF, data >> 32) # Any is sint def merge_dicts(ds:Iterable[Dict[T,U]]) -> Dict[T,U]: kvs = set([(k,v) for d in ds for k,v in d.items()]) assert len(kvs) == len(set(kv[0] for kv in kvs)), f"cannot merge, {kvs} contains different values for the same key" @@ -268,7 +268,7 @@ def cpu_objdump(lib, objdump_tool='objdump'): # TODO: make this work with read only memoryviews (if possible) def from_mv(mv:memoryview, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type * len(mv))).contents -def to_mv(ptr, sz) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B") +def to_mv(ptr:int, sz:int) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B") def mv_address(mv:memoryview): return ctypes.addressof(ctypes.c_char.from_buffer(mv)) def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options]) # noqa: E501 @functools.lru_cache(maxsize=None) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 5402f5b470..fea52ba38f 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -28,8 +28,8 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): - def __init__(self, value=0, timeline_for_device:Optional[AMDDevice]=None): - super().__init__(AMDDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=100, value_off=0, timestamp_off=8) + def __init__(self, base_addr:Optional[int]=None, **kwargs): + super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100) def __del__(self): AMDDevice.signals_pool.append(self.base_addr) @@ -47,7 +47,7 @@ class AMDComputeQueue(HWQueue): if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True)) - def pkt3(self, cmd, *vals): self.q += [amd_gpu.PACKET3(cmd, len(vals) - 1), *vals] + def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals) def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None): wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \ @@ -107,11 +107,11 @@ class AMDComputeQueue(HWQueue): self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs) - self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros. + self.cmd_idx_to_local_offset[cmd_idx] = len(self._q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros. self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0) self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0) - self.cmd_idx_to_global_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT. + self.cmd_idx_to_global_offset[cmd_idx] = len(self._q) - self.cmds_offset[cmd_idx] + 1 # +1 to skip PACKET3_DISPATCH_DIRECT. self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN) self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH)) @@ -148,16 +148,16 @@ class AMDComputeQueue(HWQueue): def bind(self, dev:AMDDevice): self.binded_device = dev - self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True)) + self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True)) hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I") - for i, value in enumerate(self.q): hw_view[i] = value + for i, value in enumerate(self._q): hw_view[i] = value self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr), - len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID] - self.q = hw_view # type: ignore + len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID] + self._q = hw_view # type: ignore def _submit(self, dev:AMDDevice): - cmds = self.indirect_cmd if dev == self.binded_device else self.q + cmds = self.indirect_cmd if dev == self.binded_device else self._q for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value @@ -171,8 +171,8 @@ class AMDCopyQueue(HWQueue): self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {} super().__init__() - def _q(self, arr): - self.q += arr + def q(self, *arr): + super().q(*arr) self.internal_cmd_sizes.append(len(arr)) def _copy(self, dest, src, copy_size): @@ -181,8 +181,8 @@ class AMDCopyQueue(HWQueue): for _ in range(copy_commands): step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE) - self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR), - amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)]) + self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR), + amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)) copied += step_copy_size @@ -192,16 +192,16 @@ class AMDCopyQueue(HWQueue): if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)]) def _signal(self, signal:AMDSignal, value=0): - self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value]) + self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value) if (dev:=signal.timeline_for_device) is not None: - self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id]) - self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)]) + self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id) + self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)) def _wait(self, signal:AMDSignal, value=0): - self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \ - amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff, - amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)]) + self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \ + amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff, + amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)) def _update_signal(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands @@ -211,8 +211,8 @@ class AMDCopyQueue(HWQueue): if value is not None: self._patch(cmd_idx, offset=3, data=[value]) def _timestamp(self, signal:AMDSignal): - self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL), - *data64_le(signal.timestamp_addr)]) + self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL), + *data64_le(signal.timestamp_addr)) def _submit(self, dev:AMDDevice): if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun") @@ -223,15 +223,15 @@ class AMDCopyQueue(HWQueue): tail_blit_dword += cmdsz start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4 - dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword]) + dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self._q[:tail_blit_dword]) dev.sdma_queue.put_value += tail_blit_dword * 4 - if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0: + if (rem_packet_cnt := len(self._q) - tail_blit_dword) > 0: zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill) dev.sdma_queue.put_value += zero_fill - dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:]) + dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self._q[tail_blit_dword:]) dev.sdma_queue.put_value += rem_packet_cnt * 4 dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 63565852b4..b164c5c4df 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -74,8 +74,9 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2) class NVSignal(HCQSignal): - def __init__(self, value=0, timeline_for_device:Optional[NVDevice]=None): - super().__init__(NVDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=1000, value_off=0, timestamp_off=8) + def __init__(self, base_addr:Optional[int]=None, **kwargs): + super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8) + def __del__(self): NVDevice.signals_pool.append(self.base_addr) class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): @@ -84,45 +85,45 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): @hcq_command def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None): - if compute_class: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class] - if copy_class: self.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class] - if local_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)] - if shared_mem_window: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)] - if local_mem: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)] - if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff] + if compute_class: self.q(nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), compute_class) + if copy_class: self.q(nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), copy_class) + if local_mem_window: self.q(nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *data64(local_mem_window)) + if shared_mem_window: self.q(nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *data64(shared_mem_window)) + if local_mem: self.q(nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *data64(local_mem)) + if local_mem_tpc_bytes: self.q(nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff) def _wait(self, signal:NVSignal, value=0): - self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), - (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT + self.q(nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), + (3 << 0) | (1 << 24)) # ACQUIRE | PAYLOAD_SIZE_64BIT def _update_wait(self, cmd_idx, signal=None, value=None): - if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.value_addr)) - if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value)) + if signal is not None: self._q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.value_addr)) + if value is not None: self._q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value)) def _timestamp(self, signal): return self._signal(signal, 0) def bind(self, dev:NVDevice): self.binded_device = dev - self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True)) + self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True)) hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I") - for i, value in enumerate(self.q): hw_view[i] = value + for i, value in enumerate(self._q): hw_view[i] = value # From now on, the queue is on the device for faster submission. - self.q = hw_view # type: ignore + self._q = hw_view # type: ignore def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo): if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr else: - if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size: - assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \ + if dev.cmdq_wptr + len(self._q) * 4 > dev.cmdq_page.size: + assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self._q) * 4 or \ gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun" dev.cmdq_wptr = 0 - dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q) + dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self._q)] = array.array('I', self._q) cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr - dev.cmdq_wptr += len(self.q) * 4 + dev.cmdq_wptr += len(self._q) * 4 - gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41) + gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41) gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count dev.gpu_mmio[0x90 // 4] = gpfifo.token gpfifo.put_value += 1 @@ -132,7 +133,7 @@ class NVComputeQueue(NVCommandQueue): self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {} super().__init__() - def _memory_barrier(self): self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)] + def _memory_barrier(self): self.q(nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)) def _exec(self, prg:NVProgram, args_state:NVArgsState, global_size, local_size): ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4) @@ -147,8 +148,8 @@ class NVComputeQueue(NVCommandQueue): qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr) if (prev_qmd:=self.cmd_idx_to_qmd.get(self._cur_cmd_idx() - 1)) is None: - self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8] - self.q += [nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9] + self.q(nvmethod(1, nv_gpu.NVC6C0_SEND_PCAS_A, 0x1), qmd_addr >> 8) + self.q(nvmethod(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 0x1), 9) else: prev_qmd.dependent_qmd0_pointer = qmd_addr >> 8 prev_qmd.dependent_qmd0_action = 1 @@ -171,9 +172,9 @@ class NVComputeQueue(NVCommandQueue): self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i return - self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), - (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP - self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0] + self.q(nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), + (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)) # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP + self.q(nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0) def _update_signal(self, cmd_idx, signal:Optional[NVSignal]=None, value=None): if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update. @@ -184,17 +185,17 @@ class NVComputeQueue(NVCommandQueue): class NVCopyQueue(NVCommandQueue): def _copy(self, dest, src, copy_size): - self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)] - self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size] - self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH + self.q(nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)) + self.q(nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size) + self.q(nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182) # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH def _update_copy(self, cmd_idx, dest=None, src=None): if dest is not None: self._patch(cmd_idx, offset=3, data=data64(dest)) if src is not None: self._patch(cmd_idx, offset=1, data=data64(src)) def _signal(self, signal, value=0): - self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.value_addr), value] - self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14] + self.q(nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.value_addr), value) + self.q(nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14) def _update_signal(self, cmd_idx, signal=None, value=None): if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.value_addr)) diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index aec2851d83..3ade2d7432 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -36,8 +36,8 @@ class QCOMCompiler(CLCompiler): def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib) class QCOMSignal(HCQSignal): - def __init__(self, value=0, timeline_for_device:Optional[QCOMDevice]=None): - super().__init__(QCOMDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=19.2) + def __init__(self, base_addr:Optional[int]=None, **kwargs): + super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2) def __del__(self): QCOMDevice.signals_pool.append(self.base_addr) @@ -55,9 +55,9 @@ class QCOMComputeQueue(HWQueue): def __del__(self): if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True)) - def cmd(self, opcode: int, *vals: int): self.q += [pkt7_hdr(opcode, len(vals)), *vals] + def cmd(self, opcode: int, *vals: int): self.q(pkt7_hdr(opcode, len(vals)), *vals) - def reg(self, reg: int, *vals: int): self.q += [pkt4_hdr(reg, len(vals)), *vals] + def reg(self, reg: int, *vals: int): self.q(pkt4_hdr(reg, len(vals)), *vals) def _cache_flush(self, write_back=True, invalidate=False, sync=True, memsync=False): # TODO: 7xx support. @@ -93,18 +93,18 @@ class QCOMComputeQueue(HWQueue): if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF]) def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None): - to_mv((hw_page_addr:=hw_addr or dev._alloc_cmd_buf(len(self.q) * 4)), len(self.q) * 4).cast('I')[:] = array.array('I', self.q) - obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self.q) * 4, flags=kgsl.KGSL_CMDLIST_IB) + to_mv((hw_page_addr:=hw_addr or dev._alloc_cmd_buf(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q) + obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB) submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx, cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object)) return submit_req, obj def bind(self, dev:QCOMDevice): self.binded_device = dev - self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True)) + self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True)) self.submit_req, self.obj = self._build_gpu_command(self.binded_device, self.hw_page.va_addr) # From now on, the queue is on the device for faster submission. - self.q = to_mv(self.obj.gpuaddr, len(self.q) * 4).cast("I") # type: ignore + self._q = to_mv(self.obj.gpuaddr, len(self._q) * 4).cast("I") # type: ignore def _submit(self, dev:QCOMDevice): if self.binded_device == dev: submit_req = self.submit_req diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 0ceb2aadf9..bfd86bee00 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -26,9 +26,9 @@ def hcq_command(func: Callable[Concatenate[QueueType, P], None]) -> Callable[Con """ @functools.wraps(func) def __wrapper(self:QueueType, *args:P.args, **kwargs:P.kwargs) -> QueueType: - self.cmds_offset.append(len(self.q)) + self.cmds_offset.append(len(self._q)) func(self, *args, **kwargs) - self.cmds_len.append(len(self.q) - self.cmds_offset[-1]) + self.cmds_len.append(len(self._q) - self.cmds_offset[-1]) self.cmds_meta.append(func.__name__) return self return __wrapper @@ -39,9 +39,11 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): Both compute and copy queues should have the following commands implemented. """ - def __init__(self): self.q, self.binded_device, self.cmds_offset, self.cmds_len, self.cmds_meta = [], None, [], [], [] + def __init__(self): self._q, self.binded_device, self.cmds_offset, self.cmds_len, self.cmds_meta = [], None, [], [], [] + def q(self, *args) -> None: self._q.extend(args) + def __len__(self): return len(self.cmds_offset) - def _patch(self, cmd_idx, offset, data): self.q[(st:=self.cmds_offset[cmd_idx]+offset):st+len(data)] = array.array('I', data) + def _patch(self, cmd_idx, offset, data): self._q[(st:=self.cmds_offset[cmd_idx]+offset):st+len(data)] = array.array('I', data) def _cur_cmd_idx(self) -> int: """ Returns the index of the command currently being enqueued. @@ -135,7 +137,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): Args: dev: The device to submit the queue to """ - if self.q: self._submit(dev) + if self._q: self._submit(dev) return self def _submit(self, dev:DeviceType): raise NotImplementedError("backend should overload this function") @@ -209,7 +211,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): raise NotImplementedError("backend should overload this function") class HCQSignal(Generic[DeviceType]): - def __init__(self, base_addr:int, value:int=0, timeline_for_device:Optional[DeviceType]=None, timestamp_divider=1, value_off=0, timestamp_off=8): + def __init__(self, base_addr:int=0, value:int=0, timeline_for_device:Optional[DeviceType]=None, timestamp_divider=1, value_off=0, timestamp_off=8): self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider) self.timeline_for_device:Optional[DeviceType] = timeline_for_device @@ -371,10 +373,11 @@ class HCQCompiled(Compiled, Generic[SignalType]): def __init__(self, device:str, allocator:HCQAllocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType], comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]): + self.device_id:int = int(device.split(":")[1]) if ":" in device else 0 self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 - self.timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self) - self._shadow_timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self) + self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) + self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self) self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = [] self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = [] self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []