diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 9ebcb56659..379a60df73 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -32,27 +32,24 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): def __init__(self, value=0, is_timeline=False): - self._signal = AMDDevice.signals_pool.pop() - self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8 + super().__init__(AMDDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(100), value_off=0, timestamp_off=8) + if is_timeline: self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1) self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8 self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id) - else: self._event_mailbox_ptr = 0 - super().__init__(value) - def __del__(self): AMDDevice.signals_pool.append(self._signal) - def _get_value(self) -> int: return self._signal[0] - def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100) - def _set_value(self, new_value:int): self._signal[0] = new_value + + def __del__(self): AMDDevice.signals_pool.append(self.base_addr) + def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)): start_time = time.time() * 1000 while (time_spent:=time.time() * 1000 - start_time) < timeout: - if self._signal[0] >= value: return + if self.value >= value: return # Wait active for 5s, then going to sleep. - if time_spent > 5000 and self._event_mailbox_ptr != 0: + if time_spent > 5000 and self.is_timeline: kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000) - raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!") + raise RuntimeError(f"wait_signal: not set to {value}, but {self.value}, {timeout} ms TIMEOUT!") class AMDComputeQueue(HWQueue): def __init__(self): @@ -140,24 +137,24 @@ class AMDComputeQueue(HWQueue): def _wait(self, signal:AMDSignal, value=0): self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \ - amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4] + amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal.value_addr), value, 0xffffffff, 4] def _timestamp(self, signal:AMDSignal): - self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr) + self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal.timestamp_addr) def _signal(self, signal:AMDSignal, value=0): # NOTE: this needs an EOP buffer on the queue or it will NULL pointer - self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True) - if signal._event_mailbox_ptr != 0: + self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.value_addr, value=value, cache_flush=True) + if signal.is_timeline: self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr, value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False) def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): - if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr)) + if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=4, data=[value]) def _update_signal(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): - if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr)) + if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value)) # Check if the signal command has mailptr part @@ -210,27 +207,27 @@ class AMDCopyQueue(HWQueue): if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)]) def _signal(self, signal:AMDSignal, value=0): - self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value]) + self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value]) - if signal._event_mailbox_ptr != 0: + if signal.is_timeline: self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id]) self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)]) def _wait(self, signal:AMDSignal, value=0): self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \ - amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff, + amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff, amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)]) def _update_signal(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): - if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr)) + if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=3, data=[value]) def _timestamp(self, signal:AMDSignal): self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL), - *data64_le(signal._timestamp_addr)]) + *data64_le(signal.timestamp_addr)]) def _submit(self, dev:AMDDevice): if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun") @@ -333,7 +330,7 @@ class AMDDevice(HCQCompiled): kfd:int = -1 event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args] signals_page:Any = None - signals_pool:List[memoryview] = [] + signals_pool:List[int] = [] gpus:List[pathlib.Path] = [] def _gpu_map(self, mem): @@ -399,7 +396,7 @@ class AMDDevice(HCQCompiled): if AMDDevice.event_page is None: AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)] + AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle) else: self._gpu_map(AMDDevice.signals_page) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index c1bc069bc5..e82e0a0517 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -75,13 +75,8 @@ def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc class NVSignal(HCQSignal): def __init__(self, value=0, is_timeline=False): - self._signal = NVDevice.signals_pool.pop() - self.signal_addr = mv_address(self._signal) - super().__init__(value) - def __del__(self): NVDevice.signals_pool.append(self._signal) - def _get_value(self) -> int: return self._signal[0] - def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000) - def _set_value(self, new_value:int): self._signal[0] = new_value + super().__init__(NVDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(1000), value_off=0, timestamp_off=8) + def __del__(self): NVDevice.signals_pool.append(self.base_addr) class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): def __del__(self): @@ -97,11 +92,11 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff] def _wait(self, signal:NVSignal, value=0): - self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value), + self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), (3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT def _update_wait(self, cmd_idx, signal=None, value=None): - if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr)) + if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.value_addr)) if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value)) def _timestamp(self, signal): return self._signal(signal, 0) @@ -170,19 +165,19 @@ class NVComputeQueue(NVCommandQueue): for i in range(2): if getattr(prev_qmd, f'release{i}_enable') == 0: setattr(prev_qmd, f'release{i}_enable', 1) - setattr(prev_qmd, f'release{i}_address', signal.signal_addr) + setattr(prev_qmd, f'release{i}_address', signal.value_addr) setattr(prev_qmd, f'release{i}_payload', value) self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i return - self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value), + self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value), (1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0] def _update_signal(self, cmd_idx, signal:Optional[NVSignal]=None, value=None): if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update. - if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr) + if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.value_addr) if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value) def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).compute_gpfifo) @@ -198,11 +193,11 @@ class NVCopyQueue(NVCommandQueue): if src is not None: self._patch(cmd_idx, offset=1, data=data64(src)) def _signal(self, signal, value=0): - self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value] + self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.value_addr), value] self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14] def _update_signal(self, cmd_idx, signal=None, value=None): - if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr)) + if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=3, data=[value]) def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).dma_gpfifo) @@ -316,7 +311,7 @@ class NVDevice(HCQCompiled[NVSignal]): fd_uvm: int = -1 gpus_info: Union[List, ctypes.Array] = [] signals_page: Any = None - signals_pool: List[Any] = [] + signals_pool: List[int] = [] low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings uvm_vaddr: int = 0x2000000000 # 0x2000000000+ host_object_enumerator: int = 0x1000 @@ -467,7 +462,7 @@ class NVDevice(HCQCompiled[NVSignal]): if NVDevice.signals_page is None: NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True) - NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)] + NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)] else: self._gpu_map(NVDevice.signals_page) channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS) diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 26478ee26a..1de13f737b 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -36,13 +36,8 @@ class QCOMCompiler(CLCompiler): def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib) class QCOMSignal(HCQSignal): - def __init__(self, value=0, is_timeline=False): - self._signal = QCOMDevice.signals_pool.pop() - super().__init__(value) - def __del__(self): QCOMDevice.signals_pool.append(self._signal) - def _get_value(self) -> int: return self._signal[0] - def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer - def _set_value(self, new_value:int): self._signal[0] = new_value + def __init__(self, value=0, is_timeline=False): super().__init__(QCOMDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(19.2)) + def __del__(self): QCOMDevice.signals_pool.append(self.base_addr) class QCOMComputeQueue(HWQueue): def __init__(self): @@ -69,7 +64,7 @@ class QCOMComputeQueue(HWQueue): self.cmd(adreno.CP_WAIT_FOR_IDLE) if QCOMDevice.gpu_id < 700: self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts), - *data64_le(mv_address(signal._signal) + (0 if not ts else 8)), qreg.cp_event_write_3(value & 0xFFFFFFFF)) + *data64_le(signal.timestamp_addr if ts else signal.value_addr), qreg.cp_event_write_3(value & 0xFFFFFFFF)) self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False) else: # TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP @@ -78,15 +73,15 @@ class QCOMComputeQueue(HWQueue): def _timestamp(self, signal:QCOMSignal): return self._signal(signal, 0, ts=True) def _wait(self, signal:QCOMSignal, value=0): - self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(mv_address(signal._signal)), + self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(signal.value_addr), qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32)) def _update_signal(self, cmd_idx, signal:Optional[QCOMSignal], value): - if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(mv_address(signal._signal))) + if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=5, data=[value & 0xFFFFFFFF]) def _update_wait(self, cmd_idx, signal:Optional[QCOMSignal], value): - if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal))) + if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF]) def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None): @@ -343,7 +338,7 @@ class QCOMAllocator(HCQAllocator): class QCOMDevice(HCQCompiled): signals_page: Any = None - signals_pool: List[Any] = [] + signals_pool: List[int] = [] gpu_id: int = 0 dummy_addr: int = 0 @@ -351,7 +346,7 @@ class QCOMDevice(HCQCompiled): self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR) QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) - QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)] + QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)] info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0 QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF) if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}") diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 01aaaa491e..32c48f9ccc 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Callable, ParamSpec, Concatenate import contextlib, decimal, statistics, random, json, atexit, time, array, ctypes, functools -from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv +from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv from tinygrad.renderer import Renderer from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator @@ -209,16 +209,16 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): raise NotImplementedError("backend should overload this function") class HCQSignal: - def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value) + def __init__(self, base_addr:int, value:int=0, is_timeline:bool=False, timestamp_divider=decimal.Decimal(1), value_off=0, timestamp_off=8): + self.base_addr, self.value_addr, self.timestamp_addr, self.ts_divider = base_addr, base_addr+value_off, base_addr+timestamp_off, timestamp_divider + self.value_mv, self.timestamp_mv, self.is_timeline = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q'), is_timeline + self.value_mv[0] = value @property - def value(self) -> int: return self._get_value() + def value(self) -> int: return self.value_mv[0] @value.setter - def value(self, new_value:int): self._set_value(new_value) - - def _get_value(self) -> int: raise NotImplementedError("_get_value() method must be implemented") - def _set_value(self, new_value:int): raise NotImplementedError("_set_value() method must be implemented") + def value(self, new_value:int): self.value_mv[0] = new_value @property def timestamp(self) -> decimal.Decimal: @@ -230,8 +230,7 @@ class HCQSignal: Returns: The timestamp in microseconds. """ - return self._get_timestamp() - def _get_timestamp(self) -> decimal.Decimal: raise NotImplementedError("_get_timestamp() method must be implemented") + return self.timestamp_mv[0] / self.ts_divider def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)): """