diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 2cfd189730..5402f5b470 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Tuple, List, Any, Optional -import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys +import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys assert sys.platform != 'win32' from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram @@ -28,25 +28,15 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2 class AMDSignal(HCQSignal): - def __init__(self, value=0, is_timeline=False): - super().__init__(AMDDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(100), value_off=0, timestamp_off=8) - - if is_timeline: - self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1) - self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8 - self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id) + def __init__(self, value=0, timeline_for_device:Optional[AMDDevice]=None): + super().__init__(AMDDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=100, value_off=0, timestamp_off=8) def __del__(self): AMDDevice.signals_pool.append(self.base_addr) - def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)): - start_time = time.time() * 1000 - while (time_spent:=time.time() * 1000 - start_time) < timeout: - if self.value >= value: return - - # Wait active for 5s, then going to sleep. - if time_spent > 5000 and self.is_timeline: - kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000) - raise RuntimeError(f"wait_signal: not set to {value}, but {self.value}, {timeout} ms TIMEOUT!") + def _sleep(self, time_spent_waiting_ms:int): + # Resonable to sleep for long workloads (which take more than 2s) and only timeline signals. + if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: + kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=self.timeline_for_device.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=200) class AMDComputeQueue(HWQueue): def __init__(self): @@ -144,9 +134,9 @@ class AMDComputeQueue(HWQueue): self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low, amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True) - if signal.is_timeline: - self.release_mem(signal._event_mailbox_ptr, signal._event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low, - amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=signal._event.event_id) + if (dev:=signal.timeline_for_device) is not None: + self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low, + amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id) def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None): if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr)) @@ -156,10 +146,6 @@ class AMDComputeQueue(HWQueue): if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr)) if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value)) - # Check if the signal command has mailptr part - if signal is not None and self.cmds_len[cmd_idx] > 8: - self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id]) - def bind(self, dev:AMDDevice): self.binded_device = dev self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True)) @@ -208,9 +194,9 @@ class AMDCopyQueue(HWQueue): def _signal(self, signal:AMDSignal, value=0): self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value]) - if signal.is_timeline: - self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id]) - self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)]) + if (dev:=signal.timeline_for_device) is not None: + self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id]) + self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)]) def _wait(self, signal:AMDSignal, value=0): self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \ @@ -423,6 +409,11 @@ class AMDDevice(HCQCompiled): eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size) self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000) + self.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1) + self.queue_event_mailbox_ptr = AMDDevice.event_page.va_addr + self.queue_event.event_slot_index * 8 + self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.queue_event.event_id) + self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr) + self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY) self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index e82e0a0517..63565852b4 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -1,5 +1,5 @@ from __future__ import annotations -import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys +import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys assert sys.platform != 'win32' from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional from dataclasses import dataclass @@ -74,8 +74,8 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4 def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2) class NVSignal(HCQSignal): - def __init__(self, value=0, is_timeline=False): - super().__init__(NVDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(1000), value_off=0, timestamp_off=8) + def __init__(self, value=0, timeline_for_device:Optional[NVDevice]=None): + super().__init__(NVDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=1000, value_off=0, timestamp_off=8) def __del__(self): NVDevice.signals_pool.append(self.base_addr) class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 1de13f737b..aec2851d83 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -1,5 +1,5 @@ from __future__ import annotations -import os, ctypes, functools, mmap, struct, array, decimal, math, sys +import os, ctypes, functools, mmap, struct, array, math, sys assert sys.platform != 'win32' from types import SimpleNamespace from typing import Tuple, List, Any, cast, Optional @@ -36,9 +36,17 @@ class QCOMCompiler(CLCompiler): def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib) class QCOMSignal(HCQSignal): - def __init__(self, value=0, is_timeline=False): super().__init__(QCOMDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(19.2)) + def __init__(self, value=0, timeline_for_device:Optional[QCOMDevice]=None): + super().__init__(QCOMDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=19.2) + def __del__(self): QCOMDevice.signals_pool.append(self.base_addr) + def _sleep(self, time_spent_waiting_ms:int): + # Sleep only for only timeline signals. Do it immidiately to free cpu. + if self.timeline_for_device is not None: + kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx, + timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff) + class QCOMComputeQueue(HWQueue): def __init__(self): self.cmd_idx_to_dims = {} @@ -397,5 +405,3 @@ class QCOMDevice(HCQCompiled): self.synchronize() self._gpu_free(self._stack) self._stack = self._gpu_alloc(sz) - - def _syncdev(self): kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.fd, context_id=self.ctx, timestamp=self.last_cmd, timeout=0xffffffff) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 32c48f9ccc..0ceb2aadf9 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -208,10 +208,13 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): def _update_copy(self, cmd_idx:int, dest:Optional[int], src:Optional[int]): raise NotImplementedError("backend should overload this function") -class HCQSignal: - def __init__(self, base_addr:int, value:int=0, is_timeline:bool=False, timestamp_divider=decimal.Decimal(1), value_off=0, timestamp_off=8): - self.base_addr, self.value_addr, self.timestamp_addr, self.ts_divider = base_addr, base_addr+value_off, base_addr+timestamp_off, timestamp_divider - self.value_mv, self.timestamp_mv, self.is_timeline = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q'), is_timeline +class HCQSignal(Generic[DeviceType]): + def __init__(self, base_addr:int, value:int=0, timeline_for_device:Optional[DeviceType]=None, timestamp_divider=1, value_off=0, timestamp_off=8): + self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off + self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider) + self.timeline_for_device:Optional[DeviceType] = timeline_for_device + + self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q') self.value_mv[0] = value @property @@ -230,7 +233,12 @@ class HCQSignal: Returns: The timestamp in microseconds. """ - return self.timestamp_mv[0] / self.ts_divider + return self.timestamp_mv[0] / self.timestamp_divider + + def _sleep(self, time_spent_waiting_ms:int): + """ + Optional function which can implement sleep functionality for the signal. + """ def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)): """ @@ -240,9 +248,10 @@ class HCQSignal: value: The value to wait for. timeout: Maximum time to wait in milliseconds. Defaults to 10s. """ - start_time = time.time() * 1000 - while time.time() * 1000 - start_time < timeout: + start_time = int(time.time() * 1000) + while (time_spent:=int(time.time() * 1000) - start_time) < timeout: if self.value >= value: return + self._sleep(time_spent) raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})") @contextlib.contextmanager @@ -364,8 +373,8 @@ class HCQCompiled(Compiled, Generic[SignalType]): comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]): self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 - self.timeline_signal:SignalType = self.signal_t(0, is_timeline=True) - self._shadow_timeline_signal:SignalType = self.signal_t(0, is_timeline=True) + self.timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self) + self._shadow_timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self) self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = [] self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = [] self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = [] @@ -379,7 +388,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.devices.append(self) def synchronize(self): - try: self.timeline_signal.wait(self.timeline_value - 1) if not hasattr(self, '_syncdev') else self._syncdev() + try: self.timeline_signal.wait(self.timeline_value - 1) except RuntimeError as e: if hasattr(self, 'on_device_hang'): self.on_device_hang() else: raise e