hcq signal add sleep (#7955)

* hcqsignal sleep

* fixes

* typing

* time ms is int
This commit is contained in:
nimlgen
2024-11-29 14:04:45 +03:00
committed by GitHub
parent 30f0e95fbd
commit 309dcb1044
4 changed files with 50 additions and 44 deletions

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Tuple, List, Any, Optional
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
@@ -28,25 +28,15 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
class AMDSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False):
super().__init__(AMDDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(100), value_off=0, timestamp_off=8)
if is_timeline:
self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
def __init__(self, value=0, timeline_for_device:Optional[AMDDevice]=None):
super().__init__(AMDDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=100, value_off=0, timestamp_off=8)
def __del__(self): AMDDevice.signals_pool.append(self.base_addr)
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
start_time = time.time() * 1000
while (time_spent:=time.time() * 1000 - start_time) < timeout:
if self.value >= value: return
# Wait active for 5s, then going to sleep.
if time_spent > 5000 and self.is_timeline:
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
raise RuntimeError(f"wait_signal: not set to {value}, but {self.value}, {timeout} ms TIMEOUT!")
def _sleep(self, time_spent_waiting_ms:int):
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None:
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=self.timeline_for_device.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=200)
class AMDComputeQueue(HWQueue):
def __init__(self):
@@ -144,9 +134,9 @@ class AMDComputeQueue(HWQueue):
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
if signal.is_timeline:
self.release_mem(signal._event_mailbox_ptr, signal._event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=signal._event.event_id)
if (dev:=signal.timeline_for_device) is not None:
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))
@@ -156,10 +146,6 @@ class AMDComputeQueue(HWQueue):
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
# Check if the signal command has mailptr part
if signal is not None and self.cmds_len[cmd_idx] > 8:
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
def bind(self, dev:AMDDevice):
self.binded_device = dev
self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
@@ -208,9 +194,9 @@ class AMDCopyQueue(HWQueue):
def _signal(self, signal:AMDSignal, value=0):
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value])
if signal.is_timeline:
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
if (dev:=signal.timeline_for_device) is not None:
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id])
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)])
def _wait(self, signal:AMDSignal, value=0):
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
@@ -423,6 +409,11 @@ class AMDDevice(HCQCompiled):
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
self.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
self.queue_event_mailbox_ptr = AMDDevice.event_page.va_addr + self.queue_event.event_slot_index * 8
self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.queue_event.event_id)
self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
from dataclasses import dataclass
@@ -74,8 +74,8 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
class NVSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False):
super().__init__(NVDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(1000), value_off=0, timestamp_off=8)
def __init__(self, value=0, timeline_for_device:Optional[NVDevice]=None):
super().__init__(NVDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=1000, value_off=0, timestamp_off=8)
def __del__(self): NVDevice.signals_pool.append(self.base_addr)
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, decimal, math, sys
import os, ctypes, functools, mmap, struct, array, math, sys
assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Tuple, List, Any, cast, Optional
@@ -36,9 +36,17 @@ class QCOMCompiler(CLCompiler):
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
class QCOMSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False): super().__init__(QCOMDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(19.2))
def __init__(self, value=0, timeline_for_device:Optional[QCOMDevice]=None):
super().__init__(QCOMDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=19.2)
def __del__(self): QCOMDevice.signals_pool.append(self.base_addr)
def _sleep(self, time_spent_waiting_ms:int):
# Sleep only for only timeline signals. Do it immidiately to free cpu.
if self.timeline_for_device is not None:
kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx,
timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff)
class QCOMComputeQueue(HWQueue):
def __init__(self):
self.cmd_idx_to_dims = {}
@@ -397,5 +405,3 @@ class QCOMDevice(HCQCompiled):
self.synchronize()
self._gpu_free(self._stack)
self._stack = self._gpu_alloc(sz)
def _syncdev(self): kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.fd, context_id=self.ctx, timestamp=self.last_cmd, timeout=0xffffffff)

View File

@@ -208,10 +208,13 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
def _update_copy(self, cmd_idx:int, dest:Optional[int], src:Optional[int]):
raise NotImplementedError("backend should overload this function")
class HCQSignal:
def __init__(self, base_addr:int, value:int=0, is_timeline:bool=False, timestamp_divider=decimal.Decimal(1), value_off=0, timestamp_off=8):
self.base_addr, self.value_addr, self.timestamp_addr, self.ts_divider = base_addr, base_addr+value_off, base_addr+timestamp_off, timestamp_divider
self.value_mv, self.timestamp_mv, self.is_timeline = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q'), is_timeline
class HCQSignal(Generic[DeviceType]):
def __init__(self, base_addr:int, value:int=0, timeline_for_device:Optional[DeviceType]=None, timestamp_divider=1, value_off=0, timestamp_off=8):
self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
self.timeline_for_device:Optional[DeviceType] = timeline_for_device
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
self.value_mv[0] = value
@property
@@ -230,7 +233,12 @@ class HCQSignal:
Returns:
The timestamp in microseconds.
"""
return self.timestamp_mv[0] / self.ts_divider
return self.timestamp_mv[0] / self.timestamp_divider
def _sleep(self, time_spent_waiting_ms:int):
"""
Optional function which can implement sleep functionality for the signal.
"""
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
"""
@@ -240,9 +248,10 @@ class HCQSignal:
value: The value to wait for.
timeout: Maximum time to wait in milliseconds. Defaults to 10s.
"""
start_time = time.time() * 1000
while time.time() * 1000 - start_time < timeout:
start_time = int(time.time() * 1000)
while (time_spent:=int(time.time() * 1000) - start_time) < timeout:
if self.value >= value: return
self._sleep(time_spent)
raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
@contextlib.contextmanager
@@ -364,8 +373,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal:SignalType = self.signal_t(0, is_timeline=True)
self._shadow_timeline_signal:SignalType = self.signal_t(0, is_timeline=True)
self.timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self)
self._shadow_timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self)
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
@@ -379,7 +388,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.devices.append(self)
def synchronize(self):
try: self.timeline_signal.wait(self.timeline_value - 1) if not hasattr(self, '_syncdev') else self._syncdev()
try: self.timeline_signal.wait(self.timeline_value - 1)
except RuntimeError as e:
if hasattr(self, 'on_device_hang'): self.on_device_hang()
else: raise e