mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 01:26:29 -05:00
hcq signal add sleep (#7955)
* hcqsignal sleep * fixes * typing * time ms is int
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any, Optional
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
|
||||
@@ -28,25 +28,15 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
||||
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, value=0, is_timeline=False):
|
||||
super().__init__(AMDDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(100), value_off=0, timestamp_off=8)
|
||||
|
||||
if is_timeline:
|
||||
self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
|
||||
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
|
||||
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
|
||||
def __init__(self, value=0, timeline_for_device:Optional[AMDDevice]=None):
|
||||
super().__init__(AMDDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=100, value_off=0, timestamp_off=8)
|
||||
|
||||
def __del__(self): AMDDevice.signals_pool.append(self.base_addr)
|
||||
|
||||
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
|
||||
start_time = time.time() * 1000
|
||||
while (time_spent:=time.time() * 1000 - start_time) < timeout:
|
||||
if self.value >= value: return
|
||||
|
||||
# Wait active for 5s, then going to sleep.
|
||||
if time_spent > 5000 and self.is_timeline:
|
||||
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
||||
raise RuntimeError(f"wait_signal: not set to {value}, but {self.value}, {timeout} ms TIMEOUT!")
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
||||
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None:
|
||||
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=self.timeline_for_device.queue_event_arr_ptr, num_events=1, wait_for_all=1, timeout=200)
|
||||
|
||||
class AMDComputeQueue(HWQueue):
|
||||
def __init__(self):
|
||||
@@ -144,9 +134,9 @@ class AMDComputeQueue(HWQueue):
|
||||
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
|
||||
if signal.is_timeline:
|
||||
self.release_mem(signal._event_mailbox_ptr, signal._event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=signal._event.event_id)
|
||||
if (dev:=signal.timeline_for_device) is not None:
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
|
||||
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))
|
||||
@@ -156,10 +146,6 @@ class AMDComputeQueue(HWQueue):
|
||||
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr))
|
||||
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
|
||||
|
||||
# Check if the signal command has mailptr part
|
||||
if signal is not None and self.cmds_len[cmd_idx] > 8:
|
||||
self._patch(cmd_idx, offset=11, data=[*data64_le(signal._event_mailbox_ptr), *data64_le(signal._event.event_id), signal._event.event_id])
|
||||
|
||||
def bind(self, dev:AMDDevice):
|
||||
self.binded_device = dev
|
||||
self.hw_page = dev.allocator.alloc(len(self.q) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
||||
@@ -208,9 +194,9 @@ class AMDCopyQueue(HWQueue):
|
||||
def _signal(self, signal:AMDSignal, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value])
|
||||
|
||||
if signal.is_timeline:
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
|
||||
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
|
||||
if (dev:=signal.timeline_for_device) is not None:
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id])
|
||||
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id)])
|
||||
|
||||
def _wait(self, signal:AMDSignal, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
@@ -423,6 +409,11 @@ class AMDDevice(HCQCompiled):
|
||||
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size)
|
||||
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
|
||||
|
||||
self.queue_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_SIGNAL, auto_reset=1)
|
||||
self.queue_event_mailbox_ptr = AMDDevice.event_page.va_addr + self.queue_event.event_slot_index * 8
|
||||
self.queue_event_arr = (kfd.struct_kfd_event_data)(event_id=self.queue_event.event_id)
|
||||
self.queue_event_arr_ptr = ctypes.addressof(self.queue_event_arr)
|
||||
|
||||
self.mem_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_MEMORY)
|
||||
self.hw_fault_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_type=kfd.KFD_IOC_EVENT_HW_EXCEPTION)
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decimal, sys
|
||||
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
|
||||
from dataclasses import dataclass
|
||||
@@ -74,8 +74,8 @@ assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
||||
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
||||
|
||||
class NVSignal(HCQSignal):
|
||||
def __init__(self, value=0, is_timeline=False):
|
||||
super().__init__(NVDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(1000), value_off=0, timestamp_off=8)
|
||||
def __init__(self, value=0, timeline_for_device:Optional[NVDevice]=None):
|
||||
super().__init__(NVDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=1000, value_off=0, timestamp_off=8)
|
||||
def __del__(self): NVDevice.signals_pool.append(self.base_addr)
|
||||
|
||||
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, decimal, math, sys
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys
|
||||
assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Any, cast, Optional
|
||||
@@ -36,9 +36,17 @@ class QCOMCompiler(CLCompiler):
|
||||
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, value=0, is_timeline=False): super().__init__(QCOMDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(19.2))
|
||||
def __init__(self, value=0, timeline_for_device:Optional[QCOMDevice]=None):
|
||||
super().__init__(QCOMDevice.signals_pool.pop(), value, timeline_for_device, timestamp_divider=19.2)
|
||||
|
||||
def __del__(self): QCOMDevice.signals_pool.append(self.base_addr)
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Sleep only for only timeline signals. Do it immidiately to free cpu.
|
||||
if self.timeline_for_device is not None:
|
||||
kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx,
|
||||
timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff)
|
||||
|
||||
class QCOMComputeQueue(HWQueue):
|
||||
def __init__(self):
|
||||
self.cmd_idx_to_dims = {}
|
||||
@@ -397,5 +405,3 @@ class QCOMDevice(HCQCompiled):
|
||||
self.synchronize()
|
||||
self._gpu_free(self._stack)
|
||||
self._stack = self._gpu_alloc(sz)
|
||||
|
||||
def _syncdev(self): kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.fd, context_id=self.ctx, timestamp=self.last_cmd, timeout=0xffffffff)
|
||||
|
||||
@@ -208,10 +208,13 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
def _update_copy(self, cmd_idx:int, dest:Optional[int], src:Optional[int]):
|
||||
raise NotImplementedError("backend should overload this function")
|
||||
|
||||
class HCQSignal:
|
||||
def __init__(self, base_addr:int, value:int=0, is_timeline:bool=False, timestamp_divider=decimal.Decimal(1), value_off=0, timestamp_off=8):
|
||||
self.base_addr, self.value_addr, self.timestamp_addr, self.ts_divider = base_addr, base_addr+value_off, base_addr+timestamp_off, timestamp_divider
|
||||
self.value_mv, self.timestamp_mv, self.is_timeline = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q'), is_timeline
|
||||
class HCQSignal(Generic[DeviceType]):
|
||||
def __init__(self, base_addr:int, value:int=0, timeline_for_device:Optional[DeviceType]=None, timestamp_divider=1, value_off=0, timestamp_off=8):
|
||||
self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
|
||||
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
|
||||
self.timeline_for_device:Optional[DeviceType] = timeline_for_device
|
||||
|
||||
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
|
||||
self.value_mv[0] = value
|
||||
|
||||
@property
|
||||
@@ -230,7 +233,12 @@ class HCQSignal:
|
||||
Returns:
|
||||
The timestamp in microseconds.
|
||||
"""
|
||||
return self.timestamp_mv[0] / self.ts_divider
|
||||
return self.timestamp_mv[0] / self.timestamp_divider
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
"""
|
||||
Optional function which can implement sleep functionality for the signal.
|
||||
"""
|
||||
|
||||
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
|
||||
"""
|
||||
@@ -240,9 +248,10 @@ class HCQSignal:
|
||||
value: The value to wait for.
|
||||
timeout: Maximum time to wait in milliseconds. Defaults to 10s.
|
||||
"""
|
||||
start_time = time.time() * 1000
|
||||
while time.time() * 1000 - start_time < timeout:
|
||||
start_time = int(time.time() * 1000)
|
||||
while (time_spent:=int(time.time() * 1000) - start_time) < timeout:
|
||||
if self.value >= value: return
|
||||
self._sleep(time_spent)
|
||||
raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
|
||||
|
||||
@contextlib.contextmanager
|
||||
@@ -364,8 +373,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal:SignalType = self.signal_t(0, is_timeline=True)
|
||||
self._shadow_timeline_signal:SignalType = self.signal_t(0, is_timeline=True)
|
||||
self.timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self)
|
||||
self._shadow_timeline_signal:SignalType = self.signal_t(0, timeline_for_device=self)
|
||||
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
|
||||
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
|
||||
@@ -379,7 +388,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.devices.append(self)
|
||||
|
||||
def synchronize(self):
|
||||
try: self.timeline_signal.wait(self.timeline_value - 1) if not hasattr(self, '_syncdev') else self._syncdev()
|
||||
try: self.timeline_signal.wait(self.timeline_value - 1)
|
||||
except RuntimeError as e:
|
||||
if hasattr(self, 'on_device_hang'): self.on_device_hang()
|
||||
else: raise e
|
||||
|
||||
Reference in New Issue
Block a user