hcq signal tiny refactor (#7913)

* hcq signal tiny refactor

* no mv

* fix

* fix2

* fix3
This commit is contained in:
nimlgen
2024-11-26 21:48:38 +03:00
committed by GitHub
parent 345457f518
commit 84f96e48a1
4 changed files with 48 additions and 62 deletions

View File

@@ -32,27 +32,24 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
class AMDSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False):
self._signal = AMDDevice.signals_pool.pop()
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
super().__init__(AMDDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(100), value_off=0, timestamp_off=8)
if is_timeline:
self._event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + self._event.event_slot_index*8
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event.event_id)
else: self._event_mailbox_ptr = 0
super().__init__(value)
def __del__(self): AMDDevice.signals_pool.append(self._signal)
def _get_value(self) -> int: return self._signal[0]
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(100)
def _set_value(self, new_value:int): self._signal[0] = new_value
def __del__(self): AMDDevice.signals_pool.append(self.base_addr)
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
start_time = time.time() * 1000
while (time_spent:=time.time() * 1000 - start_time) < timeout:
if self._signal[0] >= value: return
if self.value >= value: return
# Wait active for 5s, then going to sleep.
if time_spent > 5000 and self._event_mailbox_ptr != 0:
if time_spent > 5000 and self.is_timeline:
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
raise RuntimeError(f"wait_signal: not set to {value}, but {self.value}, {timeout} ms TIMEOUT!")
class AMDComputeQueue(HWQueue):
def __init__(self):
@@ -140,24 +137,24 @@ class AMDComputeQueue(HWQueue):
def _wait(self, signal:AMDSignal, value=0):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4]
amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal.value_addr), value, 0xffffffff, 4]
def _timestamp(self, signal:AMDSignal):
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr)
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal.timestamp_addr)
def _signal(self, signal:AMDSignal, value=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True)
if signal._event_mailbox_ptr != 0:
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.value_addr, value=value, cache_flush=True)
if signal.is_timeline:
self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr,
value=signal._event.event_id, cst=signal._event.event_id, cache_flush=False)
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr))
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=4, data=[value])
def _update_signal(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal._value_addr))
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
# Check if the signal command has mailptr part
@@ -210,27 +207,27 @@ class AMDCopyQueue(HWQueue):
if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
def _signal(self, signal:AMDSignal, value=0):
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value])
if signal._event_mailbox_ptr != 0:
if signal.is_timeline:
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._event_mailbox_ptr), signal._event.event_id])
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal._event.event_id)])
def _wait(self, signal:AMDSignal, value=0):
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal._value_addr), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
def _update_signal(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
def _update_wait(self, cmd_idx, signal:Optional[AMDSignal]=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal._value_addr))
if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
def _timestamp(self, signal:AMDSignal):
self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
*data64_le(signal._timestamp_addr)])
*data64_le(signal.timestamp_addr)])
def _submit(self, dev:AMDDevice):
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
@@ -333,7 +330,7 @@ class AMDDevice(HCQCompiled):
kfd:int = -1
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
signals_page:Any = None
signals_pool:List[memoryview] = []
signals_pool:List[int] = []
gpus:List[pathlib.Path] = []
def _gpu_map(self, mem):
@@ -399,7 +396,7 @@ class AMDDevice(HCQCompiled):
if AMDDevice.event_page is None:
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
else:
self._gpu_map(AMDDevice.signals_page)

View File

@@ -75,13 +75,8 @@ def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc
class NVSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False):
self._signal = NVDevice.signals_pool.pop()
self.signal_addr = mv_address(self._signal)
super().__init__(value)
def __del__(self): NVDevice.signals_pool.append(self._signal)
def _get_value(self) -> int: return self._signal[0]
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
def _set_value(self, new_value:int): self._signal[0] = new_value
super().__init__(NVDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(1000), value_off=0, timestamp_off=8)
def __del__(self): NVDevice.signals_pool.append(self.base_addr)
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
def __del__(self):
@@ -97,11 +92,11 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
if local_mem_tpc_bytes: self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *data64(local_mem_tpc_bytes), 0xff]
def _wait(self, signal:NVSignal, value=0):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value),
(3 << 0) | (1 << 24)] # ACQUIRE | PAYLOAD_SIZE_64BIT
def _update_wait(self, cmd_idx, signal=None, value=None):
if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.signal_addr))
if signal is not None: self.q[(sigoff:=self.cmds_offset[cmd_idx]+1):sigoff+2] = array.array('I', data64_le(signal.value_addr))
if value is not None: self.q[(valoff:=self.cmds_offset[cmd_idx]+3):valoff+2] = array.array('I', data64_le(value))
def _timestamp(self, signal): return self._signal(signal, 0)
@@ -170,19 +165,19 @@ class NVComputeQueue(NVCommandQueue):
for i in range(2):
if getattr(prev_qmd, f'release{i}_enable') == 0:
setattr(prev_qmd, f'release{i}_enable', 1)
setattr(prev_qmd, f'release{i}_address', signal.signal_addr)
setattr(prev_qmd, f'release{i}_address', signal.value_addr)
setattr(prev_qmd, f'release{i}_payload', value)
self.cmd_idx_to_qmd[self._cur_cmd_idx()] = prev_qmd
self.cmd_idx_to_signal_id[self._cur_cmd_idx()] = i
return
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.signal_addr), *data64_le(value),
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *data64_le(signal.value_addr), *data64_le(value),
(1 << 0) | (1 << 20) | (1 << 24) | (1 << 25)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
def _update_signal(self, cmd_idx, signal:Optional[NVSignal]=None, value=None):
if (qmd:=self.cmd_idx_to_qmd.get(cmd_idx)) is None: return super()._update_wait(cmd_idx, signal, value) # reuse wait, same offsets to update.
if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.signal_addr)
if signal is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_address', signal.value_addr)
if value is not None: setattr(qmd, f'release{self.cmd_idx_to_signal_id[cmd_idx]}_payload', value)
def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).compute_gpfifo)
@@ -198,11 +193,11 @@ class NVCopyQueue(NVCommandQueue):
if src is not None: self._patch(cmd_idx, offset=1, data=data64(src))
def _signal(self, signal, value=0):
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.signal_addr), value]
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 3), *data64(signal.value_addr), value]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
def _update_signal(self, cmd_idx, signal=None, value=None):
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.signal_addr))
if signal is not None: self._patch(cmd_idx, offset=1, data=data64(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=3, data=[value])
def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).dma_gpfifo)
@@ -316,7 +311,7 @@ class NVDevice(HCQCompiled[NVSignal]):
fd_uvm: int = -1
gpus_info: Union[List, ctypes.Array] = []
signals_page: Any = None
signals_pool: List[Any] = []
signals_pool: List[int] = []
low_uvm_vaddr: int = 0x1000000000 # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
uvm_vaddr: int = 0x2000000000 # 0x2000000000+
host_object_enumerator: int = 0x1000
@@ -467,7 +462,7 @@ class NVDevice(HCQCompiled[NVSignal]):
if NVDevice.signals_page is None:
NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
else: self._gpu_map(NVDevice.signals_page)
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)

View File

@@ -36,13 +36,8 @@ class QCOMCompiler(CLCompiler):
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
class QCOMSignal(HCQSignal):
def __init__(self, value=0, is_timeline=False):
self._signal = QCOMDevice.signals_pool.pop()
super().__init__(value)
def __del__(self): QCOMDevice.signals_pool.append(self._signal)
def _get_value(self) -> int: return self._signal[0]
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
def _set_value(self, new_value:int): self._signal[0] = new_value
def __init__(self, value=0, is_timeline=False): super().__init__(QCOMDevice.signals_pool.pop(), value, is_timeline, decimal.Decimal(19.2))
def __del__(self): QCOMDevice.signals_pool.append(self.base_addr)
class QCOMComputeQueue(HWQueue):
def __init__(self):
@@ -69,7 +64,7 @@ class QCOMComputeQueue(HWQueue):
self.cmd(adreno.CP_WAIT_FOR_IDLE)
if QCOMDevice.gpu_id < 700:
self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts),
*data64_le(mv_address(signal._signal) + (0 if not ts else 8)), qreg.cp_event_write_3(value & 0xFFFFFFFF))
*data64_le(signal.timestamp_addr if ts else signal.value_addr), qreg.cp_event_write_3(value & 0xFFFFFFFF))
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
else:
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
@@ -78,15 +73,15 @@ class QCOMComputeQueue(HWQueue):
def _timestamp(self, signal:QCOMSignal): return self._signal(signal, 0, ts=True)
def _wait(self, signal:QCOMSignal, value=0):
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(mv_address(signal._signal)),
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(signal.value_addr),
qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
def _update_signal(self, cmd_idx, signal:Optional[QCOMSignal], value):
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(mv_address(signal._signal)))
if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=5, data=[value & 0xFFFFFFFF])
def _update_wait(self, cmd_idx, signal:Optional[QCOMSignal], value):
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal.value_addr))
if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF])
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
@@ -343,7 +338,7 @@ class QCOMAllocator(HCQAllocator):
class QCOMDevice(HCQCompiled):
signals_page: Any = None
signals_pool: List[Any] = []
signals_pool: List[int] = []
gpu_id: int = 0
dummy_addr: int = 0
@@ -351,7 +346,7 @@ class QCOMDevice(HCQCompiled):
self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR)
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)]
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Callable, ParamSpec, Concatenate
import contextlib, decimal, statistics, random, json, atexit, time, array, ctypes, functools
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv
from tinygrad.renderer import Renderer
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator
@@ -209,16 +209,16 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
raise NotImplementedError("backend should overload this function")
class HCQSignal:
def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value)
def __init__(self, base_addr:int, value:int=0, is_timeline:bool=False, timestamp_divider=decimal.Decimal(1), value_off=0, timestamp_off=8):
self.base_addr, self.value_addr, self.timestamp_addr, self.ts_divider = base_addr, base_addr+value_off, base_addr+timestamp_off, timestamp_divider
self.value_mv, self.timestamp_mv, self.is_timeline = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q'), is_timeline
self.value_mv[0] = value
@property
def value(self) -> int: return self._get_value()
def value(self) -> int: return self.value_mv[0]
@value.setter
def value(self, new_value:int): self._set_value(new_value)
def _get_value(self) -> int: raise NotImplementedError("_get_value() method must be implemented")
def _set_value(self, new_value:int): raise NotImplementedError("_set_value() method must be implemented")
def value(self, new_value:int): self.value_mv[0] = new_value
@property
def timestamp(self) -> decimal.Decimal:
@@ -230,8 +230,7 @@ class HCQSignal:
Returns:
The timestamp in microseconds.
"""
return self._get_timestamp()
def _get_timestamp(self) -> decimal.Decimal: raise NotImplementedError("_get_timestamp() method must be implemented")
return self.timestamp_mv[0] / self.ts_divider
def wait(self, value:int, timeout:int=getenv("HCQDEV_WAIT_TIMEOUT_MS", 30000)):
"""