docs: hcq add types (#5495)

* docs: hcq add types

* linter
This commit is contained in:
nimlgen
2024-07-15 22:14:48 +03:00
committed by GitHub
parent aab1e8c6dc
commit 8dfd11c1d8
4 changed files with 42 additions and 32 deletions

View File

@@ -2,7 +2,7 @@
## Overview
The main aspect of HCQ-compatible runtimes is how they interact with devices. In HCQ, all interactions with devices occur in a hardware-friendly manner using [command queues](#commandqueues). This approach allows commands to be issued directly to devices, bypassing runtime overhead such as HIP, or CUDA. Additionally, by using the HCQ API, these runtimes can benefit from various optimizations and features, including [HCQGraph](#hcqgraph) and built-in profiling capabilities.
The main aspect of HCQ-compatible runtimes is how they interact with devices. In HCQ, all interactions with devices occur in a hardware-friendly manner using [command queues](#commandqueues). This approach allows commands to be issued directly to devices, bypassing runtime overhead such as HIP or CUDA. Additionally, by using the HCQ API, these runtimes can benefit from various optimizations and features, including [HCQGraph](#hcqgraph) and built-in profiling capabilities.
### Command Queues
@@ -116,6 +116,15 @@ Backends must adhere to the `HCQCompatAllocRes` protocol when returning allocati
members: true
show_source: false
### HCQ Compatible Program
The `HCQCompatProgram` is a helper base class for defining programs compatible with HCQ-compatible devices. Currently, the arguments consist of pointers to buffers, followed by `vals` fields. The convention expects a packed struct containing the passed pointers, followed by `vals` located at `kernargs_args_offset`.
::: tinygrad.device.HCQCompatProgram
options:
members: true
show_source: false
### Synchronization
HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device:

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import multiprocessing
from dataclasses import dataclass
from collections import defaultdict
from typing import List, Optional, Dict, Tuple, Any, cast, Protocol
from typing import List, Optional, Dict, Tuple, Any, cast, Protocol, Type
import importlib, inspect, functools, pathlib, os, ctypes, atexit, time, contextlib, array
from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, ProfileLogger, PROFILE
from tinygrad.dtype import DType, ImageDType
@@ -225,7 +225,7 @@ class HWCommandQueue:
def _patch(self, cmd_idx, offset, data): self.q[(st:=self.cmds_offset[cmd_idx]+offset):st+len(data)] = array.array('I', data)
@hcq_command
def signal(self, signal, value):
def signal(self, signal:Any, value:int):
"""
Enqueues a signal command which sets the signal to the given value, ensuring all previous operations are completed.
@@ -234,10 +234,10 @@ class HWCommandQueue:
value: The value to set the signal to
"""
self._signal(signal, value)
def _signal(self, signal, value): raise NotImplementedError("backend should overload this function")
def _signal(self, signal:Any, value:int): raise NotImplementedError("backend should overload this function")
@hcq_command
def wait(self, signal, value):
def wait(self, signal:Any, value:int):
"""
Enqueues a wait command which halts execution until the signal is greater than or equal to a specific value.
@@ -249,7 +249,7 @@ class HWCommandQueue:
def _wait(self, signal, value): raise NotImplementedError("backend should overload this function")
@hcq_command
def timestamp(self, signal):
def timestamp(self, signal:Any):
"""
Enqueues a timestamp command which records the current time in a signal after all previously enqueued commands are completed.
@@ -259,7 +259,7 @@ class HWCommandQueue:
self._timestamp(signal)
def _timestamp(self, signal): raise NotImplementedError("backend should overload this function")
def update_signal(self, cmd_idx, signal=None, value=None):
def update_signal(self, cmd_idx:int, signal:Optional[Any]=None, value:Optional[int]=None):
"""
Updates a previously queued signal command.
@@ -271,9 +271,9 @@ class HWCommandQueue:
if self.cmds_meta[cmd_idx] != "signal": raise RuntimeError("called update_signal not on a signal command")
self._update_signal(cmd_idx, signal, value)
return self
def _update_signal(self, cmd_idx, signal, value): raise NotImplementedError("backend should overload this function")
def _update_signal(self, cmd_idx:int, signal:Optional[Any], value:Optional[int]): raise NotImplementedError("backend should overload this function")
def update_wait(self, cmd_idx, signal=None, value=None):
def update_wait(self, cmd_idx:int, signal:Optional[Any]=None, value:Optional[int]=None):
"""
Updates a previously queued wait command.
@@ -285,7 +285,7 @@ class HWCommandQueue:
if self.cmds_meta[cmd_idx] != "wait": raise RuntimeError("called update_wait not on a wait command")
self._update_wait(cmd_idx, signal, value)
return self
def _update_wait(self, cmd_idx, signal, value): raise NotImplementedError("backend should overload this function")
def _update_wait(self, cmd_idx:int, signal:Optional[Any], value:Optional[int]): raise NotImplementedError("backend should overload this function")
def submit(self, device:HCQCompatCompiled):
"""
@@ -308,7 +308,7 @@ class HWComputeQueue(HWCommandQueue):
def _memory_barrier(self): pass
@hcq_command
def exec(self, prg, kernargs, global_size, local_size):
def exec(self, prg:HCQCompatProgram, kernargs:int, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]):
"""
Enqueues an execution command for a kernel program.
@@ -321,7 +321,7 @@ class HWComputeQueue(HWCommandQueue):
self._exec(prg, kernargs, global_size, local_size)
def _exec(self, prg, kernargs, global_size, local_size): raise NotImplementedError("backend should overload this function")
def update_exec(self, cmd_idx, global_size, local_size):
def update_exec(self, cmd_idx:int, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]):
"""
Updates a previously queued execution command.
@@ -337,7 +337,7 @@ class HWComputeQueue(HWCommandQueue):
class HWCopyQueue(HWCommandQueue):
@hcq_command
def copy(self, dest, src, copy_size):
def copy(self, dest:HCQCompatAllocRes, src:HCQCompatAllocRes, copy_size:int):
"""
Enqueues a copy command to transfer data.
@@ -347,9 +347,9 @@ class HWCopyQueue(HWCommandQueue):
copy_size: The size of data to copy
"""
self._copy(dest, src, copy_size)
def _copy(self, dest, src, copy_size): raise NotImplementedError("backend should overload this function")
def _copy(self, dest:HCQCompatAllocRes, src:HCQCompatAllocRes, copy_size:int): raise NotImplementedError("backend should overload this function")
def update_copy(self, cmd_idx, dest=None, src=None):
def update_copy(self, cmd_idx:int, dest:Optional[HCQCompatAllocRes]=None, src:Optional[HCQCompatAllocRes]=None):
"""
Updates a previously queued copy command.
@@ -378,7 +378,7 @@ def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
class HCQCompatProgram:
def __init__(self, kernargs_alloc_size, kernargs_args_offset=0):
def __init__(self, kernargs_alloc_size:int, kernargs_args_offset:int=0):
self.kernargs_alloc_size, self.kernargs_args_offset = kernargs_alloc_size, kernargs_args_offset
def fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()): raise NotImplementedError("need fill_kernargs")
@@ -387,7 +387,8 @@ class HCQCompatCompiled(Compiled):
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
"""
def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, comp_queue_t, copy_queue_t, timeline_signals):
def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime,
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Type[HWCopyQueue], timeline_signals:Tuple[Any, Any]):
self.hw_compute_queue_t, self.hw_copy_queue_t = comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal, self._shadow_timeline_signal = timeline_signals
@@ -399,7 +400,7 @@ class HCQCompatCompiled(Compiled):
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
@classmethod
def _read_signal(cls, signal:Any) -> Any:
def _read_signal(cls, signal:Any) -> int:
"""
Read a value for a signal.
"""
@@ -413,34 +414,34 @@ class HCQCompatCompiled(Compiled):
raise NotImplementedError("_read_timestamp needs to be implemented")
@classmethod
def _set_signal(cls, signal:Any, value:Any) -> None:
def _set_signal(cls, signal:Any, value:int):
"""
Set a value for a signal.
"""
raise NotImplementedError("_set_signal needs to be implemented")
@classmethod
def _alloc_signal(cls, value:Any = 0, **kwargs) -> Any:
def _alloc_signal(cls, value:int = 0, **kwargs) -> Any:
"""
Allocate a new signal.
"""
raise NotImplementedError("_alloc_signal needs to be implemented")
@classmethod
def _free_signal(cls, signal:Any) -> None:
def _free_signal(cls, signal:Any):
"""
Free a signal.
"""
raise NotImplementedError("_free_signal needs to be implemented")
@classmethod
def _wait_signal(cls, signal:Any, value:Any = 0, timeout:int = 10000) -> None:
def _wait_signal(cls, signal:Any, value:int = 0, timeout:int = 10000):
"""
Wait for a signal to reach a specific value. Signals
"""
raise NotImplementedError("_wait_signal needs to be implemented")
def _gpu2cpu_time(self, gpu_time:float, is_copy:bool) -> float:
def _gpu2cpu_time(self, gpu_time:int, is_copy:bool) -> float:
"""
Convert GPU time to CPU time. `is_copy` flag indicating if this is a copy queue.
"""
@@ -475,7 +476,7 @@ class HCQCompatCompiled(Compiled):
cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)
# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size.
class HCQCompatAllocRes(Protocol): va_addr: int; size: int # noqa: E702
class HCQCompatAllocRes(Protocol): va_addr:int; size:int # noqa: E702
class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
"""
@@ -484,15 +485,15 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
This class implements basic copy operations following the HCQ API, utilizing both `HWComputeQueue` and `HWCopyQueue`.
"""
def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
self.device = device
def __init__(self, device:HCQCompatCompiled, batch_size:int=(2 << 20), batch_cnt:int=32):
self.device:Any = device
self.b = [self._alloc(batch_size, BufferOptions(host=True)) for _ in range(batch_cnt)]
self.b_timeline, self.b_next = [0] * len(self.b), 0
super().__init__()
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")
def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
def copyin(self, dest:HCQCompatAllocRes, src:memoryview):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b)
@@ -504,7 +505,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
self.b_timeline[self.b_next] = self.device.timeline_value
self.device.timeline_value += 1
def copy_from_disk(self, dest: HCQCompatAllocRes, src, size):
def copy_from_disk(self, dest:HCQCompatAllocRes, src, size):
def _get_temp_buf():
# Check if the next buffer is safe to be used (its signal has passed) and reserve it.
if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device._read_signal(self.device.timeline_signal):
@@ -520,7 +521,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
self.b_timeline[batch_info[1]] = self.device.timeline_value
self.device.timeline_value += 1
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
def copyout(self, dest:memoryview, src:HCQCompatAllocRes):
self.device.synchronize()
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
@@ -533,7 +534,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
def transfer(self, dest:HCQCompatAllocRes, src:HCQCompatAllocRes, sz:int, src_dev, dest_dev):
src_dev._gpu_map(dest)
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):

View File

@@ -487,7 +487,7 @@ class AMDDevice(HCQCompatCompiled):
self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
timeline_signals=[self._alloc_signal(sync_event=sync_event), self._alloc_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1))]
timeline_signals=(self._alloc_signal(sync_event=sync_event), self._alloc_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1)))
super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
AMDComputeQueue, AMDCopyQueue, timeline_signals)

View File

@@ -546,7 +546,7 @@ class NVDevice(HCQCompatCompiled):
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
functools.partial(NVProgram, self), NVComputeQueue, NVCopyQueue, timeline_signals=[self._alloc_signal(), self._alloc_signal()])
functools.partial(NVProgram, self), NVComputeQueue, NVCopyQueue, timeline_signals=(self._alloc_signal(), self._alloc_signal()))
self._setup_gpfifos()
NVDevice.devices.append(self)