mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
unify to HWQueue [pr] (#7812)
* unify to HWCommandQueue [pr] * all is HWQueue
This commit is contained in:
@@ -6,19 +6,19 @@ The main aspect of HCQ-compatible runtimes is how they interact with devices. In
|
||||
|
||||
### Command Queues
|
||||
|
||||
To interact with devices, there are 2 types of queues: `HWComputeQueue` and `HWCopyQueue`. Commands which are defined in a base `HWCommandQueue` class should be supported by both queues. These methods are timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWCommandQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWCommandQueue.wait).
|
||||
To interact with devices you create a `HWQueue`. Some methods are required, like timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWQueue.wait), while others are dependent on it being a compute or copy queue.
|
||||
|
||||
For example, the following Python code enqueues a wait, execute, and signal command on the HCQ-compatible device:
|
||||
```python
|
||||
HWComputeQueue().wait(signal_to_wait, value_to_wait) \
|
||||
.exec(program, args_state, global_dims, local_dims) \
|
||||
.signal(signal_to_fire, value_to_fire) \
|
||||
.submit(your_device)
|
||||
HWQueue().wait(signal_to_wait, value_to_wait) \
|
||||
.exec(program, args_state, global_dims, local_dims) \
|
||||
.signal(signal_to_fire, value_to_fire) \
|
||||
.submit(your_device)
|
||||
```
|
||||
|
||||
Each runtime should implement the required functions that are defined in the `HWCommandQueue`, `HWComputeQueue`, and `HWCopyQueue` classes.
|
||||
Each runtime should implement the required functions that are defined in the `HWQueue` classes.
|
||||
|
||||
::: tinygrad.runtime.support.hcq.HWCommandQueue
|
||||
::: tinygrad.runtime.support.hcq.HWQueue
|
||||
options:
|
||||
members: [
|
||||
"signal",
|
||||
@@ -28,21 +28,9 @@ Each runtime should implement the required functions that are defined in the `HW
|
||||
"update_wait",
|
||||
"bind",
|
||||
"submit",
|
||||
]
|
||||
show_source: false
|
||||
|
||||
::: tinygrad.runtime.support.hcq.HWComputeQueue
|
||||
options:
|
||||
members: [
|
||||
"memory_barrier",
|
||||
"exec",
|
||||
"update_exec",
|
||||
]
|
||||
show_source: false
|
||||
|
||||
::: tinygrad.runtime.support.hcq.HWCopyQueue
|
||||
options:
|
||||
members: [
|
||||
"copy",
|
||||
"update_copy",
|
||||
]
|
||||
@@ -82,9 +70,9 @@ The following Python code demonstrates the usage of signals:
|
||||
```python
|
||||
signal = your_device.signal_t()
|
||||
|
||||
HWComputeQueue().timestamp(signal) \
|
||||
.signal(signal, value_to_fire) \
|
||||
.submit(your_device)
|
||||
HWQueue().timestamp(signal) \
|
||||
.signal(signal, value_to_fire) \
|
||||
.submit(your_device)
|
||||
|
||||
signal.wait(value_to_fire)
|
||||
signaled_value = signal.value # should be the same as `value_to_fire`
|
||||
@@ -134,17 +122,17 @@ Backends must adhere to the `HCQBuffer` protocol when returning allocation resul
|
||||
members: true
|
||||
show_source: false
|
||||
|
||||
**Lifetime**: The `HCQArgsState` is passed to `HWComputeQueue.exec` and is guaranteed not to be freed until `HWComputeQueue.submit` for the same queue is called.
|
||||
**Lifetime**: The `HCQArgsState` is passed to `HWQueue.exec` and is guaranteed not to be freed until `HWQueue.submit` for the same queue is called.
|
||||
|
||||
### Synchronization
|
||||
|
||||
HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device:
|
||||
|
||||
```python
|
||||
HWComputeQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \
|
||||
.exec(...)
|
||||
.signal(your_device.timeline_signal, your_device.timeline_value) \
|
||||
.submit(your_device)
|
||||
HWQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \
|
||||
.exec(...)
|
||||
.signal(your_device.timeline_signal, your_device.timeline_value) \
|
||||
.submit(your_device)
|
||||
your_device.timeline_value += 1
|
||||
|
||||
# Optionally wait for execution
|
||||
@@ -153,5 +141,5 @@ your_device.timeline_signal.wait(your_device.timeline_value - 1)
|
||||
|
||||
## HCQGraph
|
||||
|
||||
[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds a static `HWComputeQueue` and `HWCopyQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild.
|
||||
[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds static `HWQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild.
|
||||
Optionally, queues can implement a `bind` API, which allows further optimization by eliminating the need to copy the queues into the device ring.
|
||||
|
||||
10
test/external/external_test_hcq.py
vendored
10
test/external/external_test_hcq.py
vendored
@@ -33,16 +33,16 @@ class TestHCQ(unittest.TestCase):
|
||||
ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size+TestHCQ.kernargs_off, TestHCQ.addr2, len(TestHCQ.addr2))
|
||||
|
||||
if Device.DEFAULT == "AMD":
|
||||
from tinygrad.runtime.ops_amd import HWCopyQueue, HWPM4Queue
|
||||
from tinygrad.runtime.ops_amd import HWQueue, HWPM4Queue
|
||||
TestHCQ.compute_queue = HWPM4Queue
|
||||
TestHCQ.copy_queue = HWCopyQueue
|
||||
TestHCQ.copy_queue = HWQueue
|
||||
elif Device.DEFAULT == "NV":
|
||||
from tinygrad.runtime.ops_nv import HWCopyQueue, HWComputeQueue
|
||||
from tinygrad.runtime.ops_nv import HWQueue, HWQueue
|
||||
# nv need to copy constbuffer there as well
|
||||
to_mv(TestHCQ.d0.kernargs_ptr, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0)
|
||||
to_mv(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0)
|
||||
TestHCQ.compute_queue = HWComputeQueue
|
||||
TestHCQ.copy_queue = HWCopyQueue
|
||||
TestHCQ.compute_queue = HWQueue
|
||||
TestHCQ.copy_queue = HWQueue
|
||||
|
||||
def setUp(self):
|
||||
TestHCQ.d0.synchronize()
|
||||
|
||||
4
test/external/external_test_nv.py
vendored
4
test/external/external_test_nv.py
vendored
@@ -2,7 +2,7 @@ import unittest, struct, array, ctypes
|
||||
from tinygrad import Device, dtypes, Tensor
|
||||
from tinygrad.helpers import to_mv
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.runtime.ops_nv import NVDevice, HWComputeQueue
|
||||
from tinygrad.runtime.ops_nv import NVDevice, HWQueue
|
||||
from tinygrad.engine.search import Opt, OptOps
|
||||
from test.test_linearizer_failures import helper_test_lin
|
||||
from tinygrad.engine.realize import get_runner, CompiledRunner
|
||||
@@ -55,7 +55,7 @@ class TestNV(unittest.TestCase):
|
||||
to_mv(kernargs, 0x160).cast('I')[:] = array.array('I', TestNV.d0_runner.clprg.constbuffer_0)
|
||||
ctypes.memmove(kernargs + TestNV.d0_runner.clprg.kernargs_offset, TestNV.addr, len(TestNV.addr))
|
||||
|
||||
q = HWComputeQueue()
|
||||
q = HWQueue()
|
||||
q.exec(TestNV.d0_runner.clprg, kernargs, TestNV.d0_runner.global_size, TestNV.d0_runner.local_size)
|
||||
q.signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value).submit(TestNV.d0)
|
||||
TestNV.d0._wait_signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value)
|
||||
|
||||
6
test/external/fuzz_kfd.py
vendored
6
test/external/fuzz_kfd.py
vendored
@@ -3,7 +3,7 @@ import random
|
||||
from tqdm import trange
|
||||
from typing import List
|
||||
from tinygrad import Device
|
||||
from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue
|
||||
from tinygrad.runtime.ops_amd import AMDDevice, HWQueue
|
||||
|
||||
if __name__ == "__main__":
|
||||
dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
|
||||
@@ -15,9 +15,9 @@ if __name__ == "__main__":
|
||||
d1, b1 = random.choice(buffers)
|
||||
d2, b2 = random.choice(buffers)
|
||||
d1._gpu_map(b2)
|
||||
q = HWComputeQueue()
|
||||
q = HWQueue()
|
||||
q.signal(sig:=AMDDevice._alloc_signal(10))
|
||||
qc = HWCopyQueue()
|
||||
qc = HWQueue()
|
||||
qc.wait(sig)
|
||||
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
||||
d1.completion_signal.value = 1
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import collections, time
|
||||
from typing import List, Any, Dict, cast, Optional, Tuple, Set
|
||||
from tinygrad.helpers import round_up, PROFILE, memsize_to_str
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, HCQArgsState
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState
|
||||
from tinygrad.device import Buffer, BufferOptions, Compiled, Device
|
||||
from tinygrad.ops import Variable
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
|
||||
@@ -33,10 +33,10 @@ class HCQGraph(MultiGraphRunner):
|
||||
# graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with
|
||||
# global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the device’s
|
||||
# compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue.
|
||||
self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWCommandQueue, List, List, HCQSignal, Optional[int]]] = {}
|
||||
self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
|
||||
|
||||
self.comp_queues: Dict[HCQCompiled, HWComputeQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
|
||||
self.copy_queues: Dict[HCQCompiled, HWCopyQueue] = {} # lazy allocation
|
||||
self.comp_queues: Dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
|
||||
self.copy_queues: Dict[HCQCompiled, HWQueue] = {} # lazy allocation
|
||||
|
||||
self.signals: Dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
|
||||
self.kickoff_value: int = 0
|
||||
@@ -44,9 +44,9 @@ class HCQGraph(MultiGraphRunner):
|
||||
self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(self.jit_cache) * 2)] if PROFILE else []
|
||||
self.prof_records: List[Tuple[Tuple[int, bool], Tuple[int, bool], HCQCompiled, str, bool, List[int], Optional[Dict]]] = []
|
||||
|
||||
last_j: Dict[HWCommandQueue, Optional[int]] = collections.defaultdict(lambda: None)
|
||||
queue_access: Dict[HWCommandQueue, Dict[HWCommandQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
|
||||
dev_access: Dict[HWCommandQueue, Set[HCQCompiled]] = collections.defaultdict(set)
|
||||
last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
|
||||
queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
|
||||
dev_access: Dict[HWQueue, Set[HCQCompiled]] = collections.defaultdict(set)
|
||||
|
||||
for dev, queue in self.comp_queues.items(): dev_access[queue].add(dev)
|
||||
|
||||
@@ -101,7 +101,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
# Build hardware queues.
|
||||
self.op_cmd_idx: Dict[int, Tuple[Any, int]] = {}
|
||||
self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices}
|
||||
self.kickoff_wait_cmds: Dict[HWCommandQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
|
||||
self.kickoff_wait_cmds: Dict[HWQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
|
||||
|
||||
for dev in self.devices:
|
||||
self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \
|
||||
@@ -118,11 +118,11 @@ class HCQGraph(MultiGraphRunner):
|
||||
|
||||
# Encode main commands based on ji type.
|
||||
if isinstance(ji.prg, CompiledRunner):
|
||||
cast(HWComputeQueue, enqueue_queue).exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals))
|
||||
enqueue_queue.exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals))
|
||||
elif isinstance(ji.prg, BufferXfer):
|
||||
dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
|
||||
cast(HCQAllocator, Device[src.device].allocator).map(dest._buf)
|
||||
cast(HWCopyQueue, enqueue_queue).copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes)
|
||||
enqueue_queue.copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes)
|
||||
self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device]))
|
||||
self.op_cmd_idx[j] = (enqueue_queue, len(enqueue_queue) - 1)
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
|
||||
from tinygrad.device import BufferOptions
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
@@ -54,7 +54,7 @@ class AMDSignal(HCQSignal):
|
||||
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
||||
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
||||
|
||||
class AMDComputeQueue(HWComputeQueue):
|
||||
class AMDComputeQueue(HWQueue): # pylint: disable=abstract-method
|
||||
def __init__(self):
|
||||
self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
|
||||
super().__init__()
|
||||
@@ -184,7 +184,7 @@ class AMDComputeQueue(HWComputeQueue):
|
||||
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
|
||||
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
class AMDCopyQueue(HWCopyQueue):
|
||||
class AMDCopyQueue(HWQueue): # pylint: disable=abstract-method
|
||||
def __init__(self):
|
||||
self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
|
||||
super().__init__()
|
||||
|
||||
@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decima
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, hcq_command
|
||||
from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
|
||||
from tinygrad.device import BufferOptions
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
||||
@@ -83,7 +83,7 @@ class NVSignal(HCQSignal):
|
||||
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
|
||||
def _set_value(self, new_value:int): self._signal[0] = new_value
|
||||
|
||||
class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
|
||||
class NVCommandQueue(HWQueue): # pylint: disable=abstract-method
|
||||
def __del__(self):
|
||||
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
|
||||
|
||||
@@ -132,7 +132,7 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
|
||||
dev.gpu_mmio[0x90 // 4] = gpfifo.token
|
||||
gpfifo.put_value += 1
|
||||
|
||||
class NVComputeQueue(NVCommandQueue, HWComputeQueue):
|
||||
class NVComputeQueue(NVCommandQueue, HWQueue): # pylint: disable=abstract-method
|
||||
def __init__(self):
|
||||
self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
|
||||
super().__init__()
|
||||
@@ -187,7 +187,7 @@ class NVComputeQueue(NVCommandQueue, HWComputeQueue):
|
||||
|
||||
def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).compute_gpfifo)
|
||||
|
||||
class NVCopyQueue(NVCommandQueue, HWCopyQueue):
|
||||
class NVCopyQueue(NVCommandQueue, HWQueue): # pylint: disable=abstract-method
|
||||
def _copy(self, dest, src, copy_size):
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
||||
|
||||
@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Any, cast, Optional
|
||||
from tinygrad.device import BufferOptions
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState
|
||||
from tinygrad.runtime.autogen import kgsl, adreno, libc
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
@@ -44,7 +44,7 @@ class QCOMSignal(HCQSignal):
|
||||
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
|
||||
def _set_value(self, new_value:int): self._signal[0] = new_value
|
||||
|
||||
class QCOMComputeQueue(HWComputeQueue):
|
||||
class QCOMComputeQueue(HWQueue): # pylint: disable=abstract-method
|
||||
def __init__(self):
|
||||
self.cmd_idx_to_dims = {}
|
||||
super().__init__()
|
||||
|
||||
@@ -17,7 +17,7 @@ def hcq_command(func: Callable[..., None]) -> Callable[..., Any]:
|
||||
def command_method(self, ...): ...
|
||||
```
|
||||
"""
|
||||
def __wrapper(self:HWCommandQueue, *args, **kwargs):
|
||||
def __wrapper(self:HWQueue, *args, **kwargs):
|
||||
self.cmds_offset.append(len(self.q))
|
||||
func(self, *args, **kwargs)
|
||||
self.cmds_len.append(len(self.q) - self.cmds_offset[-1])
|
||||
@@ -30,7 +30,7 @@ DeviceType = TypeVar('DeviceType', bound='HCQCompiled')
|
||||
ProgramType = TypeVar('ProgramType', bound='HCQProgram')
|
||||
ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState')
|
||||
|
||||
class HWCommandQueue(Generic[SignalType, DeviceType]):
|
||||
class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
"""
|
||||
A base class for hardware command queues in the HCQ (Hardware Command Queue) API.
|
||||
Both compute and copy queues should have the following commands implemented.
|
||||
@@ -136,11 +136,12 @@ class HWCommandQueue(Generic[SignalType, DeviceType]):
|
||||
return self
|
||||
def _submit(self, dev:DeviceType): raise NotImplementedError("backend should overload this function")
|
||||
|
||||
class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
# *** commands for compute queues ***
|
||||
|
||||
@hcq_command
|
||||
def memory_barrier(self):
|
||||
"""
|
||||
Enqueues a memory barrier command to ensure memory coherence between agents.
|
||||
Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues.
|
||||
"""
|
||||
self._memory_barrier()
|
||||
def _memory_barrier(self): pass
|
||||
@@ -148,7 +149,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
|
||||
@hcq_command
|
||||
def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]):
|
||||
"""
|
||||
Enqueues an execution command for a kernel program.
|
||||
Enqueues an execution command for a kernel program. Only on compute queues.
|
||||
|
||||
Args:
|
||||
prg: The program to execute
|
||||
@@ -162,7 +163,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
|
||||
|
||||
def update_exec(self, cmd_idx:int, global_size:Optional[Tuple[int,int,int]]=None, local_size:Optional[Tuple[int,int,int]]=None):
|
||||
"""
|
||||
Updates a previously queued execution command.
|
||||
Updates a previously queued execution command. Only on compute queues.
|
||||
|
||||
Args:
|
||||
cmd_idx: Index of the execution command to update
|
||||
@@ -174,11 +175,12 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
|
||||
return self
|
||||
def _update_exec(self, cmd_idx, global_size, local_size): raise NotImplementedError("backend should overload this function")
|
||||
|
||||
class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType]):
|
||||
# *** commands for copy queues ***
|
||||
|
||||
@hcq_command
|
||||
def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
|
||||
"""
|
||||
Enqueues a copy command to transfer data.
|
||||
Enqueues a copy command to transfer data. Only on copy queues.
|
||||
|
||||
Args:
|
||||
dest: The destination of the copy
|
||||
@@ -190,7 +192,7 @@ class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, De
|
||||
|
||||
def update_copy(self, cmd_idx:int, dest:Optional[HCQBuffer]=None, src:Optional[HCQBuffer]=None):
|
||||
"""
|
||||
Updates a previously queued copy command.
|
||||
Updates a previously queued copy command. Only on copy queues.
|
||||
|
||||
Args:
|
||||
cmd_idx: Index of the copy command to update
|
||||
@@ -355,7 +357,7 @@ class HCQCompiled(Compiled):
|
||||
gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal],
|
||||
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]):
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
|
||||
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
|
||||
self.timeline_value:int = 1
|
||||
self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True)
|
||||
@@ -393,7 +395,7 @@ class HCQCompiled(Compiled):
|
||||
def _ensure_shared_time_base(self):
|
||||
if not self.gpu2cpu_compute_time_diff.is_nan(): return
|
||||
|
||||
def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWCommandQueue]):
|
||||
def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWQueue]):
|
||||
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)
|
||||
d.timeline_value += 1
|
||||
st = time.perf_counter_ns()
|
||||
@@ -411,7 +413,7 @@ class HCQCompiled(Compiled):
|
||||
if q == d.hw_compute_queue_t: d.gpu2cpu_compute_time_diff = statistics.median(l)
|
||||
if q == d.hw_copy_queue_t: d.gpu2cpu_copy_time_diff = statistics.median(l)
|
||||
|
||||
def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWCommandQueue], q2_t:Type[HWCommandQueue]):
|
||||
def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWQueue], q2_t:Type[HWQueue]):
|
||||
q1_t().signal(d1.timeline_signal, d1.timeline_value).wait(d2.timeline_signal, d2.timeline_value) \
|
||||
.timestamp(d1.timeline_signal).signal(d1.timeline_signal, d1.timeline_value+1).submit(d1)
|
||||
q2_t().signal(d2.timeline_signal, d2.timeline_value).wait(d1.timeline_signal, d1.timeline_value) \
|
||||
@@ -473,7 +475,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac
|
||||
"""
|
||||
A base allocator class compatible with the HCQ (Hardware Command Queue) API.
|
||||
|
||||
This class implements basic copy operations following the HCQ API, utilizing both `HWComputeQueue` and `HWCopyQueue`.
|
||||
This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`.
|
||||
"""
|
||||
|
||||
def __init__(self, dev:DeviceType, batch_size:int=(2 << 20), batch_cnt:int=32):
|
||||
|
||||
Reference in New Issue
Block a user