diff --git a/docs/developer/hcq.md b/docs/developer/hcq.md index a00dcec0d7..c6ac8aa9c6 100644 --- a/docs/developer/hcq.md +++ b/docs/developer/hcq.md @@ -6,19 +6,19 @@ The main aspect of HCQ-compatible runtimes is how they interact with devices. In ### Command Queues -To interact with devices, there are 2 types of queues: `HWComputeQueue` and `HWCopyQueue`. Commands which are defined in a base `HWCommandQueue` class should be supported by both queues. These methods are timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWCommandQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWCommandQueue.wait). +To interact with devices you create a `HWQueue`. Some methods are required, like timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWQueue.wait), while others are dependent on it being a compute or copy queue. For example, the following Python code enqueues a wait, execute, and signal command on the HCQ-compatible device: ```python -HWComputeQueue().wait(signal_to_wait, value_to_wait) \ - .exec(program, args_state, global_dims, local_dims) \ - .signal(signal_to_fire, value_to_fire) \ - .submit(your_device) +HWQueue().wait(signal_to_wait, value_to_wait) \ + .exec(program, args_state, global_dims, local_dims) \ + .signal(signal_to_fire, value_to_fire) \ + .submit(your_device) ``` -Each runtime should implement the required functions that are defined in the `HWCommandQueue`, `HWComputeQueue`, and `HWCopyQueue` classes. +Each runtime should implement the required functions that are defined in the `HWQueue` classes. -::: tinygrad.runtime.support.hcq.HWCommandQueue +::: tinygrad.runtime.support.hcq.HWQueue options: members: [ "signal", @@ -28,21 +28,9 @@ Each runtime should implement the required functions that are defined in the `HW "update_wait", "bind", "submit", - ] - show_source: false - -::: tinygrad.runtime.support.hcq.HWComputeQueue - options: - members: [ "memory_barrier", "exec", "update_exec", - ] - show_source: false - -::: tinygrad.runtime.support.hcq.HWCopyQueue - options: - members: [ "copy", "update_copy", ] @@ -82,9 +70,9 @@ The following Python code demonstrates the usage of signals: ```python signal = your_device.signal_t() -HWComputeQueue().timestamp(signal) \ - .signal(signal, value_to_fire) \ - .submit(your_device) +HWQueue().timestamp(signal) \ + .signal(signal, value_to_fire) \ + .submit(your_device) signal.wait(value_to_fire) signaled_value = signal.value # should be the same as `value_to_fire` @@ -134,17 +122,17 @@ Backends must adhere to the `HCQBuffer` protocol when returning allocation resul members: true show_source: false -**Lifetime**: The `HCQArgsState` is passed to `HWComputeQueue.exec` and is guaranteed not to be freed until `HWComputeQueue.submit` for the same queue is called. +**Lifetime**: The `HCQArgsState` is passed to `HWQueue.exec` and is guaranteed not to be freed until `HWQueue.submit` for the same queue is called. ### Synchronization HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device: ```python -HWComputeQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \ - .exec(...) - .signal(your_device.timeline_signal, your_device.timeline_value) \ - .submit(your_device) +HWQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \ + .exec(...) + .signal(your_device.timeline_signal, your_device.timeline_value) \ + .submit(your_device) your_device.timeline_value += 1 # Optionally wait for execution @@ -153,5 +141,5 @@ your_device.timeline_signal.wait(your_device.timeline_value - 1) ## HCQGraph -[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds a static `HWComputeQueue` and `HWCopyQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild. +[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds static `HWQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild. Optionally, queues can implement a `bind` API, which allows further optimization by eliminating the need to copy the queues into the device ring. diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py index 8cd7a74463..62466180d8 100644 --- a/test/external/external_test_hcq.py +++ b/test/external/external_test_hcq.py @@ -33,16 +33,16 @@ class TestHCQ(unittest.TestCase): ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size+TestHCQ.kernargs_off, TestHCQ.addr2, len(TestHCQ.addr2)) if Device.DEFAULT == "AMD": - from tinygrad.runtime.ops_amd import HWCopyQueue, HWPM4Queue + from tinygrad.runtime.ops_amd import HWQueue, HWPM4Queue TestHCQ.compute_queue = HWPM4Queue - TestHCQ.copy_queue = HWCopyQueue + TestHCQ.copy_queue = HWQueue elif Device.DEFAULT == "NV": - from tinygrad.runtime.ops_nv import HWCopyQueue, HWComputeQueue + from tinygrad.runtime.ops_nv import HWQueue, HWQueue # nv need to copy constbuffer there as well to_mv(TestHCQ.d0.kernargs_ptr, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0) to_mv(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0) - TestHCQ.compute_queue = HWComputeQueue - TestHCQ.copy_queue = HWCopyQueue + TestHCQ.compute_queue = HWQueue + TestHCQ.copy_queue = HWQueue def setUp(self): TestHCQ.d0.synchronize() diff --git a/test/external/external_test_nv.py b/test/external/external_test_nv.py index 87d3d682fe..c4e633e1ef 100644 --- a/test/external/external_test_nv.py +++ b/test/external/external_test_nv.py @@ -2,7 +2,7 @@ import unittest, struct, array, ctypes from tinygrad import Device, dtypes, Tensor from tinygrad.helpers import to_mv from tinygrad.engine.schedule import create_schedule -from tinygrad.runtime.ops_nv import NVDevice, HWComputeQueue +from tinygrad.runtime.ops_nv import NVDevice, HWQueue from tinygrad.engine.search import Opt, OptOps from test.test_linearizer_failures import helper_test_lin from tinygrad.engine.realize import get_runner, CompiledRunner @@ -55,7 +55,7 @@ class TestNV(unittest.TestCase): to_mv(kernargs, 0x160).cast('I')[:] = array.array('I', TestNV.d0_runner.clprg.constbuffer_0) ctypes.memmove(kernargs + TestNV.d0_runner.clprg.kernargs_offset, TestNV.addr, len(TestNV.addr)) - q = HWComputeQueue() + q = HWQueue() q.exec(TestNV.d0_runner.clprg, kernargs, TestNV.d0_runner.global_size, TestNV.d0_runner.local_size) q.signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value).submit(TestNV.d0) TestNV.d0._wait_signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value) diff --git a/test/external/fuzz_kfd.py b/test/external/fuzz_kfd.py index 527ef20d0e..29c2868381 100644 --- a/test/external/fuzz_kfd.py +++ b/test/external/fuzz_kfd.py @@ -3,7 +3,7 @@ import random from tqdm import trange from typing import List from tinygrad import Device -from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue +from tinygrad.runtime.ops_amd import AMDDevice, HWQueue if __name__ == "__main__": dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)] @@ -15,9 +15,9 @@ if __name__ == "__main__": d1, b1 = random.choice(buffers) d2, b2 = random.choice(buffers) d1._gpu_map(b2) - q = HWComputeQueue() + q = HWQueue() q.signal(sig:=AMDDevice._alloc_signal(10)) - qc = HWCopyQueue() + qc = HWQueue() qc.wait(sig) qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size)) d1.completion_signal.value = 1 diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py index 7b29eed0ca..4404bec386 100644 --- a/tinygrad/runtime/graph/hcq.py +++ b/tinygrad/runtime/graph/hcq.py @@ -1,7 +1,7 @@ import collections, time from typing import List, Any, Dict, cast, Optional, Tuple, Set from tinygrad.helpers import round_up, PROFILE, memsize_to_str -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, HCQArgsState +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState from tinygrad.device import Buffer, BufferOptions, Compiled, Device from tinygrad.ops import Variable from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner @@ -33,10 +33,10 @@ class HCQGraph(MultiGraphRunner): # graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with # global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the device’s # compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue. - self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWCommandQueue, List, List, HCQSignal, Optional[int]]] = {} + self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {} - self.comp_queues: Dict[HCQCompiled, HWComputeQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices} - self.copy_queues: Dict[HCQCompiled, HWCopyQueue] = {} # lazy allocation + self.comp_queues: Dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices} + self.copy_queues: Dict[HCQCompiled, HWQueue] = {} # lazy allocation self.signals: Dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}} self.kickoff_value: int = 0 @@ -44,9 +44,9 @@ class HCQGraph(MultiGraphRunner): self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(self.jit_cache) * 2)] if PROFILE else [] self.prof_records: List[Tuple[Tuple[int, bool], Tuple[int, bool], HCQCompiled, str, bool, List[int], Optional[Dict]]] = [] - last_j: Dict[HWCommandQueue, Optional[int]] = collections.defaultdict(lambda: None) - queue_access: Dict[HWCommandQueue, Dict[HWCommandQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None)) - dev_access: Dict[HWCommandQueue, Set[HCQCompiled]] = collections.defaultdict(set) + last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None) + queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None)) + dev_access: Dict[HWQueue, Set[HCQCompiled]] = collections.defaultdict(set) for dev, queue in self.comp_queues.items(): dev_access[queue].add(dev) @@ -101,7 +101,7 @@ class HCQGraph(MultiGraphRunner): # Build hardware queues. self.op_cmd_idx: Dict[int, Tuple[Any, int]] = {} self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices} - self.kickoff_wait_cmds: Dict[HWCommandQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())} + self.kickoff_wait_cmds: Dict[HWQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())} for dev in self.devices: self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \ @@ -118,11 +118,11 @@ class HCQGraph(MultiGraphRunner): # Encode main commands based on ji type. if isinstance(ji.prg, CompiledRunner): - cast(HWComputeQueue, enqueue_queue).exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals)) + enqueue_queue.exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals)) elif isinstance(ji.prg, BufferXfer): dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]] cast(HCQAllocator, Device[src.device].allocator).map(dest._buf) - cast(HWCopyQueue, enqueue_queue).copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes) + enqueue_queue.copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes) self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device])) self.op_cmd_idx[j] = (enqueue_queue, len(enqueue_queue) - 1) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 05587d2c20..4d2297a2c0 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys assert sys.platform != 'win32' from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram from tinygrad.device import BufferOptions from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address from tinygrad.renderer.cstyle import AMDRenderer @@ -54,7 +54,7 @@ class AMDSignal(HCQSignal): kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000) raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!") -class AMDComputeQueue(HWComputeQueue): +class AMDComputeQueue(HWQueue): # pylint: disable=abstract-method def __init__(self): self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {} super().__init__() @@ -184,7 +184,7 @@ class AMDComputeQueue(HWComputeQueue): dev.compute_queue.doorbell[0] = dev.compute_queue.put_value SDMA_MAX_COPY_SIZE = 0x400000 -class AMDCopyQueue(HWCopyQueue): +class AMDCopyQueue(HWQueue): # pylint: disable=abstract-method def __init__(self): self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {} super().__init__() diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index fade7cba32..3037f5e4e3 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decima assert sys.platform != 'win32' from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, hcq_command from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal from tinygrad.device import BufferOptions from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod @@ -83,7 +83,7 @@ class NVSignal(HCQSignal): def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000) def _set_value(self, new_value:int): self._signal[0] = new_value -class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method +class NVCommandQueue(HWQueue): # pylint: disable=abstract-method def __del__(self): if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True)) @@ -132,7 +132,7 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method dev.gpu_mmio[0x90 // 4] = gpfifo.token gpfifo.put_value += 1 -class NVComputeQueue(NVCommandQueue, HWComputeQueue): +class NVComputeQueue(NVCommandQueue, HWQueue): # pylint: disable=abstract-method def __init__(self): self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {} super().__init__() @@ -187,7 +187,7 @@ class NVComputeQueue(NVCommandQueue, HWComputeQueue): def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).compute_gpfifo) -class NVCopyQueue(NVCommandQueue, HWCopyQueue): +class NVCopyQueue(NVCommandQueue, HWQueue): # pylint: disable=abstract-method def _copy(self, dest, src, copy_size): self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)] self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size] diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 3e1b15a6fd..d96b9b3d31 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -4,7 +4,7 @@ assert sys.platform != 'win32' from types import SimpleNamespace from typing import Tuple, List, Any, cast, Optional from tinygrad.device import BufferOptions -from tinygrad.runtime.support.hcq import HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState +from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState from tinygrad.runtime.autogen import kgsl, adreno, libc from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer @@ -44,7 +44,7 @@ class QCOMSignal(HCQSignal): def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer def _set_value(self, new_value:int): self._signal[0] = new_value -class QCOMComputeQueue(HWComputeQueue): +class QCOMComputeQueue(HWQueue): # pylint: disable=abstract-method def __init__(self): self.cmd_idx_to_dims = {} super().__init__() diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index a671a34106..5514849115 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -17,7 +17,7 @@ def hcq_command(func: Callable[..., None]) -> Callable[..., Any]: def command_method(self, ...): ... ``` """ - def __wrapper(self:HWCommandQueue, *args, **kwargs): + def __wrapper(self:HWQueue, *args, **kwargs): self.cmds_offset.append(len(self.q)) func(self, *args, **kwargs) self.cmds_len.append(len(self.q) - self.cmds_offset[-1]) @@ -30,7 +30,7 @@ DeviceType = TypeVar('DeviceType', bound='HCQCompiled') ProgramType = TypeVar('ProgramType', bound='HCQProgram') ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState') -class HWCommandQueue(Generic[SignalType, DeviceType]): +class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): """ A base class for hardware command queues in the HCQ (Hardware Command Queue) API. Both compute and copy queues should have the following commands implemented. @@ -136,11 +136,12 @@ class HWCommandQueue(Generic[SignalType, DeviceType]): return self def _submit(self, dev:DeviceType): raise NotImplementedError("backend should overload this function") -class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType, ProgramType, ArgsStateType]): + # *** commands for compute queues *** + @hcq_command def memory_barrier(self): """ - Enqueues a memory barrier command to ensure memory coherence between agents. + Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues. """ self._memory_barrier() def _memory_barrier(self): pass @@ -148,7 +149,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, @hcq_command def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]): """ - Enqueues an execution command for a kernel program. + Enqueues an execution command for a kernel program. Only on compute queues. Args: prg: The program to execute @@ -162,7 +163,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, def update_exec(self, cmd_idx:int, global_size:Optional[Tuple[int,int,int]]=None, local_size:Optional[Tuple[int,int,int]]=None): """ - Updates a previously queued execution command. + Updates a previously queued execution command. Only on compute queues. Args: cmd_idx: Index of the execution command to update @@ -174,11 +175,12 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, return self def _update_exec(self, cmd_idx, global_size, local_size): raise NotImplementedError("backend should overload this function") -class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType]): + # *** commands for copy queues *** + @hcq_command def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int): """ - Enqueues a copy command to transfer data. + Enqueues a copy command to transfer data. Only on copy queues. Args: dest: The destination of the copy @@ -190,7 +192,7 @@ class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, De def update_copy(self, cmd_idx:int, dest:Optional[HCQBuffer]=None, src:Optional[HCQBuffer]=None): """ - Updates a previously queued copy command. + Updates a previously queued copy command. Only on copy queues. Args: cmd_idx: Index of the copy command to update @@ -355,7 +357,7 @@ class HCQCompiled(Compiled): gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan') def __init__(self, device:str, allocator:HCQAllocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal], - comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]): + comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]): self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t self.timeline_value:int = 1 self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True) @@ -393,7 +395,7 @@ class HCQCompiled(Compiled): def _ensure_shared_time_base(self): if not self.gpu2cpu_compute_time_diff.is_nan(): return - def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWCommandQueue]): + def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWQueue]): q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d) d.timeline_value += 1 st = time.perf_counter_ns() @@ -411,7 +413,7 @@ class HCQCompiled(Compiled): if q == d.hw_compute_queue_t: d.gpu2cpu_compute_time_diff = statistics.median(l) if q == d.hw_copy_queue_t: d.gpu2cpu_copy_time_diff = statistics.median(l) - def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWCommandQueue], q2_t:Type[HWCommandQueue]): + def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWQueue], q2_t:Type[HWQueue]): q1_t().signal(d1.timeline_signal, d1.timeline_value).wait(d2.timeline_signal, d2.timeline_value) \ .timestamp(d1.timeline_signal).signal(d1.timeline_signal, d1.timeline_value+1).submit(d1) q2_t().signal(d2.timeline_signal, d2.timeline_value).wait(d1.timeline_signal, d1.timeline_value) \ @@ -473,7 +475,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac """ A base allocator class compatible with the HCQ (Hardware Command Queue) API. - This class implements basic copy operations following the HCQ API, utilizing both `HWComputeQueue` and `HWCopyQueue`. + This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`. """ def __init__(self, dev:DeviceType, batch_size:int=(2 << 20), batch_cnt:int=32):