diff --git a/docs/developer/hcq.md b/docs/developer/hcq.md
index a00dcec0d7..c6ac8aa9c6 100644
--- a/docs/developer/hcq.md
+++ b/docs/developer/hcq.md
@@ -6,19 +6,19 @@ The main aspect of HCQ-compatible runtimes is how they interact with devices. In
 
 ### Command Queues
 
-To interact with devices, there are 2 types of queues: `HWComputeQueue` and `HWCopyQueue`. Commands which are defined in a base `HWCommandQueue` class should be supported by both queues. These methods are timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWCommandQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWCommandQueue.wait).
+To interact with devices you create a `HWQueue`. Some methods are required, like timestamp and synchronization methods like [signal](#tinygrad.runtime.support.hcq.HWQueue.signal) and [wait](#tinygrad.runtime.support.hcq.HWQueue.wait), while others are dependent on it being a compute or copy queue.
 
 For example, the following Python code enqueues a wait, execute, and signal command on the HCQ-compatible device:
 ```python
-HWComputeQueue().wait(signal_to_wait, value_to_wait) \
-                .exec(program, args_state, global_dims, local_dims) \
-                .signal(signal_to_fire, value_to_fire) \
-                .submit(your_device)
+HWQueue().wait(signal_to_wait, value_to_wait) \
+         .exec(program, args_state, global_dims, local_dims) \
+         .signal(signal_to_fire, value_to_fire) \
+         .submit(your_device)
 ```
 
-Each runtime should implement the required functions that are defined in the `HWCommandQueue`, `HWComputeQueue`, and `HWCopyQueue` classes.
+Each runtime should implement the required functions that are defined in the `HWQueue` classes.
 
-::: tinygrad.runtime.support.hcq.HWCommandQueue
+::: tinygrad.runtime.support.hcq.HWQueue
     options:
         members: [
             "signal",
@@ -28,21 +28,9 @@ Each runtime should implement the required functions that are defined in the `HW
             "update_wait",
             "bind",
             "submit",
-        ]
-        show_source: false
-
-::: tinygrad.runtime.support.hcq.HWComputeQueue
-    options:
-        members: [
             "memory_barrier",
             "exec",
             "update_exec",
-        ]
-        show_source: false
-
-::: tinygrad.runtime.support.hcq.HWCopyQueue
-    options:
-        members: [
             "copy",
             "update_copy",
         ]
@@ -82,9 +70,9 @@ The following Python code demonstrates the usage of signals:
 ```python
 signal = your_device.signal_t()
 
-HWComputeQueue().timestamp(signal) \
-                .signal(signal, value_to_fire) \
-                .submit(your_device)
+HWQueue().timestamp(signal) \
+         .signal(signal, value_to_fire) \
+         .submit(your_device)
 
 signal.wait(value_to_fire)
 signaled_value = signal.value # should be the same as `value_to_fire`
@@ -134,17 +122,17 @@ Backends must adhere to the `HCQBuffer` protocol when returning allocation resul
         members: true
         show_source: false
 
-**Lifetime**: The `HCQArgsState` is passed to `HWComputeQueue.exec` and is guaranteed not to be freed until `HWComputeQueue.submit` for the same queue is called.
+**Lifetime**: The `HCQArgsState` is passed to `HWQueue.exec` and is guaranteed not to be freed until `HWQueue.submit` for the same queue is called.
 
 ### Synchronization
 
 HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device:
 
 ```python
-HWComputeQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \
-                .exec(...)
-                .signal(your_device.timeline_signal, your_device.timeline_value) \
-                .submit(your_device)
+HWQueue().wait(your_device.timeline_signal, your_device.timeline_value - 1) \
+         .exec(...)
+         .signal(your_device.timeline_signal, your_device.timeline_value) \
+         .submit(your_device)
 your_device.timeline_value += 1
 
 # Optionally wait for execution
@@ -153,5 +141,5 @@ your_device.timeline_signal.wait(your_device.timeline_value - 1)
 
 ## HCQGraph
 
-[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds a static `HWComputeQueue` and `HWCopyQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild.
+[HCQGraph](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/graph/hcq.py) is a core feature that implements `GraphRunner` for HCQ-compatible devices. `HCQGraph` builds static `HWQueue` for all operations per device. To optimize enqueue time, only the necessary parts of the queues are updated for each run using the update APIs of the queues, avoiding a complete rebuild.
 Optionally, queues can implement a `bind` API, which allows further optimization by eliminating the need to copy the queues into the device ring.
diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py
index 8cd7a74463..62466180d8 100644
--- a/test/external/external_test_hcq.py
+++ b/test/external/external_test_hcq.py
@@ -33,16 +33,16 @@ class TestHCQ(unittest.TestCase):
     ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size+TestHCQ.kernargs_off, TestHCQ.addr2, len(TestHCQ.addr2))
 
     if Device.DEFAULT == "AMD":
-      from tinygrad.runtime.ops_amd import HWCopyQueue, HWPM4Queue
+      from tinygrad.runtime.ops_amd import HWQueue, HWPM4Queue
       TestHCQ.compute_queue = HWPM4Queue
-      TestHCQ.copy_queue = HWCopyQueue
+      TestHCQ.copy_queue = HWQueue
     elif Device.DEFAULT == "NV":
-      from tinygrad.runtime.ops_nv import HWCopyQueue, HWComputeQueue
+      from tinygrad.runtime.ops_nv import HWQueue, HWQueue
       # nv need to copy constbuffer there as well
       to_mv(TestHCQ.d0.kernargs_ptr, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0)
       to_mv(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0)
-      TestHCQ.compute_queue = HWComputeQueue
-      TestHCQ.copy_queue = HWCopyQueue
+      TestHCQ.compute_queue = HWQueue
+      TestHCQ.copy_queue = HWQueue
 
   def setUp(self):
     TestHCQ.d0.synchronize()
diff --git a/test/external/external_test_nv.py b/test/external/external_test_nv.py
index 87d3d682fe..c4e633e1ef 100644
--- a/test/external/external_test_nv.py
+++ b/test/external/external_test_nv.py
@@ -2,7 +2,7 @@ import unittest, struct, array, ctypes
 from tinygrad import Device, dtypes, Tensor
 from tinygrad.helpers import to_mv
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.runtime.ops_nv import NVDevice, HWComputeQueue
+from tinygrad.runtime.ops_nv import NVDevice, HWQueue
 from tinygrad.engine.search import Opt, OptOps
 from test.test_linearizer_failures import helper_test_lin
 from tinygrad.engine.realize import get_runner, CompiledRunner
@@ -55,7 +55,7 @@ class TestNV(unittest.TestCase):
     to_mv(kernargs, 0x160).cast('I')[:] = array.array('I', TestNV.d0_runner.clprg.constbuffer_0)
     ctypes.memmove(kernargs + TestNV.d0_runner.clprg.kernargs_offset, TestNV.addr, len(TestNV.addr))
 
-    q = HWComputeQueue()
+    q = HWQueue()
     q.exec(TestNV.d0_runner.clprg, kernargs, TestNV.d0_runner.global_size, TestNV.d0_runner.local_size)
     q.signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value).submit(TestNV.d0)
     TestNV.d0._wait_signal(TestNV.d0.timeline_signal, TestNV.d0.timeline_value)
diff --git a/test/external/fuzz_kfd.py b/test/external/fuzz_kfd.py
index 527ef20d0e..29c2868381 100644
--- a/test/external/fuzz_kfd.py
+++ b/test/external/fuzz_kfd.py
@@ -3,7 +3,7 @@ import random
 from tqdm import trange
 from typing import List
 from tinygrad import Device
-from tinygrad.runtime.ops_amd import AMDDevice, HWCopyQueue, HWComputeQueue
+from tinygrad.runtime.ops_amd import AMDDevice, HWQueue
 
 if __name__ == "__main__":
   dev: List[AMDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
@@ -15,9 +15,9 @@ if __name__ == "__main__":
     d1, b1 = random.choice(buffers)
     d2, b2 = random.choice(buffers)
     d1._gpu_map(b2)
-    q = HWComputeQueue()
+    q = HWQueue()
     q.signal(sig:=AMDDevice._alloc_signal(10))
-    qc = HWCopyQueue()
+    qc = HWQueue()
     qc.wait(sig)
     qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
     d1.completion_signal.value = 1
diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py
index 7b29eed0ca..4404bec386 100644
--- a/tinygrad/runtime/graph/hcq.py
+++ b/tinygrad/runtime/graph/hcq.py
@@ -1,7 +1,7 @@
 import collections, time
 from typing import List, Any, Dict, cast, Optional, Tuple, Set
 from tinygrad.helpers import round_up, PROFILE, memsize_to_str
-from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, HCQArgsState
+from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWQueue, HCQArgsState
 from tinygrad.device import Buffer, BufferOptions, Compiled, Device
 from tinygrad.ops import Variable
 from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
@@ -33,10 +33,10 @@ class HCQGraph(MultiGraphRunner):
     # graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with
     # global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the device’s
     # compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue.
-    self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWCommandQueue, List, List, HCQSignal, Optional[int]]] = {}
+    self.ji_schedule: Dict[int, Tuple[HCQCompiled, HWQueue, List, List, HCQSignal, Optional[int]]] = {}
 
-    self.comp_queues: Dict[HCQCompiled, HWComputeQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
-    self.copy_queues: Dict[HCQCompiled, HWCopyQueue] = {} # lazy allocation
+    self.comp_queues: Dict[HCQCompiled, HWQueue] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
+    self.copy_queues: Dict[HCQCompiled, HWQueue] = {} # lazy allocation
 
     self.signals: Dict[Any, HCQSignal] = {**{dev: dev.signal_t(value=0) for dev in self.devices}, **{"CPU": self.devices[0].signal_t(value=0)}}
     self.kickoff_value: int = 0
@@ -44,9 +44,9 @@ class HCQGraph(MultiGraphRunner):
     self.prof_signals: List[HCQSignal] = [self.devices[0].signal_t() for i in range(len(self.jit_cache) * 2)] if PROFILE else []
     self.prof_records: List[Tuple[Tuple[int, bool], Tuple[int, bool], HCQCompiled, str, bool, List[int], Optional[Dict]]] = []
 
-    last_j: Dict[HWCommandQueue, Optional[int]] = collections.defaultdict(lambda: None)
-    queue_access: Dict[HWCommandQueue, Dict[HWCommandQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
-    dev_access: Dict[HWCommandQueue, Set[HCQCompiled]] = collections.defaultdict(set)
+    last_j: Dict[HWQueue, Optional[int]] = collections.defaultdict(lambda: None)
+    queue_access: Dict[HWQueue, Dict[HWQueue, Optional[int]]] = collections.defaultdict(lambda: collections.defaultdict(lambda: None))
+    dev_access: Dict[HWQueue, Set[HCQCompiled]] = collections.defaultdict(set)
 
     for dev, queue in self.comp_queues.items(): dev_access[queue].add(dev)
 
@@ -101,7 +101,7 @@ class HCQGraph(MultiGraphRunner):
     # Build hardware queues.
     self.op_cmd_idx: Dict[int, Tuple[Any, int]] = {}
     self.copy_to_devs: Dict[HCQCompiled, Set[HCQCompiled]] = {dev: set() for dev in self.devices}
-    self.kickoff_wait_cmds: Dict[HWCommandQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
+    self.kickoff_wait_cmds: Dict[HWQueue, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
 
     for dev in self.devices:
       self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \
@@ -118,11 +118,11 @@ class HCQGraph(MultiGraphRunner):
 
       # Encode main commands based on ji type.
       if isinstance(ji.prg, CompiledRunner):
-        cast(HWComputeQueue, enqueue_queue).exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals))
+        enqueue_queue.exec(ji.prg.clprg, self.ji_args[j], *ji.prg.p.launch_dims(var_vals))
       elif isinstance(ji.prg, BufferXfer):
         dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
         cast(HCQAllocator, Device[src.device].allocator).map(dest._buf)
-        cast(HWCopyQueue, enqueue_queue).copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes)
+        enqueue_queue.copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes)
         self.copy_to_devs[cast(HCQCompiled, Device[dest.device])].add(cast(HCQCompiled, Device[src.device]))
       self.op_cmd_idx[j] = (enqueue_queue, len(enqueue_queue) - 1)
 
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index 05587d2c20..4d2297a2c0 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -3,7 +3,7 @@ from typing import Tuple, List, Any, Optional
 import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal, sys
 assert sys.platform != 'win32'
 from dataclasses import dataclass
-from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, HCQSignal, HCQProgram
+from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram
 from tinygrad.device import BufferOptions
 from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
 from tinygrad.renderer.cstyle import AMDRenderer
@@ -54,7 +54,7 @@ class AMDSignal(HCQSignal):
         kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
     raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
 
-class AMDComputeQueue(HWComputeQueue):
+class AMDComputeQueue(HWQueue):   # pylint: disable=abstract-method
   def __init__(self):
     self.cmd_idx_to_local_offset, self.cmd_idx_to_global_offset, self.cmd_idx_to_dispatch_packet = {}, {}, {}
     super().__init__()
@@ -184,7 +184,7 @@ class AMDComputeQueue(HWComputeQueue):
     dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
 
 SDMA_MAX_COPY_SIZE = 0x400000
-class AMDCopyQueue(HWCopyQueue):
+class AMDCopyQueue(HWQueue):   # pylint: disable=abstract-method
   def __init__(self):
     self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
     super().__init__()
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index fade7cba32..3037f5e4e3 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -3,7 +3,7 @@ import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, decima
 assert sys.platform != 'win32'
 from typing import Tuple, List, Any, cast, Union, Dict, Type, Optional
 from dataclasses import dataclass
-from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command
+from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, hcq_command
 from tinygrad.runtime.support.hcq import HCQArgsState, HCQProgram, HCQSignal
 from tinygrad.device import BufferOptions
 from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
@@ -83,7 +83,7 @@ class NVSignal(HCQSignal):
   def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(1000)
   def _set_value(self, new_value:int): self._signal[0] = new_value
 
-class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
+class NVCommandQueue(HWQueue): # pylint: disable=abstract-method
   def __del__(self):
     if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
 
@@ -132,7 +132,7 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
     dev.gpu_mmio[0x90 // 4] = gpfifo.token
     gpfifo.put_value += 1
 
-class NVComputeQueue(NVCommandQueue, HWComputeQueue):
+class NVComputeQueue(NVCommandQueue, HWQueue):   # pylint: disable=abstract-method
   def __init__(self):
     self.cmd_idx_to_qmd, self.cmd_idx_to_signal_id, self.cmd_idx_to_global_dims, self.cmd_idx_to_local_dims = {}, {}, {}, {}
     super().__init__()
@@ -187,7 +187,7 @@ class NVComputeQueue(NVCommandQueue, HWComputeQueue):
 
   def _submit(self, dev): self._submit_to_gpfifo(dev, cast(NVDevice, dev).compute_gpfifo)
 
-class NVCopyQueue(NVCommandQueue, HWCopyQueue):
+class NVCopyQueue(NVCommandQueue, HWQueue):   # pylint: disable=abstract-method
   def _copy(self, dest, src, copy_size):
     self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *data64(src), *data64(dest)]
     self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py
index 3e1b15a6fd..d96b9b3d31 100644
--- a/tinygrad/runtime/ops_qcom.py
+++ b/tinygrad/runtime/ops_qcom.py
@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
 from types import SimpleNamespace
 from typing import Tuple, List, Any, cast, Optional
 from tinygrad.device import BufferOptions
-from tinygrad.runtime.support.hcq import HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState
+from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState
 from tinygrad.runtime.autogen import kgsl, adreno, libc
 from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
 from tinygrad.renderer.cstyle import QCOMRenderer
@@ -44,7 +44,7 @@ class QCOMSignal(HCQSignal):
   def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
   def _set_value(self, new_value:int): self._signal[0] = new_value
 
-class QCOMComputeQueue(HWComputeQueue):
+class QCOMComputeQueue(HWQueue):   # pylint: disable=abstract-method
   def __init__(self):
     self.cmd_idx_to_dims = {}
     super().__init__()
diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py
index a671a34106..5514849115 100644
--- a/tinygrad/runtime/support/hcq.py
+++ b/tinygrad/runtime/support/hcq.py
@@ -17,7 +17,7 @@ def hcq_command(func: Callable[..., None]) -> Callable[..., Any]:
       def command_method(self, ...): ...
     ```
   """
-  def __wrapper(self:HWCommandQueue, *args, **kwargs):
+  def __wrapper(self:HWQueue, *args, **kwargs):
     self.cmds_offset.append(len(self.q))
     func(self, *args, **kwargs)
     self.cmds_len.append(len(self.q) - self.cmds_offset[-1])
@@ -30,7 +30,7 @@ DeviceType = TypeVar('DeviceType', bound='HCQCompiled')
 ProgramType = TypeVar('ProgramType', bound='HCQProgram')
 ArgsStateType = TypeVar('ArgsStateType', bound='HCQArgsState')
 
-class HWCommandQueue(Generic[SignalType, DeviceType]):
+class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
   """
   A base class for hardware command queues in the HCQ (Hardware Command Queue) API.
   Both compute and copy queues should have the following commands implemented.
@@ -136,11 +136,12 @@ class HWCommandQueue(Generic[SignalType, DeviceType]):
     return self
   def _submit(self, dev:DeviceType): raise NotImplementedError("backend should overload this function")
 
-class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
+  # *** commands for compute queues ***
+
   @hcq_command
   def memory_barrier(self):
     """
-    Enqueues a memory barrier command to ensure memory coherence between agents.
+    Enqueues a memory barrier command to ensure memory coherence between agents. Only on compute queues.
     """
     self._memory_barrier()
   def _memory_barrier(self): pass
@@ -148,7 +149,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
   @hcq_command
   def exec(self, prg:ProgramType, args_state:ArgsStateType, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int]):
     """
-    Enqueues an execution command for a kernel program.
+    Enqueues an execution command for a kernel program. Only on compute queues.
 
     Args:
       prg: The program to execute
@@ -162,7 +163,7 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
 
   def update_exec(self, cmd_idx:int, global_size:Optional[Tuple[int,int,int]]=None, local_size:Optional[Tuple[int,int,int]]=None):
     """
-    Updates a previously queued execution command.
+    Updates a previously queued execution command. Only on compute queues.
 
     Args:
       cmd_idx: Index of the execution command to update
@@ -174,11 +175,12 @@ class HWComputeQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType,
     return self
   def _update_exec(self, cmd_idx, global_size, local_size): raise NotImplementedError("backend should overload this function")
 
-class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, DeviceType]):
+  # *** commands for copy queues ***
+
   @hcq_command
   def copy(self, dest:HCQBuffer, src:HCQBuffer, copy_size:int):
     """
-    Enqueues a copy command to transfer data.
+    Enqueues a copy command to transfer data. Only on copy queues.
 
     Args:
       dest: The destination of the copy
@@ -190,7 +192,7 @@ class HWCopyQueue(HWCommandQueue[SignalType, DeviceType], Generic[SignalType, De
 
   def update_copy(self, cmd_idx:int, dest:Optional[HCQBuffer]=None, src:Optional[HCQBuffer]=None):
     """
-    Updates a previously queued copy command.
+    Updates a previously queued copy command. Only on copy queues.
 
     Args:
       cmd_idx: Index of the copy command to update
@@ -355,7 +357,7 @@ class HCQCompiled(Compiled):
   gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')
 
   def __init__(self, device:str, allocator:HCQAllocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal],
-               comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]):
+               comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]):
     self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
     self.timeline_value:int = 1
     self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True)
@@ -393,7 +395,7 @@ class HCQCompiled(Compiled):
   def _ensure_shared_time_base(self):
     if not self.gpu2cpu_compute_time_diff.is_nan(): return
 
-    def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWCommandQueue]):
+    def _sync_cpu_queue(d:HCQCompiled, q_t:Type[HWQueue]):
       q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)
       d.timeline_value += 1
       st = time.perf_counter_ns()
@@ -411,7 +413,7 @@ class HCQCompiled(Compiled):
       if q == d.hw_compute_queue_t: d.gpu2cpu_compute_time_diff = statistics.median(l)
       if q == d.hw_copy_queue_t: d.gpu2cpu_copy_time_diff = statistics.median(l)
 
-    def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWCommandQueue], q2_t:Type[HWCommandQueue]):
+    def _sync_gpu_to_gpu_queue(d1:HCQCompiled, d2:HCQCompiled, q1_t:Type[HWQueue], q2_t:Type[HWQueue]):
       q1_t().signal(d1.timeline_signal, d1.timeline_value).wait(d2.timeline_signal, d2.timeline_value) \
             .timestamp(d1.timeline_signal).signal(d1.timeline_signal, d1.timeline_value+1).submit(d1)
       q2_t().signal(d2.timeline_signal, d2.timeline_value).wait(d1.timeline_signal, d1.timeline_value) \
@@ -473,7 +475,7 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): # pylint: disable=abstrac
   """
   A base allocator class compatible with the HCQ (Hardware Command Queue) API.
 
-  This class implements basic copy operations following the HCQ API, utilizing both `HWComputeQueue` and `HWCopyQueue`.
+  This class implements basic copy operations following the HCQ API, utilizing both types of `HWQueue`.
   """
 
   def __init__(self, dev:DeviceType, batch_size:int=(2 << 20), batch_cnt:int=32):