diff --git a/docs/developer/hcq.md b/docs/developer/hcq.md
index 982825b652..4429da82b1 100644
--- a/docs/developer/hcq.md
+++ b/docs/developer/hcq.md
@@ -134,6 +134,8 @@ Backends must adhere to the `HCQBuffer` protocol when returning allocation resul
         members: true
         show_source: false
 
+**Lifetime**: The `HCQArgsState` is passed to `HWComputeQueue.exec` and is guaranteed not to be freed until `HWComputeQueue.submit` for the same queue is called.
+
 ### Synchronization
 
 HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device:
diff --git a/tinygrad/device.py b/tinygrad/device.py
index dfaa7cc7c7..27d36cfec1 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -468,10 +468,11 @@ class HCQProgram:
       Execution time of the kernel if 'wait' is True, otherwise None.
     """
 
+    kernargs = self.fill_kernargs(bufs, vals)
     q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
 
     with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
-      q.exec(self, self.fill_kernargs(bufs, vals), global_size, local_size)
+      q.exec(self, kernargs, global_size, local_size)
 
     q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
     self.device.timeline_value += 1