diff --git a/docs/developer/hcq.md b/docs/developer/hcq.md index 982825b652..4429da82b1 100644 --- a/docs/developer/hcq.md +++ b/docs/developer/hcq.md @@ -134,6 +134,8 @@ Backends must adhere to the `HCQBuffer` protocol when returning allocation resul members: true show_source: false +**Lifetime**: The `HCQArgsState` is passed to `HWComputeQueue.exec` and is guaranteed not to be freed until `HWComputeQueue.submit` for the same queue is called. + ### Synchronization HCQ-compatible devices use a global timeline signal for synchronizing all operations. This mechanism ensures proper ordering and completion of tasks across the device. By convention, `self.timeline_value` points to the next value to signal. So, to wait for all previous operations on the device to complete, wait for `self.timeline_value - 1` value. The following Python code demonstrates the typical usage of signals to synchronize execution to other operations on the device: diff --git a/tinygrad/device.py b/tinygrad/device.py index dfaa7cc7c7..27d36cfec1 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -468,10 +468,11 @@ class HCQProgram: Execution time of the kernel if 'wait' is True, otherwise None. """ + kernargs = self.fill_kernargs(bufs, vals) q = self.device.hw_compute_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier() with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en): - q.exec(self, self.fill_kernargs(bufs, vals), global_size, local_size) + q.exec(self, kernargs, global_size, local_size) q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) self.device.timeline_value += 1