hcq move out synchronize to base class (#5634)

2026-01-22 21:38:10 -05:00 · 2024-07-22 20:36:04 +03:00
parent 26fc4610a0
commit ee633c1988
3 changed files with 6 additions and 15 deletions
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -443,6 +443,12 @@ class HCQCompiled(Compiled):
    self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferOptions(cpu_access=True))
    self.kernargs_ptr:int = self.kernargs_page.va_addr

+  def synchronize(self):
+    self.timeline_signal.wait(self.timeline_value - 1)
+
+    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
+    if PROFILE: self._prof_process_events()
+
  def _gpu2cpu_time(self, gpu_time:float, is_copy:bool) -> float:
    """
    Translates local gpu time (timestamp) into global cpu time.
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -334,7 +334,6 @@ class AMDProgram(HCQProgram):
      q.exec(self, kernargs_ptr, global_size, local_size)

    q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-
    self.device.timeline_value += 1

    if wait:
@@ -471,12 +470,6 @@ class AMDDevice(HCQCompiled):
                        read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
                        doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))

-  def synchronize(self):
-    self.timeline_signal.wait(self.timeline_value - 1)
-
-    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
-    if PROFILE: self._prof_process_events()
-
  def invalidate_cache(self):
    AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
    self.timeline_value += 1
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -314,7 +314,6 @@ class NVProgram(HCQProgram):
      q.exec(self, kernargs_ptr, global_size, local_size)

    q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-
    self.device.timeline_value += 1

    if wait:
@@ -526,13 +525,6 @@ class NVDevice(HCQCompiled):
    self._setup_gpfifos()
    NVDevice.devices.append(self)

-  def synchronize(self):
-    self.timeline_signal.wait(self.timeline_value - 1)
-    self.cmdq_wptr = 0
-
-    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
-    if PROFILE: self._prof_process_events()
-
  def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
    notifier = self._gpu_system_alloc(48 << 20)
    params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,