diff --git a/tinygrad/device.py b/tinygrad/device.py
index 9c72669d3d..02ac91fcf4 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -443,6 +443,12 @@ class HCQCompiled(Compiled):
     self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferOptions(cpu_access=True))
     self.kernargs_ptr:int = self.kernargs_page.va_addr
 
+  def synchronize(self):
+    self.timeline_signal.wait(self.timeline_value - 1)
+
+    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
+    if PROFILE: self._prof_process_events()
+
   def _gpu2cpu_time(self, gpu_time:float, is_copy:bool) -> float:
     """
     Translates local gpu time (timestamp) into global cpu time.
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index dbe5848118..1c4d2fd4ed 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -334,7 +334,6 @@ class AMDProgram(HCQProgram):
       q.exec(self, kernargs_ptr, global_size, local_size)
 
     q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-
     self.device.timeline_value += 1
 
     if wait:
@@ -471,12 +470,6 @@ class AMDDevice(HCQCompiled):
                         read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
                         doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
 
-  def synchronize(self):
-    self.timeline_signal.wait(self.timeline_value - 1)
-
-    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
-    if PROFILE: self._prof_process_events()
-
   def invalidate_cache(self):
     AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self)
     self.timeline_value += 1
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index 8179c22f08..d7e9dd2783 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -314,7 +314,6 @@ class NVProgram(HCQProgram):
       q.exec(self, kernargs_ptr, global_size, local_size)
 
     q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
-
     self.device.timeline_value += 1
 
     if wait:
@@ -526,13 +525,6 @@ class NVDevice(HCQCompiled):
     self._setup_gpfifos()
     NVDevice.devices.append(self)
 
-  def synchronize(self):
-    self.timeline_signal.wait(self.timeline_value - 1)
-    self.cmdq_wptr = 0
-
-    if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
-    if PROFILE: self._prof_process_events()
-
   def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
     notifier = self._gpu_system_alloc(48 << 20)
     params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,