diff --git a/tinygrad/device.py b/tinygrad/device.py index 9c72669d3d..02ac91fcf4 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -443,6 +443,12 @@ class HCQCompiled(Compiled): self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferOptions(cpu_access=True)) self.kernargs_ptr:int = self.kernargs_page.va_addr + def synchronize(self): + self.timeline_signal.wait(self.timeline_value - 1) + + if self.timeline_value > (1 << 31): self._wrap_timeline_signal() + if PROFILE: self._prof_process_events() + def _gpu2cpu_time(self, gpu_time:float, is_copy:bool) -> float: """ Translates local gpu time (timestamp) into global cpu time. diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index dbe5848118..1c4d2fd4ed 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -334,7 +334,6 @@ class AMDProgram(HCQProgram): q.exec(self, kernargs_ptr, global_size, local_size) q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) - self.device.timeline_value += 1 if wait: @@ -471,12 +470,6 @@ class AMDDevice(HCQCompiled): read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"), doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q")) - def synchronize(self): - self.timeline_signal.wait(self.timeline_value - 1) - - if self.timeline_value > (1 << 31): self._wrap_timeline_signal() - if PROFILE: self._prof_process_events() - def invalidate_cache(self): AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.timeline_value).submit(self) self.timeline_value += 1 diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 8179c22f08..d7e9dd2783 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -314,7 +314,6 @@ class NVProgram(HCQProgram): q.exec(self, kernargs_ptr, global_size, local_size) q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device) - self.device.timeline_value += 1 if wait: @@ -526,13 +525,6 @@ class NVDevice(HCQCompiled): self._setup_gpfifos() NVDevice.devices.append(self) - def synchronize(self): - self.timeline_signal.wait(self.timeline_value - 1) - self.cmdq_wptr = 0 - - if self.timeline_value > (1 << 31): self._wrap_timeline_signal() - if PROFILE: self._prof_process_events() - def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo: notifier = self._gpu_system_alloc(48 << 20) params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,