diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index a286a2d57a..efc43ab136 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -386,9 +386,6 @@ class AMDCopyQueue(HWQueue): self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)] def _submit(self, dev:AMDDevice): - # usb devices run in single-step mode, so they can't overrun the queue. - if not dev.is_usb() and dev.sdma_queue.put_value - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun") - if self.binded_device == dev: # An IB packet must end on a 8 DW boundary. add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8 @@ -404,7 +401,12 @@ class AMDCopyQueue(HWQueue): tail_blit_dword += cmdsz # Force align of submits to hit our usb layer write cache. - if dev.is_usb() and (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: tail_blit_dword = 0 + if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0 + + # USB devices run in single-step mode, so they can't overrun the queue. + total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4 + assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun" + while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4 dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword]) @@ -718,6 +720,8 @@ class USBIface(PCIIface): return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.dev_impl.doorbell64.view(doorbell_index * 8, 8, fmt='Q')], read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')]) + def sleep(self, timeout): pass + class AMDDevice(HCQCompiled): devices: ClassVar[list[HCQCompiled]] = [] signal_pages: ClassVar[list[HCQBuffer]] = [] @@ -771,11 +775,11 @@ class AMDDevice(HCQCompiled): nbio_pad = (0,) if self.target[0] == 9 else () self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], nbio_pad+self.iface.ip_offsets[am.NBIF_HWIP]) - self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else 0x800000, eop_buffer_size=0x1000, + self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000, ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size) max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000 - self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000) + self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20)) super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 1) else AMDRenderer(self.arch), AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 1) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),