amd: sdma queue overrun fix (#11012)

* amd: sdma queue overrun fix

* add ()

* fix

* bug

* this is correct
This commit is contained in:
nimlgen
2025-06-28 01:42:03 +03:00
committed by GitHub
parent f2548afeb5
commit e53673a0b2

View File

@@ -386,9 +386,6 @@ class AMDCopyQueue(HWQueue):
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
def _submit(self, dev:AMDDevice):
# usb devices run in single-step mode, so they can't overrun the queue.
if not dev.is_usb() and dev.sdma_queue.put_value - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
if self.binded_device == dev:
# An IB packet must end on a 8 DW boundary.
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
@@ -404,7 +401,12 @@ class AMDCopyQueue(HWQueue):
tail_blit_dword += cmdsz
# Force align of submits to hit our usb layer write cache.
if dev.is_usb() and (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: tail_blit_dword = 0
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0
# USB devices run in single-step mode, so they can't overrun the queue.
total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4
assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun"
while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
@@ -718,6 +720,8 @@ class USBIface(PCIIface):
return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.dev_impl.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
def sleep(self, timeout): pass
class AMDDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
@@ -771,11 +775,11 @@ class AMDDevice(HCQCompiled):
nbio_pad = (0,) if self.target[0] == 9 else ()
self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], nbio_pad+self.iface.ip_offsets[am.NBIF_HWIP])
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else 0x800000, eop_buffer_size=0x1000,
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000,
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000)
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 1) else AMDRenderer(self.arch),
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 1) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),