mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 13:58:00 -05:00
amd: sdma queue overrun fix (#11012)
* amd: sdma queue overrun fix * add () * fix * bug * this is correct
This commit is contained in:
@@ -386,9 +386,6 @@ class AMDCopyQueue(HWQueue):
|
||||
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
||||
|
||||
def _submit(self, dev:AMDDevice):
|
||||
# usb devices run in single-step mode, so they can't overrun the queue.
|
||||
if not dev.is_usb() and dev.sdma_queue.put_value - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
|
||||
|
||||
if self.binded_device == dev:
|
||||
# An IB packet must end on a 8 DW boundary.
|
||||
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
|
||||
@@ -404,7 +401,12 @@ class AMDCopyQueue(HWQueue):
|
||||
tail_blit_dword += cmdsz
|
||||
|
||||
# Force align of submits to hit our usb layer write cache.
|
||||
if dev.is_usb() and (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: tail_blit_dword = 0
|
||||
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0 and dev.is_usb(): tail_blit_dword = 0
|
||||
|
||||
# USB devices run in single-step mode, so they can't overrun the queue.
|
||||
total_bytes = (tail_blit_dword * 4 if rem_packet_cnt == 0 else -dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) + rem_packet_cnt * 4
|
||||
assert total_bytes < dev.sdma_queue.ring.nbytes, "SDMA queue overrun"
|
||||
while not dev.is_usb() and dev.sdma_queue.put_value + total_bytes - dev.sdma_queue.read_ptr > dev.sdma_queue.ring.nbytes: pass
|
||||
|
||||
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
|
||||
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
|
||||
@@ -718,6 +720,8 @@ class USBIface(PCIIface):
|
||||
return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.dev_impl.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
|
||||
read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
|
||||
|
||||
def sleep(self, timeout): pass
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
@@ -771,11 +775,11 @@ class AMDDevice(HCQCompiled):
|
||||
nbio_pad = (0,) if self.target[0] == 9 else ()
|
||||
self.nbio = AMDIP(nbio_name, self.iface.ip_versions[am.NBIF_HWIP], nbio_pad+self.iface.ip_offsets[am.NBIF_HWIP])
|
||||
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else 0x800000, eop_buffer_size=0x1000,
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x2000 if self.is_usb() else (16 << 20), eop_buffer_size=0x1000,
|
||||
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000)
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 1) else AMDRenderer(self.arch),
|
||||
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 1) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
|
||||
Reference in New Issue
Block a user