usbgpu: move queues to controller (#10194)

This commit is contained in:
nimlgen
2025-05-07 16:41:16 +03:00
committed by GitHub
parent 0fbe494c6b
commit c603b86d69

View File

@@ -401,6 +401,9 @@ class AMDCopyQueue(HWQueue):
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
tail_blit_dword += cmdsz
# Force align of submits to hit our usb layer write cache.
if dev.is_usb() and (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: tail_blit_dword = 0
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
dev.sdma_queue.put_value += tail_blit_dword * 4
@@ -502,7 +505,7 @@ class AMDQueueDesc:
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
# Flush hdp if queue is in dev mem.
if dev.is_am() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
if dev.is_am() and not dev.is_usb() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
for doorbell in self.doorbells: doorbell[0] = self.put_value
@dataclass(frozen=True)
@@ -814,12 +817,19 @@ class USBIface(PCIIface):
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'))
# special regions
copy_vaddr = self.adev.mm.alloc_vaddr(size=0x1000, align=0x1000)
self.copy_region = self.adev.mm.map_range(copy_vaddr, 0x1000, [(0x200000, 0x1000)], system=True, snooped=False, uncached=True)
self.copy_bufs = [HCQBuffer(copy_vaddr, 0x1000, meta=AMAllocationMeta(self.dev, [self.dev], self.copy_region, has_cpu_mapping=False),
view=USBMMIOInterface(self.usb, 0xf000, 0x1000, fmt='B', pcimem=False))]
self.copy_bufs = [self._new_dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x1000)]
self.sys_buf, self.sys_next_off = self._new_dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0
def _new_dma_region(self, ctrl_addr, sys_addr, size):
region = self.adev.mm.map_range(vaddr:=self.adev.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], system=True, snooped=False, uncached=True)
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], region, has_cpu_mapping=False),
view=USBMMIOInterface(self.usb, ctrl_addr, size, fmt='B', pcimem=False))
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size:
self.sys_next_off += size
return self.sys_buf.offset(self.sys_next_off - size, size)
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=False),
view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None)
@@ -832,10 +842,8 @@ class USBIface(PCIIface):
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
return AMDQueueDesc(ring=self.adev.vram.view(ring.meta.mapping.paddrs[0][0], 0x8000, fmt='I'),
doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
read_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0], 8, fmt='Q')],
write_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0]+0x10, 8, fmt='Q')])
return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
class AMDDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
@@ -892,12 +900,12 @@ class AMDDevice(HCQCompiled):
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x8000 if self.is_usb() else 0x800000)
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000)
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20))
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
# Scratch setup
self.max_private_segment_size = 0
@@ -927,7 +935,7 @@ class AMDDevice(HCQCompiled):
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
gart = self.dev_iface.alloc(0x100, uncached=True, cpu_access=True)
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.dev_iface.props.get('num_xcc', 1), mmap.PAGESIZE)
cwsr_buffer = self.dev_iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None