mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
usbgpu: move queues to controller (#10194)
This commit is contained in:
@@ -401,6 +401,9 @@ class AMDCopyQueue(HWQueue):
|
||||
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
|
||||
tail_blit_dword += cmdsz
|
||||
|
||||
# Force align of submits to hit our usb layer write cache.
|
||||
if dev.is_usb() and (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0: tail_blit_dword = 0
|
||||
|
||||
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
|
||||
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
|
||||
dev.sdma_queue.put_value += tail_blit_dword * 4
|
||||
@@ -502,7 +505,7 @@ class AMDQueueDesc:
|
||||
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
||||
|
||||
# Flush hdp if queue is in dev mem.
|
||||
if dev.is_am() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
if dev.is_am() and not dev.is_usb() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
for doorbell in self.doorbells: doorbell[0] = self.put_value
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -814,12 +817,19 @@ class USBIface(PCIIface):
|
||||
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'))
|
||||
|
||||
# special regions
|
||||
copy_vaddr = self.adev.mm.alloc_vaddr(size=0x1000, align=0x1000)
|
||||
self.copy_region = self.adev.mm.map_range(copy_vaddr, 0x1000, [(0x200000, 0x1000)], system=True, snooped=False, uncached=True)
|
||||
self.copy_bufs = [HCQBuffer(copy_vaddr, 0x1000, meta=AMAllocationMeta(self.dev, [self.dev], self.copy_region, has_cpu_mapping=False),
|
||||
view=USBMMIOInterface(self.usb, 0xf000, 0x1000, fmt='B', pcimem=False))]
|
||||
self.copy_bufs = [self._new_dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x1000)]
|
||||
self.sys_buf, self.sys_next_off = self._new_dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0
|
||||
|
||||
def _new_dma_region(self, ctrl_addr, sys_addr, size):
|
||||
region = self.adev.mm.map_range(vaddr:=self.adev.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], system=True, snooped=False, uncached=True)
|
||||
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], region, has_cpu_mapping=False),
|
||||
view=USBMMIOInterface(self.usb, ctrl_addr, size, fmt='B', pcimem=False))
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
||||
if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size:
|
||||
self.sys_next_off += size
|
||||
return self.sys_buf.offset(self.sys_next_off - size, size)
|
||||
|
||||
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
||||
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=False),
|
||||
view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None)
|
||||
@@ -832,10 +842,8 @@ class USBIface(PCIIface):
|
||||
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
||||
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
||||
|
||||
return AMDQueueDesc(ring=self.adev.vram.view(ring.meta.mapping.paddrs[0][0], 0x8000, fmt='I'),
|
||||
doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
|
||||
read_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0], 8, fmt='Q')],
|
||||
write_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0]+0x10, 8, fmt='Q')])
|
||||
return AMDQueueDesc(ring=ring.cpu_view().view(fmt='I'), doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
|
||||
read_ptrs=[gart.cpu_view().view(size=8, fmt='Q')], write_ptrs=[gart.cpu_view().view(offset=0x10, size=8, fmt='Q')])
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
@@ -892,12 +900,12 @@ class AMDDevice(HCQCompiled):
|
||||
ctx_save_restore_size=0 if self.is_am() else wg_data_size + ctl_stack_size, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x8000 if self.is_usb() else 0x800000)
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else 0x800000)
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
|
||||
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
|
||||
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20))
|
||||
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000)
|
||||
|
||||
# Scratch setup
|
||||
self.max_private_segment_size = 0
|
||||
@@ -927,7 +935,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
||||
gart = self.dev_iface.alloc(0x100, uncached=True, cpu_access=True)
|
||||
|
||||
cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.dev_iface.props.get('num_xcc', 1), mmap.PAGESIZE)
|
||||
cwsr_buffer = self.dev_iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None
|
||||
|
||||
Reference in New Issue
Block a user