diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index b6a4925795..1655493e16 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -925,7 +925,8 @@ class USBIface(PCIIface): return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(region, has_cpu_mapping=False), view=self.pci_dev.dma_view(ctrl_addr, size), owner=self.dev) def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, force_devmem=False, **kwargs) -> HCQBuffer: - if (host or (uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size: + # custom usb allocates uncached and cpu_access in vram. vram writes are faster than sram writes + if (host or (not self.pci_dev.usb.usb.is_custom and uncached and cpu_access)) and self.sys_next_off + size < self.sys_buf.size: self.sys_next_off += size return self.sys_buf.offset(self.sys_next_off - size, size) diff --git a/tinygrad/runtime/support/usb.py b/tinygrad/runtime/support/usb.py index 5a127fe936..8d933d9d92 100644 --- a/tinygrad/runtime/support/usb.py +++ b/tinygrad/runtime/support/usb.py @@ -175,8 +175,7 @@ class CustomASM24Controller: self._pci_cacheable: list[tuple[int, int]] = [] self._pci_cache: dict[int, int|None] = {} - # Pre-allocate buffers for _f0_out (12 bytes) and _f0_in (8 bytes) - self._f0_out_buf, self._f0_out_mv = alloc_cbuffer(12) + self._f0_out_buf, self._f0_out_mv = alloc_cbuffer(0x1000) # for f0 and e4, allocate big enough for e4 self._f0_in_buf, _ = alloc_cbuffer(8) # Verify custom firmware is running and PCIe link is up (LTSSM=0x78).