diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index afa4802a80..102657e1d2 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -820,8 +820,8 @@ class USBIface(PCIIface): self.usb._pci_cacheable += [self.bars[2]] # doorbell region is cacheable # special regions - self.copy_bufs = [self._new_dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x4000)] - self.sys_buf, self.sys_next_off = self._new_dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0 + self.copy_bufs = [self._new_dma_region(ctrl_addr=0xf000, sys_addr=0x200000, size=0x80000)] + self.sys_buf, self.sys_next_off = self._new_dma_region(ctrl_addr=0xa000, sys_addr=0x820000, size=0x1000), 0x800 def _new_dma_region(self, ctrl_addr, sys_addr, size): region = self.adev.mm.map_range(vaddr:=self.adev.mm.alloc_vaddr(size=size), size, [(sys_addr, size)], system=True, snooped=False, uncached=True) diff --git a/tinygrad/runtime/support/usb.py b/tinygrad/runtime/support/usb.py index d99d1e3334..43fd0674bc 100644 --- a/tinygrad/runtime/support/usb.py +++ b/tinygrad/runtime/support/usb.py @@ -119,8 +119,8 @@ class ASM24Controller: self._pci_cache: dict[int, int|None] = {} # Init controller. - self.exec_ops([WriteOp(0x54b, b' '), WriteOp(0x5a8, b'\x02'), WriteOp(0x5f8, b'\x04'), WriteOp(0x7ec, b'\x01\x00\x00\x00'), - WriteOp(0xc422, b'\x02'), WriteOp(0x0, b'\x33')]) + self.exec_ops([WriteOp(0x54b, b' '), WriteOp(0x54e, b'\x04'), WriteOp(0x5a8, b'\x02'), WriteOp(0x5f8, b'\x04'), + WriteOp(0x7ec, b'\x01\x00\x00\x00'), WriteOp(0xc422, b'\x02'), WriteOp(0x0, b'\x33')]) def exec_ops(self, ops:Sequence[WriteOp|ReadOp|ScsiWriteOp]): cdbs:list[bytes] = [] @@ -152,7 +152,14 @@ class ASM24Controller: def write(self, base_addr:int, data:bytes, ignore_cache:bool=True): return self.exec_ops([WriteOp(base_addr, data, ignore_cache)]) def scsi_write(self, buf:bytes, lba:int=0): - self.exec_ops([ScsiWriteOp(buf, lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True), WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)]) + if len(buf) > 0x4000: buf += b'\x00' * (round_up(len(buf), 0x10000) - len(buf)) + + for i in range(0, len(buf), 0x10000): + self.exec_ops([ScsiWriteOp(buf[i:i+0x10000], lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True)]) + self.exec_ops([WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)]) + + if len(buf) > 0x4000: + for i in range(4): self.exec_ops([WriteOp(0xce40 + i, b'\x00', ignore_cache=True)]) def read(self, base_addr:int, length:int, stride:int=0xff) -> bytes: parts = self.exec_ops([ReadOp(base_addr + off, min(stride, length - off)) for off in range(0, length, stride)]) @@ -253,7 +260,7 @@ class USBMMIOInterface(MMIOInterface): if not self.pcimem: # Fast path for writing into buffer 0xf000 - use_cache = 0xa000 <= self.addr <= 0xb200 + use_cache = 0xa800 <= self.addr <= 0xb000 return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data), ignore_cache=not use_cache) _, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))