mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
usbgpu: rebar (#10275)
* usbgpu: rebar * cache back * revert this * fix * ugh * tt
This commit is contained in:
@@ -813,7 +813,7 @@ class USBIface(PCIIface):
|
||||
def __init__(self, dev, dev_id):
|
||||
self.dev = dev
|
||||
self.usb = ASM24Controller()
|
||||
self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x40000000, pref_mem_base=0x10000000)
|
||||
self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30))
|
||||
|
||||
self._setup_adev(f"usb:{dev_id}", USBMMIOInterface(self.usb, *self.bars[0], fmt='B'), USBMMIOInterface(self.usb, *self.bars[2], fmt='Q'),
|
||||
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'))
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib, array
|
||||
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib
|
||||
from tinygrad.helpers import mv_address, getenv, round_up, DEBUG, temp, fetch
|
||||
from tinygrad.runtime.autogen.am import am, mp_11_0
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface
|
||||
@@ -350,15 +350,6 @@ class AMDev:
|
||||
if ((rval:=reg.read()) & mask) == value: return rval
|
||||
raise RuntimeError(f'wait_reg timeout reg=0x{reg.addr:X} mask=0x{mask:X} value=0x{value:X} last_val=0x{rval}')
|
||||
|
||||
def _read_vram(self, addr, size) -> bytes:
|
||||
assert addr % 4 == 0 and size % 4 == 0, f"Invalid address {addr:#x} or size {size:#x}"
|
||||
res = []
|
||||
for caddr in range(addr, addr + size, 4):
|
||||
self.wreg(0x06, caddr >> 31)
|
||||
self.wreg(0x00, (caddr & 0x7FFFFFFF) | 0x80000000)
|
||||
res.append(self.rreg(0x01))
|
||||
return bytes(array.array('I', res))
|
||||
|
||||
def _run_discovery(self):
|
||||
# NOTE: Fixed register to query memory size without known ip bases to find the discovery table.
|
||||
# The table is located at the end of VRAM - 64KB and is 10KB in size.
|
||||
@@ -366,8 +357,7 @@ class AMDev:
|
||||
self.vram_size = self.rreg(mmRCC_CONFIG_MEMSIZE) << 20
|
||||
tmr_offset, tmr_size = self.vram_size - (64 << 10), (10 << 10)
|
||||
|
||||
disc_tbl = self._read_vram(tmr_offset, tmr_size) if self.vram.nbytes < self.vram_size else self.vram.view(tmr_offset, tmr_size)[:]
|
||||
self.bhdr = am.struct_binary_header.from_buffer(bytearray(disc_tbl))
|
||||
self.bhdr = am.struct_binary_header.from_buffer(bytearray(self.vram.view(tmr_offset, tmr_size)[:]))
|
||||
ihdr = am.struct_ip_discovery_header.from_address(ctypes.addressof(self.bhdr) + self.bhdr.table_list[am.IP_DISCOVERY].offset)
|
||||
assert ihdr.signature == am.DISCOVERY_TABLE_SIGNATURE and not ihdr.base_addr_64_bit, f"0x{ihdr.signature:X} != 0x{am.DISCOVERY_TABLE_SIGNATURE:X}"
|
||||
|
||||
|
||||
@@ -42,29 +42,46 @@ def setup_pci_bars(usb:ASM24Controller, gpu_bus:int, mem_base:int, pref_mem_base
|
||||
buses = (0 << 0) | ((bus+1) << 8) | ((gpu_bus) << 16)
|
||||
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=buses, size=4)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xf000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=pref_mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=(mem_base>>16) & 0xffff, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=(pref_mem_base>>16) & 0xffff, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_BASE_UPPER32, bus=bus, dev=0, fn=0, value=pref_mem_base >> 32, size=4)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_LIMIT_UPPER32, bus=bus, dev=0, fn=0, value=0xffffffff, size=4)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
|
||||
|
||||
# resize bar 0
|
||||
cap_ptr = 0x100
|
||||
while cap_ptr:
|
||||
if pci.PCI_EXT_CAP_ID(hdr:=usb.pcie_cfg_req(cap_ptr, bus=gpu_bus, dev=0, fn=0, size=4)) == pci.PCI_EXT_CAP_ID_REBAR:
|
||||
cap = usb.pcie_cfg_req(cap_ptr + 0x04, bus=gpu_bus, dev=0, fn=0, size=4)
|
||||
new_ctrl = (usb.pcie_cfg_req(cap_ptr + 0x08, bus=gpu_bus, dev=0, fn=0, size=4) & ~0x1F00) | ((int(cap >> 4).bit_length() - 1) << 8)
|
||||
usb.pcie_cfg_req(cap_ptr + 0x08, bus=gpu_bus, dev=0, fn=0, value=new_ctrl, size=4)
|
||||
|
||||
cap_ptr = pci.PCI_EXT_CAP_NEXT(hdr)
|
||||
|
||||
mem_space_addr, bar_off, bars = [mem_base, pref_mem_base], 0, {}
|
||||
while bar_off < 24:
|
||||
cfg = usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4)
|
||||
bar_mem, bar_space = bool(cfg & pci.PCI_BASE_ADDRESS_MEM_PREFETCH), cfg & pci.PCI_BASE_ADDRESS_SPACE
|
||||
bar_mem, bar_64 = bool(cfg & pci.PCI_BASE_ADDRESS_MEM_PREFETCH), cfg & pci.PCI_BASE_ADDRESS_MEM_TYPE_64
|
||||
|
||||
if bar_space == pci.PCI_BASE_ADDRESS_SPACE_MEMORY:
|
||||
if (cfg & pci.PCI_BASE_ADDRESS_SPACE) == pci.PCI_BASE_ADDRESS_SPACE_MEMORY:
|
||||
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=0xffffffff, size=4)
|
||||
bar_size = 0xffffffff - (usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4) & 0xfffffff0) + 1
|
||||
lo = (usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4) & 0xfffffff0)
|
||||
|
||||
if bar_64: usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, value=0xffffffff, size=4)
|
||||
hi = (usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, size=4) if bar_64 else 0)
|
||||
|
||||
bar_size = ((~(((hi << 32) | lo) & ~0xf)) + 1) & (0xffffffffffffffff if bar_64 else 0xffffffff)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=mem_space_addr[bar_mem] & 0xffffffff, size=4)
|
||||
if bar_64: usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, value=mem_space_addr[bar_mem] >> 32, size=4)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=mem_space_addr[bar_mem], size=4)
|
||||
bars[bar_off // 4] = (mem_space_addr[bar_mem], bar_size)
|
||||
mem_space_addr[bar_mem] += round_up(bar_size, 2 << 20)
|
||||
|
||||
# 64bit bar, zero out the upper 32 bits
|
||||
if bar_space == pci.PCI_BASE_ADDRESS_MEM_TYPE_64: usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, value=0,size=4)
|
||||
bar_off += 8 if cfg & pci.PCI_BASE_ADDRESS_MEM_TYPE_64 else 4
|
||||
bar_off += 8 if bar_64 else 4
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=gpu_bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
|
||||
return bars
|
||||
|
||||
@@ -167,17 +167,17 @@ class ASM24Controller:
|
||||
|
||||
def _is_pci_cacheable(self, addr:int) -> bool: return any(x <= addr <= x + sz for x, sz in self._pci_cacheable)
|
||||
def pcie_prep_request(self, fmt_type:int, address:int, value:int|None=None, size:int=4) -> list[WriteOp]:
|
||||
if fmt_type == 0x40 and size == 4 and self._is_pci_cacheable(address) and self._pci_cache.get(address) == value: return []
|
||||
if fmt_type == 0x60 and size == 4 and self._is_pci_cacheable(address) and self._pci_cache.get(address) == value: return []
|
||||
|
||||
assert fmt_type >> 8 == 0 and size > 0 and size <= 4, f"Invalid fmt_type {fmt_type} or size {size}"
|
||||
if DEBUG >= 3: print("pcie_request", hex(fmt_type), hex(address), value, size)
|
||||
|
||||
masked_address, offset = address & 0xFFFFFFFC, address & 0x3
|
||||
assert size + offset <= 4 and (value is None or value >> (8 * size) == 0)
|
||||
self._pci_cache[masked_address] = value if size == 4 and fmt_type == 0x40 else None
|
||||
self._pci_cache[masked_address] = value if size == 4 and fmt_type == 0x60 else None
|
||||
|
||||
return ([WriteOp(0xB220, struct.pack('>I', value << (8 * offset)), ignore_cache=False)] if value is not None else []) + \
|
||||
[WriteOp(0xB218, struct.pack('>I', masked_address), ignore_cache=False),
|
||||
[WriteOp(0xB218, struct.pack('>I', masked_address), ignore_cache=False), WriteOp(0xB21c, struct.pack('>I', address>>32), ignore_cache=False),
|
||||
WriteOp(0xB217, bytes([((1 << size) - 1) << offset]), ignore_cache=False), WriteOp(0xB210, bytes([fmt_type]), ignore_cache=False),
|
||||
WriteOp(0xB254, b"\x0f", ignore_cache=True), WriteOp(0xB296, b"\x04", ignore_cache=True)]
|
||||
|
||||
@@ -206,7 +206,7 @@ class ASM24Controller:
|
||||
|
||||
# Handle completion errors or inconsistencies
|
||||
if status or ((fmt_type & 0xbe == 0x04) and (((value is None) and (not (b284 & 0x01))) or ((value is not None) and (b284 & 0x01)))):
|
||||
status_map = {0b001: "Unsupported Request: invalid address/function (target might not be reachable)",
|
||||
status_map = {0b001: f"Unsupported Request: invalid address/function (target might not be reachable): {address:#x}",
|
||||
0b100: "Completer Abort: abort due to internal error", 0b010: "Configuration Request Retry Status: configuration space busy"}
|
||||
raise RuntimeError(f"TLP status: {status_map.get(status, 'Reserved (0b{:03b})'.format(status))}")
|
||||
|
||||
@@ -219,10 +219,10 @@ class ASM24Controller:
|
||||
address = (bus << 24) | (dev << 19) | (fn << 16) | (byte_addr & 0xfff)
|
||||
return self.pcie_request(fmt_type, address, value, size)
|
||||
|
||||
def pcie_mem_req(self, address, value=None, size=4): return self.pcie_request(0x40 if value is not None else 0x0, address, value, size)
|
||||
def pcie_mem_req(self, address, value=None, size=4): return self.pcie_request(0x60 if value is not None else 0x20, address, value, size)
|
||||
|
||||
def pcie_mem_write(self, address, values, size):
|
||||
ops = [self.pcie_prep_request(0x40, address + i * size, value, size) for i, value in enumerate(values)]
|
||||
ops = [self.pcie_prep_request(0x60, address + i * size, value, size) for i, value in enumerate(values)]
|
||||
|
||||
# Send in batches of 4
|
||||
for i in range(0, len(ops), 4): self.exec_ops(list(itertools.chain.from_iterable(ops[i:i+4])))
|
||||
|
||||
Reference in New Issue
Block a user