diff --git a/tinygrad/runtime/support/nv/ip.py b/tinygrad/runtime/support/nv/ip.py index 0332bea173..e74206c7d3 100644 --- a/tinygrad/runtime/support/nv/ip.py +++ b/tinygrad/runtime/support/nv/ip.py @@ -150,7 +150,7 @@ class NV_FLCN(NV_IP): patched_image[(cmd_off:=self.desc_v3.IMEMLoadSize+dmem.cmd_in_buffer_offset) : cmd_off+len(cmd)] = cmd patched_image[(sig_off:=self.desc_v3.IMEMLoadSize+self.desc_v3.PKCDataOffset) : sig_off+0x180] = signature[-0x180:] - return System.alloc_sysmem(len(patched_image), contiguous=True, data=patched_image) + return self.nvdev._alloc_sysmem(len(patched_image), contiguous=True, data=patched_image) _, self.frts_image_sysmem = __patch(0x15, bytes(frts_cmd)) @@ -163,7 +163,7 @@ class NV_FLCN(NV_IP): patched_image = bytearray(image) patched_image[patch_loc:patch_loc+sig_len] = sig[:sig_len] - _, self.booter_image_sysmem = System.alloc_sysmem(len(patched_image), contiguous=True, data=patched_image) + _, self.booter_image_sysmem = self.nvdev._alloc_sysmem(len(patched_image), contiguous=True, data=patched_image) _, _, self.booter_data_off, self.booter_data_sz, _, self.booter_code_off, self.booter_code_sz, _, _ = struct.unpack("9I", header) def init_hw(self): @@ -287,7 +287,7 @@ class NV_FLCN_COT(NV_IP): self.fmc_booter_hash = memoryview(self.nvdev.extract_fw("kgspBinArchiveGspRmFmcGfwProdSigned", "ucode_hash_data")).cast('I') self.fmc_booter_sig = memoryview(self.nvdev.extract_fw("kgspBinArchiveGspRmFmcGfwProdSigned", "ucode_sig_data")).cast('I') self.fmc_booter_pkey = memoryview(self.nvdev.extract_fw("kgspBinArchiveGspRmFmcGfwProdSigned", "ucode_pkey_data") + b'\x00\x00\x00').cast('I') - _, self.fmc_booter_sysmem = System.alloc_sysmem(len(self.fmc_booter_image), contiguous=True, data=self.fmc_booter_image) + _, self.fmc_booter_sysmem = self.nvdev._alloc_sysmem(len(self.fmc_booter_image), contiguous=True, data=self.fmc_booter_image) def init_hw(self): self.falcon = 0x00110000 @@ -344,7 +344,7 @@ class NV_GSP(NV_IP): # Alloc queues pte_cnt = ((queue_pte_cnt:=(queue_size * 2) // 0x1000)) + round_up(queue_pte_cnt * 8, 0x1000) // 0x1000 pt_size = round_up(pte_cnt * 8, 0x1000) - queues_view, queues_sysmem = System.alloc_sysmem(pt_size + queue_size * 2, contiguous=False) + queues_view, queues_sysmem = self.nvdev._alloc_sysmem(pt_size + queue_size * 2, contiguous=False) # Fill up ptes for i, sysmem in enumerate(queues_sysmem): queues_view.view(i * 0x8, 0x8, fmt='Q')[0] = sysmem @@ -364,8 +364,8 @@ class NV_GSP(NV_IP): self.cmd_q = NVRpcQueue(self, self.cmd_q_va, None) def init_libos_args(self): - _, logbuf_sysmem = System.alloc_sysmem((2 << 20), contiguous=True) - libos_args_view, self.libos_args_sysmem = System.alloc_sysmem(0x1000, contiguous=True) + _, logbuf_sysmem = self.nvdev._alloc_sysmem((2 << 20), contiguous=True) + libos_args_view, self.libos_args_sysmem = self.nvdev._alloc_sysmem(0x1000, contiguous=True) libos_structs = (nv.LibosMemoryRegionInitArgument * 6).from_address(libos_args_view.addr) for i, name in enumerate(["INIT", "INTR", "RM", "MNOC", "KRNL"]): @@ -387,7 +387,7 @@ class NV_GSP(NV_IP): for i in range(3, 0, -1): npages[i-1] = ((npages[i] - 1) >> (nv.LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2 - 3)) + 1 offsets = [sum(npages[:i]) * 0x1000 for i in range(4)] - radix_view, self.gsp_radix3_sysmem = System.alloc_sysmem(offsets[-1] + len(self.gsp_image), contiguous=False) + radix_view, self.gsp_radix3_sysmem = self.nvdev._alloc_sysmem(offsets[-1] + len(self.gsp_image), contiguous=False) # Copy image radix_view.view(offsets[-1], len(self.gsp_image))[:] = self.gsp_image @@ -398,12 +398,12 @@ class NV_GSP(NV_IP): radix_view.view(offsets[i], npages[i+1] * 8, fmt='Q')[:] = array.array('Q', self.gsp_radix3_sysmem[cur_offset:cur_offset+npages[i+1]]) # Copy signature - _, self.gsp_signature_sysmem = System.alloc_sysmem(len(signature), contiguous=True, data=signature) + _, self.gsp_signature_sysmem = self.nvdev._alloc_sysmem(len(signature), contiguous=True, data=signature) def init_boot_binary_image(self): self.booter_image = self.nvdev.extract_fw("kgspBinArchiveGspRmBoot", "ucode_image_prod_data") self.booter_desc = nv.RM_RISCV_UCODE_DESC.from_buffer_copy(self.nvdev.extract_fw("kgspBinArchiveGspRmBoot", "ucode_desc_prod_data")) - _, self.booter_sysmem = System.alloc_sysmem(len(self.booter_image), contiguous=True, data=self.booter_image) + _, self.booter_sysmem = self.nvdev._alloc_sysmem(len(self.booter_image), contiguous=True, data=self.booter_image) def init_wpr_meta(self): self.init_gsp_image() @@ -499,7 +499,7 @@ class NV_GSP(NV_IP): params.ramfcMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=ramfc_alloc.paddrs[0][0], size=0x200, addressSpace=2, cacheAttrib=0) params.instanceMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=ramfc_alloc.paddrs[0][0], size=0x1000, addressSpace=2, cacheAttrib=0) - _, method_sysmem = System.alloc_sysmem(0x5000, contiguous=True) + _, method_sysmem = self.nvdev._alloc_sysmem(0x5000, contiguous=True) params.mthdbufMem = nv_gpu.NV_MEMORY_DESC_PARAMS(base=method_sysmem[0], size=0x5000, addressSpace=1, cacheAttrib=0) if client is not None and client != self.priv_root and params.hObjectError != 0: diff --git a/tinygrad/runtime/support/nv/nvdev.py b/tinygrad/runtime/support/nv/nvdev.py index 0f06017f25..400f8b958b 100644 --- a/tinygrad/runtime/support/nv/nvdev.py +++ b/tinygrad/runtime/support/nv/nvdev.py @@ -3,7 +3,7 @@ import ctypes, time, functools, re, gzip, struct from tinygrad.helpers import getenv, DEBUG, fetch, getbits from tinygrad.runtime.support.memory import TLSFAllocator, MemoryManager, AddrSpace from tinygrad.runtime.support.nv.ip import NV_FLCN, NV_FLCN_COT, NV_GSP -from tinygrad.runtime.support.system import System, PCIDevice, PCIDevImplBase +from tinygrad.runtime.support.system import PCIDevice, PCIDevImplBase, MMIOInterface NV_DEBUG = getenv("NV_DEBUG", 0) @@ -140,8 +140,13 @@ class NVDev(PCIDevImplBase): self.mm = NVMemoryManager(self, self.vram_size, boot_size=(2 << 20), pt_t=NVPageTableEntry, va_bits=bits, va_shifts=shifts, va_base=0, palloc_ranges=[(x, x) for x in [512 << 20, 2 << 20, 4 << 10]], reserve_ptable=not self.large_bar) + def _alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False, data:bytes|None=None) -> tuple[MMIOInterface, list[int]]: + view, paddrs = self.pci_dev.alloc_sysmem(size, vaddr, contiguous=contiguous) + if data is not None: view[:size] = data + return view, paddrs + def _alloc_boot_struct(self, struct:ctypes.Structure) -> tuple[ctypes.Structure, int]: - view, paddrs = System.alloc_sysmem(sz:=ctypes.sizeof(type(struct)), contiguous=True) + view, paddrs = self._alloc_sysmem(sz:=ctypes.sizeof(type(struct)), contiguous=True) view[:sz] = bytes(struct) return type(struct).from_address(view.addr), paddrs[0] diff --git a/tinygrad/runtime/support/system.py b/tinygrad/runtime/support/system.py index 0da350b241..d307e8727a 100644 --- a/tinygrad/runtime/support/system.py +++ b/tinygrad/runtime/support/system.py @@ -79,20 +79,6 @@ class _System: self.pagemap.seek(vaddr // mmap.PAGESIZE * 8) return [(x & ((1<<55) - 1)) * mmap.PAGESIZE for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))] - def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False, data:bytes|None=None) -> tuple[MMIOInterface, list[int]]: - if OSX: - sysmem_view = System.iokit_pci_memmap(round_up(size, mmap.PAGESIZE)) - paddrs = list(itertools.takewhile(lambda p: p[1] != 0, zip(sysmem_view.view(fmt='Q')[0::2], sysmem_view.view(fmt='Q')[1::2]))) - assert not contiguous or len(paddrs) == 1, "not contiguous, but required" - else: - assert not contiguous or size <= (2 << 20), "Contiguous allocation is only supported for sizes up to 2MB" - flags = (libc.MAP_HUGETLB if contiguous and (size:=round_up(size, mmap.PAGESIZE)) > mmap.PAGESIZE else 0) | (MAP_FIXED if vaddr else 0) - va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS|MAP_POPULATE|MAP_LOCKED|flags, 0) - sysmem_view, paddrs = MMIOInterface(va, size), [(x, mmap.PAGESIZE) for x in self.system_paddrs(va, size)] - - if data is not None: sysmem_view[:len(data)] = data - return sysmem_view, [p + i for p, sz in paddrs for i in range(0, sz, 0x1000)][:ceildiv(size, 0x1000)] - def pci_scan_bus(self, target_vendor:int, target_devices:list[tuple[int, list[int]]], base_class:int|None=None) -> list[str]: result = [] for pcibus in FileIOInterface("/sys/bus/pci/devices").listdir(): @@ -210,6 +196,12 @@ class PCIDevice: res = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines() self.bar_info = {j:PCIBarInfo(int(s,16), int(e,16)-int(s,16)+1) for j,(s,e,_) in enumerate(l.split() for l in res)} + def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False) -> tuple[MMIOInterface, list[int]]: + assert not contiguous or size <= (2 << 20), "Contiguous allocation is only supported for sizes up to 2MB" + flags = (libc.MAP_HUGETLB if contiguous and (size:=round_up(size, mmap.PAGESIZE)) > mmap.PAGESIZE else 0) | (MAP_FIXED if vaddr else 0) + va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS|MAP_POPULATE|MAP_LOCKED|flags, 0) + sysmem_view, paddrs = MMIOInterface(va, size), [(x, mmap.PAGESIZE) for x in System.system_paddrs(va, size)] + return sysmem_view, [p + i for p, sz in paddrs for i in range(0, sz, 0x1000)][:ceildiv(size, 0x1000)] def read_config(self, offset:int, size:int): return int.from_bytes(self.cfg_fd.read(size, binary=True, offset=offset), byteorder='little') def write_config(self, offset:int, value:int, size:int): self.cfg_fd.write(value.to_bytes(size, byteorder='little'), binary=True, offset=offset) def map_bar(self, bar:int, off:int=0, addr:int=0, size:int|None=None, fmt='B') -> MMIOInterface: @@ -223,6 +215,11 @@ class APLPCIDevice(PCIDevice): self.lock_fd = System.flock_acquire(f"{devpref.lower()}_{pcibus.lower()}.lock") self.pcibus, self.bars = pcibus, {b: System.iokit_pci_memmap(b) for b in bars} self.bar_info = {b:PCIBarInfo(0, self.bars[b].nbytes-1 if b in self.bars else 0) for b in range(6)} # NOTE: fake bar info for nv. + def alloc_sysmem(self, size:int, vaddr:int=0, contiguous:bool=False) -> tuple[MMIOInterface, list[int]]: + sysmem_view = System.iokit_pci_memmap(round_up(size, mmap.PAGESIZE)) + paddrs = list(itertools.takewhile(lambda p: p[1] != 0, zip(sysmem_view.view(fmt='Q')[0::2], sysmem_view.view(fmt='Q')[1::2]))) + assert not contiguous or len(paddrs) == 1, "not contiguous, but required" + return sysmem_view, [p + i for p, sz in paddrs for i in range(0, sz, 0x1000)][:ceildiv(size, 0x1000)] def map_bar(self, bar:int, off:int=0, addr:int=0, size:int|None=None, fmt='B') -> MMIOInterface: return self.bars[bar].view(off, size, fmt) def read_config(self, offset:int, size:int): return System.iokit_pci_rpc(__TinyGPURPCReadCfg:=0, offset, size)[0] def write_config(self, offset:int, value:int, size:int): System.iokit_pci_rpc(__TinyGPURPCWriteCfg:=1, offset, size, value) @@ -261,7 +258,7 @@ class LNXPCIIfaceBase: should_use_sysmem = host or ((cpu_access if OSX else (uncached and cpu_access)) and not force_devmem) if should_use_sysmem: vaddr = self.dev_impl.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE) - memview, paddrs = System.alloc_sysmem(size, vaddr=vaddr, contiguous=contiguous) + memview, paddrs = self.pci_dev.alloc_sysmem(size, vaddr=vaddr, contiguous=contiguous) mapping = self.dev_impl.mm.map_range(vaddr, size, [(paddr, 0x1000) for paddr in paddrs], aspace=AddrSpace.SYS, snooped=True, uncached=True) return HCQBuffer(vaddr, size, meta=PCIAllocationMeta(mapping, has_cpu_mapping=True, hMemory=paddrs[0]), view=memview, owner=self.dev)