diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 950ec4b724..27c7fac672 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -225,7 +225,7 @@ generate_libc() { sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py - sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py + sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py fixup $BASE/libc.py } diff --git a/extra/amdpci/am_smi.py b/extra/amdpci/am_smi.py index 4eb9907698..20bd1d8118 100755 --- a/extra/amdpci/am_smi.py +++ b/extra/amdpci/am_smi.py @@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import MMIOInterface from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager, AMPageTableEntry from tinygrad.runtime.support.am.ip import AM_SOC, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA -AM_VERSION = 0xA0000004 +AM_VERSION = 0xA0000005 def bold(s): return f"\033[1m{s}\033[0m" diff --git a/tinygrad/runtime/autogen/libc.py b/tinygrad/runtime/autogen/libc.py index fe955160ba..315c7c54f8 100644 --- a/tinygrad/runtime/autogen/libc.py +++ b/tinygrad/runtime/autogen/libc.py @@ -27,7 +27,7 @@ class FunctionFactoryStub: # You can either re-run clan2py with -l /path/to/library.so # Or manually fix this by comment the ctypes.CDLL loading _libraries = {} -_libraries['libc'] = None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path) # ctypes.CDLL('libc') +_libraries['libc'] = None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True) # ctypes.CDLL('libc') def string_cast(char_pointer, encoding='utf-8', errors='strict'): value = ctypes.cast(char_pointer, ctypes.c_char_p).value if value is not None and encoding is not None: diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index f4c5d8be9e..0d8643ae0f 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -571,7 +571,6 @@ class KFDIface: if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR: buf = addr = FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0) - assert addr != 0xffffffffffffffff try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf) @@ -711,6 +710,9 @@ class PCIIface: self._setup_adev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I')) self.doorbell_cpu_addr = dbell.addr + if first_dev: + FileIOInterface.anon_mmap((alloc:=self.adev.mm.va_allocator).base, alloc.size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, 0) + pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND) @@ -729,20 +731,18 @@ class PCIIface: def _map_pci_range(self, bar, off=0, addr=0, size=None, fmt='B'): fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1) libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK) - assert loc != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(addr)}" return MMIOInterface(loc, sz, fmt=fmt) def alloc(self, size:int, host=False, uncached=False, cpu_access=False): if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory. vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE) va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) - assert va != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(vaddr)}" # Read pagemap to get the physical address of each page. The pages are locked. self.pagemap.seek(va // mmap.PAGESIZE * 8) paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))] am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True) - return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access), + return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=True), view=MMIOInterface(am_mapping.va_addr, size, fmt='B')) am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index 1bbdfbd89a..7d818a9555 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -152,7 +152,7 @@ class AMPageTableTraverseContext: self.level_up() class AMMemoryManager: - va_allocator = TLSFAllocator(512 * (1 << 30), base=0x7F0000000000) # global for all devices. + va_allocator = TLSFAllocator(512 * (1 << 30), base=0x200000000000) # global for all devices. def __init__(self, adev:AMDev, vram_size:int): self.adev, self.vram_size = adev, vram_size @@ -265,7 +265,7 @@ class AMDev: # all blocks that are initialized only during the initial AM boot. # To determine if the GPU is in the third state, AM uses regSCRATCH_REG7 as a flag. self.is_booting, self.smi_dev = True, False # During boot only boot memory can be allocated. This flag is to validate this. - self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000004)) and (getenv("AM_RESET", 0) != 1) + self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000005)) and (getenv("AM_RESET", 0) != 1) # Memory manager & firmware self.mm = AMMemoryManager(self, self.vram_size) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index d24c2fb963..8dcf1548b5 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -26,7 +26,10 @@ class FileIOInterface: def __del__(self): if hasattr(self, 'fd'): os.close(self.fd) def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg) - def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset) + def mmap(self, start, sz, prot, flags, offset): + x = libc.mmap(start, sz, prot, flags, self.fd, offset) + if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}") + return x def read(self, size=None, binary=False, offset=None): if offset is not None: self.seek(offset) with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size) @@ -36,7 +39,10 @@ class FileIOInterface: def listdir(self): return os.listdir(self.path) def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET) @staticmethod - def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset) + def anon_mmap(start, sz, prot, flags, offset): + x = libc.mmap(start, sz, prot, flags, -1, offset) + if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}") + return x @staticmethod def munmap(buf, sz): return libc.munmap(buf, sz) @staticmethod