am: reserve address space (#10564)

* am: reserve address space

* f

* cc

* errno

* fix

* always has cpu mapping
This commit is contained in:
nimlgen
2025-05-30 19:31:03 +03:00
committed by GitHub
parent e0305e54fc
commit 883bb4541c
6 changed files with 17 additions and 11 deletions

View File

@@ -225,7 +225,7 @@ generate_libc() {
sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py
sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py
sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py
sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py
fixup $BASE/libc.py
}

View File

@@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import MMIOInterface
from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager, AMPageTableEntry
from tinygrad.runtime.support.am.ip import AM_SOC, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA
AM_VERSION = 0xA0000004
AM_VERSION = 0xA0000005
def bold(s): return f"\033[1m{s}\033[0m"

View File

@@ -27,7 +27,7 @@ class FunctionFactoryStub:
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries = {}
_libraries['libc'] = None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path) # ctypes.CDLL('libc')
_libraries['libc'] = None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True) # ctypes.CDLL('libc')
def string_cast(char_pointer, encoding='utf-8', errors='strict'):
value = ctypes.cast(char_pointer, ctypes.c_char_p).value
if value is not None and encoding is not None:

View File

@@ -571,7 +571,6 @@ class KFDIface:
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
buf = addr = FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
assert addr != 0xffffffffffffffff
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
flags=flags, mmap_offset=buf)
@@ -711,6 +710,9 @@ class PCIIface:
self._setup_adev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I'))
self.doorbell_cpu_addr = dbell.addr
if first_dev:
FileIOInterface.anon_mmap((alloc:=self.adev.mm.va_allocator).base, alloc.size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, 0)
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
@@ -729,20 +731,18 @@ class PCIIface:
def _map_pci_range(self, bar, off=0, addr=0, size=None, fmt='B'):
fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
assert loc != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(addr)}"
return MMIOInterface(loc, sz, fmt=fmt)
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
assert va != 0xffffffffffffffff, f"Failed to mmap {size} bytes at {hex(vaddr)}"
# Read pagemap to get the physical address of each page. The pages are locked.
self.pagemap.seek(va // mmap.PAGESIZE * 8)
paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access),
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=True),
view=MMIOInterface(am_mapping.va_addr, size, fmt='B'))
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)

View File

@@ -152,7 +152,7 @@ class AMPageTableTraverseContext:
self.level_up()
class AMMemoryManager:
va_allocator = TLSFAllocator(512 * (1 << 30), base=0x7F0000000000) # global for all devices.
va_allocator = TLSFAllocator(512 * (1 << 30), base=0x200000000000) # global for all devices.
def __init__(self, adev:AMDev, vram_size:int):
self.adev, self.vram_size = adev, vram_size
@@ -265,7 +265,7 @@ class AMDev:
# all blocks that are initialized only during the initial AM boot.
# To determine if the GPU is in the third state, AM uses regSCRATCH_REG7 as a flag.
self.is_booting, self.smi_dev = True, False # During boot only boot memory can be allocated. This flag is to validate this.
self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000004)) and (getenv("AM_RESET", 0) != 1)
self.partial_boot = (self.reg("regSCRATCH_REG7").read() == (am_version:=0xA0000005)) and (getenv("AM_RESET", 0) != 1)
# Memory manager & firmware
self.mm = AMMemoryManager(self, self.vram_size)

View File

@@ -26,7 +26,10 @@ class FileIOInterface:
def __del__(self):
if hasattr(self, 'fd'): os.close(self.fd)
def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg)
def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset)
def mmap(self, start, sz, prot, flags, offset):
x = libc.mmap(start, sz, prot, flags, self.fd, offset)
if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}")
return x
def read(self, size=None, binary=False, offset=None):
if offset is not None: self.seek(offset)
with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size)
@@ -36,7 +39,10 @@ class FileIOInterface:
def listdir(self): return os.listdir(self.path)
def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET)
@staticmethod
def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset)
def anon_mmap(start, sz, prot, flags, offset):
x = libc.mmap(start, sz, prot, flags, -1, offset)
if x == 0xffffffffffffffff: raise OSError(f"Failed to mmap {sz} bytes at {hex(start)}: {os.strerror(ctypes.get_errno())}")
return x
@staticmethod
def munmap(buf, sz): return libc.munmap(buf, sz)
@staticmethod