mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
hw interface abstraction (#8524)
* use HWInterface in autogen * mockgpu * HWInterface * more HWInterface * fix * fix * old code * fix * implicit field definition * add offset check to mockgpu too * refactor * forgot to pass flags + read rewrite * test * play with vfio * nv: this should be kept * try this * vfio * rm overwrite=True * linetr * do not reinit kfd * minor * mypy * mock * init them once --------- Co-authored-by: patrini32 <patrini23@proton.me>
This commit is contained in:
@@ -79,6 +79,10 @@ generate_kfd() {
|
||||
|
||||
fixup $BASE/kfd.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kfd.py
|
||||
sed -i "s\import fcntl, functools\import functools" $BASE/kfd.py
|
||||
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/kfd.py
|
||||
sed -i "s\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs):\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs):\g" $BASE/kfd.py
|
||||
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\__fd.ioctl((__idir<<30)\g" $BASE/kfd.py
|
||||
python3 -c "import tinygrad.runtime.autogen.kfd"
|
||||
}
|
||||
|
||||
@@ -263,6 +267,10 @@ generate_vfio() {
|
||||
/usr/include/linux/vfio.h \
|
||||
-o $BASE/vfio.py
|
||||
fixup $BASE/vfio.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py
|
||||
sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py
|
||||
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/vfio.py
|
||||
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py
|
||||
}
|
||||
|
||||
generate_am() {
|
||||
|
||||
@@ -1,95 +1,36 @@
|
||||
import ctypes, struct, os
|
||||
from typing import Any
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.helpers import round_up
|
||||
|
||||
class VirtFileDesc:
|
||||
def __init__(self, fd): self.fd, self.off = fd, 0
|
||||
def read(self, fd, buf, sz): raise NotImplementedError()
|
||||
def ioctl(self, fd, req, argp): raise NotImplementedError()
|
||||
def mmap(self, st, sz, prot, flags, fd, off): raise NotImplementedError()
|
||||
def write(self, fd, buf, sz): raise NotImplementedError()
|
||||
def lseek(self, fd, off, whence): raise NotImplementedError()
|
||||
def fstat(self, fd, buf): raise NotImplementedError()
|
||||
def getdents(self, fd, buf, sz): return -1
|
||||
def close(self, fd): return 0
|
||||
|
||||
class TextFileDesc(VirtFileDesc):
|
||||
def __init__(self, fd, text):
|
||||
super().__init__(fd)
|
||||
self.content = ctypes.create_string_buffer(text.encode())
|
||||
self.sz = len(self.content) - 1
|
||||
self.content = text
|
||||
|
||||
def ioctl(self, fd, req, argp): return 0
|
||||
def write(self, fd, buf, sz): return -1
|
||||
def read(self, fd, buf, sz):
|
||||
ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, rdsz:=min(sz, self.sz - self.off))
|
||||
self.off += rdsz
|
||||
return rdsz
|
||||
def lseek(self, fd, off, whence):
|
||||
if whence == os.SEEK_SET: self.off = off
|
||||
elif whence == os.SEEK_CUR: self.off += off
|
||||
elif whence == os.SEEK_END: self.off = self.sz + off
|
||||
else: return -1
|
||||
return 0
|
||||
def fstat(self, fd, buf):
|
||||
ctypes.memmove(buf, VirtFile.build_fstat(st_size=self.sz), 88)
|
||||
return 0
|
||||
|
||||
def read_contents(self, size=None):
|
||||
ret = self.content[self.off:self.off+(size or len(self.content))]
|
||||
self.off += (size or len(self.content))
|
||||
return ret
|
||||
def seek(self, offset): self.off += offset
|
||||
class DirFileDesc(VirtFileDesc):
|
||||
def __init__(self, fd, child_names):
|
||||
super().__init__(fd)
|
||||
child_names = ['.', '..'] + child_names
|
||||
|
||||
tmp = b''
|
||||
for ino, name in enumerate(child_names):
|
||||
tmp += VirtFile.build_dirent(ino + 1, 0, name)
|
||||
self.content = ctypes.create_string_buffer(tmp)
|
||||
self.sz = len(self.content) - 1
|
||||
self.child_names = child_names
|
||||
|
||||
def ioctl(self, fd, req, argp): return 0
|
||||
def write(self, fd, buf, sz): return -1
|
||||
def read(self, fd, buf, sz): return -1
|
||||
def lseek(self, fd, off, whence):
|
||||
if whence == os.SEEK_SET: self.off = off
|
||||
elif whence == os.SEEK_CUR: self.off += off
|
||||
elif whence == os.SEEK_END: self.off = self.sz + off
|
||||
else: return -1
|
||||
return 0
|
||||
|
||||
def getdents(self, fd, buf, sz):
|
||||
if self.sz == self.off: return 0
|
||||
if sz < self.sz: return -1
|
||||
ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, self.sz)
|
||||
self.off = self.sz
|
||||
return self.sz
|
||||
|
||||
def fstat(self, fd, buf):
|
||||
ctypes.memmove(buf, VirtFile.build_fstat(st_mode=0o40755), 96)
|
||||
return 0
|
||||
def list_contents(self): return self.child_names
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VirtFile:
|
||||
path: str
|
||||
fdcls: Any # TODO: fix this Union[VirtFileDesc, functools.partial[VirtFileDesc]]
|
||||
|
||||
@staticmethod
|
||||
def build_fstat(st_dev=0x20, st_ino=0x100000, st_mode=0o100777, st_nlink=1, st_uid=0, st_gid=0, st_rdev=0, st_size=0,
|
||||
st_blksize=4096, st_blocks=0, st_atime=0, st_mtime=0, st_ctime=0):
|
||||
fmt_string = 'QQQIIIQQiQqqq'
|
||||
ssz = struct.calcsize(fmt_string)
|
||||
assert ssz == 96, f"{ssz} != 96"
|
||||
return struct.pack(fmt_string, st_dev, st_ino, st_nlink, st_mode, st_uid, st_gid,
|
||||
st_rdev, st_size, st_blksize, st_blocks, st_atime, st_mtime, st_ctime)
|
||||
|
||||
@staticmethod
|
||||
def build_dirent(d_ino, d_off, d_name, d_type=None):
|
||||
# Start with packing inode number, offset, and record length
|
||||
d_reclen = round_up(19 + len(d_name) + 1, 8)
|
||||
packed_data = struct.pack('QQHc', d_ino, d_off, d_reclen, b'\x04')
|
||||
d_name_bytes = d_name.encode()
|
||||
return packed_data + d_name_bytes + b'\x00' + b'\x00' * (d_reclen - (19 + len(d_name) + 1))
|
||||
|
||||
class VirtDriver:
|
||||
def __init__(self):
|
||||
self.tracked_files = []
|
||||
|
||||
@@ -1,160 +1,17 @@
|
||||
import ctypes, ctypes.util, struct, platform, time, os, builtins, atexit
|
||||
import ctypes, ctypes.util, time, os, builtins, fcntl
|
||||
from tinygrad.runtime.support.hcq import HWInterface
|
||||
from test.mockgpu.nv.nvdriver import NVDriver
|
||||
from test.mockgpu.amd.amddriver import AMDDriver
|
||||
from tinygrad.helpers import to_mv
|
||||
start = time.perf_counter()
|
||||
|
||||
# *** ioctl lib ***
|
||||
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
||||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
||||
libc.munmap.restype = ctypes.c_int
|
||||
libc.fdopendir.argtypes = [ctypes.c_int]
|
||||
libc.fdopendir.restype = ctypes.c_void_p
|
||||
|
||||
# platform.processor calls `uname -p` which can return `unknown` on some systems
|
||||
processor = os.getenv("IOCTL_PROCESSOR") or platform.processor()
|
||||
OPEN_SYSCALL = {"aarch64": None, "x86_64": 2}[processor]
|
||||
CLOSE_SYSCALL = {"aarch64": 57, "x86_64": 3}[processor]
|
||||
READ_SYSCALL = {"aarch64": 63, "x86_64": 0}[processor]
|
||||
IOCTL_SYSCALL = {"aarch64": 29, "x86_64": 16}[processor]
|
||||
MMAP_SYSCALL = {"aarch64": 222, "x86_64": 9}[processor]
|
||||
LSEEK_SYSCALL = {"aarch64": 62, "x86_64": 8}[processor]
|
||||
NEWFSTATAT_SYSCALL = {"aarch64": 79, "x86_64": 262}[processor]
|
||||
GETDENTS64_SYSCALL = {"aarch64": 61, "x86_64": 217}[processor]
|
||||
|
||||
def install_hook(c_function, python_function):
|
||||
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
|
||||
if processor == "x86_64":
|
||||
# tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
|
||||
# push r9
|
||||
# push r9
|
||||
# mov r9, 0x1122334455667788
|
||||
# mov [rsp+8], r9
|
||||
# pop r9
|
||||
# ret
|
||||
tramp = b"\x41\x51\x41\x51\x49\xB9" + struct.pack("Q", python_function_addr) + b"\x4C\x89\x4C\x24\x08\x41\x59\xC3"
|
||||
else:
|
||||
raise Exception(f"processor {processor} not supported")
|
||||
|
||||
original_bc = (ctypes.c_char * 64)()
|
||||
|
||||
# get real ioctl address
|
||||
ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
|
||||
|
||||
# hook ioctl
|
||||
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
|
||||
assert ret == 0
|
||||
libc.memcpy(original_bc, ioctl_address.contents, len(tramp))
|
||||
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
|
||||
|
||||
# Restore correct functions to close libs after python exits
|
||||
def __restore(): libc.memcpy(ioctl_address.contents, original_bc, len(tramp))
|
||||
atexit.register(__restore)
|
||||
|
||||
drivers = [AMDDriver(), NVDriver()]
|
||||
tracked_fds = {}
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong)
|
||||
def _open(name, flags, mode):
|
||||
for d in drivers:
|
||||
pyname = name.decode()
|
||||
for x in d.tracked_files:
|
||||
if pyname == x.path:
|
||||
virtfd = d.open(pyname, flags, mode, x)
|
||||
tracked_fds[virtfd.fd] = virtfd
|
||||
return virtfd.fd
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(OPEN_SYSCALL, name, flags, mode)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_char_p)
|
||||
def _opendir(name):
|
||||
fd = _open(name, os.O_RDONLY| os.O_DIRECTORY, 0)
|
||||
if fd >= 0x80:
|
||||
fake_dirfd = _open(".".encode(), os.O_RDONLY| os.O_DIRECTORY, 0)
|
||||
st = libc.fdopendir(fake_dirfd)
|
||||
to_mv(st, 8).cast('Q')[0] = fd
|
||||
return st
|
||||
else: return libc.fdopendir(fd)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int)
|
||||
def _close(fd):
|
||||
if fd in tracked_fds:
|
||||
tracked_fds[fd].close(fd)
|
||||
tracked_fds.pop(fd)
|
||||
return 0
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(CLOSE_SYSCALL, fd)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)
|
||||
def _closedir(st): return _close(to_mv(st, 8).cast('Q')[0])
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
|
||||
def _ioctl(fd, request, argp):
|
||||
if fd in tracked_fds: return tracked_fds[fd].ioctl(fd, request, argp)
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_long, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t)
|
||||
def _read(fd, buf, sz):
|
||||
if fd in tracked_fds: return tracked_fds[fd].read(fd, buf, sz)
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(READ_SYSCALL, ctypes.c_int(fd), ctypes.c_void_p(buf), ctypes.c_size_t(sz))
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_int)
|
||||
def _lseek64(fd, off, whence):
|
||||
if fd in tracked_fds: return tracked_fds[fd].lseek(fd, off, whence)
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_int]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(LSEEK_SYSCALL, fd, off, whence)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
|
||||
def _stat64(name, buf):
|
||||
for d in drivers:
|
||||
pyname = name.decode()
|
||||
for x in d.tracked_files:
|
||||
if pyname == x.path:
|
||||
virtfd = d.open(pyname, 0, 0, x)
|
||||
return virtfd.fstat(virtfd.fd, buf)
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(NEWFSTATAT_SYSCALL, -100, name, ctypes.c_void_p(buf), 0)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p)
|
||||
def _fstat64(fd, buf):
|
||||
if fd in tracked_fds: return tracked_fds[fd].fstat(fd, buf)
|
||||
|
||||
empty_str = (ctypes.c_char*1)()
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(NEWFSTATAT_SYSCALL, ctypes.c_int(fd), empty_str, ctypes.c_void_p(buf), 0x1000)
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong)
|
||||
def _getdents64(fd, buf, sz):
|
||||
if fd in tracked_fds: return tracked_fds[fd].getdents(fd, buf, sz)
|
||||
|
||||
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong]
|
||||
libc.syscall.restype = ctypes.c_int
|
||||
return libc.syscall(GETDENTS64_SYSCALL, fd, buf, sz)
|
||||
|
||||
def _mmap(start, sz, prot, flags, fd, offset):
|
||||
if fd in tracked_fds: return tracked_fds[fd].mmap(start, sz, prot, flags, fd, offset)
|
||||
return libc.mmap(start, sz, prot, flags, fd, offset)
|
||||
|
||||
def _munmap(buf, sz):
|
||||
return libc.munmap(buf, sz)
|
||||
|
||||
orignal_memoryview = builtins.memoryview
|
||||
class TrackedMemoryView:
|
||||
def __init__(self, data, rcb, wcb):
|
||||
@@ -185,20 +42,60 @@ def _memoryview(cls, mem):
|
||||
for st,en,rcb,wcb in d.tracked_addresses:
|
||||
if st <= addr <= en: return TrackedMemoryView(mem, rcb, wcb)
|
||||
return orignal_memoryview(mem)
|
||||
|
||||
install_hook(libc.open, _open)
|
||||
install_hook(libc.opendir, _opendir)
|
||||
install_hook(libc.close, _close)
|
||||
install_hook(libc.closedir, _closedir)
|
||||
install_hook(libc.ioctl, _ioctl)
|
||||
install_hook(libc.read, _read)
|
||||
install_hook(libc.lseek64, _lseek64)
|
||||
install_hook(libc.stat64, _stat64)
|
||||
install_hook(libc.fstat64, _fstat64)
|
||||
install_hook(libc.getdents64, _getdents64)
|
||||
builtins.memoryview = type("memoryview", (), {'__new__': _memoryview}) # type: ignore
|
||||
|
||||
# rewrite autogen's libc mmaps functions.
|
||||
import tinygrad.runtime.autogen.libc as autogen_libc
|
||||
autogen_libc.mmap = _mmap # type: ignore
|
||||
autogen_libc.munmap = _munmap # type: ignore
|
||||
def _open(path, flags):
|
||||
for d in drivers:
|
||||
for x in d.tracked_files:
|
||||
if path == x.path:
|
||||
virtfd = d.open(path, flags, 0o777, x)
|
||||
tracked_fds[virtfd.fd] = virtfd
|
||||
return virtfd.fd
|
||||
return os.open(path, flags, 0o777) if os.path.exists(path) else None
|
||||
|
||||
class MockHWInterface(HWInterface):
|
||||
def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
|
||||
self.path = path
|
||||
self.fd = fd or _open(path, flags)
|
||||
|
||||
def __del__(self):
|
||||
if self.fd in tracked_fds:
|
||||
tracked_fds[self.fd].close(self.fd)
|
||||
tracked_fds.pop(self.fd)
|
||||
else: os.close(self.fd)
|
||||
|
||||
def ioctl(self, request, arg):
|
||||
if self.fd in tracked_fds:
|
||||
return tracked_fds[self.fd].ioctl(self.fd, request, ctypes.addressof(arg))
|
||||
return fcntl.ioctl(self.fd, request, arg)
|
||||
|
||||
def mmap(self, start, sz, prot, flags, offset):
|
||||
if self.fd in tracked_fds:
|
||||
return tracked_fds[self.fd].mmap(start, sz, prot, flags, self.fd, offset)
|
||||
return libc.mmap(start, sz, prot, flags, self.fd, offset)
|
||||
|
||||
def read(self, size=None, binary=False):
|
||||
if binary: raise NotImplementedError()
|
||||
if self.fd in tracked_fds:
|
||||
return tracked_fds[self.fd].read_contents(size)
|
||||
with open(self.fd, "rb" if binary else "r", closefd=False) as file:
|
||||
if file.tell() >= os.fstat(self.fd).st_size: file.seek(0)
|
||||
return file.read(size)
|
||||
|
||||
def listdir(self):
|
||||
if self.fd in tracked_fds:
|
||||
return tracked_fds[self.fd].list_contents()
|
||||
return os.listdir(self.path)
|
||||
|
||||
def write(self, content, binary=False): raise NotImplementedError()
|
||||
def seek(self, offset):
|
||||
if self.fd in tracked_fds:
|
||||
tracked_fds[self.fd].seek(offset)
|
||||
else:
|
||||
os.lseek(self.fd, offset, os.SEEK_CUR)
|
||||
@staticmethod
|
||||
def exists(path): return _open(path, os.O_RDONLY) is not None
|
||||
@staticmethod
|
||||
def readlink(path): raise NotImplementedError()
|
||||
@staticmethod
|
||||
def eventfd(initval, flags=None): NotImplementedError()
|
||||
|
||||
@@ -10,10 +10,11 @@ import ctypes, os
|
||||
|
||||
|
||||
|
||||
import fcntl, functools
|
||||
import functools
|
||||
from tinygrad.runtime.support.hcq import HWInterface
|
||||
|
||||
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs):
|
||||
ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made)
|
||||
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs):
|
||||
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
|
||||
@@ -9,14 +9,14 @@
|
||||
import ctypes
|
||||
|
||||
|
||||
from tinygrad.runtime.support.hcq import HWInterface
|
||||
import functools
|
||||
|
||||
import fcntl, functools
|
||||
def _do_ioctl_io(__idir, __base, __nr, __fd:HWInterface, val=0, __len=0):
|
||||
return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val)
|
||||
|
||||
def _do_ioctl_io(__idir, __base, __nr, __fd, val=0, __len=0):
|
||||
return fcntl.ioctl(__fd, (__idir<<30) | (__len<<16) | (__base<<8) | __nr, val)
|
||||
|
||||
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, __val=None, **kwargs):
|
||||
ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made)
|
||||
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, __val=None, **kwargs):
|
||||
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys, select, struct
|
||||
from typing import Any, cast
|
||||
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, struct
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio
|
||||
from tinygrad.runtime.autogen.am import am
|
||||
@@ -14,7 +14,6 @@ from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.am.amdev import AMDev
|
||||
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
if getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
||||
|
||||
@@ -281,31 +280,34 @@ class AMDQueueDesc:
|
||||
put_value: int = 0
|
||||
|
||||
class KFDIface:
|
||||
kfd:int = -1
|
||||
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
||||
gpus:list[pathlib.Path] = []
|
||||
kfd:HWInterface|None = None
|
||||
event_page:HCQBuffer|None = None
|
||||
gpus:list[HWInterface] = []
|
||||
|
||||
def _is_usable_gpu(self, gpu_id):
|
||||
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
||||
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
|
||||
return False
|
||||
|
||||
def __init__(self, dev, device_id):
|
||||
self.dev = dev
|
||||
|
||||
if KFDIface.kfd == -1:
|
||||
KFDIface.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if self._is_usable_gpu(g)]
|
||||
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
|
||||
kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
|
||||
|
||||
# Initialize KFD interface during first run
|
||||
if KFDIface.kfd is None:
|
||||
KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
|
||||
gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
||||
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
||||
|
||||
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
||||
|
||||
with open(f"{KFDIface.gpus[device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||
with open(f"{KFDIface.gpus[device_id]}/properties", "r") as f: self.props = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
||||
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
||||
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
||||
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
||||
|
||||
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
||||
|
||||
# Set these for our device.
|
||||
if KFDIface.event_page is None:
|
||||
@@ -331,8 +333,8 @@ class KFDIface:
|
||||
|
||||
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
||||
|
||||
if host: buf = addr = libc.mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0)
|
||||
else: buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, -1, 0)
|
||||
if host: buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
||||
else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
|
||||
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
||||
@@ -344,7 +346,7 @@ class KFDIface:
|
||||
raise
|
||||
|
||||
if not host:
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
||||
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
||||
assert addr == buf == mem.va_addr
|
||||
|
||||
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
|
||||
@@ -355,7 +357,7 @@ class KFDIface:
|
||||
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
||||
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
||||
assert stm.n_success == len(gpus)
|
||||
if mem.va_addr: libc.munmap(mem.va_addr, mem.size)
|
||||
if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
|
||||
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
||||
|
||||
def map(self, mem):
|
||||
@@ -376,7 +378,7 @@ class KFDIface:
|
||||
|
||||
if not hasattr(self, 'doorbells'):
|
||||
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDIface.kfd, self.doorbells_base)
|
||||
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
|
||||
|
||||
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
|
||||
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
|
||||
@@ -398,8 +400,8 @@ class KFDIface:
|
||||
raise RuntimeError("\n".join(report))
|
||||
|
||||
class PCIIface:
|
||||
vfio:bool = getenv("VFIO", 1) and os.path.exists("/dev/vfio/vfio")
|
||||
vfio_fd:int = -1
|
||||
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
|
||||
vfio_fd:HWInterface
|
||||
gpus:list[Any] = []
|
||||
|
||||
def __init__(self, dev, dev_id):
|
||||
@@ -419,43 +421,48 @@ class PCIIface:
|
||||
self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
|
||||
|
||||
# Unbind the device from the kernel driver
|
||||
if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
||||
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind").write_text(self.pcibus)
|
||||
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize").write_text("15")
|
||||
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
||||
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
|
||||
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
|
||||
|
||||
# Probe device
|
||||
libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
|
||||
|
||||
# Try to init vfio. Use it if success.
|
||||
if PCIIface.vfio and PCIIface.vfio_fd == -1:
|
||||
if PCIIface.vfio:
|
||||
try:
|
||||
pathlib.Path("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode").write_text("1")
|
||||
PCIIface.vfio_fd = os.open("/dev/vfio/vfio", os.O_RDWR)
|
||||
if first_dev:
|
||||
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
|
||||
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
|
||||
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
||||
except OSError: PCIIface.vfio = False
|
||||
|
||||
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
|
||||
HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
|
||||
|
||||
iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
||||
except OSError:
|
||||
if DEBUG >= 1: print("AM: failed to init vfio-pci module (not inserted or no-iommu mode is not supported).")
|
||||
PCIIface.vfio = False
|
||||
|
||||
# Init vfio for the device
|
||||
if PCIIface.vfio:
|
||||
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver_override").write_text("vfio-pci")
|
||||
pathlib.Path("/sys/bus/pci/drivers_probe").write_text(self.pcibus)
|
||||
|
||||
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
||||
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
||||
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd))
|
||||
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
||||
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
|
||||
|
||||
if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
||||
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))
|
||||
self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
|
||||
|
||||
self.irq_fd = os.eventfd(0, 0) # type: ignore[attr-defined]
|
||||
self.irq_fd = HWInterface.eventfd(0, 0)
|
||||
self.irq_poller = select.poll()
|
||||
self.irq_poller.register(self.irq_fd, select.POLLIN)
|
||||
self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
|
||||
|
||||
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
||||
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
|
||||
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
|
||||
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
||||
else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
|
||||
|
||||
self.bar_fds = {bar: os.open(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
|
||||
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
|
||||
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
|
||||
|
||||
self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
||||
self.doorbell_cpu_addr = mv_address(dbell)
|
||||
@@ -469,19 +476,18 @@ class PCIIface:
|
||||
vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar))
|
||||
fd, sz, off = self.vfio_dev, size or reg.size, reg.offset + off
|
||||
else: fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size
|
||||
return to_mv(libc.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), fd, off), sz)
|
||||
return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz)
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
||||
if host:
|
||||
vaddr = self.adev.mm.alloc_vaddr(size, align=mmap.PAGESIZE)
|
||||
va = libc.mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, -1, 0)
|
||||
va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
|
||||
|
||||
# Read pagemap to get the physical address of each page. The pages are locked.
|
||||
with open("/proc/self/pagemap", "rb") as f:
|
||||
for off in range(0, size, mmap.PAGESIZE):
|
||||
f.seek(((va + off) // mmap.PAGESIZE) * 8)
|
||||
pt_entry = struct.unpack("Q", f.read(8))[0] & ((1 << 55) - 1)
|
||||
self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True)
|
||||
for off in range(0, size, mmap.PAGESIZE):
|
||||
self.pagemap.seek(((va + off) // mmap.PAGESIZE) * 8)
|
||||
pt_entry = struct.unpack("Q", self.pagemap.read(8, binary=True))[0] & ((1 << 55) - 1)
|
||||
self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True)
|
||||
return HCQBuffer(vaddr, size, meta=(self.dev, [self.dev], None))
|
||||
|
||||
vm = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
||||
@@ -510,8 +516,8 @@ class PCIIface:
|
||||
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
|
||||
|
||||
def sleep(self, timeout):
|
||||
if PCIIface.vfio and len(self.irq_poller.poll(timeout)):
|
||||
os.read(self.irq_fd, 1024)
|
||||
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
|
||||
self.irq_fd.read(8 * events_cnt)
|
||||
self.adev.ih.interrupt_handler()
|
||||
|
||||
def on_device_hang(self):
|
||||
@@ -519,14 +525,13 @@ class PCIIface:
|
||||
raise RuntimeError("Device hang detected")
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
driverless:bool = not os.path.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
||||
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
||||
signals_page:Any = None
|
||||
signals_pool:list[int] = []
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
|
||||
|
||||
self.target = int(self.dev_iface.props['gfx_target_version'])
|
||||
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
|
||||
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
|
||||
@@ -1,27 +1,27 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
|
||||
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Any, cast, Union, Type
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
||||
from tinygrad.ops import sint
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
||||
from tinygrad.runtime.autogen import nv_gpu, libc
|
||||
from tinygrad.runtime.autogen import nv_gpu
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
if MOCKGPU:=getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
|
||||
|
||||
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
|
||||
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
|
||||
|
||||
def nv_iowr(fd, nr, args):
|
||||
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
||||
def nv_iowr(fd:HWInterface, nr, args):
|
||||
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
|
||||
def rm_alloc(fd, clss, root, parant, params):
|
||||
@@ -46,8 +46,8 @@ def make_rmctrl_type():
|
||||
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
|
||||
rmctrl = make_rmctrl_type()
|
||||
|
||||
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
||||
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
|
||||
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
|
||||
ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
|
||||
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
|
||||
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
|
||||
return made
|
||||
@@ -283,8 +283,8 @@ class GPFifo:
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class NVDevice(HCQCompiled[NVSignal]):
|
||||
root = None
|
||||
fd_ctl: int = -1
|
||||
fd_uvm: int = -1
|
||||
fd_ctl: HWInterface
|
||||
fd_uvm: HWInterface
|
||||
gpus_info: Union[list, ctypes.Array] = []
|
||||
signals_page: Any = None
|
||||
signals_pool: list[int] = []
|
||||
@@ -297,19 +297,17 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
host_object_enumerator: int = 0x1000
|
||||
|
||||
def _new_gpu_fd(self):
|
||||
fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
||||
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
||||
fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
|
||||
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
|
||||
return fd_dev
|
||||
|
||||
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
|
||||
fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
||||
fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
|
||||
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
|
||||
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
||||
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
|
||||
res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
||||
os.close(fd_dev)
|
||||
return res
|
||||
return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
|
||||
|
||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
||||
# Uncached memory is "system". Use huge pages only for gpu memory.
|
||||
@@ -318,7 +316,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
||||
|
||||
if host:
|
||||
va_addr = libc.mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0)
|
||||
va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
|
||||
|
||||
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
|
||||
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
|
||||
@@ -357,7 +355,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
|
||||
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
|
||||
if mem.meta.has_cpu_mapping: libc.munmap(cast(int, mem.va_addr), mem.size)
|
||||
if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
|
||||
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
|
||||
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
||||
@@ -365,8 +363,9 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
|
||||
self._debug_mappings[(va_base, size)] = tag
|
||||
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
||||
hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
||||
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
|
||||
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
|
||||
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
||||
|
||||
def _gpu_map(self, mem:HCQBuffer):
|
||||
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
|
||||
@@ -384,12 +383,12 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if NVDevice.root is None:
|
||||
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
||||
uvm.initialize(self.fd_uvm)
|
||||
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
||||
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
||||
|
||||
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
@@ -425,7 +424,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
|
||||
|
||||
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
|
||||
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
|
||||
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
|
||||
|
||||
for dev in cast(list[NVDevice], self.devices):
|
||||
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
|
||||
@@ -481,7 +480,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
assert ws_token_params.workSubmitToken != -1
|
||||
|
||||
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
|
||||
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
||||
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
|
||||
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
||||
|
||||
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
|
||||
|
||||
@@ -1,10 +1,41 @@
|
||||
from __future__ import annotations
|
||||
from typing import cast, Type, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, time, ctypes, array
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
|
||||
from tinygrad.ops import sym_infer, sint, Variable
|
||||
from tinygrad.runtime.autogen import libc
|
||||
|
||||
class HWInterface:
|
||||
"""
|
||||
Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices.
|
||||
"""
|
||||
|
||||
def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
|
||||
self.path:str = path
|
||||
self.fd:int = fd or os.open(path, flags)
|
||||
def __del__(self): os.close(self.fd)
|
||||
def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg)
|
||||
def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset)
|
||||
def read(self, size=None, binary=False):
|
||||
with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size)
|
||||
def write(self, content, binary=False):
|
||||
with open(self.fd, "wb" if binary else "w", closefd=False) as file: file.write(content)
|
||||
def listdir(self): return os.listdir(self.path)
|
||||
def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET)
|
||||
@staticmethod
|
||||
def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset)
|
||||
@staticmethod
|
||||
def munmap(buf, sz): return libc.munmap(buf, sz)
|
||||
@staticmethod
|
||||
def exists(path): return os.path.exists(path)
|
||||
@staticmethod
|
||||
def readlink(path): return os.readlink(path)
|
||||
@staticmethod
|
||||
def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags))
|
||||
|
||||
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
# **************** for HCQ Compatible Devices ****************
|
||||
|
||||
|
||||
Reference in New Issue
Block a user