From ab3ac2b58d4e36e33714d710f8e5faf126d9eac0 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 7 Jan 2025 18:18:28 +0300 Subject: [PATCH] hw interface abstraction (#8524) * use HWInterface in autogen * mockgpu * HWInterface * more HWInterface * fix * fix * old code * fix * implicit field definition * add offset check to mockgpu too * refactor * forgot to pass flags + read rewrite * test * play with vfio * nv: this should be kept * try this * vfio * rm overwrite=True * linetr * do not reinit kfd * minor * mypy * mock * init them once --------- Co-authored-by: patrini32 --- autogen_stubs.sh | 8 ++ test/mockgpu/driver.py | 75 ++--------- test/mockgpu/mockgpu.py | 217 ++++++++----------------------- tinygrad/runtime/autogen/kfd.py | 7 +- tinygrad/runtime/autogen/vfio.py | 12 +- tinygrad/runtime/ops_amd.py | 111 ++++++++-------- tinygrad/runtime/ops_nv.py | 51 ++++---- tinygrad/runtime/support/hcq.py | 33 ++++- 8 files changed, 198 insertions(+), 316 deletions(-) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 8c68d5bb3b..670a789753 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -79,6 +79,10 @@ generate_kfd() { fixup $BASE/kfd.py sed -i "s\import ctypes\import ctypes, os\g" $BASE/kfd.py + sed -i "s\import fcntl, functools\import functools" $BASE/kfd.py + sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/kfd.py + sed -i "s\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs):\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs):\g" $BASE/kfd.py + sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\__fd.ioctl((__idir<<30)\g" $BASE/kfd.py python3 -c "import tinygrad.runtime.autogen.kfd" } @@ -263,6 +267,10 @@ generate_vfio() { /usr/include/linux/vfio.h \ -o $BASE/vfio.py fixup $BASE/vfio.py + sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py + sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py + sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/vfio.py + sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py } generate_am() { diff --git a/test/mockgpu/driver.py b/test/mockgpu/driver.py index 369bcc3679..7be165cb06 100644 --- a/test/mockgpu/driver.py +++ b/test/mockgpu/driver.py @@ -1,95 +1,36 @@ -import ctypes, struct, os from typing import Any from dataclasses import dataclass -from tinygrad.helpers import round_up class VirtFileDesc: def __init__(self, fd): self.fd, self.off = fd, 0 - def read(self, fd, buf, sz): raise NotImplementedError() def ioctl(self, fd, req, argp): raise NotImplementedError() def mmap(self, st, sz, prot, flags, fd, off): raise NotImplementedError() - def write(self, fd, buf, sz): raise NotImplementedError() - def lseek(self, fd, off, whence): raise NotImplementedError() - def fstat(self, fd, buf): raise NotImplementedError() - def getdents(self, fd, buf, sz): return -1 def close(self, fd): return 0 class TextFileDesc(VirtFileDesc): def __init__(self, fd, text): super().__init__(fd) - self.content = ctypes.create_string_buffer(text.encode()) - self.sz = len(self.content) - 1 + self.content = text def ioctl(self, fd, req, argp): return 0 - def write(self, fd, buf, sz): return -1 - def read(self, fd, buf, sz): - ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, rdsz:=min(sz, self.sz - self.off)) - self.off += rdsz - return rdsz - def lseek(self, fd, off, whence): - if whence == os.SEEK_SET: self.off = off - elif whence == os.SEEK_CUR: self.off += off - elif whence == os.SEEK_END: self.off = self.sz + off - else: return -1 - return 0 - def fstat(self, fd, buf): - ctypes.memmove(buf, VirtFile.build_fstat(st_size=self.sz), 88) - return 0 - + def read_contents(self, size=None): + ret = self.content[self.off:self.off+(size or len(self.content))] + self.off += (size or len(self.content)) + return ret + def seek(self, offset): self.off += offset class DirFileDesc(VirtFileDesc): def __init__(self, fd, child_names): super().__init__(fd) - child_names = ['.', '..'] + child_names - - tmp = b'' - for ino, name in enumerate(child_names): - tmp += VirtFile.build_dirent(ino + 1, 0, name) - self.content = ctypes.create_string_buffer(tmp) - self.sz = len(self.content) - 1 + self.child_names = child_names def ioctl(self, fd, req, argp): return 0 - def write(self, fd, buf, sz): return -1 - def read(self, fd, buf, sz): return -1 - def lseek(self, fd, off, whence): - if whence == os.SEEK_SET: self.off = off - elif whence == os.SEEK_CUR: self.off += off - elif whence == os.SEEK_END: self.off = self.sz + off - else: return -1 - return 0 - - def getdents(self, fd, buf, sz): - if self.sz == self.off: return 0 - if sz < self.sz: return -1 - ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, self.sz) - self.off = self.sz - return self.sz - - def fstat(self, fd, buf): - ctypes.memmove(buf, VirtFile.build_fstat(st_mode=0o40755), 96) - return 0 + def list_contents(self): return self.child_names @dataclass(frozen=True) class VirtFile: path: str fdcls: Any # TODO: fix this Union[VirtFileDesc, functools.partial[VirtFileDesc]] - @staticmethod - def build_fstat(st_dev=0x20, st_ino=0x100000, st_mode=0o100777, st_nlink=1, st_uid=0, st_gid=0, st_rdev=0, st_size=0, - st_blksize=4096, st_blocks=0, st_atime=0, st_mtime=0, st_ctime=0): - fmt_string = 'QQQIIIQQiQqqq' - ssz = struct.calcsize(fmt_string) - assert ssz == 96, f"{ssz} != 96" - return struct.pack(fmt_string, st_dev, st_ino, st_nlink, st_mode, st_uid, st_gid, - st_rdev, st_size, st_blksize, st_blocks, st_atime, st_mtime, st_ctime) - - @staticmethod - def build_dirent(d_ino, d_off, d_name, d_type=None): - # Start with packing inode number, offset, and record length - d_reclen = round_up(19 + len(d_name) + 1, 8) - packed_data = struct.pack('QQHc', d_ino, d_off, d_reclen, b'\x04') - d_name_bytes = d_name.encode() - return packed_data + d_name_bytes + b'\x00' + b'\x00' * (d_reclen - (19 + len(d_name) + 1)) - class VirtDriver: def __init__(self): self.tracked_files = [] diff --git a/test/mockgpu/mockgpu.py b/test/mockgpu/mockgpu.py index 9f120df0e7..5070c7bd28 100644 --- a/test/mockgpu/mockgpu.py +++ b/test/mockgpu/mockgpu.py @@ -1,160 +1,17 @@ -import ctypes, ctypes.util, struct, platform, time, os, builtins, atexit +import ctypes, ctypes.util, time, os, builtins, fcntl +from tinygrad.runtime.support.hcq import HWInterface from test.mockgpu.nv.nvdriver import NVDriver from test.mockgpu.amd.amddriver import AMDDriver -from tinygrad.helpers import to_mv start = time.perf_counter() # *** ioctl lib *** libc = ctypes.CDLL(ctypes.util.find_library("c")) libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long] libc.mmap.restype = ctypes.c_void_p -libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t] -libc.munmap.restype = ctypes.c_int -libc.fdopendir.argtypes = [ctypes.c_int] -libc.fdopendir.restype = ctypes.c_void_p - -# platform.processor calls `uname -p` which can return `unknown` on some systems -processor = os.getenv("IOCTL_PROCESSOR") or platform.processor() -OPEN_SYSCALL = {"aarch64": None, "x86_64": 2}[processor] -CLOSE_SYSCALL = {"aarch64": 57, "x86_64": 3}[processor] -READ_SYSCALL = {"aarch64": 63, "x86_64": 0}[processor] -IOCTL_SYSCALL = {"aarch64": 29, "x86_64": 16}[processor] -MMAP_SYSCALL = {"aarch64": 222, "x86_64": 9}[processor] -LSEEK_SYSCALL = {"aarch64": 62, "x86_64": 8}[processor] -NEWFSTATAT_SYSCALL = {"aarch64": 79, "x86_64": 262}[processor] -GETDENTS64_SYSCALL = {"aarch64": 61, "x86_64": 217}[processor] - -def install_hook(c_function, python_function): - python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value - if processor == "x86_64": - # tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0" - # push r9 - # push r9 - # mov r9, 0x1122334455667788 - # mov [rsp+8], r9 - # pop r9 - # ret - tramp = b"\x41\x51\x41\x51\x49\xB9" + struct.pack("Q", python_function_addr) + b"\x4C\x89\x4C\x24\x08\x41\x59\xC3" - else: - raise Exception(f"processor {processor} not supported") - - original_bc = (ctypes.c_char * 64)() - - # get real ioctl address - ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong)) - - # hook ioctl - ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7) - assert ret == 0 - libc.memcpy(original_bc, ioctl_address.contents, len(tramp)) - libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp)) - - # Restore correct functions to close libs after python exits - def __restore(): libc.memcpy(ioctl_address.contents, original_bc, len(tramp)) - atexit.register(__restore) drivers = [AMDDriver(), NVDriver()] tracked_fds = {} -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong) -def _open(name, flags, mode): - for d in drivers: - pyname = name.decode() - for x in d.tracked_files: - if pyname == x.path: - virtfd = d.open(pyname, flags, mode, x) - tracked_fds[virtfd.fd] = virtfd - return virtfd.fd - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong] - libc.syscall.restype = ctypes.c_int - return libc.syscall(OPEN_SYSCALL, name, flags, mode) - -@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_char_p) -def _opendir(name): - fd = _open(name, os.O_RDONLY| os.O_DIRECTORY, 0) - if fd >= 0x80: - fake_dirfd = _open(".".encode(), os.O_RDONLY| os.O_DIRECTORY, 0) - st = libc.fdopendir(fake_dirfd) - to_mv(st, 8).cast('Q')[0] = fd - return st - else: return libc.fdopendir(fd) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int) -def _close(fd): - if fd in tracked_fds: - tracked_fds[fd].close(fd) - tracked_fds.pop(fd) - return 0 - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int] - libc.syscall.restype = ctypes.c_int - return libc.syscall(CLOSE_SYSCALL, fd) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p) -def _closedir(st): return _close(to_mv(st, 8).cast('Q')[0]) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p) -def _ioctl(fd, request, argp): - if fd in tracked_fds: return tracked_fds[fd].ioctl(fd, request, argp) - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p] - libc.syscall.restype = ctypes.c_int - return libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp)) - -@ctypes.CFUNCTYPE(ctypes.c_long, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t) -def _read(fd, buf, sz): - if fd in tracked_fds: return tracked_fds[fd].read(fd, buf, sz) - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t] - libc.syscall.restype = ctypes.c_int - return libc.syscall(READ_SYSCALL, ctypes.c_int(fd), ctypes.c_void_p(buf), ctypes.c_size_t(sz)) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_int) -def _lseek64(fd, off, whence): - if fd in tracked_fds: return tracked_fds[fd].lseek(fd, off, whence) - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_int] - libc.syscall.restype = ctypes.c_int - return libc.syscall(LSEEK_SYSCALL, fd, off, whence) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p) -def _stat64(name, buf): - for d in drivers: - pyname = name.decode() - for x in d.tracked_files: - if pyname == x.path: - virtfd = d.open(pyname, 0, 0, x) - return virtfd.fstat(virtfd.fd, buf) - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong] - libc.syscall.restype = ctypes.c_int - return libc.syscall(NEWFSTATAT_SYSCALL, -100, name, ctypes.c_void_p(buf), 0) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p) -def _fstat64(fd, buf): - if fd in tracked_fds: return tracked_fds[fd].fstat(fd, buf) - - empty_str = (ctypes.c_char*1)() - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong] - libc.syscall.restype = ctypes.c_int - return libc.syscall(NEWFSTATAT_SYSCALL, ctypes.c_int(fd), empty_str, ctypes.c_void_p(buf), 0x1000) - -@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong) -def _getdents64(fd, buf, sz): - if fd in tracked_fds: return tracked_fds[fd].getdents(fd, buf, sz) - - libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong] - libc.syscall.restype = ctypes.c_int - return libc.syscall(GETDENTS64_SYSCALL, fd, buf, sz) - -def _mmap(start, sz, prot, flags, fd, offset): - if fd in tracked_fds: return tracked_fds[fd].mmap(start, sz, prot, flags, fd, offset) - return libc.mmap(start, sz, prot, flags, fd, offset) - -def _munmap(buf, sz): - return libc.munmap(buf, sz) - orignal_memoryview = builtins.memoryview class TrackedMemoryView: def __init__(self, data, rcb, wcb): @@ -185,20 +42,60 @@ def _memoryview(cls, mem): for st,en,rcb,wcb in d.tracked_addresses: if st <= addr <= en: return TrackedMemoryView(mem, rcb, wcb) return orignal_memoryview(mem) - -install_hook(libc.open, _open) -install_hook(libc.opendir, _opendir) -install_hook(libc.close, _close) -install_hook(libc.closedir, _closedir) -install_hook(libc.ioctl, _ioctl) -install_hook(libc.read, _read) -install_hook(libc.lseek64, _lseek64) -install_hook(libc.stat64, _stat64) -install_hook(libc.fstat64, _fstat64) -install_hook(libc.getdents64, _getdents64) builtins.memoryview = type("memoryview", (), {'__new__': _memoryview}) # type: ignore -# rewrite autogen's libc mmaps functions. -import tinygrad.runtime.autogen.libc as autogen_libc -autogen_libc.mmap = _mmap # type: ignore -autogen_libc.munmap = _munmap # type: ignore +def _open(path, flags): + for d in drivers: + for x in d.tracked_files: + if path == x.path: + virtfd = d.open(path, flags, 0o777, x) + tracked_fds[virtfd.fd] = virtfd + return virtfd.fd + return os.open(path, flags, 0o777) if os.path.exists(path) else None + +class MockHWInterface(HWInterface): + def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None): + self.path = path + self.fd = fd or _open(path, flags) + + def __del__(self): + if self.fd in tracked_fds: + tracked_fds[self.fd].close(self.fd) + tracked_fds.pop(self.fd) + else: os.close(self.fd) + + def ioctl(self, request, arg): + if self.fd in tracked_fds: + return tracked_fds[self.fd].ioctl(self.fd, request, ctypes.addressof(arg)) + return fcntl.ioctl(self.fd, request, arg) + + def mmap(self, start, sz, prot, flags, offset): + if self.fd in tracked_fds: + return tracked_fds[self.fd].mmap(start, sz, prot, flags, self.fd, offset) + return libc.mmap(start, sz, prot, flags, self.fd, offset) + + def read(self, size=None, binary=False): + if binary: raise NotImplementedError() + if self.fd in tracked_fds: + return tracked_fds[self.fd].read_contents(size) + with open(self.fd, "rb" if binary else "r", closefd=False) as file: + if file.tell() >= os.fstat(self.fd).st_size: file.seek(0) + return file.read(size) + + def listdir(self): + if self.fd in tracked_fds: + return tracked_fds[self.fd].list_contents() + return os.listdir(self.path) + + def write(self, content, binary=False): raise NotImplementedError() + def seek(self, offset): + if self.fd in tracked_fds: + tracked_fds[self.fd].seek(offset) + else: + os.lseek(self.fd, offset, os.SEEK_CUR) + @staticmethod + def exists(path): return _open(path, os.O_RDONLY) is not None + @staticmethod + def readlink(path): raise NotImplementedError() + @staticmethod + def eventfd(initval, flags=None): NotImplementedError() diff --git a/tinygrad/runtime/autogen/kfd.py b/tinygrad/runtime/autogen/kfd.py index 4fc6368239..cdb4b3f0db 100644 --- a/tinygrad/runtime/autogen/kfd.py +++ b/tinygrad/runtime/autogen/kfd.py @@ -10,10 +10,11 @@ import ctypes, os -import fcntl, functools +import functools +from tinygrad.runtime.support.hcq import HWInterface -def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs): - ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made) +def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs): + ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") return made diff --git a/tinygrad/runtime/autogen/vfio.py b/tinygrad/runtime/autogen/vfio.py index ff0cda416a..86abf92497 100644 --- a/tinygrad/runtime/autogen/vfio.py +++ b/tinygrad/runtime/autogen/vfio.py @@ -9,14 +9,14 @@ import ctypes +from tinygrad.runtime.support.hcq import HWInterface +import functools -import fcntl, functools +def _do_ioctl_io(__idir, __base, __nr, __fd:HWInterface, val=0, __len=0): + return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val) -def _do_ioctl_io(__idir, __base, __nr, __fd, val=0, __len=0): - return fcntl.ioctl(__fd, (__idir<<30) | (__len<<16) | (__base<<8) | __nr, val) - -def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, __val=None, **kwargs): - ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made) +def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, __val=None, **kwargs): + ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") return made diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 3ac13cd1b0..02c7ca0b13 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,12 +1,12 @@ from __future__ import annotations -from typing import Any -import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys, select, struct +from typing import Any, cast +import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, struct assert sys.platform != 'win32' from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface from tinygrad.ops import sint from tinygrad.device import BufferSpec -from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address +from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG from tinygrad.renderer.cstyle import AMDRenderer from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio from tinygrad.runtime.autogen.am import am @@ -14,7 +14,6 @@ from tinygrad.runtime.support.compiler_hip import AMDCompiler from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.am.amdev import AMDev if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import -if getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107 @@ -281,31 +280,34 @@ class AMDQueueDesc: put_value: int = 0 class KFDIface: - kfd:int = -1 - event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args] - gpus:list[pathlib.Path] = [] + kfd:HWInterface|None = None + event_page:HCQBuffer|None = None + gpus:list[HWInterface] = [] def _is_usable_gpu(self, gpu_id): - with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0 + with contextlib.suppress(OSError): return int(gpu_id.read()) != 0 return False def __init__(self, dev, device_id): self.dev = dev - if KFDIface.kfd == -1: - KFDIface.kfd = os.open("/dev/kfd", os.O_RDWR) - gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if self._is_usable_gpu(g)] - gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1])) + kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes" + + # Initialize KFD interface during first run + if KFDIface.kfd is None: + KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR) + gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))] + gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1])) visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()] KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?") - with open(f"{KFDIface.gpus[device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read()) - with open(f"{KFDIface.gpus[device_id]}/properties", "r") as f: self.props = {line.split()[0]: int(line.split()[1]) for line in f} - self.drm_fd = os.open(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR) + self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read()) + self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()} + self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR) - kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id) + kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id) # Set these for our device. if KFDIface.event_page is None: @@ -331,8 +333,8 @@ class KFDIface: if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC - if host: buf = addr = libc.mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0) - else: buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, -1, 0) + if host: buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) + else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0) assert addr != 0xffffffffffffffff try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, @@ -344,7 +346,7 @@ class KFDIface: raise if not host: - buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset) + buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset) assert addr == buf == mem.va_addr self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem)) @@ -355,7 +357,7 @@ class KFDIface: c_gpus = (ctypes.c_int32 * len(gpus))(*gpus) stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus)) assert stm.n_success == len(gpus) - if mem.va_addr: libc.munmap(mem.va_addr, mem.size) + if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size) kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle) def map(self, mem): @@ -376,7 +378,7 @@ class KFDIface: if not hasattr(self, 'doorbells'): self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages - self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDIface.kfd, self.doorbells_base) + self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base) return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"), @@ -398,8 +400,8 @@ class KFDIface: raise RuntimeError("\n".join(report)) class PCIIface: - vfio:bool = getenv("VFIO", 1) and os.path.exists("/dev/vfio/vfio") - vfio_fd:int = -1 + vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio") + vfio_fd:HWInterface gpus:list[Any] = [] def __init__(self, dev, dev_id): @@ -419,43 +421,48 @@ class PCIIface: self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}" # Unbind the device from the kernel driver - if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): - pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind").write_text(self.pcibus) - pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize").write_text("15") + if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): + HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus) + HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15") # Probe device libpciaccess.pci_device_probe(ctypes.byref(self.pcidev)) # Try to init vfio. Use it if success. - if PCIIface.vfio and PCIIface.vfio_fd == -1: + if PCIIface.vfio: try: - pathlib.Path("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode").write_text("1") - PCIIface.vfio_fd = os.open("/dev/vfio/vfio", os.O_RDWR) + if first_dev: + HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1") + PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR) vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) - except OSError: PCIIface.vfio = False + + HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci") + HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus) + + iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] + except OSError: + if DEBUG >= 1: print("AM: failed to init vfio-pci module (not inserted or no-iommu mode is not supported).") + PCIIface.vfio = False # Init vfio for the device if PCIIface.vfio: - pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver_override").write_text("vfio-pci") - pathlib.Path("/sys/bus/pci/drivers_probe").write_text(self.pcibus) - - iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] - self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) - vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd)) + self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) + vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd)) if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) - self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())) + self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))) - self.irq_fd = os.eventfd(0, 0) # type: ignore[attr-defined] + self.irq_fd = HWInterface.eventfd(0, 0) self.irq_poller = select.poll() - self.irq_poller.register(self.irq_fd, select.POLLIN) + self.irq_poller.register(self.irq_fd.fd, select.POLLIN) irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER, - argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd)) + argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd)) vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs) else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev)) - self.bar_fds = {bar: os.open(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]} + self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY) + self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]} self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I')) self.doorbell_cpu_addr = mv_address(dbell) @@ -469,19 +476,18 @@ class PCIIface: vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar)) fd, sz, off = self.vfio_dev, size or reg.size, reg.offset + off else: fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size - return to_mv(libc.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), fd, off), sz) + return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz) def alloc(self, size:int, host=False, uncached=False, cpu_access=False): if host: vaddr = self.adev.mm.alloc_vaddr(size, align=mmap.PAGESIZE) - va = libc.mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, -1, 0) + va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) # Read pagemap to get the physical address of each page. The pages are locked. - with open("/proc/self/pagemap", "rb") as f: - for off in range(0, size, mmap.PAGESIZE): - f.seek(((va + off) // mmap.PAGESIZE) * 8) - pt_entry = struct.unpack("Q", f.read(8))[0] & ((1 << 55) - 1) - self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True) + for off in range(0, size, mmap.PAGESIZE): + self.pagemap.seek(((va + off) // mmap.PAGESIZE) * 8) + pt_entry = struct.unpack("Q", self.pagemap.read(8, binary=True))[0] & ((1 << 55) - 1) + self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True) return HCQBuffer(vaddr, size, meta=(self.dev, [self.dev], None)) vm = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access) @@ -510,8 +516,8 @@ class PCIIface: read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q")) def sleep(self, timeout): - if PCIIface.vfio and len(self.irq_poller.poll(timeout)): - os.read(self.irq_fd, 1024) + if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))): + self.irq_fd.read(8 * events_cnt) self.adev.ih.interrupt_handler() def on_device_hang(self): @@ -519,14 +525,13 @@ class PCIIface: raise RuntimeError("Device hang detected") class AMDDevice(HCQCompiled): - driverless:bool = not os.path.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) + driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) signals_page:Any = None signals_pool:list[int] = [] def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id) - self.target = int(self.dev_iface.props['gfx_target_version']) self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100) if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}") diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index ab43c40263..df42c59208 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -1,27 +1,27 @@ from __future__ import annotations -import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys +import os, ctypes, contextlib, re, functools, mmap, struct, array, sys assert sys.platform != 'win32' from typing import Any, cast, Union, Type from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator +from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU from tinygrad.ops import sint from tinygrad.device import BufferSpec from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import NVRenderer from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler -from tinygrad.runtime.autogen import nv_gpu, libc +from tinygrad.runtime.autogen import nv_gpu from tinygrad.runtime.support.elf import elf_loader if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import -if MOCKGPU:=getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}" NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")} NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")} -def nv_iowr(fd, nr, args): - ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args) +def nv_iowr(fd:HWInterface, nr, args): + ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") def rm_alloc(fd, clss, root, parant, params): @@ -46,8 +46,8 @@ def make_rmctrl_type(): getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))}) rmctrl = make_rmctrl_type() -def uvm_ioctl(cmd, sttyp, fd, **kwargs): - ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs)) +def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs): + ret = fd.ioctl(cmd, made:=sttyp(**kwargs)) if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}") if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}") return made @@ -283,8 +283,8 @@ class GPFifo: MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 class NVDevice(HCQCompiled[NVSignal]): root = None - fd_ctl: int = -1 - fd_uvm: int = -1 + fd_ctl: HWInterface + fd_uvm: HWInterface gpus_info: Union[list, ctypes.Array] = [] signals_page: Any = None signals_pool: list[int] = [] @@ -297,19 +297,17 @@ class NVDevice(HCQCompiled[NVSignal]): host_object_enumerator: int = 0x1000 def _new_gpu_fd(self): - fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC) - nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl)) + fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC) + nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd)) return fd_dev def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False): - fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) - made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev, + fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) + made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd, params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags)) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made) if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}") - res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0) - os.close(fd_dev) - return res + return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0) def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer: # Uncached memory is "system". Use huge pages only for gpu memory. @@ -318,7 +316,7 @@ class NVDevice(HCQCompiled[NVSignal]): va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) if host: - va_addr = libc.mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0) + va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \ | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30) @@ -357,7 +355,7 @@ class NVDevice(HCQCompiled[NVSignal]): self._debug_mappings.pop((cast(int, mem.va_addr), mem.size)) uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size) - if mem.meta.has_cpu_mapping: libc.munmap(cast(int, mem.va_addr), mem.size) + if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size) def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer: if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size) @@ -365,8 +363,9 @@ class NVDevice(HCQCompiled[NVSignal]): # NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol. self._debug_mappings[(va_base, size)] = tag - return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, - hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)) + return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd, + hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, + mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)) def _gpu_map(self, mem:HCQBuffer): if self.gpu_uuid in mem.meta.mapped_gpu_ids: return @@ -384,12 +383,12 @@ class NVDevice(HCQCompiled[NVSignal]): def __init__(self, device:str=""): if NVDevice.root is None: - NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) - NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) - fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) + NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) + NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) + self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew uvm.initialize(self.fd_uvm) - with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too + with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)()) visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()] @@ -425,7 +424,7 @@ class NVDevice(HCQCompiled[NVSignal]): self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)])) uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid) - uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace) + uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace) for dev in cast(list[NVDevice], self.devices): try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid) @@ -481,7 +480,7 @@ class NVDevice(HCQCompiled[NVSignal]): assert ws_token_params.workSubmitToken != -1 channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True) - uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, + uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hChannel=gpfifo, base=channel_base, length=0x4000000) return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken, diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 962e10cea1..3b39f605b3 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,10 +1,41 @@ from __future__ import annotations from typing import cast, Type, TypeVar, Generic, Any -import contextlib, decimal, statistics, time, ctypes, array +import contextlib, decimal, statistics, time, ctypes, array, os, fcntl from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent from tinygrad.ops import sym_infer, sint, Variable +from tinygrad.runtime.autogen import libc + +class HWInterface: + """ + Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices. + """ + + def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None): + self.path:str = path + self.fd:int = fd or os.open(path, flags) + def __del__(self): os.close(self.fd) + def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg) + def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset) + def read(self, size=None, binary=False): + with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size) + def write(self, content, binary=False): + with open(self.fd, "wb" if binary else "w", closefd=False) as file: file.write(content) + def listdir(self): return os.listdir(self.path) + def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET) + @staticmethod + def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset) + @staticmethod + def munmap(buf, sz): return libc.munmap(buf, sz) + @staticmethod + def exists(path): return os.path.exists(path) + @staticmethod + def readlink(path): return os.readlink(path) + @staticmethod + def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags)) + +if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface # noqa: F401 # pylint: disable=unused-import # **************** for HCQ Compatible Devices ****************