hw interface abstraction (#8524)

* use HWInterface in autogen

* mockgpu

* HWInterface

* more HWInterface

* fix

* fix

* old code

* fix

* implicit field definition

* add offset check to mockgpu too

* refactor

* forgot to pass flags + read rewrite

* test

* play with vfio

* nv: this should be kept

* try this

* vfio

* rm overwrite=True

* linetr

* do not reinit kfd

* minor

* mypy

* mock

* init them once

---------

Co-authored-by: patrini32 <patrini23@proton.me>
This commit is contained in:
nimlgen
2025-01-07 18:18:28 +03:00
committed by GitHub
parent 0e97f807e0
commit ab3ac2b58d
8 changed files with 198 additions and 316 deletions

View File

@@ -79,6 +79,10 @@ generate_kfd() {
fixup $BASE/kfd.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kfd.py
sed -i "s\import fcntl, functools\import functools" $BASE/kfd.py
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/kfd.py
sed -i "s\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs):\def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs):\g" $BASE/kfd.py
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\__fd.ioctl((__idir<<30)\g" $BASE/kfd.py
python3 -c "import tinygrad.runtime.autogen.kfd"
}
@@ -263,6 +267,10 @@ generate_vfio() {
/usr/include/linux/vfio.h \
-o $BASE/vfio.py
fixup $BASE/vfio.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py
sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/vfio.py
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py
}
generate_am() {

View File

@@ -1,95 +1,36 @@
import ctypes, struct, os
from typing import Any
from dataclasses import dataclass
from tinygrad.helpers import round_up
class VirtFileDesc:
def __init__(self, fd): self.fd, self.off = fd, 0
def read(self, fd, buf, sz): raise NotImplementedError()
def ioctl(self, fd, req, argp): raise NotImplementedError()
def mmap(self, st, sz, prot, flags, fd, off): raise NotImplementedError()
def write(self, fd, buf, sz): raise NotImplementedError()
def lseek(self, fd, off, whence): raise NotImplementedError()
def fstat(self, fd, buf): raise NotImplementedError()
def getdents(self, fd, buf, sz): return -1
def close(self, fd): return 0
class TextFileDesc(VirtFileDesc):
def __init__(self, fd, text):
super().__init__(fd)
self.content = ctypes.create_string_buffer(text.encode())
self.sz = len(self.content) - 1
self.content = text
def ioctl(self, fd, req, argp): return 0
def write(self, fd, buf, sz): return -1
def read(self, fd, buf, sz):
ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, rdsz:=min(sz, self.sz - self.off))
self.off += rdsz
return rdsz
def lseek(self, fd, off, whence):
if whence == os.SEEK_SET: self.off = off
elif whence == os.SEEK_CUR: self.off += off
elif whence == os.SEEK_END: self.off = self.sz + off
else: return -1
return 0
def fstat(self, fd, buf):
ctypes.memmove(buf, VirtFile.build_fstat(st_size=self.sz), 88)
return 0
def read_contents(self, size=None):
ret = self.content[self.off:self.off+(size or len(self.content))]
self.off += (size or len(self.content))
return ret
def seek(self, offset): self.off += offset
class DirFileDesc(VirtFileDesc):
def __init__(self, fd, child_names):
super().__init__(fd)
child_names = ['.', '..'] + child_names
tmp = b''
for ino, name in enumerate(child_names):
tmp += VirtFile.build_dirent(ino + 1, 0, name)
self.content = ctypes.create_string_buffer(tmp)
self.sz = len(self.content) - 1
self.child_names = child_names
def ioctl(self, fd, req, argp): return 0
def write(self, fd, buf, sz): return -1
def read(self, fd, buf, sz): return -1
def lseek(self, fd, off, whence):
if whence == os.SEEK_SET: self.off = off
elif whence == os.SEEK_CUR: self.off += off
elif whence == os.SEEK_END: self.off = self.sz + off
else: return -1
return 0
def getdents(self, fd, buf, sz):
if self.sz == self.off: return 0
if sz < self.sz: return -1
ctypes.memmove(buf, ctypes.addressof(self.content) + self.off, self.sz)
self.off = self.sz
return self.sz
def fstat(self, fd, buf):
ctypes.memmove(buf, VirtFile.build_fstat(st_mode=0o40755), 96)
return 0
def list_contents(self): return self.child_names
@dataclass(frozen=True)
class VirtFile:
path: str
fdcls: Any # TODO: fix this Union[VirtFileDesc, functools.partial[VirtFileDesc]]
@staticmethod
def build_fstat(st_dev=0x20, st_ino=0x100000, st_mode=0o100777, st_nlink=1, st_uid=0, st_gid=0, st_rdev=0, st_size=0,
st_blksize=4096, st_blocks=0, st_atime=0, st_mtime=0, st_ctime=0):
fmt_string = 'QQQIIIQQiQqqq'
ssz = struct.calcsize(fmt_string)
assert ssz == 96, f"{ssz} != 96"
return struct.pack(fmt_string, st_dev, st_ino, st_nlink, st_mode, st_uid, st_gid,
st_rdev, st_size, st_blksize, st_blocks, st_atime, st_mtime, st_ctime)
@staticmethod
def build_dirent(d_ino, d_off, d_name, d_type=None):
# Start with packing inode number, offset, and record length
d_reclen = round_up(19 + len(d_name) + 1, 8)
packed_data = struct.pack('QQHc', d_ino, d_off, d_reclen, b'\x04')
d_name_bytes = d_name.encode()
return packed_data + d_name_bytes + b'\x00' + b'\x00' * (d_reclen - (19 + len(d_name) + 1))
class VirtDriver:
def __init__(self):
self.tracked_files = []

View File

@@ -1,160 +1,17 @@
import ctypes, ctypes.util, struct, platform, time, os, builtins, atexit
import ctypes, ctypes.util, time, os, builtins, fcntl
from tinygrad.runtime.support.hcq import HWInterface
from test.mockgpu.nv.nvdriver import NVDriver
from test.mockgpu.amd.amddriver import AMDDriver
from tinygrad.helpers import to_mv
start = time.perf_counter()
# *** ioctl lib ***
libc = ctypes.CDLL(ctypes.util.find_library("c"))
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
libc.munmap.restype = ctypes.c_int
libc.fdopendir.argtypes = [ctypes.c_int]
libc.fdopendir.restype = ctypes.c_void_p
# platform.processor calls `uname -p` which can return `unknown` on some systems
processor = os.getenv("IOCTL_PROCESSOR") or platform.processor()
OPEN_SYSCALL = {"aarch64": None, "x86_64": 2}[processor]
CLOSE_SYSCALL = {"aarch64": 57, "x86_64": 3}[processor]
READ_SYSCALL = {"aarch64": 63, "x86_64": 0}[processor]
IOCTL_SYSCALL = {"aarch64": 29, "x86_64": 16}[processor]
MMAP_SYSCALL = {"aarch64": 222, "x86_64": 9}[processor]
LSEEK_SYSCALL = {"aarch64": 62, "x86_64": 8}[processor]
NEWFSTATAT_SYSCALL = {"aarch64": 79, "x86_64": 262}[processor]
GETDENTS64_SYSCALL = {"aarch64": 61, "x86_64": 217}[processor]
def install_hook(c_function, python_function):
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
if processor == "x86_64":
# tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
# push r9
# push r9
# mov r9, 0x1122334455667788
# mov [rsp+8], r9
# pop r9
# ret
tramp = b"\x41\x51\x41\x51\x49\xB9" + struct.pack("Q", python_function_addr) + b"\x4C\x89\x4C\x24\x08\x41\x59\xC3"
else:
raise Exception(f"processor {processor} not supported")
original_bc = (ctypes.c_char * 64)()
# get real ioctl address
ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
# hook ioctl
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
assert ret == 0
libc.memcpy(original_bc, ioctl_address.contents, len(tramp))
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
# Restore correct functions to close libs after python exits
def __restore(): libc.memcpy(ioctl_address.contents, original_bc, len(tramp))
atexit.register(__restore)
drivers = [AMDDriver(), NVDriver()]
tracked_fds = {}
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong)
def _open(name, flags, mode):
for d in drivers:
pyname = name.decode()
for x in d.tracked_files:
if pyname == x.path:
virtfd = d.open(pyname, flags, mode, x)
tracked_fds[virtfd.fd] = virtfd
return virtfd.fd
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_char_p, ctypes.c_int, ctypes.c_ulong]
libc.syscall.restype = ctypes.c_int
return libc.syscall(OPEN_SYSCALL, name, flags, mode)
@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_char_p)
def _opendir(name):
fd = _open(name, os.O_RDONLY| os.O_DIRECTORY, 0)
if fd >= 0x80:
fake_dirfd = _open(".".encode(), os.O_RDONLY| os.O_DIRECTORY, 0)
st = libc.fdopendir(fake_dirfd)
to_mv(st, 8).cast('Q')[0] = fd
return st
else: return libc.fdopendir(fd)
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int)
def _close(fd):
if fd in tracked_fds:
tracked_fds[fd].close(fd)
tracked_fds.pop(fd)
return 0
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int]
libc.syscall.restype = ctypes.c_int
return libc.syscall(CLOSE_SYSCALL, fd)
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)
def _closedir(st): return _close(to_mv(st, 8).cast('Q')[0])
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
def _ioctl(fd, request, argp):
if fd in tracked_fds: return tracked_fds[fd].ioctl(fd, request, argp)
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p]
libc.syscall.restype = ctypes.c_int
return libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
@ctypes.CFUNCTYPE(ctypes.c_long, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t)
def _read(fd, buf, sz):
if fd in tracked_fds: return tracked_fds[fd].read(fd, buf, sz)
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
libc.syscall.restype = ctypes.c_int
return libc.syscall(READ_SYSCALL, ctypes.c_int(fd), ctypes.c_void_p(buf), ctypes.c_size_t(sz))
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_int)
def _lseek64(fd, off, whence):
if fd in tracked_fds: return tracked_fds[fd].lseek(fd, off, whence)
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_ulong, ctypes.c_int]
libc.syscall.restype = ctypes.c_int
return libc.syscall(LSEEK_SYSCALL, fd, off, whence)
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
def _stat64(name, buf):
for d in drivers:
pyname = name.decode()
for x in d.tracked_files:
if pyname == x.path:
virtfd = d.open(pyname, 0, 0, x)
return virtfd.fstat(virtfd.fd, buf)
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong]
libc.syscall.restype = ctypes.c_int
return libc.syscall(NEWFSTATAT_SYSCALL, -100, name, ctypes.c_void_p(buf), 0)
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p)
def _fstat64(fd, buf):
if fd in tracked_fds: return tracked_fds[fd].fstat(fd, buf)
empty_str = (ctypes.c_char*1)()
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_ulong]
libc.syscall.restype = ctypes.c_int
return libc.syscall(NEWFSTATAT_SYSCALL, ctypes.c_int(fd), empty_str, ctypes.c_void_p(buf), 0x1000)
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong)
def _getdents64(fd, buf, sz):
if fd in tracked_fds: return tracked_fds[fd].getdents(fd, buf, sz)
libc.syscall.argtypes = [ctypes.c_ulong, ctypes.c_int, ctypes.c_void_p, ctypes.c_ulong]
libc.syscall.restype = ctypes.c_int
return libc.syscall(GETDENTS64_SYSCALL, fd, buf, sz)
def _mmap(start, sz, prot, flags, fd, offset):
if fd in tracked_fds: return tracked_fds[fd].mmap(start, sz, prot, flags, fd, offset)
return libc.mmap(start, sz, prot, flags, fd, offset)
def _munmap(buf, sz):
return libc.munmap(buf, sz)
orignal_memoryview = builtins.memoryview
class TrackedMemoryView:
def __init__(self, data, rcb, wcb):
@@ -185,20 +42,60 @@ def _memoryview(cls, mem):
for st,en,rcb,wcb in d.tracked_addresses:
if st <= addr <= en: return TrackedMemoryView(mem, rcb, wcb)
return orignal_memoryview(mem)
install_hook(libc.open, _open)
install_hook(libc.opendir, _opendir)
install_hook(libc.close, _close)
install_hook(libc.closedir, _closedir)
install_hook(libc.ioctl, _ioctl)
install_hook(libc.read, _read)
install_hook(libc.lseek64, _lseek64)
install_hook(libc.stat64, _stat64)
install_hook(libc.fstat64, _fstat64)
install_hook(libc.getdents64, _getdents64)
builtins.memoryview = type("memoryview", (), {'__new__': _memoryview}) # type: ignore
# rewrite autogen's libc mmaps functions.
import tinygrad.runtime.autogen.libc as autogen_libc
autogen_libc.mmap = _mmap # type: ignore
autogen_libc.munmap = _munmap # type: ignore
def _open(path, flags):
for d in drivers:
for x in d.tracked_files:
if path == x.path:
virtfd = d.open(path, flags, 0o777, x)
tracked_fds[virtfd.fd] = virtfd
return virtfd.fd
return os.open(path, flags, 0o777) if os.path.exists(path) else None
class MockHWInterface(HWInterface):
def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
self.path = path
self.fd = fd or _open(path, flags)
def __del__(self):
if self.fd in tracked_fds:
tracked_fds[self.fd].close(self.fd)
tracked_fds.pop(self.fd)
else: os.close(self.fd)
def ioctl(self, request, arg):
if self.fd in tracked_fds:
return tracked_fds[self.fd].ioctl(self.fd, request, ctypes.addressof(arg))
return fcntl.ioctl(self.fd, request, arg)
def mmap(self, start, sz, prot, flags, offset):
if self.fd in tracked_fds:
return tracked_fds[self.fd].mmap(start, sz, prot, flags, self.fd, offset)
return libc.mmap(start, sz, prot, flags, self.fd, offset)
def read(self, size=None, binary=False):
if binary: raise NotImplementedError()
if self.fd in tracked_fds:
return tracked_fds[self.fd].read_contents(size)
with open(self.fd, "rb" if binary else "r", closefd=False) as file:
if file.tell() >= os.fstat(self.fd).st_size: file.seek(0)
return file.read(size)
def listdir(self):
if self.fd in tracked_fds:
return tracked_fds[self.fd].list_contents()
return os.listdir(self.path)
def write(self, content, binary=False): raise NotImplementedError()
def seek(self, offset):
if self.fd in tracked_fds:
tracked_fds[self.fd].seek(offset)
else:
os.lseek(self.fd, offset, os.SEEK_CUR)
@staticmethod
def exists(path): return _open(path, os.O_RDONLY) is not None
@staticmethod
def readlink(path): raise NotImplementedError()
@staticmethod
def eventfd(initval, flags=None): NotImplementedError()

View File

@@ -10,10 +10,11 @@ import ctypes, os
import fcntl, functools
import functools
from tinygrad.runtime.support.hcq import HWInterface
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, **kwargs):
ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made)
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs):
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made

View File

@@ -9,14 +9,14 @@
import ctypes
from tinygrad.runtime.support.hcq import HWInterface
import functools
import fcntl, functools
def _do_ioctl_io(__idir, __base, __nr, __fd:HWInterface, val=0, __len=0):
return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val)
def _do_ioctl_io(__idir, __base, __nr, __fd, val=0, __len=0):
return fcntl.ioctl(__fd, (__idir<<30) | (__len<<16) | (__base<<8) | __nr, val)
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, __val=None, **kwargs):
ret = fcntl.ioctl(__fd, (__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made)
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, __val=None, **kwargs):
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made

View File

@@ -1,12 +1,12 @@
from __future__ import annotations
from typing import Any
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, array, contextlib, sys, select, struct
from typing import Any, cast
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, struct
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio
from tinygrad.runtime.autogen.am import am
@@ -14,7 +14,6 @@ from tinygrad.runtime.support.compiler_hip import AMDCompiler
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.am.amdev import AMDev
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
if getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
@@ -281,31 +280,34 @@ class AMDQueueDesc:
put_value: int = 0
class KFDIface:
kfd:int = -1
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
gpus:list[pathlib.Path] = []
kfd:HWInterface|None = None
event_page:HCQBuffer|None = None
gpus:list[HWInterface] = []
def _is_usable_gpu(self, gpu_id):
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
return False
def __init__(self, dev, device_id):
self.dev = dev
if KFDIface.kfd == -1:
KFDIface.kfd = os.open("/dev/kfd", os.O_RDWR)
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if self._is_usable_gpu(g)]
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
kfd_topo_path = "/sys/devices/virtual/kfd/kfd/topology/nodes"
# Initialize KFD interface during first run
if KFDIface.kfd is None:
KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR)
gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
with open(f"{KFDIface.gpus[device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
with open(f"{KFDIface.gpus[device_id]}/properties", "r") as f: self.props = {line.split()[0]: int(line.split()[1]) for line in f}
self.drm_fd = os.open(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
# Set these for our device.
if KFDIface.event_page is None:
@@ -331,8 +333,8 @@ class KFDIface:
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
if host: buf = addr = libc.mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0)
else: buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, -1, 0)
if host: buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
assert addr != 0xffffffffffffffff
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
@@ -344,7 +346,7 @@ class KFDIface:
raise
if not host:
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset)
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
assert addr == buf == mem.va_addr
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
@@ -355,7 +357,7 @@ class KFDIface:
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
assert stm.n_success == len(gpus)
if mem.va_addr: libc.munmap(mem.va_addr, mem.size)
if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size)
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
def map(self, mem):
@@ -376,7 +378,7 @@ class KFDIface:
if not hasattr(self, 'doorbells'):
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDIface.kfd, self.doorbells_base)
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
@@ -398,8 +400,8 @@ class KFDIface:
raise RuntimeError("\n".join(report))
class PCIIface:
vfio:bool = getenv("VFIO", 1) and os.path.exists("/dev/vfio/vfio")
vfio_fd:int = -1
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
vfio_fd:HWInterface
gpus:list[Any] = []
def __init__(self, dev, dev_id):
@@ -419,43 +421,48 @@ class PCIIface:
self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
# Unbind the device from the kernel driver
if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind").write_text(self.pcibus)
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize").write_text("15")
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
# Probe device
libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
# Try to init vfio. Use it if success.
if PCIIface.vfio and PCIIface.vfio_fd == -1:
if PCIIface.vfio:
try:
pathlib.Path("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode").write_text("1")
PCIIface.vfio_fd = os.open("/dev/vfio/vfio", os.O_RDWR)
if first_dev:
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR)
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
except OSError: PCIIface.vfio = False
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
except OSError:
if DEBUG >= 1: print("AM: failed to init vfio-pci module (not inserted or no-iommu mode is not supported).")
PCIIface.vfio = False
# Init vfio for the device
if PCIIface.vfio:
pathlib.Path(f"/sys/bus/pci/devices/{self.pcibus}/driver_override").write_text("vfio-pci")
pathlib.Path("/sys/bus/pci/drivers_probe").write_text(self.pcibus)
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd))
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))
self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
self.irq_fd = os.eventfd(0, 0) # type: ignore[attr-defined]
self.irq_fd = HWInterface.eventfd(0, 0)
self.irq_poller = select.poll()
self.irq_poller.register(self.irq_fd, select.POLLIN)
self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
self.bar_fds = {bar: os.open(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
self.doorbell_cpu_addr = mv_address(dbell)
@@ -469,19 +476,18 @@ class PCIIface:
vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar))
fd, sz, off = self.vfio_dev, size or reg.size, reg.offset + off
else: fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size
return to_mv(libc.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), fd, off), sz)
return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz)
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
if host:
vaddr = self.adev.mm.alloc_vaddr(size, align=mmap.PAGESIZE)
va = libc.mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, -1, 0)
va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
# Read pagemap to get the physical address of each page. The pages are locked.
with open("/proc/self/pagemap", "rb") as f:
for off in range(0, size, mmap.PAGESIZE):
f.seek(((va + off) // mmap.PAGESIZE) * 8)
pt_entry = struct.unpack("Q", f.read(8))[0] & ((1 << 55) - 1)
self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True)
for off in range(0, size, mmap.PAGESIZE):
self.pagemap.seek(((va + off) // mmap.PAGESIZE) * 8)
pt_entry = struct.unpack("Q", self.pagemap.read(8, binary=True))[0] & ((1 << 55) - 1)
self.adev.mm.map_range(vaddr=vaddr + off, size=mmap.PAGESIZE, paddr=pt_entry * mmap.PAGESIZE, system=True, snooped=True, uncached=True)
return HCQBuffer(vaddr, size, meta=(self.dev, [self.dev], None))
vm = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
@@ -510,8 +516,8 @@ class PCIIface:
read_ptr=to_mv(gart.va_addr, 8).cast("Q"), write_ptr=to_mv(gart.va_addr+0x10, 8).cast("Q"))
def sleep(self, timeout):
if PCIIface.vfio and len(self.irq_poller.poll(timeout)):
os.read(self.irq_fd, 1024)
if PCIIface.vfio and (events_cnt:=len(self.irq_poller.poll(timeout))):
self.irq_fd.read(8 * events_cnt)
self.adev.ih.interrupt_handler()
def on_device_hang(self):
@@ -519,14 +525,13 @@ class PCIIface:
raise RuntimeError("Device hang detected")
class AMDDevice(HCQCompiled):
driverless:bool = not os.path.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
signals_page:Any = None
signals_pool:list[int] = []
def __init__(self, device:str=""):
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
self.target = int(self.dev_iface.props['gfx_target_version'])
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")

View File

@@ -1,27 +1,27 @@
from __future__ import annotations
import os, ctypes, contextlib, re, fcntl, functools, mmap, struct, array, sys
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
assert sys.platform != 'win32'
from typing import Any, cast, Union, Type
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
from tinygrad.runtime.autogen import nv_gpu, libc
from tinygrad.runtime.autogen import nv_gpu
from tinygrad.runtime.support.elf import elf_loader
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
if MOCKGPU:=getenv("MOCKGPU"): import test.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status, 'Unknown error')}"
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
def nv_iowr(fd, nr, args):
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
def nv_iowr(fd:HWInterface, nr, args):
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
def rm_alloc(fd, clss, root, parant, params):
@@ -46,8 +46,8 @@ def make_rmctrl_type():
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
rmctrl = make_rmctrl_type()
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs):
ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
return made
@@ -283,8 +283,8 @@ class GPFifo:
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class NVDevice(HCQCompiled[NVSignal]):
root = None
fd_ctl: int = -1
fd_uvm: int = -1
fd_ctl: HWInterface
fd_uvm: HWInterface
gpus_info: Union[list, ctypes.Array] = []
signals_page: Any = None
signals_pool: list[int] = []
@@ -297,19 +297,17 @@ class NVDevice(HCQCompiled[NVSignal]):
host_object_enumerator: int = 0x1000
def _new_gpu_fd(self):
fd_dev = os.open(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
return fd_dev
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}")
res = libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
os.close(fd_dev)
return res
return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0)
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
# Uncached memory is "system". Use huge pages only for gpu memory.
@@ -318,7 +316,7 @@ class NVDevice(HCQCompiled[NVSignal]):
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
if host:
va_addr = libc.mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0)
va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
@@ -357,7 +355,7 @@ class NVDevice(HCQCompiled[NVSignal]):
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
if mem.meta.has_cpu_mapping: libc.munmap(cast(int, mem.va_addr), mem.size)
if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
@@ -365,8 +363,9 @@ class NVDevice(HCQCompiled[NVSignal]):
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
self._debug_mappings[(va_base, size)] = tag
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root,
hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
def _gpu_map(self, mem:HCQBuffer):
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
@@ -384,12 +383,12 @@ class NVDevice(HCQCompiled[NVSignal]):
def __init__(self, device:str=""):
if NVDevice.root is None:
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
uvm.initialize(self.fd_uvm)
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
@@ -425,7 +424,7 @@ class NVDevice(HCQCompiled[NVSignal]):
self.gpu_uuid = nv_gpu.struct_nv_uuid(uuid=(ctypes.c_ubyte*16)(*[raw_uuid.data[i] for i in range(16)]))
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=self.gpu_uuid)
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root, hVaSpace=vaspace)
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hVaSpace=vaspace)
for dev in cast(list[NVDevice], self.devices):
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
@@ -481,7 +480,7 @@ class NVDevice(HCQCompiled[NVSignal]):
assert ws_token_params.workSubmitToken != -1
channel_base = self._alloc_gpu_vaddr(0x4000000, force_low=True)
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl, hClient=self.root,
uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root,
hChannel=gpfifo, base=channel_base, length=0x4000000)
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,

View File

@@ -1,10 +1,41 @@
from __future__ import annotations
from typing import cast, Type, TypeVar, Generic, Any
import contextlib, decimal, statistics, time, ctypes, array
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent
from tinygrad.ops import sym_infer, sint, Variable
from tinygrad.runtime.autogen import libc
class HWInterface:
"""
Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices.
"""
def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
self.path:str = path
self.fd:int = fd or os.open(path, flags)
def __del__(self): os.close(self.fd)
def ioctl(self, request, arg): return fcntl.ioctl(self.fd, request, arg)
def mmap(self, start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, self.fd, offset)
def read(self, size=None, binary=False):
with open(self.fd, "rb" if binary else "r", closefd=False) as file: return file.read(size)
def write(self, content, binary=False):
with open(self.fd, "wb" if binary else "w", closefd=False) as file: file.write(content)
def listdir(self): return os.listdir(self.path)
def seek(self, offset): os.lseek(self.fd, offset, os.SEEK_SET)
@staticmethod
def anon_mmap(start, sz, prot, flags, offset): return libc.mmap(start, sz, prot, flags, -1, offset)
@staticmethod
def munmap(buf, sz): return libc.munmap(buf, sz)
@staticmethod
def exists(path): return os.path.exists(path)
@staticmethod
def readlink(path): return os.readlink(path)
@staticmethod
def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags))
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface # noqa: F401 # pylint: disable=unused-import
# **************** for HCQ Compatible Devices ****************