rename HWInterface -> FileIOInterface (#9989)

* rename HWInterface -> FileIOInterface

* ugh
This commit is contained in:
nimlgen
2025-04-22 22:18:57 +03:00
committed by GitHub
parent c1539b0319
commit db51133537
8 changed files with 69 additions and 69 deletions

View File

@@ -80,8 +80,8 @@ generate_kfd() {
fixup $BASE/kfd.py fixup $BASE/kfd.py
sed -i "s/import ctypes/import ctypes, os/g" $BASE/kfd.py sed -i "s/import ctypes/import ctypes, os/g" $BASE/kfd.py
sed -i "s/import fcntl, functools/import functools/g" $BASE/kfd.py sed -i "s/import fcntl, functools/import functools/g" $BASE/kfd.py
sed -i "/import functools/a from tinygrad.runtime.support.hcq import HWInterface" $BASE/kfd.py sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py
sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, \*\*kwargs):/g" $BASE/kfd.py sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py
sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py
python3 -c "import tinygrad.runtime.autogen.kfd" python3 -c "import tinygrad.runtime.autogen.kfd"
} }
@@ -287,7 +287,7 @@ generate_vfio() {
fixup $BASE/vfio.py fixup $BASE/vfio.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py
sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/vfio.py sed -i "s\import ctypes,os\a from tinygrad.runtime.support import FileIOInterface\g" $BASE/vfio.py
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py
} }

View File

@@ -1,5 +1,5 @@
import ctypes, ctypes.util, time, os, builtins, fcntl import ctypes, ctypes.util, time, os, builtins, fcntl
from tinygrad.runtime.support.hcq import HWInterface from tinygrad.runtime.support.hcq import FileIOInterface
from test.mockgpu.nv.nvdriver import NVDriver from test.mockgpu.nv.nvdriver import NVDriver
from test.mockgpu.amd.amddriver import AMDDriver from test.mockgpu.amd.amddriver import AMDDriver
start = time.perf_counter() start = time.perf_counter()
@@ -53,7 +53,7 @@ def _open(path, flags):
return virtfd.fd return virtfd.fd
return os.open(path, flags, 0o777) if os.path.exists(path) else None return os.open(path, flags, 0o777) if os.path.exists(path) else None
class MockHWInterface(HWInterface): class MockFileIOInterface(FileIOInterface):
def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None): def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None):
self.path = path self.path = path
self.fd = fd or _open(path, flags) self.fd = fd or _open(path, flags)

View File

@@ -11,9 +11,9 @@ import ctypes, os
import functools import functools
from tinygrad.runtime.support.hcq import HWInterface from tinygrad.runtime.support.hcq import FileIOInterface
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs): def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, **kwargs):
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made) ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}") if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made return made

View File

@@ -9,13 +9,13 @@
import ctypes import ctypes
from tinygrad.runtime.support.hcq import HWInterface from tinygrad.runtime.support.hcq import FileIOInterface
import functools import functools
def _do_ioctl_io(__idir, __base, __nr, __fd:HWInterface, val=0, __len=0): def _do_ioctl_io(__idir, __base, __nr, __fd:FileIOInterface, val=0, __len=0):
return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val) return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val)
def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, __val=None, **kwargs): def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, __val=None, **kwargs):
ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made) ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}") if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made return made

View File

@@ -3,7 +3,7 @@ from typing import Any, cast, ClassVar
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select
assert sys.platform != 'win32' assert sys.platform != 'win32'
from dataclasses import dataclass from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
from tinygrad.ops import sint from tinygrad.ops import sint
from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, all_same, flatten, DEBUG, OSX from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, all_same, flatten, DEBUG, OSX
@@ -520,9 +520,9 @@ class AMDIP:
return getattr(self.module, name) return getattr(self.module, name)
class KFDIface: class KFDIface:
kfd:HWInterface|None = None kfd:FileIOInterface|None = None
event_page:HCQBuffer|None = None event_page:HCQBuffer|None = None
gpus:list[HWInterface] = [] gpus:list[FileIOInterface] = []
def _is_usable_gpu(self, gpu_id): def _is_usable_gpu(self, gpu_id):
with contextlib.suppress(OSError): return int(gpu_id.read()) != 0 with contextlib.suppress(OSError): return int(gpu_id.read()) != 0
@@ -535,23 +535,23 @@ class KFDIface:
# Initialize KFD interface during first run # Initialize KFD interface during first run
if KFDIface.kfd is None: if KFDIface.kfd is None:
KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR) KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR)
gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))] gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1])) gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()] visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?") if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read()) self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()} self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0" ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0"
id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP} id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
self.ip_versions = {id2ip[int(hwid)]:tuple(int(HWInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision']) self.ip_versions = {id2ip[int(hwid)]:tuple(int(FileIOInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision'])
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in HWInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines()) self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in FileIOInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines())
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR) self.drm_fd = FileIOInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id) kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
@@ -580,8 +580,8 @@ class KFDIface:
if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR: if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) buf = addr = FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0) else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0)
assert addr != 0xffffffffffffffff assert addr != 0xffffffffffffffff
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
@@ -604,7 +604,7 @@ class KFDIface:
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus) c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus)) stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
assert stm.n_success == len(gpus) assert stm.n_success == len(gpus)
if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size) if mem.va_addr: FileIOInterface.munmap(mem.va_addr, mem.size)
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle) kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
def map(self, mem): def map(self, mem):
@@ -624,7 +624,7 @@ class KFDIface:
if not hasattr(self, 'doorbells'): if not hasattr(self, 'doorbells'):
self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base) self.doorbells = cast(FileIOInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base)
return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"),
read_ptrs=[to_mv(queue.read_pointer_address, 8).cast("Q")], write_ptrs=[to_mv(queue.write_pointer_address, 8).cast("Q")], read_ptrs=[to_mv(queue.read_pointer_address, 8).cast("Q")], write_ptrs=[to_mv(queue.write_pointer_address, 8).cast("Q")],
@@ -650,17 +650,17 @@ class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AM
class PCIIface: class PCIIface:
supported_devs:list[int] = [0x744c, 0x7480, 0x7550] supported_devs:list[int] = [0x744c, 0x7480, 0x7550]
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio") vfio:bool = getenv("VFIO", 1) and FileIOInterface.exists("/dev/vfio/vfio")
vfio_fd:HWInterface vfio_fd:FileIOInterface
gpus:list[Any] = [] gpus:list[Any] = []
def __init__(self, dev, dev_id): def __init__(self, dev, dev_id):
self.dev = dev self.dev = dev
if first_dev:=len(PCIIface.gpus) == 0: if first_dev:=len(PCIIface.gpus) == 0:
for pcibus in HWInterface("/sys/bus/pci/devices").listdir(): for pcibus in FileIOInterface("/sys/bus/pci/devices").listdir():
vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16) vendor = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16) device = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus) if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
PCIIface.gpus = sorted(PCIIface.gpus) PCIIface.gpus = sorted(PCIIface.gpus)
@@ -671,51 +671,51 @@ class PCIIface:
self.pcibus = PCIIface.gpus[dev_id] self.pcibus = PCIIface.gpus[dev_id]
# Unbind the device from the kernel driver # Unbind the device from the kernel driver
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): if FileIOInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus) FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16) supported_sizes = int(FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
try: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1)) try: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
except OSError as e: raise RuntimeError(f"Cannot resize BAR: {e}. Ensure the resizable BAR option is enabled on your system.") from e except OSError as e: raise RuntimeError(f"Cannot resize BAR: {e}. Ensure the resizable BAR option is enabled on your system.") from e
# Try to init vfio. Use it if success. # Try to init vfio. Use it if success.
if PCIIface.vfio: if PCIIface.vfio:
try: try:
if first_dev: if first_dev:
HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1") FileIOInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1")
PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR) PCIIface.vfio_fd = FileIOInterface("/dev/vfio/vfio", os.O_RDWR)
vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci") FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci")
HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus) FileIOInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus)
iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] iommu_group = FileIOInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
except OSError: except OSError:
if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).") if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).")
PCIIface.vfio = False PCIIface.vfio = False
# Init vfio for the device # Init vfio for the device
if PCIIface.vfio: if PCIIface.vfio:
self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) self.vfio_group = FileIOInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd)) vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd))
if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))) self.vfio_dev = FileIOInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode())))
self.irq_fd = HWInterface.eventfd(0, 0) self.irq_fd = FileIOInterface.eventfd(0, 0)
self.irq_poller = select.poll() self.irq_poller = select.poll()
self.irq_poller.register(self.irq_fd.fd, select.POLLIN) self.irq_poller.register(self.irq_fd.fd, select.POLLIN)
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER, irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd)) argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs) vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1") else: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY) self.pagemap = FileIOInterface("/proc/self/pagemap", os.O_RDONLY)
self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) self.cfg_fd = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]} self.bar_fds = {b: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{b}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for b in [0, 2, 5]}
bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines() bar_info = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)} self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I')) self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
@@ -741,7 +741,7 @@ class PCIIface:
def alloc(self, size:int, host=False, uncached=False, cpu_access=False): def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory. if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE) vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE)
va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0)
# Read pagemap to get the physical address of each page. The pages are locked. # Read pagemap to get the physical address of each page. The pages are locked.
self.pagemap.seek(va // mmap.PAGESIZE * 8) self.pagemap.seek(va // mmap.PAGESIZE * 8)
@@ -791,7 +791,7 @@ class AMDDevice(HCQCompiled):
signal_pages: ClassVar[list[Any]] = [] signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = [] signal_pool: ClassVar[list[int]] = []
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
def __init__(self, device:str=""): def __init__(self, device:str=""):
self.device_id = int(device.split(":")[1]) if ":" in device else 0 self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -852,7 +852,7 @@ class AMDDevice(HCQCompiled):
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0)) self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
if self.sqtt_enabled: if self.sqtt_enabled:
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX') if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
if not self.driverless and (ppfeaturemask:=int(HWInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16)) & 0x8000: if not self.driverless and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add " raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add "
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n" f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
"For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md") "For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")

View File

@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
from typing import Any, cast, Union, Type, ClassVar from typing import Any, cast, Union, Type, ClassVar
from dataclasses import dataclass from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import MMIOInterface, HWInterface, MOCKGPU from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
from tinygrad.ops import sint from tinygrad.ops import sint
from tinygrad.device import BufferSpec, CPUProgram from tinygrad.device import BufferSpec, CPUProgram
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, round_up, data64, data64_le, DEBUG, prod, OSX from tinygrad.helpers import getenv, mv_address, init_c_struct_t, round_up, data64, data64_le, DEBUG, prod, OSX
@@ -20,7 +20,7 @@ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status
NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")} NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")}
NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")} NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")}
def nv_iowr(fd:HWInterface, nr, args): def nv_iowr(fd:FileIOInterface, nr, args):
ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args) ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}") if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
@@ -46,7 +46,7 @@ def make_rmctrl_type():
getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))}) getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))})
rmctrl = make_rmctrl_type() rmctrl = make_rmctrl_type()
def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs): def uvm_ioctl(cmd, sttyp, fd:FileIOInterface, **kwargs):
ret = fd.ioctl(cmd, made:=sttyp(**kwargs)) ret = fd.ioctl(cmd, made:=sttyp(**kwargs))
if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}") if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}")
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}") if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}")
@@ -293,8 +293,8 @@ class NVDevice(HCQCompiled[NVSignal]):
signal_pool: ClassVar[list[int]] = [] signal_pool: ClassVar[list[int]] = []
root = None root = None
fd_ctl: HWInterface fd_ctl: FileIOInterface
fd_uvm: HWInterface fd_uvm: FileIOInterface
gpus_info: Union[list, ctypes.Array] = [] gpus_info: Union[list, ctypes.Array] = []
# TODO: Need a proper allocator for va addresses # TODO: Need a proper allocator for va addresses
@@ -305,12 +305,12 @@ class NVDevice(HCQCompiled[NVSignal]):
host_object_enumerator: int = 0x1000 host_object_enumerator: int = 0x1000
def _new_gpu_fd(self): def _new_gpu_fd(self):
fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC) fd_dev = FileIOInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC)
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd)) nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd))
return fd_dev return fd_dev
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False): def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) fd_dev = self._new_gpu_fd() if not system else FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd, made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd,
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags)) params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags))
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
@@ -324,7 +324,7 @@ class NVDevice(HCQCompiled[NVSignal]):
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
if host: if host:
va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) va_addr = FileIOInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0)
flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \ flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \
| (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30) | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30)
@@ -363,7 +363,7 @@ class NVDevice(HCQCompiled[NVSignal]):
self._debug_mappings.pop((cast(int, mem.va_addr), mem.size)) self._debug_mappings.pop((cast(int, mem.va_addr), mem.size))
uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size) uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size)
if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size) if mem.meta.has_cpu_mapping: FileIOInterface.munmap(cast(int, mem.va_addr), mem.size)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer: def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size) if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
@@ -391,9 +391,9 @@ class NVDevice(HCQCompiled[NVSignal]):
def __init__(self, device:str=""): def __init__(self, device:str=""):
if NVDevice.root is None: if NVDevice.root is None:
NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) NVDevice.fd_ctl = FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) NVDevice.fd_uvm = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) self.fd_uvm_2 = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
uvm.initialize(self.fd_uvm) uvm.initialize(self.fd_uvm)
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too

View File

@@ -5,7 +5,7 @@ from types import SimpleNamespace
from typing import Any, cast, ClassVar from typing import Any, cast, ClassVar
from tinygrad.device import BufferSpec from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import HWInterface from tinygrad.runtime.support.hcq import FileIOInterface
from tinygrad.runtime.autogen import kgsl, adreno from tinygrad.runtime.autogen import kgsl, adreno
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer from tinygrad.renderer.cstyle import QCOMRenderer
@@ -325,7 +325,7 @@ class QCOMDevice(HCQCompiled):
dummy_addr: int = 0 dummy_addr: int = 0
def __init__(self, device:str=""): def __init__(self, device:str=""):
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR) self.fd = FileIOInterface('/dev/kgsl-3d0', os.O_RDWR)
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr) QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \ flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
@@ -364,7 +364,7 @@ class QCOMDevice(HCQCompiled):
def _gpu_free(self, mem:HCQBuffer): def _gpu_free(self, mem:HCQBuffer):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id) kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
HWInterface.munmap(mem.va_addr, mem.meta.mmapsize) FileIOInterface.munmap(mem.va_addr, mem.meta.mmapsize)
def _ensure_stack_size(self, sz): def _ensure_stack_size(self, sz):
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz) if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)

View File

@@ -13,7 +13,7 @@ class MMIOInterface:
def __getitem__(self, k) -> int|list[int]: return self.mv[k].tolist() if isinstance(k, slice) else self.mv[k] def __getitem__(self, k) -> int|list[int]: return self.mv[k].tolist() if isinstance(k, slice) else self.mv[k]
def __setitem__(self, k, v:int|array.array): self.mv[k] = v def __setitem__(self, k, v:int|array.array): self.mv[k] = v
class HWInterface: class FileIOInterface:
""" """
Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices. Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices.
""" """
@@ -42,9 +42,9 @@ class HWInterface:
@staticmethod @staticmethod
def readlink(path): return os.readlink(path) def readlink(path): return os.readlink(path)
@staticmethod @staticmethod
def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined] def eventfd(initval, flags=None): return FileIOInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined]
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface # noqa: F401 # pylint: disable=unused-import if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import
# **************** for HCQ Compatible Devices **************** # **************** for HCQ Compatible Devices ****************