diff --git a/autogen_stubs.sh b/autogen_stubs.sh index fe184bfae1..d7ccd8b0aa 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -80,8 +80,8 @@ generate_kfd() { fixup $BASE/kfd.py sed -i "s/import ctypes/import ctypes, os/g" $BASE/kfd.py sed -i "s/import fcntl, functools/import functools/g" $BASE/kfd.py - sed -i "/import functools/a from tinygrad.runtime.support.hcq import HWInterface" $BASE/kfd.py - sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, \*\*kwargs):/g" $BASE/kfd.py + sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py + sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py python3 -c "import tinygrad.runtime.autogen.kfd" } @@ -287,7 +287,7 @@ generate_vfio() { fixup $BASE/vfio.py sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py - sed -i "s\import ctypes,os\a from tinygrad.runtime.support import HWInterface\g" $BASE/vfio.py + sed -i "s\import ctypes,os\a from tinygrad.runtime.support import FileIOInterface\g" $BASE/vfio.py sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py } diff --git a/test/mockgpu/mockgpu.py b/test/mockgpu/mockgpu.py index b0041f38a1..9e60fa3979 100644 --- a/test/mockgpu/mockgpu.py +++ b/test/mockgpu/mockgpu.py @@ -1,5 +1,5 @@ import ctypes, ctypes.util, time, os, builtins, fcntl -from tinygrad.runtime.support.hcq import HWInterface +from tinygrad.runtime.support.hcq import FileIOInterface from test.mockgpu.nv.nvdriver import NVDriver from test.mockgpu.amd.amddriver import AMDDriver start = time.perf_counter() @@ -53,7 +53,7 @@ def _open(path, flags): return virtfd.fd return os.open(path, flags, 0o777) if os.path.exists(path) else None -class MockHWInterface(HWInterface): +class MockFileIOInterface(FileIOInterface): def __init__(self, path:str="", flags:int=os.O_RDONLY, fd:int|None=None): self.path = path self.fd = fd or _open(path, flags) diff --git a/tinygrad/runtime/autogen/kfd.py b/tinygrad/runtime/autogen/kfd.py index cdb4b3f0db..4b970abf9a 100644 --- a/tinygrad/runtime/autogen/kfd.py +++ b/tinygrad/runtime/autogen/kfd.py @@ -11,9 +11,9 @@ import ctypes, os import functools -from tinygrad.runtime.support.hcq import HWInterface +from tinygrad.runtime.support.hcq import FileIOInterface -def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, **kwargs): +def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, **kwargs): ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := __user_struct(**kwargs))<<16) | (__base<<8) | __nr, made) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") return made diff --git a/tinygrad/runtime/autogen/vfio.py b/tinygrad/runtime/autogen/vfio.py index 86abf92497..2cc7377e82 100644 --- a/tinygrad/runtime/autogen/vfio.py +++ b/tinygrad/runtime/autogen/vfio.py @@ -9,13 +9,13 @@ import ctypes -from tinygrad.runtime.support.hcq import HWInterface +from tinygrad.runtime.support.hcq import FileIOInterface import functools -def _do_ioctl_io(__idir, __base, __nr, __fd:HWInterface, val=0, __len=0): +def _do_ioctl_io(__idir, __base, __nr, __fd:FileIOInterface, val=0, __len=0): return __fd.ioctl((__idir<<30) | (__len<<16) | (__base<<8) | __nr, val) -def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:HWInterface, __val=None, **kwargs): +def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, __val=None, **kwargs): ret = __fd.ioctl((__idir<<30) | (ctypes.sizeof(made := (__made or __user_struct(**kwargs)))<<16) | (__base<<8) | __nr, made) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") return made diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 2401456e6c..0e777f1618 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -3,7 +3,7 @@ from typing import Any, cast, ClassVar import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select assert sys.platform != 'win32' from dataclasses import dataclass -from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface +from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface from tinygrad.ops import sint from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, all_same, flatten, DEBUG, OSX @@ -520,9 +520,9 @@ class AMDIP: return getattr(self.module, name) class KFDIface: - kfd:HWInterface|None = None + kfd:FileIOInterface|None = None event_page:HCQBuffer|None = None - gpus:list[HWInterface] = [] + gpus:list[FileIOInterface] = [] def _is_usable_gpu(self, gpu_id): with contextlib.suppress(OSError): return int(gpu_id.read()) != 0 @@ -535,23 +535,23 @@ class KFDIface: # Initialize KFD interface during first run if KFDIface.kfd is None: - KFDIface.kfd = HWInterface("/dev/kfd", os.O_RDWR) - gpus = [g for g in HWInterface(kfd_topo_path).listdir() if self._is_usable_gpu(HWInterface(f"{kfd_topo_path}/{g}/gpu_id"))] + KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR) + gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))] gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1])) visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()] KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?") - self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read()) - self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()} + self.gpu_id = int(FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read()) + self.props = {(p:=l.split())[0]: int(p[1]) for l in FileIOInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()} ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0" id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP} - self.ip_versions = {id2ip[int(hwid)]:tuple(int(HWInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision']) - for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} - self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in HWInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines()) - for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} - self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR) + self.ip_versions = {id2ip[int(hwid)]:tuple(int(FileIOInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision']) + for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} + self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in FileIOInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines()) + for hwid in FileIOInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip} + self.drm_fd = FileIOInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR) kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id) @@ -580,8 +580,8 @@ class KFDIface: if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR: - buf = addr = HWInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) - else: buf, addr = 0, HWInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0) + buf = addr = FileIOInterface.anon_mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) + else: buf, addr = 0, FileIOInterface.anon_mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, 0) assert addr != 0xffffffffffffffff try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, @@ -604,7 +604,7 @@ class KFDIface: c_gpus = (ctypes.c_int32 * len(gpus))(*gpus) stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus)) assert stm.n_success == len(gpus) - if mem.va_addr: HWInterface.munmap(mem.va_addr, mem.size) + if mem.va_addr: FileIOInterface.munmap(mem.va_addr, mem.size) kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle) def map(self, mem): @@ -624,7 +624,7 @@ class KFDIface: if not hasattr(self, 'doorbells'): self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages - self.doorbells = cast(HWInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base) + self.doorbells = cast(FileIOInterface, KFDIface.kfd).mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, self.doorbells_base) return AMDQueueDesc(ring=to_mv(ring.va_addr, ring.size).cast("I"), read_ptrs=[to_mv(queue.read_pointer_address, 8).cast("Q")], write_ptrs=[to_mv(queue.write_pointer_address, 8).cast("Q")], @@ -650,17 +650,17 @@ class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AM class PCIIface: supported_devs:list[int] = [0x744c, 0x7480, 0x7550] - vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio") - vfio_fd:HWInterface + vfio:bool = getenv("VFIO", 1) and FileIOInterface.exists("/dev/vfio/vfio") + vfio_fd:FileIOInterface gpus:list[Any] = [] def __init__(self, dev, dev_id): self.dev = dev if first_dev:=len(PCIIface.gpus) == 0: - for pcibus in HWInterface("/sys/bus/pci/devices").listdir(): - vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16) - device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16) + for pcibus in FileIOInterface("/sys/bus/pci/devices").listdir(): + vendor = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16) + device = int(FileIOInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16) if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus) PCIIface.gpus = sorted(PCIIface.gpus) @@ -671,51 +671,51 @@ class PCIIface: self.pcibus = PCIIface.gpus[dev_id] # Unbind the device from the kernel driver - if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): - HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus) + if FileIOInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): + FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus) - supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16) - try: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1)) + supported_sizes = int(FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16) + try: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1)) except OSError as e: raise RuntimeError(f"Cannot resize BAR: {e}. Ensure the resizable BAR option is enabled on your system.") from e # Try to init vfio. Use it if success. if PCIIface.vfio: try: if first_dev: - HWInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1") - PCIIface.vfio_fd = HWInterface("/dev/vfio/vfio", os.O_RDWR) + FileIOInterface("/sys/module/vfio/parameters/enable_unsafe_noiommu_mode", os.O_RDWR).write("1") + PCIIface.vfio_fd = FileIOInterface("/dev/vfio/vfio", os.O_RDWR) vfio.VFIO_CHECK_EXTENSION(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) - HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci") - HWInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus) + FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", os.O_WRONLY).write("vfio-pci") + FileIOInterface("/sys/bus/pci/drivers_probe", os.O_WRONLY).write(self.pcibus) - iommu_group = HWInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] + iommu_group = FileIOInterface.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] except OSError: if DEBUG >= 1: print(f"am {self.pcibus}: failed to init vfio-pci module (run `sudo modprobe vfio-pci`).") PCIIface.vfio = False # Init vfio for the device if PCIIface.vfio: - self.vfio_group = HWInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) + self.vfio_group = FileIOInterface(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(PCIIface.vfio_fd.fd)) if first_dev: vfio.VFIO_SET_IOMMU(PCIIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) - self.vfio_dev = HWInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))) + self.vfio_dev = FileIOInterface(fd=vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, ctypes.create_string_buffer(self.pcibus.encode()))) - self.irq_fd = HWInterface.eventfd(0, 0) + self.irq_fd = FileIOInterface.eventfd(0, 0) self.irq_poller = select.poll() self.irq_poller.register(self.irq_fd.fd, select.POLLIN) irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER, argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd)) vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs) - else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1") + else: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1") - self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY) - self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) - self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]} + self.pagemap = FileIOInterface("/proc/self/pagemap", os.O_RDONLY) + self.cfg_fd = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) + self.bar_fds = {b: FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{b}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for b in [0, 2, 5]} - bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines() + bar_info = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines() self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)} self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I')) @@ -741,7 +741,7 @@ class PCIIface: def alloc(self, size:int, host=False, uncached=False, cpu_access=False): if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory. vaddr = self.adev.mm.alloc_vaddr(size:=round_up(size, mmap.PAGESIZE), align=mmap.PAGESIZE) - va = HWInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) + va = FileIOInterface.anon_mmap(vaddr, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED, 0) # Read pagemap to get the physical address of each page. The pages are locked. self.pagemap.seek(va // mmap.PAGESIZE * 8) @@ -791,7 +791,7 @@ class AMDDevice(HCQCompiled): signal_pages: ClassVar[list[Any]] = [] signal_pool: ClassVar[list[int]] = [] - driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) + driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0)) def __init__(self, device:str=""): self.device_id = int(device.split(":")[1]) if ":" in device else 0 @@ -852,7 +852,7 @@ class AMDDevice(HCQCompiled): self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0)) if self.sqtt_enabled: if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX') - if not self.driverless and (ppfeaturemask:=int(HWInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16)) & 0x8000: + if not self.driverless and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000: raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add " f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n" "For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md") diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 7d8952a256..eb2ca429c9 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -4,7 +4,7 @@ assert sys.platform != 'win32' from typing import Any, cast, Union, Type, ClassVar from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator -from tinygrad.runtime.support.hcq import MMIOInterface, HWInterface, MOCKGPU +from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU from tinygrad.ops import sint from tinygrad.device import BufferSpec, CPUProgram from tinygrad.helpers import getenv, mv_address, init_c_struct_t, round_up, data64, data64_le, DEBUG, prod, OSX @@ -20,7 +20,7 @@ def get_error_str(status): return f"{status}: {nv_gpu.nv_status_codes.get(status NV_PFAULT_FAULT_TYPE = {dt:name for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_FAULT_TYPE_")} NV_PFAULT_ACCESS_TYPE = {dt:name.split("_")[-1] for name,dt in nv_gpu.__dict__.items() if name.startswith("NV_PFAULT_ACCESS_TYPE_")} -def nv_iowr(fd:HWInterface, nr, args): +def nv_iowr(fd:FileIOInterface, nr, args): ret = fd.ioctl((3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args) if ret != 0: raise RuntimeError(f"ioctl returned {ret}") @@ -46,7 +46,7 @@ def make_rmctrl_type(): getattr(nv_gpu, name+"_PARAMS", getattr(nv_gpu, name.replace("_CTRL_CMD_", "_CTRL_DEBUG_")+"_PARAMETERS", None))))}) rmctrl = make_rmctrl_type() -def uvm_ioctl(cmd, sttyp, fd:HWInterface, **kwargs): +def uvm_ioctl(cmd, sttyp, fd:FileIOInterface, **kwargs): ret = fd.ioctl(cmd, made:=sttyp(**kwargs)) if ret != 0: raise RuntimeError(f"ioctl(uvm) returned {ret}") if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl returned {get_error_str(made.rmStatus)}") @@ -293,8 +293,8 @@ class NVDevice(HCQCompiled[NVSignal]): signal_pool: ClassVar[list[int]] = [] root = None - fd_ctl: HWInterface - fd_uvm: HWInterface + fd_ctl: FileIOInterface + fd_uvm: FileIOInterface gpus_info: Union[list, ctypes.Array] = [] # TODO: Need a proper allocator for va addresses @@ -305,12 +305,12 @@ class NVDevice(HCQCompiled[NVSignal]): host_object_enumerator: int = 0x1000 def _new_gpu_fd(self): - fd_dev = HWInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC) + fd_dev = FileIOInterface(f"/dev/nvidia{NVDevice.gpus_info[self.device_id].minor_number}", os.O_RDWR | os.O_CLOEXEC) nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl.fd)) return fd_dev def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False): - fd_dev = self._new_gpu_fd() if not system else HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) + fd_dev = self._new_gpu_fd() if not system else FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev.fd, params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.nvdevice, hMemory=memory_handle, length=size, flags=flags)) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made) @@ -324,7 +324,7 @@ class NVDevice(HCQCompiled[NVSignal]): va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) if host: - va_addr = HWInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) + va_addr = FileIOInterface.anon_mmap(va_addr, size, mmap.PROT_READ | mmap.PROT_WRITE, MAP_FIXED | mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, 0) flags = (nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) \ | (nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30) @@ -363,7 +363,7 @@ class NVDevice(HCQCompiled[NVSignal]): self._debug_mappings.pop((cast(int, mem.va_addr), mem.size)) uvm.free(self.fd_uvm, base=cast(int, mem.va_addr), length=mem.size) - if mem.meta.has_cpu_mapping: HWInterface.munmap(cast(int, mem.va_addr), mem.size) + if mem.meta.has_cpu_mapping: FileIOInterface.munmap(cast(int, mem.va_addr), mem.size) def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer: if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size) @@ -391,9 +391,9 @@ class NVDevice(HCQCompiled[NVSignal]): def __init__(self, device:str=""): if NVDevice.root is None: - NVDevice.fd_ctl = HWInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) - NVDevice.fd_uvm = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) - self.fd_uvm_2 = HWInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) + NVDevice.fd_ctl = FileIOInterface("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC) + NVDevice.fd_uvm = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) + self.fd_uvm_2 = FileIOInterface("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC) NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew uvm.initialize(self.fd_uvm) with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 90982eb71c..a4fd068551 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -5,7 +5,7 @@ from types import SimpleNamespace from typing import Any, cast, ClassVar from tinygrad.device import BufferSpec from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator -from tinygrad.runtime.support.hcq import HWInterface +from tinygrad.runtime.support.hcq import FileIOInterface from tinygrad.runtime.autogen import kgsl, adreno from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer @@ -325,7 +325,7 @@ class QCOMDevice(HCQCompiled): dummy_addr: int = 0 def __init__(self, device:str=""): - self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR) + self.fd = FileIOInterface('/dev/kgsl-3d0', os.O_RDWR) QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr) flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \ @@ -364,7 +364,7 @@ class QCOMDevice(HCQCompiled): def _gpu_free(self, mem:HCQBuffer): kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id) - HWInterface.munmap(mem.va_addr, mem.meta.mmapsize) + FileIOInterface.munmap(mem.va_addr, mem.meta.mmapsize) def _ensure_stack_size(self, sz): if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 2e9beaf369..282c105651 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -13,7 +13,7 @@ class MMIOInterface: def __getitem__(self, k) -> int|list[int]: return self.mv[k].tolist() if isinstance(k, slice) else self.mv[k] def __setitem__(self, k, v:int|array.array): self.mv[k] = v -class HWInterface: +class FileIOInterface: """ Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices. """ @@ -42,9 +42,9 @@ class HWInterface: @staticmethod def readlink(path): return os.readlink(path) @staticmethod - def eventfd(initval, flags=None): return HWInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined] + def eventfd(initval, flags=None): return FileIOInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined] -if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockHWInterface as HWInterface # noqa: F401 # pylint: disable=unused-import +if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import # **************** for HCQ Compatible Devices ****************