mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
amd/nv respect visible devs (#5409)
* nv/amd respect visible devices * linter * sort amd gpus * env docs
This commit is contained in:
@@ -48,3 +48,4 @@ IMAGE | [1-2] | enable 2d specific optimizations
|
||||
FLOAT16 | [1] | use float16 for images instead of float32
|
||||
PTX | [1] | enable the specialized [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/) assembler for Nvidia GPUs. If not set, defaults to generic CUDA codegen backend.
|
||||
PROFILE | [1] | enable output of [perfetto](https://ui.perfetto.dev/) compatible profile. This feature is supported in NV and AMD backends.
|
||||
VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
|
||||
|
||||
@@ -449,8 +449,14 @@ class AMDDevice(HCQCompatCompiled):
|
||||
def __init__(self, device:str=""):
|
||||
if AMDDevice.kfd == -1:
|
||||
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
||||
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
||||
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
||||
from typing import Tuple, List, Any, cast
|
||||
from typing import Tuple, List, Any, cast, Union
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, HWCommandQueue, HWComputeQueue, HWCopyQueue, hcq_command, \
|
||||
hcq_profile, Compiler, CompileError, BufferOptions
|
||||
@@ -380,7 +380,7 @@ class NVDevice(HCQCompatCompiled):
|
||||
root = None
|
||||
fd_ctl: int = -1
|
||||
fd_uvm: int = -1
|
||||
gpus_info = None
|
||||
gpus_info:Union[List, ctypes.Array] = []
|
||||
signals_page:Any = None
|
||||
signals_pool: List[Any] = []
|
||||
uvm_vaddr: int = 0x1000000000
|
||||
@@ -388,7 +388,7 @@ class NVDevice(HCQCompatCompiled):
|
||||
devices: List[NVDevice] = []
|
||||
|
||||
def _new_gpu_fd(self):
|
||||
fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
|
||||
fd_dev = os.open(f"/dev/nvidia{self.gpu_info.deviceInstance}", os.O_RDWR | os.O_CLOEXEC)
|
||||
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
||||
return fd_dev
|
||||
|
||||
@@ -488,16 +488,19 @@ class NVDevice(HCQCompatCompiled):
|
||||
uvm.initialize(self.fd_uvm)
|
||||
with contextlib.suppress(RuntimeError): uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm) # this error is okay, CUDA hits it too
|
||||
|
||||
NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
|
||||
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
|
||||
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
NVDevice.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
|
||||
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
|
||||
if self.device_id >= len(NVDevice.gpus_info) or not NVDevice.gpus_info[self.device_id].valid:
|
||||
raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
||||
|
||||
self.gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
||||
self.fd_dev = self._new_gpu_fd()
|
||||
|
||||
assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
|
||||
gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
||||
|
||||
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
|
||||
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.gpu_info.deviceInstance, hClientShare=self.root,
|
||||
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
||||
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
||||
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
|
||||
|
||||
Reference in New Issue
Block a user