amd/nv respect visible devs (#5409)

* nv/amd respect visible devices

* linter

* sort amd gpus

* env docs
This commit is contained in:
nimlgen
2024-07-12 20:02:12 +03:00
committed by GitHub
parent b18aa00bba
commit 6604d2b2c3
3 changed files with 20 additions and 10 deletions

View File

@@ -449,8 +449,14 @@ class AMDDevice(HCQCompatCompiled):
def __init__(self, device:str=""):
if AMDDevice.kfd == -1:
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
self.device_id = int(device.split(":")[1]) if ":" in device else 0
if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)