mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-15 09:05:40 -05:00
amd/nv respect visible devs (#5409)
* nv/amd respect visible devices * linter * sort amd gpus * env docs
This commit is contained in:
@@ -449,8 +449,14 @@ class AMDDevice(HCQCompatCompiled):
|
||||
def __init__(self, device:str=""):
|
||||
if AMDDevice.kfd == -1:
|
||||
AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
AMDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
||||
gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
||||
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
|
||||
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
||||
with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||
|
||||
Reference in New Issue
Block a user