mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
hcq: helper for visible devices (#12950)
* hcq: helper for visible devices * fix * f
This commit is contained in:
@@ -41,7 +41,7 @@ BEAM | [#] | number of beams in kernel beam search
|
||||
DEFAULT_FLOAT | [HALF, ...]| specify the default float dtype (FLOAT32, HALF, BFLOAT16, FLOAT64, ...), default to FLOAT32
|
||||
IMAGE | [1-2] | enable 2d specific optimizations
|
||||
FLOAT16 | [1] | use float16 for images instead of float32
|
||||
VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
|
||||
HCQ_VISIBLE_DEVICES | [list[int]]| restricts the HCQ devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
|
||||
JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
|
||||
VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)
|
||||
ALLOW_TF32 | [1] | enable TensorFloat-32 tensor cores on Ampere or newer GPUs.
|
||||
|
||||
@@ -4,7 +4,7 @@ import os, ctypes, struct, hashlib, functools, importlib, mmap, errno, array, co
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerPairT
|
||||
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, suppress_finalizing, lo32, hi32, colored
|
||||
@@ -575,9 +575,7 @@ class KFDIface:
|
||||
if KFDIface.kfd is None:
|
||||
KFDIface.kfd = FileIOInterface("/dev/kfd", os.O_RDWR)
|
||||
gpus = [g for g in FileIOInterface(kfd_topo_path).listdir() if self._is_usable_gpu(FileIOInterface(f"{kfd_topo_path}/{g}/gpu_id"))]
|
||||
gpus = sorted(gpus, key=lambda x: int(x.split('/')[-1]))
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
KFDIface.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
|
||||
KFDIface.gpus = hcq_filter_visible_devices(sorted(gpus, key=lambda x: int(x.split('/')[-1])))
|
||||
|
||||
if device_id >= len(KFDIface.gpus): raise RuntimeError(f"No device found for {device_id}. Requesting more devices than the system has?")
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ assert sys.platform != 'win32'
|
||||
from typing import cast, ClassVar
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import BufferSpec, CompilerPairT
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, suppress_finalizing
|
||||
@@ -321,8 +321,7 @@ class NVKIface:
|
||||
with contextlib.suppress(RuntimeError): uvm.mm_initialize(self.fd_uvm_2, uvmFd=self.fd_uvm.fd) # this error is okay, CUDA hits it too
|
||||
|
||||
nv_iowr(NVKIface.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, gpus_info:=(nv_gpu.nv_ioctl_card_info_t*64)())
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('CUDA_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
||||
NVKIface.gpus_info = [gpus_info[x] for x in visible_devices] if visible_devices else gpus_info
|
||||
NVKIface.gpus_info = hcq_filter_visible_devices(gpus_info)
|
||||
|
||||
self.dev, self.device_id = dev, device_id
|
||||
if self.device_id >= len(NVKIface.gpus_info) or not NVKIface.gpus_info[self.device_id].valid:
|
||||
|
||||
@@ -57,6 +57,9 @@ if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterf
|
||||
|
||||
# **************** for HCQ Compatible Devices ****************
|
||||
|
||||
def hcq_filter_visible_devices(dev):
|
||||
return [dev[x] for x in ids] if (ids:=[int(x) for x in (getenv('HCQ_VISIBLE_DEVICES', '')).split(',') if x.strip()]) else dev
|
||||
|
||||
SignalType = TypeVar('SignalType', bound='HCQSignal')
|
||||
HCQDeviceType = TypeVar('HCQDeviceType', bound='HCQCompiled')
|
||||
ProgramType = TypeVar('ProgramType', bound='HCQProgram')
|
||||
|
||||
@@ -2,7 +2,7 @@ import os, mmap, array, functools, ctypes, select, contextlib, dataclasses, sys,
|
||||
from typing import cast, ClassVar
|
||||
from tinygrad.helpers import round_up, getenv, OSX, temp, ceildiv
|
||||
from tinygrad.runtime.autogen import libc, vfio, pci
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface, HCQBuffer
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface, HCQBuffer, hcq_filter_visible_devices
|
||||
from tinygrad.runtime.support.memory import MemoryManager, VirtMapping
|
||||
from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface
|
||||
|
||||
@@ -243,9 +243,7 @@ class LNXPCIIfaceBase:
|
||||
|
||||
def __init__(self, dev, dev_id, vendor, devices, bars, vram_bar, va_start, va_size):
|
||||
if len((cls:=type(self)).gpus) == 0:
|
||||
cls.gpus = System.pci_scan_bus(vendor, devices)
|
||||
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', '')).split(',') if x.strip()]
|
||||
cls.gpus = [cls.gpus[x] for x in visible_devices] if visible_devices else cls.gpus
|
||||
cls.gpus = hcq_filter_visible_devices(System.pci_scan_bus(vendor, devices))
|
||||
|
||||
# Acquire va range to avoid collisions.
|
||||
FileIOInterface.anon_mmap(va_start, va_size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, 0)
|
||||
|
||||
Reference in New Issue
Block a user