mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-13 16:15:19 -05:00
* kfd_ops: Fix GPU node discovery on NUMA systems Ignore potentially multiple CPU NUMA nodes and any GPU nodes that are not accessible because of device cgroups. Signed-off-by: Felix Kuehling <felix.kuehling@amd.com> * kfd_ops: Format the GFX arch target name correctly The target version in sysfs properties is a decimal representation with two digits per component. The format for LLVM GFX target names is a bit quirky for historical reasons. It uses one digit for the minor version and stepping. When it ran out of decimal digits for the stepping on gfx90X it started using hexadecimal there. But the major version is still decimal and went double digit in GFX10. Make sure to parse and format it accordingly for all supported GPUs. Signed-off-by: Felix Kuehling <felix.kuehling@amd.com> --------- Signed-off-by: Felix Kuehling <felix.kuehling@amd.com>
492 lines
28 KiB
Python
492 lines
28 KiB
Python
from __future__ import annotations
|
|
from typing import Tuple, List, Any
|
|
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno
|
|
from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
|
|
from tinygrad.buffer import BufferOptions
|
|
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
|
|
from tinygrad.renderer.cstyle import HIPRenderer
|
|
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
|
import tinygrad.runtime.autogen.kfd as kfd
|
|
import tinygrad.runtime.autogen.hsa as hsa
|
|
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
|
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
|
|
|
|
libc = ctypes.CDLL("libc.so.6")
|
|
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
|
libc.mmap.restype = ctypes.c_void_p
|
|
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
|
libc.munmap.restype = ctypes.c_int
|
|
|
|
def is_usable_gpu(gpu_id):
|
|
try:
|
|
with gpu_id.open() as f:
|
|
return int(f.read()) != 0
|
|
except OSError:
|
|
return False
|
|
|
|
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
|
made = made_struct or user_struct(**kwargs)
|
|
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
|
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
|
return made
|
|
|
|
def ioctls_from_header():
|
|
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
|
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
|
#matches = re.findall(pattern, hdr, re.MULTILINE)
|
|
# get this from python instead
|
|
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
|
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
|
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
|
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
|
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
|
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
|
for name, idir, nr, sname in matches}
|
|
return type("KIO", (object, ), fxns)
|
|
kio = ioctls_from_header()
|
|
|
|
def create_sdma_packets():
|
|
# TODO: clean up this, if we want to keep it
|
|
structs = {}
|
|
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
|
names = set()
|
|
fields = []
|
|
for pkt_fields in pkt._fields_:
|
|
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
|
else:
|
|
assert pkt_fields[1]._fields_[0][0] == '_0'
|
|
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
|
fname = union_fields[0]
|
|
if fname in names: fname = pkt_fields[0]+fname
|
|
names.add(fname)
|
|
# merge together 64-bit fields, otherwise just append them
|
|
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
|
|
else: fields.append(tuple([fname, *union_fields[1:]]))
|
|
new_name = name[16:-4].lower()
|
|
structs[new_name] = init_c_struct_t(tuple(fields))
|
|
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
|
return type("SDMA_PKTS", (object, ), structs)
|
|
sdma_pkts = create_sdma_packets()
|
|
|
|
class KFDCompiler(Compiler):
|
|
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
|
|
def __init__(self, arch:str):
|
|
self.arch = arch
|
|
super().__init__(f"compile_hip_{self.arch}")
|
|
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
|
|
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
|
|
|
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
|
SDMA_MAX_COPY_SIZE = 0x400000
|
|
PAGE_SIZE = 0x1000
|
|
|
|
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
|
|
|
|
VENDOR_HEADER = hsa.HSA_PACKET_TYPE_VENDOR_SPECIFIC << hsa.HSA_PACKET_HEADER_TYPE
|
|
|
|
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
|
|
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
|
|
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
|
|
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
|
|
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
|
|
|
|
BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
|
|
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
|
|
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
|
|
BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
|
|
|
|
SHT_PROGBITS = 0x1
|
|
SHF_ALLOC = 0x2
|
|
|
|
EMPTY_SIGNAL = hsa.hsa_signal_t()
|
|
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
|
|
|
class HWComputeQueue:
|
|
def __init__(self): self.q = []
|
|
|
|
def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
|
|
if completion_signal is not None: completion_signal.value = 1
|
|
self.q.append(hsa.hsa_kernel_dispatch_packet_t(
|
|
header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP,
|
|
workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2],
|
|
grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2],
|
|
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs,
|
|
completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL))
|
|
return self
|
|
|
|
def signal(self, signal:hsa.amd_signal_t):
|
|
signal.value = 1
|
|
self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal))))
|
|
return self
|
|
|
|
def wait(self, signal:hsa.amd_signal_t):
|
|
sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER)
|
|
sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal))
|
|
self.q.append(sig)
|
|
return self
|
|
|
|
def submit(self, device:KFDDevice):
|
|
if not len(self.q): return
|
|
write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id
|
|
if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
|
|
for cmd in self.q:
|
|
ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size
|
|
ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE)
|
|
write_ptr += 1
|
|
# TODO: add CPU memory barrier here
|
|
device.amd_aql_queue.write_dispatch_id = write_ptr
|
|
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
|
|
device.aql_doorbell_value += len(self.q)
|
|
return self
|
|
|
|
# prebuilt sdma packets
|
|
sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
|
|
sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
|
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
|
|
GCR_CONTROL_GL2_RANGE=0)
|
|
sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
|
GCR_CONTROL_GL2_RANGE=0)
|
|
|
|
class HWCopyQueue:
|
|
def __init__(self): self.q = []
|
|
|
|
def submit(self, device:KFDDevice):
|
|
read_ptr = device.sdma_read_pointer[0]
|
|
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
|
for cmd in self.q:
|
|
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
|
|
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
|
|
device.sdma_doorbell_value += fill
|
|
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
|
|
device.sdma_doorbell_value += cmdsz
|
|
device.sdma_write_pointer[0] = device.sdma_doorbell_value
|
|
device.sdma_doorbell[0] = device.sdma_doorbell_value
|
|
return self
|
|
|
|
def timestamp(self, addr):
|
|
self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
|
|
return self
|
|
|
|
def copy(self, dest, src, copy_size):
|
|
self.q.append(sdma_flush_hdp_pkt) # TODO: do I need this?
|
|
self.q.append(sdma_cache_inv)
|
|
copied = 0
|
|
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
|
for _ in range(copies_commands):
|
|
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
|
self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
|
|
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
|
|
copied += step_copy_size
|
|
self.q.append(sdma_cache_wb)
|
|
return self
|
|
|
|
def signal(self, signal:hsa.amd_signal_t):
|
|
signal.value = 1
|
|
self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64,
|
|
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1))
|
|
if signal.event_mailbox_ptr != 0:
|
|
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
|
|
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
|
|
return self
|
|
|
|
def wait(self, signal:hsa.amd_signal_t):
|
|
self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3,
|
|
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
|
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
|
return self
|
|
|
|
class KFDProgram:
|
|
def __init__(self, device:KFDDevice, name:str, lib:bytes):
|
|
# TODO; this API needs the type signature of the function and global_size/local_size
|
|
self.device, self.name, self.lib = device, name, lib
|
|
|
|
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
|
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
|
|
|
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
|
|
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
|
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
|
|
|
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
|
|
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
|
|
|
|
self.device._submit_cache_inv(gli=2)
|
|
|
|
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
|
|
self.handle = self.lib_gpu.va_addr + entry_point
|
|
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
|
|
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
|
|
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
|
|
assert self.private_segment_size <= self.device.max_private_segment_size, \
|
|
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
|
|
|
|
# NOTE: no programs are ever freed
|
|
def __del__(self):
|
|
if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
|
|
|
|
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
|
if self.device.kernargs_ptr >= (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size):
|
|
self.device.kernargs_ptr = self.device.kernargs.va_addr
|
|
assert self.device.kernargs_ptr < (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size), "kernargs overrun"
|
|
if not hasattr(self, "args_struct_t"):
|
|
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
|
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
|
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
|
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
|
args_st = self.args_struct_t.from_address(self.device.kernargs_ptr)
|
|
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
|
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
|
|
|
HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size,
|
|
self.device.completion_signal if wait else None).submit(self.device)
|
|
self.device.kernargs_ptr += self.kernargs_segment_size
|
|
|
|
if wait:
|
|
self.device._wait_signal(self.device.completion_signal)
|
|
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
|
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
|
|
|
class KFDAllocator(LRUAllocator):
|
|
def __init__(self, device:KFDDevice):
|
|
self.device = device
|
|
# NOTE: KFD_IOC_ALLOC_MEM_FLAGS_GTT doesn't work here for readinto
|
|
self.b = [self.device._gpu_alloc(SDMA_MAX_COPY_SIZE*4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) for _ in range(2)]
|
|
super().__init__()
|
|
|
|
def _alloc(self, size:int, options:BufferOptions):
|
|
try:
|
|
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
|
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
|
except OSError as e:
|
|
if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e
|
|
else: raise
|
|
|
|
def _free(self, gpumem, options:BufferOptions): self.device._gpu_free(gpumem)
|
|
#def as_buffer(self, src:Any) -> memoryview:
|
|
# self.device.synchronize()
|
|
# return to_mv(src.va_addr, src.size)
|
|
|
|
#def copy_from_fd(self, dest, fd, offset, size):
|
|
# fo = io.FileIO(fd, "a+b", closefd=False)
|
|
# fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
|
|
# copied_in, total_copy_size = 0, round_up(size+minor_offset, PAGE_SIZE)
|
|
# for i in range(0, size+minor_offset, self.b[0].size):
|
|
# local_size = min(self.b[0].size, total_copy_size-i)
|
|
# copy_size = min(local_size-minor_offset, size-copied_in)
|
|
# if copy_size == 0: break
|
|
|
|
# fo.readinto(to_mv(self.b[1].va_addr, local_size))
|
|
# if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
|
# self.b = self.b[::-1]
|
|
# self.device._submit_sdma(dest.va_addr+copied_in, self.b[0].va_addr+minor_offset, copy_size, completion_signal=self.device.signal_sdma)
|
|
|
|
# copied_in += copy_size
|
|
# minor_offset = 0 # only on the first
|
|
# self.device._wait_signal(self.device.signal_sdma)
|
|
|
|
def copyin(self, dest, src: memoryview):
|
|
for i in range(0, src.nbytes, self.b[0].size):
|
|
ctypes.memmove(self.b[1].va_addr, from_mv(src[i:]), lsize:=min(self.b[0].size, src.nbytes-i))
|
|
if i != 0: self.device._wait_signal(self.device.signal_sdma)
|
|
self.b = self.b[::-1]
|
|
self.device._submit_sdma(dest.va_addr+i, self.b[0].va_addr, lsize, completion_signal=self.device.signal_sdma)
|
|
self.device._wait_signal(self.device.signal_sdma)
|
|
|
|
def copyout(self, dest:memoryview, src):
|
|
self.device.synchronize()
|
|
for i in range(0, dest.nbytes, self.b[0].size):
|
|
self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), completion_signal=self.device.signal_sdma)
|
|
self.device._wait_signal(self.device.signal_sdma)
|
|
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
|
|
|
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
|
|
dest_dev._gpu_map(src)
|
|
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
|
|
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
|
HWComputeQueue().wait(sigc).submit(dest_dev)
|
|
q.wait(sigc).submit(src_dev)
|
|
|
|
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
|
class KFDDevice(Compiled):
|
|
kfd:int = -1
|
|
event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
|
|
signals_page:Any = None
|
|
signal_number:int = 16
|
|
gpus:List[pathlib.Path] = []
|
|
|
|
def _gpu_map(self, mem):
|
|
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
|
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
|
|
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
|
|
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
|
|
assert stm.n_success == len(mem.mapped_gpu_ids)
|
|
|
|
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
|
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
|
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
|
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
|
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
|
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
|
else:
|
|
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
|
assert addr != 0xffffffffffffffff
|
|
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
|
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
|
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
|
assert addr == buf == mem.va_addr
|
|
if map_to_gpu: self._gpu_map(mem)
|
|
return mem
|
|
|
|
def _gpu_free(self, mem):
|
|
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
|
|
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
|
stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
|
assert stm.n_success == len(gpus)
|
|
libc.munmap(mem.va_addr, mem.size)
|
|
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
|
|
|
@classmethod
|
|
def _get_signal(self, num=None, sync_event=None) -> hsa.amd_signal_t:
|
|
if num is None:
|
|
num = KFDDevice.signal_number
|
|
KFDDevice.signal_number += 1
|
|
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
|
|
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
|
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
|
if sync_event is not None:
|
|
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
|
ret.event_id = sync_event.event_id
|
|
return ret
|
|
|
|
@classmethod
|
|
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=60000):
|
|
assert signal.event_id != 0, "can't wait on this signal"
|
|
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
|
evt_arr[0].event_id = signal.event_id
|
|
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
|
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
|
|
|
def __init__(self, device:str=""):
|
|
if KFDDevice.kfd == -1:
|
|
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
|
KFDDevice.gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
|
|
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
|
with open(f"{KFDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
|
|
with open(f"{KFDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
|
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
|
target = int(self.properties['gfx_target_version'])
|
|
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
|
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
|
|
|
if KFDDevice.event_page is None:
|
|
KFDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
KFDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=KFDDevice.event_page.handle, auto_reset=1)
|
|
else:
|
|
self._gpu_map(KFDDevice.signals_page)
|
|
self._gpu_map(KFDDevice.event_page)
|
|
sync_event = kio.create_event(KFDDevice.kfd, auto_reset=1)
|
|
|
|
self.completion_signal = KFDDevice._get_signal(self.device_id*2, sync_event=sync_event)
|
|
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
|
|
|
|
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
|
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
|
self.kernargs_ptr = self.kernargs.va_addr
|
|
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
|
|
|
# AQL Queue
|
|
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr)
|
|
self.amd_aql_queue.write_dispatch_id = 0
|
|
self.amd_aql_queue.read_dispatch_id = 0
|
|
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
|
|
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
|
|
|
|
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
|
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
|
|
|
# scratch setup
|
|
self.max_private_segment_size = 4096
|
|
wave_scratch_len = round_up(((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
|
self.scratch_len = (self.amd_aql_queue.max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
|
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
|
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
|
|
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
|
|
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
|
|
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
|
|
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
|
|
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
|
|
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
|
|
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
|
self.amd_aql_queue.compute_tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
|
|
|
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
|
|
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
|
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
|
|
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
|
|
ctl_stack_size = 0xa000,
|
|
write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
|
read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
|
|
|
self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
|
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
|
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
|
self.aql_doorbell_value = 0
|
|
|
|
# SDMA Queue
|
|
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
|
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
|
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
|
|
|
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
|
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
|
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
|
self.sdma_doorbell_value = 0
|
|
|
|
# PM4 stuff
|
|
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
|
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
|
|
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
|
|
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
|
|
self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds,
|
|
completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal)))
|
|
|
|
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
|
|
|
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
|
q = HWCopyQueue()
|
|
if wait_signals is not None:
|
|
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
|
|
for sig in wait_signals: q.wait(ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset)
|
|
if completion_signal is not None: q.timestamp(ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset)
|
|
q.copy(dest, src, copy_size)
|
|
if completion_signal is not None: q.timestamp(ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset)
|
|
if completion_signal is not None: q.signal(completion_signal)
|
|
q.submit(self)
|
|
|
|
def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0):
|
|
pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I")
|
|
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
|
|
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
|
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
|
for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value
|
|
q = HWComputeQueue()
|
|
q.q.append(self.pm4_packet)
|
|
q.submit(self)
|
|
self._wait_signal(self.completion_signal)
|
|
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
|
|
|
def synchronize(self):
|
|
HWComputeQueue().signal(self.completion_signal).submit(self)
|
|
self._wait_signal(self.completion_signal)
|
|
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
|
|
|
# reset kernargs
|
|
self.kernargs_ptr = self.kernargs.va_addr
|