mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
autogen cleanup (#6064)
* start autogen cleanup * nvgpu * better? * better * amd part * gpu regen * fix mockgpu amd * nv * amd fix linter * remove import * ugh * nv on master * amd on master
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any, cast
|
||||
import os, fcntl, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
|
||||
import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
|
||||
HCQSignal, HCQProgram, BufferOptions
|
||||
@@ -16,14 +16,6 @@ def is_usable_gpu(gpu_id):
|
||||
with contextlib.suppress(OSError): return int(pathlib.Path(gpu_id).read_text()) != 0
|
||||
return False
|
||||
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
kio:Any = type("KIO", (object,), {name[11:].lower(): functools.partial(kfd_ioctl, {"IOW": 1, "IOR": 2, "IOWR": 3}[p[0]], p[1], p[2])
|
||||
for name,p in kfd.__dict__.items() if name.startswith("AMDKFD_IOC_")})
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
||||
|
||||
# VGT_EVENT_TYPE in navi10_enum.h
|
||||
@@ -42,7 +34,7 @@ class AMDSignal(HCQSignal):
|
||||
self._signal = AMDDevice.signals_pool.pop()
|
||||
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
|
||||
if alloc_event:
|
||||
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
||||
sync_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
|
||||
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
self._event_id = sync_event.event_id
|
||||
self._evt_array = (kfd.struct_kfd_event_data)(event_id=self._event_id)
|
||||
@@ -59,7 +51,7 @@ class AMDSignal(HCQSignal):
|
||||
|
||||
# Wait active for 5s, then going to sleep.
|
||||
if time_spent > 5000 and self._event_id != 0:
|
||||
kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
||||
kfd.AMDKFD_IOC_WAIT_EVENTS(AMDDevice.kfd, events_ptr=ctypes.addressof(self._evt_array), num_events=1, wait_for_all=1, timeout=1000)
|
||||
raise RuntimeError(f"wait_signal: not set to {value}, but {self._signal[0]}, {timeout} ms TIMEOUT!")
|
||||
|
||||
class AMDComputeQueue(HWComputeQueue):
|
||||
@@ -338,7 +330,8 @@ class AMDDevice(HCQCompiled):
|
||||
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
||||
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
|
||||
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
|
||||
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
|
||||
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
||||
n_devices=len(mem.mapped_gpu_ids))
|
||||
assert stm.n_success == len(mem.mapped_gpu_ids)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
||||
@@ -351,7 +344,8 @@ class AMDDevice(HCQCompiled):
|
||||
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
|
||||
try: mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
||||
try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id,
|
||||
flags=flags, mmap_offset=buf)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
|
||||
raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
|
||||
@@ -367,10 +361,10 @@ class AMDDevice(HCQCompiled):
|
||||
def _gpu_free(self, mem):
|
||||
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
|
||||
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
||||
stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
||||
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
||||
assert stm.n_success == len(gpus)
|
||||
libc.munmap(mem.va_addr, mem.size)
|
||||
kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
|
||||
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if AMDDevice.kfd == -1:
|
||||
@@ -390,13 +384,13 @@ class AMDDevice(HCQCompiled):
|
||||
self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
|
||||
if target < 110000 or target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
|
||||
kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
|
||||
if AMDDevice.event_page is None:
|
||||
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
AMDDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, AMDDevice.signals_page.size, 16)]
|
||||
kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
||||
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
||||
else:
|
||||
self._gpu_map(AMDDevice.signals_page)
|
||||
self._gpu_map(AMDDevice.event_page)
|
||||
@@ -422,7 +416,7 @@ class AMDDevice(HCQCompiled):
|
||||
ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
|
||||
eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
|
||||
queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
||||
queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
|
||||
ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
|
||||
|
||||
Reference in New Issue
Block a user