mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-19 02:44:40 -05:00
* connect to gpu
* rlc init?
* gfx comp start init
* early init is hardoded, some progress with fw
* gart
* progress, next mqd
* ring setup, still does not execute anything
* ugh write correct reg
* pci2: vm
* pci2: start psp
* vm seems to work
* pci2: gfx start
* pci2: fix psp ring resp
* pci2: try ring
* pci2: mes and some fixes
* pci2: some progress
* pci2: progress
* pci2: mm
* pci2: discovery
* pci2: correct apertures
* pci2: b
* pci2: i
* pci2: l
* pci2: o
* pci2: cmu
* pci2: mes_kiq works
* pci2: mes
* pci2: kcq does not work(
* pci2: unhalt gfx
* ops_am
* minor
* check if amdgpu is there, or we will crash
* bring back graph, it just works
* less prints
* do not init mes (not used)
* remove unused files
* ops_am: start move into core
* ops_am: works
* clcks, but still slower
* faster + no mes_kiq
* vm frags + remove mes
* cleanup fw
* gmc tiny cleanup
* move to ops_amd
* comment out what we dont really need
* driverless
* close in speed
* am clean most of ips
* gmc to ips
* cleaner
* new vm walker
* comment old one
* remove unsued autogens
* last write ups
* remove psp hardcoded values
* more
* add logs
* ih
* p2p and sdma
* vfio hal and interrupts
* smth
* amd dev iface
* minor after rebase
* bind for sdma
* Revert "bind for sdma"
This reverts commit a90766514d.
* tmp
* debug new mm
* ugh, allreduce hangs fixed
* p1
* works
* no pci.py
* cleaner a bit
* smth
* tiny cleanups
* cleaner a bit
* pciiface
* linter
* linter 2
* linter 3
* linter
* pylint
* reverted unrelated changes
* unrelated
* cmp tool
* ugh wrong fw
* clockgating
* unrelated
* alloc smaller chunks
* this
* opt sigs
* collect stat
* ops
* upd
* proclogs
* proclogs2
* vfio
* ruff
* linter pylint
* oops
* mypy p1
* mem fix
* mypy p2
* mypy p3
* mypy p4
* correct
* minor
* more tests
* linter in tests
* pci_regs header
* minor write up
* setup
* do not require libs
---------
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
147 lines
6.4 KiB
Python
147 lines
6.4 KiB
Python
import pathlib, re, ctypes, mmap, collections, functools, copy
|
|
import tinygrad.runtime.autogen.kfd as kfd
|
|
from tinygrad.helpers import from_mv
|
|
from test.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile
|
|
from test.mockgpu.amd.amdgpu import AMDGPU, gpu_props
|
|
|
|
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
|
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
|
libc.mmap.restype = ctypes.c_void_p
|
|
|
|
def ioctls_from_header():
|
|
# hdrpy = (pathlib.Path(__file__).parent.parent.parent.parent / "tinygrad" / "runtime" / "autogen" / "kfd.py").read_text()
|
|
# pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
|
# matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
|
hdr = (pathlib.Path(__file__).parent.parent.parent.parent / "extra" / "hip_gpu_driver" / "kfd_ioctl.h").read_text().replace("\\\n", "")
|
|
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
|
matches = re.findall(pattern, hdr, re.MULTILINE)
|
|
return type("KFD_IOCTLS", (object, ), {name: int(nr, 0x10) for name, _, nr, _ in matches}), \
|
|
{int(nr, 0x10): getattr(kfd, "struct_"+sname) for name, idir, nr, sname in matches}
|
|
kfd_ioctls, kfd_headers = ioctls_from_header()
|
|
|
|
class KFDFileDesc(VirtFileDesc):
|
|
def __init__(self, fd, driver):
|
|
super().__init__(fd)
|
|
self.driver = driver
|
|
|
|
def ioctl(self, fd, request, argp): return self.driver.kfd_ioctl(request, argp)
|
|
def mmap(self, start, sz, prot, flags, fd, offset): return offset
|
|
|
|
class DRMFileDesc(VirtFileDesc):
|
|
def __init__(self, fd, driver, gpu):
|
|
super().__init__(fd)
|
|
self.driver, self.gpu = driver, gpu
|
|
|
|
def mmap(self, start, sz, prot, flags, fd, offset): return libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0)
|
|
|
|
class AMDDriver(VirtDriver):
|
|
def __init__(self, gpus=6):
|
|
super().__init__()
|
|
|
|
self.tracked_files += [VirtFile('/dev/kfd', functools.partial(KFDFileDesc, driver=self))] + \
|
|
[VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i) for i in range(gpus)]))]
|
|
|
|
self.gpus = {}
|
|
self.next_fd = (1 << 30)
|
|
self.next_handle = 1
|
|
self.next_event = 1
|
|
|
|
self.object_by_handle = {}
|
|
self.doorbells = {}
|
|
self.next_doorbell = collections.defaultdict(int)
|
|
|
|
for i in range(gpus): self._prepare_gpu(i)
|
|
|
|
def _alloc_fd(self):
|
|
my_fd = self.next_fd
|
|
self.next_fd = self.next_fd + 1
|
|
return my_fd
|
|
|
|
def _alloc_handle(self):
|
|
handle = self.next_handle
|
|
self.next_handle += 1
|
|
return handle
|
|
|
|
def _alloc_next_event_slot(self):
|
|
ev = self.next_event
|
|
self.next_event += 1
|
|
return ev
|
|
|
|
def _alloc_doorbell(self, gpu_id):
|
|
x = ctypes.addressof(from_mv(self.doorbells[gpu_id])) + self.next_doorbell[gpu_id] * 8
|
|
self.next_doorbell[gpu_id] += 1
|
|
return x
|
|
|
|
def _prepare_gpu(self, gpu_id):
|
|
self.doorbells[gpu_id] = memoryview(bytearray(0x2000))
|
|
self.gpus[gpu_id] = AMDGPU(gpu_id)
|
|
self.tracked_files += [
|
|
VirtFile('/sys/module/amdgpu', functools.partial(TextFileDesc, text="1")),
|
|
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}', functools.partial(DirFileDesc, child_names=['gpu_id', 'properties'])),
|
|
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/gpu_id', functools.partial(TextFileDesc, text=f"{gpu_id}")),
|
|
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/properties',
|
|
functools.partial(TextFileDesc, text=gpu_props.format(drm_render_minor=gpu_id))),
|
|
VirtFile(f'/dev/dri/renderD{gpu_id}', functools.partial(DRMFileDesc, driver=self, gpu=f"{self.gpus[gpu_id]}")),
|
|
]
|
|
|
|
def open(self, name, flags, mode, virtfile): return virtfile.fdcls(self._alloc_fd())
|
|
|
|
def kfd_ioctl(self, req, argp):
|
|
nr = req & 0xFF
|
|
struct = kfd_headers[nr].from_address(argp)
|
|
|
|
if nr == kfd_ioctls.AMDKFD_IOC_ACQUIRE_VM: pass
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
|
|
if struct.gpu_id not in self.gpus: return -1
|
|
struct.handle = self._alloc_handle()
|
|
self.object_by_handle[struct.handle] = copy.deepcopy(struct) # save memory struct to know what mem it is
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_FREE_MEMORY_OF_GPU:
|
|
self.object_by_handle.pop(struct.handle)
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_MAP_MEMORY_TO_GPU:
|
|
dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr)
|
|
for i in range(struct.n_devices):
|
|
gpu = self.gpus[dev_ids[i]]
|
|
mem_obj = self.object_by_handle[struct.handle]
|
|
gpu.map_range(mem_obj.va_addr, mem_obj.size)
|
|
struct.n_success = i + 1
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
|
|
dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr)
|
|
for i in range(struct.n_devices):
|
|
gpu = self.gpus[dev_ids[i]]
|
|
mem_obj = self.object_by_handle[struct.handle]
|
|
gpu.unmap_range(mem_obj.va_addr, mem_obj.size)
|
|
struct.n_success = i + 1
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_EVENT:
|
|
struct.event_slot_index = self._alloc_next_event_slot()
|
|
struct.event_id = struct.event_slot_index
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_QUEUE:
|
|
gpu = self.gpus[struct.gpu_id]
|
|
if struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
|
gpu.add_sdma_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address)
|
|
elif struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE:
|
|
gpu.add_pm4_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address)
|
|
else: raise RuntimeError("Unsuported, queue")
|
|
|
|
# Track writes to doorbell, calling callback
|
|
struct.doorbell_offset = self._alloc_doorbell(struct.gpu_id)
|
|
self.track_address(struct.doorbell_offset, struct.doorbell_offset + 8, lambda mv,off: None, lambda mv, off: self._emulate_execute())
|
|
elif nr == kfd_ioctls.AMDKFD_IOC_WAIT_EVENTS:
|
|
pass
|
|
else:
|
|
name = "unknown"
|
|
for k,v in kfd_ioctls.__dict__.items():
|
|
if nr == v: name = k
|
|
assert False, f"unknown kfd ioctl, {nr} {name}"
|
|
exit(1)
|
|
return 0
|
|
|
|
def _emulate_execute(self):
|
|
any_progress = True
|
|
while any_progress:
|
|
any_progress = False
|
|
for gpu in self.gpus.values():
|
|
for q in gpu.queues:
|
|
if (prev_rptr:=q.rptr[0]) != q.wptr[0]:
|
|
q.execute()
|
|
any_progress |= (prev_rptr != q.rptr[0])
|