minor after rebase

This commit is contained in:
nimlgen
2024-12-08 13:58:47 -08:00
parent 61127888f8
commit 68d582e88e
4 changed files with 75 additions and 47 deletions

View File

@@ -35,7 +35,9 @@ class HCQGraph(MultiGraphRunner):
for j,ji in enumerate(jit_cache):
if not isinstance(ji.prg, CompiledRunner): continue
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
gpu_addr = kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16)
cpu_addr = gpu_addr - self.kernargs_bufs[ji.prg.dev].va_addr + self.kernargs_bufs[ji.prg.dev].cpu_addr
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, (gpu_addr, cpu_addr))
# Schedule Dependencies.
# There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any

View File

@@ -97,7 +97,7 @@ class AMDComputeQueue(HWQueue):
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
user_regs += [*data64_le(dp_addr)]
user_regs += [*data64_le(args_state.ptr)]
user_regs += [*data64_le(args_state.gpu_ptr)]
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
@@ -259,23 +259,23 @@ class AMDProgram(HCQProgram):
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
class AMDPciBuffer(HCQBuffer):
def __init__(self, va_addr, size, cpu_addr, owner, vm=None):
self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm
# class AMDPciBuffer(HCQBuffer):
# def __init__(self, va_addr, size, cpu_addr, owner, vm=None):
# self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm
class AMDDriverAllocator(HCQAllocator['AMDDevice']):
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
if options.host: return self.dev.dev_iface.alloc(size, host=True)
if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True)
if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True, cpu_access=options.cpu_access)
return self.dev.dev_iface.alloc(size, cpu_access=options.cpu_access)
def _free(self, opaque, options:BufferSpec):
self.dev.synchronize()
self.dev.dev_iface.free(opaque)
def map(self, buf:HCQBuffer): self.dev.dev_iface._map(buf._base if buf._base is not None else buf)
def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0x2000
@@ -410,27 +410,30 @@ class VFIOIface:
if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", 'w') as f: f.write(self.pcibus)
with open(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", 'w') as f: f.write("15")
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci")
with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus)
libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
if getenv("VFIO", 1):
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci")
with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus)
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd))
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0')))
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd))
self.irq_fd = os.eventfd(0, 0)
self.irq_poller = select.poll()
self.irq_poller.register(self.irq_fd, select.POLLIN)
if dev_id == 0:
vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0')))
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
self.irq_fd = os.eventfd(0, 0)
self.irq_poller = select.poll()
self.irq_poller.register(self.irq_fd, select.POLLIN)
# libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
# libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
# irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
# argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
# vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
self.doorbell_cpu_addr = mv_address(dbell)
@@ -440,6 +443,11 @@ class VFIOIface:
'array_count': 12, 'simd_arrays_per_engine': 2, 'lds_size_in_kb': 64}
def _map_pci_range(self, bar):
if not getenv("VFIO", 1):
libpciaccess.pci_device_map_range(ctypes.byref(self.pcidev), self.pcidev.regions[bar].base_addr, size:=self.pcidev.regions[bar].size,
libpciaccess.PCI_DEV_MAP_FLAG_WRITABLE, ctypes.byref(pcimem:=ctypes.c_void_p()))
return to_mv(pcimem, size)
vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar))
return to_mv(libc.mmap(0, reg.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.vfio_dev, reg.offset), reg.size)
@@ -454,15 +462,27 @@ class VFIOIface:
vaddr = self.adev.mm.alloc_vaddr(size)
for off in range(0, size, mmap.PAGESIZE):
self.adev.mm.map_range(vaddr=vaddr + off, paddr=read_pagemap(va + off), size=0x1000, system=True, snooped=True, uncached=True)
return AMDPciBuffer(vaddr + self.adev.gmc.vm_base, size, va, self, None)
return AMDPciBuffer((vm:=self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached)).vaddr, vm.size, vm.cpu_addr(), vm)
return HCQBuffer(vaddr + self.adev.gmc.vm_base, size, cpu_addr=va, meta=(self.dev, None))
vm = self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached)
# print(cpu_access, vm.cpu_addr())
return HCQBuffer(vm.vaddr, vm.size, cpu_addr=vm.cpu_addr() if cpu_access else None, meta=(self.dev, vm))
def free(self, mem): pass
def map(self, mem): pass
def map(self, mem):
owner, vm = mem.meta
# print(self.dev, owner, owner.dev_iface.pcidev.regions[0].base_addr, self.pcidev.regions[0].base_addr)
if owner == self.dev or self.dev in getattr(mem.meta[1], "mapped_gpu_ids", []): return
mem.meta[1].__setattr__("mapped_gpu_ids", getattr(mem.meta[1], "mapped_gpu_ids", []) + [self.dev])
peer_address = vm.paddr + owner.dev_iface.pcidev.regions[0].base_addr
self.adev.mm.map_range(vaddr=vm.ptable_vaddr, paddr=peer_address, size=vm.size, system=True, snooped=vm.snooped, uncached=vm.uncached)
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
mqd = self.alloc(0x1000, uncached=True)
mqd = self.alloc(0x1000, uncached=True, cpu_access=True)
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
doorbell_index = 0x100 # 0x100 is the first doorbell index for SDMA
@@ -499,8 +519,9 @@ class VFIOIface:
doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"))
def sleep(self, timeout):
self.irq_poller.poll(timeout)
os.read(self.irq_fd, 1024)
if getenv("VFIO", 1):
x = self.irq_poller.poll(timeout)
if len(x): os.read(self.irq_fd, 1024)
class AMDDevice(HCQCompiled):
driverless:bool = False
@@ -520,7 +541,7 @@ class AMDDevice(HCQCompiled):
# TODO: think of moving this out.
if AMDDevice.signals_page is None:
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True)
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True, cpu_access=True)
AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
else: self.dev_iface.map(AMDDevice.signals_page)
@@ -555,8 +576,8 @@ class AMDDevice(HCQCompiled):
AMDSignal, AMDComputeQueue, AMDCopyQueue)
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
ring = self.dev_iface.alloc(ring_size, uncached=True)
gart = self.dev_iface.alloc(0x1000, uncached=True)
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)

View File

@@ -25,8 +25,9 @@ class ScatterList(Generic[PhysicalMemoryBlockType]):
def __iter__(self): return iter(self.blocks)
class VirtualMapping(GPUPhysicalMemoryBlock):
def __init__(self, adev, ptable_vaddr, paddr, size):
def __init__(self, adev, ptable_vaddr, paddr, size, uncached=False, system=False, snooped=False):
self.vaddr, self.ptable_vaddr = ptable_vaddr + adev.gmc.vm_base, ptable_vaddr
self.uncached, self.system, self.snooped = uncached, system, snooped
super().__init__(adev, paddr, size)
# TODO: Complete + tests
@@ -65,10 +66,11 @@ class AMPageTableEntry:
def get_entry(self, entry_id): return self.view[entry_id]
class MM:
next_vaddr:int = 0
def __init__(self, adev, vram_size:int):
self.adev, self.vram_size = adev, vram_size
self.phys_allocator = PhysicalAllocator(adev, vram_size)
self.next_vaddr = 0
self.root_page_table = AMPageTableEntry(self.palloc(0x1000, zero=True), lv=am.AMDGPU_VM_PDB1)
def page_table_walker(self, page_table, vaddr, size, offset=0, free_pt=False) -> Generator[Tuple[int, int, int, int], None, None]:
@@ -129,7 +131,7 @@ class MM:
self.adev.gmc.flush_tlb(ip="GC", vmid=0)
self.adev.gmc.flush_tlb(ip="MM", vmid=0)
return VirtualMapping(self.adev, vaddr, paddr, size)
return VirtualMapping(self.adev, vaddr, paddr, size, uncached=uncached, system=system, snooped=snooped)
def unmap_range(self, vaddr:int, size:int):
for va, off, pte_st_idx, n_ptes, pte_covers, page_table in self.page_table_walker(self.root_page_table, vaddr, size, free_pt=True):
@@ -139,15 +141,15 @@ class MM:
def alloc_vaddr(self, size:int, align=0x1000) -> int:
size = round_up(size, 0x1000)
align = (1 << size.bit_length())
# TODO: need for here?
for i in range(31):
if (1 << i) <= size: align = (1 << i)
# for i in range(31):
# if (1 << i) <= size: align = (1 << i)
addr = round_up(self.next_vaddr, align)
self.next_vaddr = addr + size
addr = round_up(MM.next_vaddr, align)
MM.next_vaddr = addr + size
assert self.next_vaddr <= self.adev.gmc.vm_end
assert MM.next_vaddr <= self.adev.gmc.vm_end
return addr
def valloc(self, size:int, align=0x1000, uncached=False) -> VirtualMapping:

View File

@@ -236,20 +236,20 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Optional[Type[HWQueue
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
class HCQArgsState(Generic[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
self.ptr, self.prg = ptr, prg
def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
self.gpu_ptr, self.cpu_ptr, self.prg = gpu_ptr, cpu_ptr, prg
self.bind_data:List[Tuple[Tuple[sint, ...], int, str]] = []
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
class CLikeArgsState(HCQArgsState[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
super().__init__(ptr, prg, bufs, vals=vals)
def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
super().__init__(gpu_ptr, cpu_ptr, prg, bufs, vals=vals)
if prefix is not None: to_mv(self.cpu_ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q')
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.cpu_ptr + len(prefix or []) * 4, fmt='Q')
self.bind_sints_to_ptr(*vals, ptr=self.cpu_ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
class HCQProgram(Generic[DeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
@@ -265,7 +265,10 @@ class HCQProgram(Generic[DeviceType]):
Returns:
Arguments state with the given buffers and values set for the program.
"""
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
if kernargs_ptr is None:
gpu_ptr, cpu_ptr = (x:=self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size)), x - self.dev.kernargs_page.va_addr + self.dev.kernargs_page.cpu_addr
else: gpu_ptr, cpu_ptr = kernargs_ptr
return self.args_state_t(gpu_ptr, cpu_ptr, self, bufs, vals=vals)
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]: