mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 15:38:29 -05:00
minor after rebase
This commit is contained in:
@@ -35,7 +35,9 @@ class HCQGraph(MultiGraphRunner):
|
||||
for j,ji in enumerate(jit_cache):
|
||||
if not isinstance(ji.prg, CompiledRunner): continue
|
||||
|
||||
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
|
||||
gpu_addr = kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16)
|
||||
cpu_addr = gpu_addr - self.kernargs_bufs[ji.prg.dev].va_addr + self.kernargs_bufs[ji.prg.dev].cpu_addr
|
||||
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, (gpu_addr, cpu_addr))
|
||||
|
||||
# Schedule Dependencies.
|
||||
# There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any
|
||||
|
||||
@@ -97,7 +97,7 @@ class AMDComputeQueue(HWQueue):
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
||||
user_regs += [*data64_le(dp_addr)]
|
||||
|
||||
user_regs += [*data64_le(args_state.ptr)]
|
||||
user_regs += [*data64_le(args_state.gpu_ptr)]
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
@@ -259,23 +259,23 @@ class AMDProgram(HCQProgram):
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class AMDPciBuffer(HCQBuffer):
|
||||
def __init__(self, va_addr, size, cpu_addr, owner, vm=None):
|
||||
self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm
|
||||
# class AMDPciBuffer(HCQBuffer):
|
||||
# def __init__(self, va_addr, size, cpu_addr, owner, vm=None):
|
||||
# self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm
|
||||
|
||||
class AMDDriverAllocator(HCQAllocator['AMDDevice']):
|
||||
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
|
||||
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
if options.host: return self.dev.dev_iface.alloc(size, host=True)
|
||||
if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True)
|
||||
if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True, cpu_access=options.cpu_access)
|
||||
return self.dev.dev_iface.alloc(size, cpu_access=options.cpu_access)
|
||||
|
||||
def _free(self, opaque, options:BufferSpec):
|
||||
self.dev.synchronize()
|
||||
self.dev.dev_iface.free(opaque)
|
||||
|
||||
def map(self, buf:HCQBuffer): self.dev.dev_iface._map(buf._base if buf._base is not None else buf)
|
||||
def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0x2000
|
||||
|
||||
@@ -410,27 +410,30 @@ class VFIOIface:
|
||||
if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
||||
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", 'w') as f: f.write(self.pcibus)
|
||||
with open(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", 'w') as f: f.write("15")
|
||||
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci")
|
||||
with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus)
|
||||
|
||||
libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
|
||||
|
||||
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
||||
if getenv("VFIO", 1):
|
||||
with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci")
|
||||
with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus)
|
||||
|
||||
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
||||
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd))
|
||||
iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1]
|
||||
|
||||
vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
||||
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0')))
|
||||
self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR)
|
||||
vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd))
|
||||
|
||||
self.irq_fd = os.eventfd(0, 0)
|
||||
self.irq_poller = select.poll()
|
||||
self.irq_poller.register(self.irq_fd, select.POLLIN)
|
||||
if dev_id == 0:
|
||||
vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU)
|
||||
self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0')))
|
||||
|
||||
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
||||
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
|
||||
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
||||
self.irq_fd = os.eventfd(0, 0)
|
||||
self.irq_poller = select.poll()
|
||||
self.irq_poller.register(self.irq_fd, select.POLLIN)
|
||||
|
||||
# libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
|
||||
# libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
|
||||
# irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
||||
# argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd))
|
||||
# vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
||||
else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
|
||||
|
||||
self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
||||
self.doorbell_cpu_addr = mv_address(dbell)
|
||||
@@ -440,6 +443,11 @@ class VFIOIface:
|
||||
'array_count': 12, 'simd_arrays_per_engine': 2, 'lds_size_in_kb': 64}
|
||||
|
||||
def _map_pci_range(self, bar):
|
||||
if not getenv("VFIO", 1):
|
||||
libpciaccess.pci_device_map_range(ctypes.byref(self.pcidev), self.pcidev.regions[bar].base_addr, size:=self.pcidev.regions[bar].size,
|
||||
libpciaccess.PCI_DEV_MAP_FLAG_WRITABLE, ctypes.byref(pcimem:=ctypes.c_void_p()))
|
||||
return to_mv(pcimem, size)
|
||||
|
||||
vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar))
|
||||
return to_mv(libc.mmap(0, reg.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.vfio_dev, reg.offset), reg.size)
|
||||
|
||||
@@ -454,15 +462,27 @@ class VFIOIface:
|
||||
vaddr = self.adev.mm.alloc_vaddr(size)
|
||||
for off in range(0, size, mmap.PAGESIZE):
|
||||
self.adev.mm.map_range(vaddr=vaddr + off, paddr=read_pagemap(va + off), size=0x1000, system=True, snooped=True, uncached=True)
|
||||
return AMDPciBuffer(vaddr + self.adev.gmc.vm_base, size, va, self, None)
|
||||
|
||||
return AMDPciBuffer((vm:=self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached)).vaddr, vm.size, vm.cpu_addr(), vm)
|
||||
return HCQBuffer(vaddr + self.adev.gmc.vm_base, size, cpu_addr=va, meta=(self.dev, None))
|
||||
|
||||
vm = self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached)
|
||||
# print(cpu_access, vm.cpu_addr())
|
||||
return HCQBuffer(vm.vaddr, vm.size, cpu_addr=vm.cpu_addr() if cpu_access else None, meta=(self.dev, vm))
|
||||
|
||||
def free(self, mem): pass
|
||||
def map(self, mem): pass
|
||||
|
||||
def map(self, mem):
|
||||
owner, vm = mem.meta
|
||||
|
||||
# print(self.dev, owner, owner.dev_iface.pcidev.regions[0].base_addr, self.pcidev.regions[0].base_addr)
|
||||
if owner == self.dev or self.dev in getattr(mem.meta[1], "mapped_gpu_ids", []): return
|
||||
mem.meta[1].__setattr__("mapped_gpu_ids", getattr(mem.meta[1], "mapped_gpu_ids", []) + [self.dev])
|
||||
|
||||
peer_address = vm.paddr + owner.dev_iface.pcidev.regions[0].base_addr
|
||||
self.adev.mm.map_range(vaddr=vm.ptable_vaddr, paddr=peer_address, size=vm.size, system=True, snooped=vm.snooped, uncached=vm.uncached)
|
||||
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
||||
mqd = self.alloc(0x1000, uncached=True)
|
||||
mqd = self.alloc(0x1000, uncached=True, cpu_access=True)
|
||||
|
||||
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
||||
doorbell_index = 0x100 # 0x100 is the first doorbell index for SDMA
|
||||
@@ -499,8 +519,9 @@ class VFIOIface:
|
||||
doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q"))
|
||||
|
||||
def sleep(self, timeout):
|
||||
self.irq_poller.poll(timeout)
|
||||
os.read(self.irq_fd, 1024)
|
||||
if getenv("VFIO", 1):
|
||||
x = self.irq_poller.poll(timeout)
|
||||
if len(x): os.read(self.irq_fd, 1024)
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
driverless:bool = False
|
||||
@@ -520,7 +541,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
# TODO: think of moving this out.
|
||||
if AMDDevice.signals_page is None:
|
||||
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True)
|
||||
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True, cpu_access=True)
|
||||
AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
|
||||
else: self.dev_iface.map(AMDDevice.signals_page)
|
||||
|
||||
@@ -555,8 +576,8 @@ class AMDDevice(HCQCompiled):
|
||||
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
||||
|
||||
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True)
|
||||
gart = self.dev_iface.alloc(0x1000, uncached=True)
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
||||
eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None
|
||||
return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size,
|
||||
ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size)
|
||||
|
||||
@@ -25,8 +25,9 @@ class ScatterList(Generic[PhysicalMemoryBlockType]):
|
||||
def __iter__(self): return iter(self.blocks)
|
||||
|
||||
class VirtualMapping(GPUPhysicalMemoryBlock):
|
||||
def __init__(self, adev, ptable_vaddr, paddr, size):
|
||||
def __init__(self, adev, ptable_vaddr, paddr, size, uncached=False, system=False, snooped=False):
|
||||
self.vaddr, self.ptable_vaddr = ptable_vaddr + adev.gmc.vm_base, ptable_vaddr
|
||||
self.uncached, self.system, self.snooped = uncached, system, snooped
|
||||
super().__init__(adev, paddr, size)
|
||||
|
||||
# TODO: Complete + tests
|
||||
@@ -65,10 +66,11 @@ class AMPageTableEntry:
|
||||
def get_entry(self, entry_id): return self.view[entry_id]
|
||||
|
||||
class MM:
|
||||
next_vaddr:int = 0
|
||||
|
||||
def __init__(self, adev, vram_size:int):
|
||||
self.adev, self.vram_size = adev, vram_size
|
||||
self.phys_allocator = PhysicalAllocator(adev, vram_size)
|
||||
self.next_vaddr = 0
|
||||
self.root_page_table = AMPageTableEntry(self.palloc(0x1000, zero=True), lv=am.AMDGPU_VM_PDB1)
|
||||
|
||||
def page_table_walker(self, page_table, vaddr, size, offset=0, free_pt=False) -> Generator[Tuple[int, int, int, int], None, None]:
|
||||
@@ -129,7 +131,7 @@ class MM:
|
||||
|
||||
self.adev.gmc.flush_tlb(ip="GC", vmid=0)
|
||||
self.adev.gmc.flush_tlb(ip="MM", vmid=0)
|
||||
return VirtualMapping(self.adev, vaddr, paddr, size)
|
||||
return VirtualMapping(self.adev, vaddr, paddr, size, uncached=uncached, system=system, snooped=snooped)
|
||||
|
||||
def unmap_range(self, vaddr:int, size:int):
|
||||
for va, off, pte_st_idx, n_ptes, pte_covers, page_table in self.page_table_walker(self.root_page_table, vaddr, size, free_pt=True):
|
||||
@@ -139,15 +141,15 @@ class MM:
|
||||
|
||||
def alloc_vaddr(self, size:int, align=0x1000) -> int:
|
||||
size = round_up(size, 0x1000)
|
||||
align = (1 << size.bit_length())
|
||||
|
||||
# TODO: need for here?
|
||||
for i in range(31):
|
||||
if (1 << i) <= size: align = (1 << i)
|
||||
# for i in range(31):
|
||||
# if (1 << i) <= size: align = (1 << i)
|
||||
|
||||
addr = round_up(self.next_vaddr, align)
|
||||
self.next_vaddr = addr + size
|
||||
addr = round_up(MM.next_vaddr, align)
|
||||
MM.next_vaddr = addr + size
|
||||
|
||||
assert self.next_vaddr <= self.adev.gmc.vm_end
|
||||
assert MM.next_vaddr <= self.adev.gmc.vm_end
|
||||
return addr
|
||||
|
||||
def valloc(self, size:int, align=0x1000, uncached=False) -> VirtualMapping:
|
||||
|
||||
@@ -236,20 +236,20 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Optional[Type[HWQueue
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQArgsState(Generic[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
|
||||
self.ptr, self.prg = ptr, prg
|
||||
def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()):
|
||||
self.gpu_ptr, self.cpu_ptr, self.prg = gpu_ptr, cpu_ptr, prg
|
||||
self.bind_data:List[Tuple[Tuple[sint, ...], int, str]] = []
|
||||
|
||||
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
|
||||
|
||||
class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None):
|
||||
super().__init__(gpu_ptr, cpu_ptr, prg, bufs, vals=vals)
|
||||
|
||||
if prefix is not None: to_mv(self.cpu_ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
|
||||
|
||||
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q')
|
||||
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
|
||||
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.cpu_ptr + len(prefix or []) * 4, fmt='Q')
|
||||
self.bind_sints_to_ptr(*vals, ptr=self.cpu_ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
|
||||
|
||||
class HCQProgram(Generic[DeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int):
|
||||
@@ -265,7 +265,10 @@ class HCQProgram(Generic[DeviceType]):
|
||||
Returns:
|
||||
Arguments state with the given buffers and values set for the program.
|
||||
"""
|
||||
return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
|
||||
if kernargs_ptr is None:
|
||||
gpu_ptr, cpu_ptr = (x:=self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size)), x - self.dev.kernargs_page.va_addr + self.dev.kernargs_page.cpu_addr
|
||||
else: gpu_ptr, cpu_ptr = kernargs_ptr
|
||||
return self.args_state_t(gpu_ptr, cpu_ptr, self, bufs, vals=vals)
|
||||
|
||||
def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1),
|
||||
vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]:
|
||||
|
||||
Reference in New Issue
Block a user