diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py index 7d14b0ade8..6e1263fd95 100644 --- a/tinygrad/runtime/graph/hcq.py +++ b/tinygrad/runtime/graph/hcq.py @@ -35,7 +35,9 @@ class HCQGraph(MultiGraphRunner): for j,ji in enumerate(jit_cache): if not isinstance(ji.prg, CompiledRunner): continue - self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16)) + gpu_addr = kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16) + cpu_addr = gpu_addr - self.kernargs_bufs[ji.prg.dev].va_addr + self.kernargs_bufs[ji.prg.dev].cpu_addr + self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, (gpu_addr, cpu_addr)) # Schedule Dependencies. # There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 38c6c8e051..36ea2538c4 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -97,7 +97,7 @@ class AMDComputeQueue(HWQueue): dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr user_regs += [*data64_le(dp_addr)] - user_regs += [*data64_le(args_state.ptr)] + user_regs += [*data64_le(args_state.gpu_ptr)] self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)) self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2) @@ -259,23 +259,23 @@ class AMDProgram(HCQProgram): def __del__(self): if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True)) -class AMDPciBuffer(HCQBuffer): - def __init__(self, va_addr, size, cpu_addr, owner, vm=None): - self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm +# class AMDPciBuffer(HCQBuffer): +# def __init__(self, va_addr, size, cpu_addr, owner, vm=None): +# self.adev, self.va_addr, self.size, self.cpu_addr, self.vm = owner, va_addr, size, cpu_addr, vm class AMDDriverAllocator(HCQAllocator['AMDDevice']): def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE) def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: if options.host: return self.dev.dev_iface.alloc(size, host=True) - if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True) + if options.cpu_access and options.uncached: return self.dev.dev_iface.alloc(size, uncached=True, cpu_access=options.cpu_access) return self.dev.dev_iface.alloc(size, cpu_access=options.cpu_access) def _free(self, opaque, options:BufferSpec): self.dev.synchronize() self.dev.dev_iface.free(opaque) - def map(self, buf:HCQBuffer): self.dev.dev_iface._map(buf._base if buf._base is not None else buf) + def map(self, buf:HCQBuffer): self.dev.dev_iface.map(buf._base if buf._base is not None else buf) MAP_FIXED, MAP_NORESERVE, MAP_LOCKED = 0x10, 0x400, 0x2000 @@ -410,27 +410,30 @@ class VFIOIface: if os.path.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"): with open(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", 'w') as f: f.write(self.pcibus) with open(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", 'w') as f: f.write("15") - with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci") - with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus) + + libpciaccess.pci_device_probe(ctypes.byref(self.pcidev)) - iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] + if getenv("VFIO", 1): + with open(f"/sys/bus/pci/devices/{self.pcibus}/driver_override", 'w') as f: f.write("vfio-pci") + with open(f"/sys/bus/pci/drivers_probe", 'w') as f: f.write(self.pcibus) - self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) - vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd)) + iommu_group = os.readlink(f"/sys/bus/pci/devices/{self.pcibus}/iommu_group").split('/')[-1] - vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) - self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0'))) + self.vfio_group = os.open(f"/dev/vfio/noiommu-{iommu_group}", os.O_RDWR) + vfio.VFIO_GROUP_SET_CONTAINER(self.vfio_group, ctypes.c_int(VFIOIface.vfio_fd)) - self.irq_fd = os.eventfd(0, 0) - self.irq_poller = select.poll() - self.irq_poller.register(self.irq_fd, select.POLLIN) + if dev_id == 0: + vfio.VFIO_SET_IOMMU(VFIOIface.vfio_fd, vfio.VFIO_NOIOMMU_IOMMU) + self.vfio_dev = vfio.VFIO_GROUP_GET_DEVICE_FD(self.vfio_group, (ctypes.c_char * (len(self.pcibus) + 1))(*bytearray(self.pcibus.encode() + b'\0'))) - irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER, - argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd)) - vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs) + self.irq_fd = os.eventfd(0, 0) + self.irq_poller = select.poll() + self.irq_poller.register(self.irq_fd, select.POLLIN) - # libpciaccess.pci_device_probe(ctypes.byref(self.pcidev)) - # libpciaccess.pci_device_enable(ctypes.byref(self.pcidev)) + # irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER, + # argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd)) + # vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs) + else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev)) self.adev = AMDev(self.pcidev, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I')) self.doorbell_cpu_addr = mv_address(dbell) @@ -440,6 +443,11 @@ class VFIOIface: 'array_count': 12, 'simd_arrays_per_engine': 2, 'lds_size_in_kb': 64} def _map_pci_range(self, bar): + if not getenv("VFIO", 1): + libpciaccess.pci_device_map_range(ctypes.byref(self.pcidev), self.pcidev.regions[bar].base_addr, size:=self.pcidev.regions[bar].size, + libpciaccess.PCI_DEV_MAP_FLAG_WRITABLE, ctypes.byref(pcimem:=ctypes.c_void_p())) + return to_mv(pcimem, size) + vfio.VFIO_DEVICE_GET_REGION_INFO(self.vfio_dev, reg:=vfio.struct_vfio_region_info(argsz=ctypes.sizeof(vfio.struct_vfio_region_info), index=bar)) return to_mv(libc.mmap(0, reg.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.vfio_dev, reg.offset), reg.size) @@ -454,15 +462,27 @@ class VFIOIface: vaddr = self.adev.mm.alloc_vaddr(size) for off in range(0, size, mmap.PAGESIZE): self.adev.mm.map_range(vaddr=vaddr + off, paddr=read_pagemap(va + off), size=0x1000, system=True, snooped=True, uncached=True) - return AMDPciBuffer(vaddr + self.adev.gmc.vm_base, size, va, self, None) - return AMDPciBuffer((vm:=self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached)).vaddr, vm.size, vm.cpu_addr(), vm) + return HCQBuffer(vaddr + self.adev.gmc.vm_base, size, cpu_addr=va, meta=(self.dev, None)) + + vm = self.adev.mm.valloc(round_up(size, 0x1000), uncached=uncached) + # print(cpu_access, vm.cpu_addr()) + return HCQBuffer(vm.vaddr, vm.size, cpu_addr=vm.cpu_addr() if cpu_access else None, meta=(self.dev, vm)) def free(self, mem): pass - def map(self, mem): pass + + def map(self, mem): + owner, vm = mem.meta + + # print(self.dev, owner, owner.dev_iface.pcidev.regions[0].base_addr, self.pcidev.regions[0].base_addr) + if owner == self.dev or self.dev in getattr(mem.meta[1], "mapped_gpu_ids", []): return + mem.meta[1].__setattr__("mapped_gpu_ids", getattr(mem.meta[1], "mapped_gpu_ids", []) + [self.dev]) + + peer_address = vm.paddr + owner.dev_iface.pcidev.regions[0].base_addr + self.adev.mm.map_range(vaddr=vm.ptable_vaddr, paddr=peer_address, size=vm.size, system=True, snooped=vm.snooped, uncached=vm.uncached) def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0): - mqd = self.alloc(0x1000, uncached=True) + mqd = self.alloc(0x1000, uncached=True, cpu_access=True) if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA: doorbell_index = 0x100 # 0x100 is the first doorbell index for SDMA @@ -499,8 +519,9 @@ class VFIOIface: doorbell=to_mv(self.doorbell_cpu_addr + doorbell_index * 8, 8).cast("Q")) def sleep(self, timeout): - self.irq_poller.poll(timeout) - os.read(self.irq_fd, 1024) + if getenv("VFIO", 1): + x = self.irq_poller.poll(timeout) + if len(x): os.read(self.irq_fd, 1024) class AMDDevice(HCQCompiled): driverless:bool = False @@ -520,7 +541,7 @@ class AMDDevice(HCQCompiled): # TODO: think of moving this out. if AMDDevice.signals_page is None: - AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True) + AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, uncached=True, cpu_access=True) AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] else: self.dev_iface.map(AMDDevice.signals_page) @@ -555,8 +576,8 @@ class AMDDevice(HCQCompiled): AMDSignal, AMDComputeQueue, AMDCopyQueue) def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0): - ring = self.dev_iface.alloc(ring_size, uncached=True) - gart = self.dev_iface.alloc(0x1000, uncached=True) + ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True) + gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True) eop_buffer = self.dev_iface.alloc(eop_buffer_size) if eop_buffer_size else None return self.dev_iface.create_queue(queue_type, ring, gart, eop_buffer=eop_buffer, debug_memory_size=debug_memory_size, ctx_save_restore_size=ctx_save_restore_size, ctl_stack_size=ctl_stack_size) diff --git a/tinygrad/runtime/support/am/mm.py b/tinygrad/runtime/support/am/mm.py index 72d0dd3f50..a0a00b4a33 100644 --- a/tinygrad/runtime/support/am/mm.py +++ b/tinygrad/runtime/support/am/mm.py @@ -25,8 +25,9 @@ class ScatterList(Generic[PhysicalMemoryBlockType]): def __iter__(self): return iter(self.blocks) class VirtualMapping(GPUPhysicalMemoryBlock): - def __init__(self, adev, ptable_vaddr, paddr, size): + def __init__(self, adev, ptable_vaddr, paddr, size, uncached=False, system=False, snooped=False): self.vaddr, self.ptable_vaddr = ptable_vaddr + adev.gmc.vm_base, ptable_vaddr + self.uncached, self.system, self.snooped = uncached, system, snooped super().__init__(adev, paddr, size) # TODO: Complete + tests @@ -65,10 +66,11 @@ class AMPageTableEntry: def get_entry(self, entry_id): return self.view[entry_id] class MM: + next_vaddr:int = 0 + def __init__(self, adev, vram_size:int): self.adev, self.vram_size = adev, vram_size self.phys_allocator = PhysicalAllocator(adev, vram_size) - self.next_vaddr = 0 self.root_page_table = AMPageTableEntry(self.palloc(0x1000, zero=True), lv=am.AMDGPU_VM_PDB1) def page_table_walker(self, page_table, vaddr, size, offset=0, free_pt=False) -> Generator[Tuple[int, int, int, int], None, None]: @@ -129,7 +131,7 @@ class MM: self.adev.gmc.flush_tlb(ip="GC", vmid=0) self.adev.gmc.flush_tlb(ip="MM", vmid=0) - return VirtualMapping(self.adev, vaddr, paddr, size) + return VirtualMapping(self.adev, vaddr, paddr, size, uncached=uncached, system=system, snooped=snooped) def unmap_range(self, vaddr:int, size:int): for va, off, pte_st_idx, n_ptes, pte_covers, page_table in self.page_table_walker(self.root_page_table, vaddr, size, free_pt=True): @@ -139,15 +141,15 @@ class MM: def alloc_vaddr(self, size:int, align=0x1000) -> int: size = round_up(size, 0x1000) + align = (1 << size.bit_length()) - # TODO: need for here? - for i in range(31): - if (1 << i) <= size: align = (1 << i) + # for i in range(31): + # if (1 << i) <= size: align = (1 << i) - addr = round_up(self.next_vaddr, align) - self.next_vaddr = addr + size + addr = round_up(MM.next_vaddr, align) + MM.next_vaddr = addr + size - assert self.next_vaddr <= self.adev.gmc.vm_end + assert MM.next_vaddr <= self.adev.gmc.vm_end return addr def valloc(self, size:int, align=0x1000, uncached=False) -> VirtualMapping: diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index b68697026f..7238021b36 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -236,20 +236,20 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Optional[Type[HWQueue if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t)) class HCQArgsState(Generic[ProgramType]): - def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()): - self.ptr, self.prg = ptr, prg + def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=()): + self.gpu_ptr, self.cpu_ptr, self.prg = gpu_ptr, cpu_ptr, prg self.bind_data:List[Tuple[Tuple[sint, ...], int, str]] = [] def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt)) class CLikeArgsState(HCQArgsState[ProgramType]): - def __init__(self, ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None): - super().__init__(ptr, prg, bufs, vals=vals) + def __init__(self, gpu_ptr:int, cpu_ptr:int, prg:ProgramType, bufs:Tuple[HCQBuffer, ...], vals:Tuple[sint, ...]=(), prefix:Optional[List[int]]=None): + super().__init__(gpu_ptr, cpu_ptr, prg, bufs, vals=vals) if prefix is not None: to_mv(self.cpu_ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix) - self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q') - self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I') + self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.cpu_ptr + len(prefix or []) * 4, fmt='Q') + self.bind_sints_to_ptr(*vals, ptr=self.cpu_ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I') class HCQProgram(Generic[DeviceType]): def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int): @@ -265,7 +265,10 @@ class HCQProgram(Generic[DeviceType]): Returns: Arguments state with the given buffers and values set for the program. """ - return self.args_state_t(kernargs_ptr or self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size), self, bufs, vals=vals) + if kernargs_ptr is None: + gpu_ptr, cpu_ptr = (x:=self.dev.kernargs_alloctor.alloc(self.kernargs_alloc_size)), x - self.dev.kernargs_page.va_addr + self.dev.kernargs_page.cpu_addr + else: gpu_ptr, cpu_ptr = kernargs_ptr + return self.args_state_t(gpu_ptr, cpu_ptr, self, bufs, vals=vals) def __call__(self, *bufs:HCQBuffer, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait:bool=False) -> Optional[float]: