diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 4d4346a7cd..f800cbb8d9 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -276,15 +276,15 @@ class NVProgram(HCQProgram): class NVAllocator(HCQAllocator['NVDevice']): def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: - return self.dev.iface._gpu_alloc(size, cpu_access=options.cpu_access, host=options.host) + return self.dev.iface.alloc(size, cpu_access=options.cpu_access, host=options.host) def _free(self, opaque:HCQBuffer, options:BufferSpec): try: self.dev.synchronize() - self.dev.iface._gpu_free(opaque) + self.dev.iface.free(opaque) except AttributeError: pass - def map(self, buf:HCQBuffer): self.dev.iface._gpu_map(buf._base if buf._base is not None else buf) + def map(self, buf:HCQBuffer): self.dev.iface.map(buf._base if buf._base is not None else buf) @dataclass class GPFifo: @@ -385,7 +385,7 @@ class NVKIface: if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {get_error_str(made.params.status)}") return fd_dev.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), 0) - def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0) -> HCQBuffer: + def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0) -> HCQBuffer: # Uncached memory is "system". Use huge pages only for gpu memory. page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10))) size = round_up(size, page_size) @@ -423,7 +423,7 @@ class NVKIface: return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host) - def _gpu_free(self, mem:HCQBuffer): + def free(self, mem:HCQBuffer): if mem.meta.hMemory > NVKIface.host_object_enumerator: # not a host object, clear phys mem. made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.dev.nvdevice, hObjectOld=mem.meta.hMemory) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made) @@ -442,7 +442,7 @@ class NVKIface: mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping), view=MMIOInterface(va_base, size, fmt='B') if has_cpu_mapping else None) - def _gpu_map(self, mem:HCQBuffer): + def map(self, mem:HCQBuffer): if self.gpu_uuid in mem.meta.mapped_gpu_ids: return mem.meta.mapped_gpu_ids.append(self.gpu_uuid) self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False) @@ -478,7 +478,7 @@ class NVDevice(HCQCompiled[NVSignal]): channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS) channel_group = self.iface.rm_alloc(self.nvdevice, nv_gpu.KEPLER_CHANNEL_GROUP_A, channel_params) - gpfifo_area = self.iface._gpu_alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000) + gpfifo_area = self.iface.alloc(0x200000, contiguous=True, cpu_access=True, map_flags=0x10d0000) ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC) ctxshare = self.iface.rm_alloc(channel_group, nv_gpu.FERMI_CONTEXT_SHARE_A, ctxshare_params) @@ -487,7 +487,7 @@ class NVDevice(HCQCompiled[NVSignal]): self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000, compute=False) self.iface.rm_control(channel_group, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)) - self.cmdq_page:HCQBuffer = self.iface._gpu_alloc(0x200000, cpu_access=True) + self.cmdq_page:HCQBuffer = self.iface.alloc(0x200000, cpu_access=True) self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True) self.cmdq = MMIOInterface(cast(int, self.cmdq_page.va_addr), 0x200000, fmt='I') @@ -505,7 +505,7 @@ class NVDevice(HCQCompiled[NVSignal]): self._setup_gpfifos() def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, compute=False) -> GPFifo: - notifier = self.iface._gpu_alloc(48 << 20, uncached=True) + notifier = self.iface.alloc(48 << 20, uncached=True) params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory, gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare, hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset)) diff --git a/tinygrad/runtime/support/memory.py b/tinygrad/runtime/support/memory.py index 62e11ec9e7..721b654c07 100644 --- a/tinygrad/runtime/support/memory.py +++ b/tinygrad/runtime/support/memory.py @@ -105,7 +105,7 @@ class VirtMapping: va_addr:int; size:int; paddrs:list[tuple[int, int]]; uncached class PageTableTraverseContext: def __init__(self, dev, pt, vaddr, create_pts=False, free_pts=False, boot=False): self.dev, self.vaddr, self.create_pts, self.free_pts, self.boot = dev, vaddr - dev.mm.va_base, create_pts, free_pts, boot - self.pt_stack:list[tuple[Any, int, int]] = [(pt, self._pt_pte_idx(pt, vaddr), self._pt_pte_size(pt))] + self.pt_stack:list[tuple[Any, int, int]] = [(pt, self._pt_pte_idx(pt, self.vaddr), self._pt_pte_size(pt))] def _pt_pte_cnt(self, lv): return self.dev.mm.pte_cnt[lv] def _pt_pte_size(self, pt): return self.dev.mm.pte_covers[pt.lv]