diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 017444c1fc..9fc3554cb5 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -240,8 +240,9 @@ class NVProgram: self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32 # Load program and constant buffers (if any) - self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) + - sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000) + # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults. + self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) + + round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000) self.lib_gpu = self.device.allocator.alloc(self.lib_sz) HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device) @@ -257,8 +258,8 @@ class NVProgram: cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config, max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config, - barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89, - program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, + barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8, + program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89, program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40, constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1) @@ -330,7 +331,7 @@ class NVAllocator(LRUAllocator): def _alloc(self, size:int, options:BufferOptions): if options.host: return self.device._gpu_host_alloc(size) - else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access) + else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20))) def _free(self, gpumem, options:BufferOptions): NVDevice.synchronize_system() @@ -396,7 +397,7 @@ class NVDevice(Compiled): return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0) def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0): - size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup? + size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10))) alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size, attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) | ((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),