nv fix round of allocation sizes (#4828)

* fix round of allocation sizes

* comment on prefetch

* use huge pages
This commit is contained in:
nimlgen
2024-06-06 00:21:56 +03:00
committed by GitHub
parent 09bfb8c10a
commit 1649c21ead

View File

@@ -240,8 +240,9 @@ class NVProgram:
self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
# Load program and constant buffers (if any)
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) +
sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
@@ -257,8 +258,8 @@ class NVProgram:
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
@@ -330,7 +331,7 @@ class NVAllocator(LRUAllocator):
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_host_alloc(size)
else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access)
else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
def _free(self, gpumem, options:BufferOptions):
NVDevice.synchronize_system()
@@ -396,7 +397,7 @@ class NVDevice(Compiled):
return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),