mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-31 09:48:25 -05:00
nv fix round of allocation sizes (#4828)
* fix round of allocation sizes * comment on prefetch * use huge pages
This commit is contained in:
@@ -240,8 +240,9 @@ class NVProgram:
|
||||
self.max_threads = ((65536 // round_up(self.registers_usage * 32, 256)) // 4) * 4 * 32
|
||||
|
||||
# Load program and constant buffers (if any)
|
||||
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + round_up(0 if self.global_init is None else self.global_init.nbytes, 128) +
|
||||
sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
|
||||
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
||||
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + max(0x1000, sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]) +
|
||||
round_up(0 if self.global_init is None else self.global_init.nbytes, 128)), 0x1000)
|
||||
self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
|
||||
|
||||
HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
|
||||
@@ -257,8 +258,8 @@ class NVProgram:
|
||||
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
|
||||
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
|
||||
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
|
||||
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
|
||||
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
|
||||
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
|
||||
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
|
||||
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
|
||||
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
|
||||
|
||||
@@ -330,7 +331,7 @@ class NVAllocator(LRUAllocator):
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
if options.host: return self.device._gpu_host_alloc(size)
|
||||
else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access)
|
||||
else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
||||
|
||||
def _free(self, gpumem, options:BufferOptions):
|
||||
NVDevice.synchronize_system()
|
||||
@@ -396,7 +397,7 @@ class NVDevice(Compiled):
|
||||
return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
||||
|
||||
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_flags=0):
|
||||
size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
|
||||
size = round_up(size, align:=((2 << 20) if huge_page else (4 << 10)))
|
||||
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
|
||||
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
|
||||
((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
|
||||
|
||||
Reference in New Issue
Block a user