From 78c01a5c2ba1a7004b0f2aed5950ac7b250ea7b7 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:50:23 +0300 Subject: [PATCH] amd general _gpu_alloc (#8056) * amd general _gpu_alloc * hmm * ops --- tinygrad/runtime/ops_amd.py | 50 +++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index ed86173ef4..a10fb38f9f 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -262,9 +262,9 @@ class AMDAllocator(HCQAllocator['AMDDevice']): def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE) def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: - if options.host: return self.dev._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) - if options.cpu_access and options.uncached: return self.dev._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - return self.dev._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access) + if options.host: return self.dev._gpu_alloc(size, host=True) + if options.cpu_access and options.uncached: return self.dev._gpu_alloc(size, uncached=True) + return self.dev._gpu_alloc(size, cpu_access=options.cpu_access) def _free(self, opaque, options:BufferSpec): self.dev.synchronize() @@ -297,28 +297,31 @@ class AMDDevice(HCQCompiled): n_devices=len(mem.mapped_gpu_ids)) assert stm.n_success == len(mem.mapped_gpu_ids) - def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True): - flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE - if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED - if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC - if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR: - buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0) - else: - buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) + def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False): + flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE + + if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT + else: flags |= (kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) + + if cpu_access or host: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC + + if host: buf = addr = libc.mmap(0, size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | mmap.MAP_ANONYMOUS, -1, 0) + else: buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE, -1, 0) assert addr != 0xffffffffffffffff try: mem = kfd.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, - flags=flags, mmap_offset=buf) + flags=flags, mmap_offset=buf, cpu_addr=addr) except OSError as e: - if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public: + if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and cpu_access: raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e raise - if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR): - buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset) + if not host: + buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset) assert addr == buf == mem.va_addr - if map_to_gpu: self._gpu_map(mem) + + self._gpu_map(mem) return mem def _gpu_free(self, mem): @@ -350,8 +353,8 @@ class AMDDevice(HCQCompiled): kfd.AMDKFD_IOC_ACQUIRE_VM(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id) if AMDDevice.event_page is None: - AMDDevice.signals_page = self._gpu_alloc(16 * 65536, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) + AMDDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) + AMDDevice.event_page = self._gpu_alloc(0x8000, uncached=True) AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle) else: @@ -365,7 +368,7 @@ class AMDDevice(HCQCompiled): # =gfx11 requires 256 wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256 if self.target >= 110000 else 1024) self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len - self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) + self.scratch = self._gpu_alloc(self.scratch_len) self.has_scratch_base_registers = self.target >= 110000 engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine'] waves = wave_scratch_len // (256 if self.target >= 110000 else 1024) @@ -396,11 +399,10 @@ class AMDDevice(HCQCompiled): AMDSignal, AMDComputeQueue, AMDCopyQueue) def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None, ctl_stack_size=0) -> AMDQueueDesc: - gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) - cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE), - kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None - eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None + gart = self._gpu_alloc(0x1000, uncached=True) + ring = self._gpu_alloc(ring_size, uncached=True) + cwsr_ctx = self._gpu_alloc(round_up(ctx_save_restore_size + self.debug_memory_size, mmap.PAGESIZE)) if ctx_save_restore_size else None + eop_buffer = self._gpu_alloc(eop_buffer_size) if eop_buffer_size else None queue = kfd.AMDKFD_IOC_CREATE_QUEUE(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id, queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0, ctl_stack_size=ctl_stack_size,