diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index f459fd7bdf..b834858ba2 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Tuple, List, Any, cast +from typing import Tuple, List, Any import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal from dataclasses import dataclass from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \ @@ -61,8 +61,7 @@ class AMDComputeQueue(HWComputeQueue): def __del__(self): if self.binded_device is not None: - self.binded_device.synchronize() - self.binded_device._gpu_free(self.hw_page) + self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True)) def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1): self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0, @@ -161,7 +160,7 @@ class AMDComputeQueue(HWComputeQueue): def bind(self, device): self.binded_device = device - self.hw_page = cast(AMDDevice, device)._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) + self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True)) hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I") for i, value in enumerate(self.q): hw_view[i] = value @@ -303,6 +302,7 @@ class AMDAllocator(HCQAllocator): def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer: if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) + if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access) def _free(self, opaque, options:BufferOptions): diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index e99c81c801..ba58e26ab0 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -84,9 +84,7 @@ class NVSignal(HCQSignal): class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method def __del__(self): - if self.binded_device is not None: - self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use. - self.binded_device._gpu_free(self.hw_page) + if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True)) @hcq_command def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None): @@ -109,7 +107,7 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method def bind(self, device): self.binded_device = device - self.hw_page = cast(NVDevice, device)._gpu_alloc(len(self.q) * 4, map_to_cpu=True) + self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True)) hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I") for i, value in enumerate(self.q): hw_view[i] = value