From 486d53d6469e3f0837613ebce8a0c549fa8151a4 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 30 Jan 2026 23:53:17 +0300 Subject: [PATCH] device: call free for external_ptr (#14448) * device: call free for external_ptr * lin --- tinygrad/device.py | 7 ++++--- tinygrad/runtime/ops_cuda.py | 8 ++++---- tinygrad/runtime/ops_metal.py | 7 ++++--- tinygrad/runtime/ops_qcom.py | 8 +++++--- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index cd691041fe..1638d2b6e2 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -147,8 +147,9 @@ class Buffer: def deallocate(self): assert hasattr(self, '_buf'), "buffer must be allocated to deallocate" if DEBUG is not None and DEBUG >= 7: print(f"buffer: deallocate {self.nbytes} bytes on {self.device}") - if self._base is None and (self.options is None or self.options.external_ptr is None): - if GlobalCounters is not None and not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes + if self._base is None: + if GlobalCounters is not None and not self.device.startswith("DISK") and (self.options is None or self.options.external_ptr is None): + GlobalCounters.mem_used -= self.nbytes if PROFILE: Buffer.profile_events.append(ProfilePointEvent(self.device, "free", self.trace_num)) self.allocator.free(self._buf, self.nbytes, self.options) elif self._base is not None: self._base.allocated_views -= 1 @@ -263,7 +264,7 @@ class LRUAllocator(Allocator, Generic[DeviceType]): for opaque in opaques: super().free(opaque, sz, options) opaques.clear() def free(self, opaque:Any, size:int, options:BufferSpec|None=None): - if LRU and (options is None or not options.nolru): self.cache[(size, options)].append(opaque) + if LRU and (options is None or (not options.nolru and options.external_ptr is None)): self.cache[(size, options)].append(opaque) else: super().free(opaque, size, options) # **************** for Compiled Devices **************** diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index e53572a565..8b7a077e81 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -69,11 +69,11 @@ class CUDAAllocator(LRUAllocator['CUDADevice']): if options.external_ptr: return cuda.CUdeviceptr_v2(options.external_ptr) if options.host: return init_c_var(ctypes.c_void_p, lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01))) return init_c_var(cuda.CUdeviceptr, lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) + @suppress_finalizing def _free(self, opaque, options:BufferSpec): - try: - if options.host: check(cuda.cuMemFreeHost(opaque)) - else: check(cuda.cuMemFree_v2(opaque)) - except (TypeError, AttributeError): pass + if options.external_ptr: return + if options.host: check(cuda.cuMemFreeHost(opaque)) + else: check(cuda.cuMemFree_v2(opaque)) def _copyin(self, dest, src:memoryview): check(cuda.cuCtxSetCurrent(self.dev.context)) host_mem = self.alloc(len(src), BufferSpec(host=True)) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 9ff0bd125c..1268ae705c 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -1,5 +1,5 @@ -import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform, sys -from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap +import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform +from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing import tinygrad.runtime.support.objc as objc from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet, CompilerPair from tinygrad.renderer.cstyle import MetalRenderer @@ -167,8 +167,9 @@ class MetalAllocator(LRUAllocator[MetalDevice]): ret.retain = False if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}") return MetalBuffer(ret, size) + @suppress_finalizing def _free(self, opaque:MetalBuffer, options): - if not sys.is_finalizing(): opaque.buf.release + if not options.external_ptr: opaque.buf.release def _transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice): dest_dev.synchronize() src_command_buffer = src_dev.mtl_queue.commandBuffer().retained() diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 02ab87bc8b..648dd455a8 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -399,9 +399,11 @@ class QCOMDevice(HCQCompiled): raise RuntimeError("Failed to map external pointer to GPU memory") from e def _gpu_free(self, mem:HCQBuffer): - if mem.meta[0] is None: return - kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta[0].id) - if mem.meta[1]: FileIOInterface.munmap(mem.va_addr, mem.meta[0].mmapsize) + if mem.meta[0] is None: return # external (gpu) ptr + if not mem.meta[1]: kgsl.IOCTL_KGSL_SHAREDMEM_FREE(self.fd, gpuaddr=mem.meta[0].gpuaddr) # external (cpu) ptr + else: + kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta[0].id) + FileIOInterface.munmap(mem.va_addr, mem.meta[0].mmapsize) def _ensure_stack_size(self, sz): if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)