device: call free for external_ptr (#14448)

* device: call free for external_ptr

* lin
This commit is contained in:
nimlgen
2026-01-30 23:53:17 +03:00
committed by GitHub
parent e0978498dc
commit 486d53d646
4 changed files with 17 additions and 13 deletions

View File

@@ -147,8 +147,9 @@ class Buffer:
def deallocate(self):
assert hasattr(self, '_buf'), "buffer must be allocated to deallocate"
if DEBUG is not None and DEBUG >= 7: print(f"buffer: deallocate {self.nbytes} bytes on {self.device}")
if self._base is None and (self.options is None or self.options.external_ptr is None):
if GlobalCounters is not None and not self.device.startswith("DISK"): GlobalCounters.mem_used -= self.nbytes
if self._base is None:
if GlobalCounters is not None and not self.device.startswith("DISK") and (self.options is None or self.options.external_ptr is None):
GlobalCounters.mem_used -= self.nbytes
if PROFILE: Buffer.profile_events.append(ProfilePointEvent(self.device, "free", self.trace_num))
self.allocator.free(self._buf, self.nbytes, self.options)
elif self._base is not None: self._base.allocated_views -= 1
@@ -263,7 +264,7 @@ class LRUAllocator(Allocator, Generic[DeviceType]):
for opaque in opaques: super().free(opaque, sz, options)
opaques.clear()
def free(self, opaque:Any, size:int, options:BufferSpec|None=None):
if LRU and (options is None or not options.nolru): self.cache[(size, options)].append(opaque)
if LRU and (options is None or (not options.nolru and options.external_ptr is None)): self.cache[(size, options)].append(opaque)
else: super().free(opaque, size, options)
# **************** for Compiled Devices ****************

View File

@@ -69,11 +69,11 @@ class CUDAAllocator(LRUAllocator['CUDADevice']):
if options.external_ptr: return cuda.CUdeviceptr_v2(options.external_ptr)
if options.host: return init_c_var(ctypes.c_void_p, lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
return init_c_var(cuda.CUdeviceptr, lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
@suppress_finalizing
def _free(self, opaque, options:BufferSpec):
try:
if options.host: check(cuda.cuMemFreeHost(opaque))
else: check(cuda.cuMemFree_v2(opaque))
except (TypeError, AttributeError): pass
if options.external_ptr: return
if options.host: check(cuda.cuMemFreeHost(opaque))
else: check(cuda.cuMemFree_v2(opaque))
def _copyin(self, dest, src:memoryview):
check(cuda.cuCtxSetCurrent(self.dev.context))
host_mem = self.alloc(len(src), BufferSpec(host=True))

View File

@@ -1,5 +1,5 @@
import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform, sys
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap
import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
import tinygrad.runtime.support.objc as objc
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet, CompilerPair
from tinygrad.renderer.cstyle import MetalRenderer
@@ -167,8 +167,9 @@ class MetalAllocator(LRUAllocator[MetalDevice]):
ret.retain = False
if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}")
return MetalBuffer(ret, size)
@suppress_finalizing
def _free(self, opaque:MetalBuffer, options):
if not sys.is_finalizing(): opaque.buf.release
if not options.external_ptr: opaque.buf.release
def _transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
dest_dev.synchronize()
src_command_buffer = src_dev.mtl_queue.commandBuffer().retained()

View File

@@ -399,9 +399,11 @@ class QCOMDevice(HCQCompiled):
raise RuntimeError("Failed to map external pointer to GPU memory") from e
def _gpu_free(self, mem:HCQBuffer):
if mem.meta[0] is None: return
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta[0].id)
if mem.meta[1]: FileIOInterface.munmap(mem.va_addr, mem.meta[0].mmapsize)
if mem.meta[0] is None: return # external (gpu) ptr
if not mem.meta[1]: kgsl.IOCTL_KGSL_SHAREDMEM_FREE(self.fd, gpuaddr=mem.meta[0].gpuaddr) # external (cpu) ptr
else:
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta[0].id)
FileIOInterface.munmap(mem.va_addr, mem.meta[0].mmapsize)
def _ensure_stack_size(self, sz):
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)