_gpu_alloc -> allocator.alloc (#6189)

* _gpu_alloc -> allocator.alloc * not needed this import * pylint
2026-01-24 06:18:01 -05:00 · 2024-08-19 23:34:22 +03:00
parent 96d502d8b7
commit bc44e6501b
2 changed files with 6 additions and 8 deletions
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Tuple, List, Any, cast
+from typing import Tuple, List, Any
 import os, ctypes, ctypes.util, functools, pathlib, mmap, errno, time, array, contextlib, decimal
 from dataclasses import dataclass
 from tinygrad.device import HCQCompiled, HCQAllocator, HCQBuffer, HWComputeQueue, HWCopyQueue, HCQArgsState, \
@@ -61,8 +61,7 @@ class AMDComputeQueue(HWComputeQueue):

  def __del__(self):
    if self.binded_device is not None:
-      self.binded_device.synchronize()
-      self.binded_device._gpu_free(self.hw_page)
+      self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True, uncached=True))

  def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
    self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
@@ -161,7 +160,7 @@ class AMDComputeQueue(HWComputeQueue):

  def bind(self, device):
    self.binded_device = device
-    self.hw_page = cast(AMDDevice, device)._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
+    self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True, uncached=True))
    hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
    for i, value in enumerate(self.q): hw_view[i] = value

@@ -303,6 +302,7 @@ class AMDAllocator(HCQAllocator):

  def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
    if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
+    if options.cpu_access and options.uncached: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
    return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)

  def _free(self, opaque, options:BufferOptions):
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -84,9 +84,7 @@ class NVSignal(HCQSignal):

 class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method
  def __del__(self):
-    if self.binded_device is not None:
-      self.binded_device.synchronize() # Synchronize to ensure the buffer is no longer in use.
-      self.binded_device._gpu_free(self.hw_page)
+    if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))

  @hcq_command
  def setup(self, compute_class=None, copy_class=None, local_mem_window=None, shared_mem_window=None, local_mem=None, local_mem_tpc_bytes=None):
@@ -109,7 +107,7 @@ class NVCommandQueue(HWCommandQueue): # pylint: disable=abstract-method

  def bind(self, device):
    self.binded_device = device
-    self.hw_page = cast(NVDevice, device)._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
+    self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
    hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
    for i, value in enumerate(self.q): hw_view[i] = value