mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-16 01:26:29 -05:00
hcq buffer is a class (#8106)
* hcq buffer is a class * qcom * no from_mv in qcom * remove qcombuffer * useless cast * mypy * qcom fix * _md -> meta
This commit is contained in:
@@ -270,7 +270,7 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
self.dev.synchronize()
|
||||
self.dev._gpu_free(opaque)
|
||||
|
||||
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
||||
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
|
||||
@@ -289,15 +289,15 @@ class AMDDevice(HCQCompiled):
|
||||
signals_pool:List[int] = []
|
||||
gpus:List[pathlib.Path] = []
|
||||
|
||||
def _gpu_map(self, mem):
|
||||
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
|
||||
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
|
||||
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
|
||||
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
||||
n_devices=len(mem.mapped_gpu_ids))
|
||||
assert stm.n_success == len(mem.mapped_gpu_ids)
|
||||
def _gpu_map(self, mem:HCQBuffer):
|
||||
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
|
||||
mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
|
||||
c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
|
||||
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
|
||||
n_devices=len(mem.meta.mapped_gpu_ids))
|
||||
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
|
||||
|
||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
|
||||
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
|
||||
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
|
||||
@@ -321,16 +321,16 @@ class AMDDevice(HCQCompiled):
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
||||
assert addr == buf == mem.va_addr
|
||||
|
||||
self._gpu_map(mem)
|
||||
return mem
|
||||
self._gpu_map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
|
||||
return hcqbuf
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
|
||||
def _gpu_free(self, mem:HCQBuffer):
|
||||
if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
|
||||
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
|
||||
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
||||
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
|
||||
assert stm.n_success == len(gpus)
|
||||
libc.munmap(mem.va_addr, mem.size)
|
||||
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
|
||||
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if AMDDevice.kfd == -1:
|
||||
@@ -356,7 +356,7 @@ class AMDDevice(HCQCompiled):
|
||||
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
|
||||
AMDDevice.event_page = self._gpu_alloc(0x8000, uncached=True)
|
||||
AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
|
||||
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
|
||||
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.meta.handle)
|
||||
else:
|
||||
self._gpu_map(AMDDevice.signals_page)
|
||||
self._gpu_map(AMDDevice.event_page)
|
||||
|
||||
@@ -268,7 +268,7 @@ class NVAllocator(HCQAllocator['NVDevice']):
|
||||
self.dev.synchronize()
|
||||
self.dev._gpu_free(opaque)
|
||||
|
||||
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf)
|
||||
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
|
||||
|
||||
@dataclass
|
||||
class GPFifo:
|
||||
@@ -309,7 +309,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
os.close(fd_dev)
|
||||
return res
|
||||
|
||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag=""):
|
||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
||||
# Uncached memory is "system". Use huge pages only for gpu memory.
|
||||
page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))
|
||||
size = round_up(size, page_size)
|
||||
@@ -347,29 +347,29 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
||||
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.hMemory)
|
||||
def _gpu_free(self, mem:HCQBuffer):
|
||||
if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
|
||||
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
|
||||
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
||||
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
|
||||
|
||||
self._debug_mappings.pop((mem.va_addr, mem.size))
|
||||
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
|
||||
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
|
||||
if mem.meta.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
|
||||
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
|
||||
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
||||
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
|
||||
|
||||
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
|
||||
self._debug_mappings[(va_base, size)] = tag
|
||||
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
||||
gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
|
||||
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root,
|
||||
hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
||||
|
||||
def _gpu_map(self, mem):
|
||||
if self.gpu_uuid in mem.mapped_gpu_ids: return
|
||||
mem.mapped_gpu_ids.append(self.gpu_uuid)
|
||||
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
|
||||
def _gpu_map(self, mem:HCQBuffer):
|
||||
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
|
||||
mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
|
||||
self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
|
||||
|
||||
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
|
||||
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
|
||||
@@ -447,7 +447,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
||||
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
||||
self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
|
||||
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True)
|
||||
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
||||
|
||||
@@ -463,9 +463,9 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
|
||||
notifier = self._gpu_alloc(48 << 20, uncached=True)
|
||||
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
|
||||
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
|
||||
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
||||
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
||||
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
||||
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
||||
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
||||
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
||||
|
||||
@@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQComp
|
||||
from tinygrad.runtime.autogen import kgsl, adreno, libc
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, fromimport
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
||||
@@ -179,9 +179,10 @@ class QCOMArgsState(HCQArgsState):
|
||||
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
||||
|
||||
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
||||
for i, b in enumerate(cast(List[QCOMBuffer], bufs)):
|
||||
if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc)
|
||||
elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo)
|
||||
for i, b in enumerate(bufs):
|
||||
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
|
||||
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
|
||||
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
|
||||
else: self.update_buffer(i, b)
|
||||
for i, v in enumerate(vals): self.update_var(i, v)
|
||||
|
||||
@@ -269,15 +270,13 @@ class QCOMProgram(HCQProgram):
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class QCOMBuffer(HCQBuffer):
|
||||
def __init__(self, va_addr:int, size:int, info=None, mapped=False, desc=None, ibo=None, pitch=None, real_stride=None, **kwargs):
|
||||
self.va_addr, self.size, self.info, self.mapped = va_addr, size, info, mapped
|
||||
|
||||
# Texture specific definitions
|
||||
self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride
|
||||
class QCOMTextureInfo:
|
||||
def __init__(self, pitch:int, real_stride:int, desc:List[int], ibo:List[int]):
|
||||
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
||||
|
||||
class QCOMAllocator(HCQAllocatorBase):
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
# Recalculate real size for texture
|
||||
if options.image is not None:
|
||||
imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize))
|
||||
pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6
|
||||
@@ -286,22 +285,18 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
granularity = 128 if options.image.itemsize == 4 else 256
|
||||
pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0
|
||||
pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add
|
||||
size = pitch * imgh
|
||||
|
||||
if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size)
|
||||
else: texture = self.dev._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE)
|
||||
|
||||
texture.pitch, texture.real_stride = pitch, real_stride
|
||||
buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
|
||||
|
||||
if options.image is not None:
|
||||
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
|
||||
texture.desc[0] = qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
|
||||
texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
|
||||
texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6)
|
||||
texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
|
||||
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
|
||||
desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh),
|
||||
qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0,
|
||||
*data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
|
||||
|
||||
return texture
|
||||
|
||||
return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
|
||||
buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]])
|
||||
return buf
|
||||
|
||||
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
|
||||
while src_off < src_size:
|
||||
@@ -309,13 +304,14 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
|
||||
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
if (qd:=cast(QCOMBuffer, dest)).pitch is not None: self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch)
|
||||
else: ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
|
||||
stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch)
|
||||
self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch)
|
||||
|
||||
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
||||
self.dev.synchronize()
|
||||
if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
|
||||
else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
|
||||
|
||||
stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch)
|
||||
self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride)
|
||||
|
||||
def _as_buffer(self, src:HCQBuffer) -> memoryview:
|
||||
self.dev.synchronize()
|
||||
@@ -361,7 +357,7 @@ class QCOMDevice(HCQCompiled):
|
||||
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
||||
QCOMSignal, QCOMComputeQueue, None)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
|
||||
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
|
||||
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
||||
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
|
||||
|
||||
@@ -369,11 +365,11 @@ class QCOMDevice(HCQCompiled):
|
||||
va_addr = libc.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.fd, alloc.id * 0x1000)
|
||||
|
||||
if fill_zeroes: ctypes.memset(va_addr, 0, size)
|
||||
return QCOMBuffer(va_addr=va_addr, size=size, info=alloc)
|
||||
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
|
||||
libc.munmap(mem.va_addr, mem.info.mmapsize)
|
||||
def _gpu_free(self, mem:HCQBuffer):
|
||||
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
|
||||
libc.munmap(mem.va_addr, mem.meta.mmapsize)
|
||||
|
||||
def _ensure_stack_size(self, sz):
|
||||
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
|
||||
from typing import List, Optional, Dict, Tuple, cast, Type, Union, TypeVar, Generic, Any
|
||||
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
|
||||
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
@@ -358,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
|
||||
|
||||
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
|
||||
self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
|
||||
self.kernargs_alloctor:BumpAllocator = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
|
||||
self.devices.append(self)
|
||||
|
||||
def synchronize(self):
|
||||
@@ -448,8 +448,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self.timeline_signal.value = 0
|
||||
cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b)
|
||||
|
||||
# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size.
|
||||
class HCQBuffer(Protocol): va_addr:int; size:int # noqa: E702
|
||||
class HCQBuffer:
|
||||
def __init__(self, va_addr:int, size:int, texture_info:Any=None, meta:Any=None, _base:Optional[HCQBuffer]=None):
|
||||
self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base
|
||||
|
||||
class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
|
||||
"""
|
||||
@@ -467,8 +468,7 @@ class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
|
||||
def map(self, buf:HCQBuffer): pass
|
||||
|
||||
def _offset(self, buf, size:int, offset:int) -> HCQBuffer:
|
||||
return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']},
|
||||
**{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf)
|
||||
return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf)
|
||||
|
||||
class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
|
||||
Reference in New Issue
Block a user