diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index a10fb38f9f..bac90a100a 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -270,7 +270,7 @@ class AMDAllocator(HCQAllocator['AMDDevice']): self.dev.synchronize() self.dev._gpu_free(opaque) - def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf) + def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf) MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 @@ -289,15 +289,15 @@ class AMDDevice(HCQCompiled): signals_pool:List[int] = [] gpus:List[pathlib.Path] = [] - def _gpu_map(self, mem): - if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return - mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id]) - c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids) - stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), - n_devices=len(mem.mapped_gpu_ids)) - assert stm.n_success == len(mem.mapped_gpu_ids) + def _gpu_map(self, mem:HCQBuffer): + if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return + mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id]) + c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids) + stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), + n_devices=len(mem.meta.mapped_gpu_ids)) + assert stm.n_success == len(mem.meta.mapped_gpu_ids) - def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False): + def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer: flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT @@ -321,16 +321,16 @@ class AMDDevice(HCQCompiled): buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset) assert addr == buf == mem.va_addr - self._gpu_map(mem) - return mem + self._gpu_map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem)) + return hcqbuf - def _gpu_free(self, mem): - if len(gpus:=getattr(mem, "mapped_gpu_ids", [])): + def _gpu_free(self, mem:HCQBuffer): + if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])): c_gpus = (ctypes.c_int32 * len(gpus))(*gpus) - stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus)) + stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus)) assert stm.n_success == len(gpus) libc.munmap(mem.va_addr, mem.size) - kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle) + kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle) def __init__(self, device:str=""): if AMDDevice.kfd == -1: @@ -356,7 +356,7 @@ class AMDDevice(HCQCompiled): AMDDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) AMDDevice.event_page = self._gpu_alloc(0x8000, uncached=True) AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)] - kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle) + kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.meta.handle) else: self._gpu_map(AMDDevice.signals_page) self._gpu_map(AMDDevice.event_page) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 9667745cab..9e6fc138c9 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -268,7 +268,7 @@ class NVAllocator(HCQAllocator['NVDevice']): self.dev.synchronize() self.dev._gpu_free(opaque) - def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf) + def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf) @dataclass class GPFifo: @@ -309,7 +309,7 @@ class NVDevice(HCQCompiled[NVSignal]): os.close(fd_dev) return res - def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag=""): + def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer: # Uncached memory is "system". Use huge pages only for gpu memory. page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10)) size = round_up(size, page_size) @@ -347,29 +347,29 @@ class NVDevice(HCQCompiled[NVSignal]): return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag) - def _gpu_free(self, mem): - if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem. - made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.hMemory) + def _gpu_free(self, mem:HCQBuffer): + if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem. + made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory) nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made) if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}") self._debug_mappings.pop((mem.va_addr, mem.size)) uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size) - if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size) + if mem.meta.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size) - def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS: + def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer: if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size) attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1)) # NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol. self._debug_mappings[(va_base, size)] = tag - return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle, - gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping) + return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, + hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)) - def _gpu_map(self, mem): - if self.gpu_uuid in mem.mapped_gpu_ids: return - mem.mapped_gpu_ids.append(self.gpu_uuid) - self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem") + def _gpu_map(self, mem:HCQBuffer): + if self.gpu_uuid in mem.meta.mapped_gpu_ids: return + mem.meta.mapped_gpu_ids.append(self.gpu_uuid) + self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem") def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False): return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment) @@ -447,7 +447,7 @@ class NVDevice(HCQCompiled[NVSignal]): rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1) - self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq") + self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq") self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True) self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I") @@ -463,9 +463,9 @@ class NVDevice(HCQCompiled[NVSignal]): def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo: notifier = self._gpu_alloc(48 << 20, uncached=True) - params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory, + params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory, gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare, - hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset)) + hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset)) gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None) diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 68d7a257d1..26019baa5a 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQComp from tinygrad.runtime.autogen import kgsl, adreno, libc from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer -from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, fromimport +from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2 @@ -179,9 +179,10 @@ class QCOMArgsState(HCQArgsState): for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little') if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers) - for i, b in enumerate(cast(List[QCOMBuffer], bufs)): - if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc) - elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo) + for i, b in enumerate(bufs): + if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}: + obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo + to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj) else: self.update_buffer(i, b) for i, v in enumerate(vals): self.update_var(i, v) @@ -269,15 +270,13 @@ class QCOMProgram(HCQProgram): def __del__(self): if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True)) -class QCOMBuffer(HCQBuffer): - def __init__(self, va_addr:int, size:int, info=None, mapped=False, desc=None, ibo=None, pitch=None, real_stride=None, **kwargs): - self.va_addr, self.size, self.info, self.mapped = va_addr, size, info, mapped - - # Texture specific definitions - self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride +class QCOMTextureInfo: + def __init__(self, pitch:int, real_stride:int, desc:List[int], ibo:List[int]): + self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo class QCOMAllocator(HCQAllocatorBase): def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: + # Recalculate real size for texture if options.image is not None: imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize)) pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6 @@ -286,22 +285,18 @@ class QCOMAllocator(HCQAllocatorBase): granularity = 128 if options.image.itemsize == 4 else 256 pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0 pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add + size = pitch * imgh - if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size) - else: texture = self.dev._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE) - - texture.pitch, texture.real_stride = pitch, real_stride + buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size) + if options.image is not None: tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT - texture.desc[0] = qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt) - texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh) - texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6) - texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)] - texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]] + desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh), + qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0, + *data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)] - return texture - - return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size) + buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]]) + return buf def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0): while src_off < src_size: @@ -309,13 +304,14 @@ class QCOMAllocator(HCQAllocatorBase): src_off, dest_off = src_off+src_stride, dest_off+dest_stride def _copyin(self, dest:HCQBuffer, src:memoryview): - if (qd:=cast(QCOMBuffer, dest)).pitch is not None: self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch) - else: ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes) + stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch) + self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch) def _copyout(self, dest:memoryview, src:HCQBuffer): self.dev.synchronize() - if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride) - else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes) + + stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch) + self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride) def _as_buffer(self, src:HCQBuffer) -> memoryview: self.dev.synchronize() @@ -361,7 +357,7 @@ class QCOMDevice(HCQCompiled): super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self), QCOMSignal, QCOMComputeQueue, None) - def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False): + def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer: flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED) @@ -369,11 +365,11 @@ class QCOMDevice(HCQCompiled): va_addr = libc.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.fd, alloc.id * 0x1000) if fill_zeroes: ctypes.memset(va_addr, 0, size) - return QCOMBuffer(va_addr=va_addr, size=size, info=alloc) + return HCQBuffer(va_addr=va_addr, size=size, meta=alloc) - def _gpu_free(self, mem): - kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id) - libc.munmap(mem.va_addr, mem.info.mmapsize) + def _gpu_free(self, mem:HCQBuffer): + kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id) + libc.munmap(mem.va_addr, mem.meta.mmapsize) def _ensure_stack_size(self, sz): if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index c35fc22315..608cd6525a 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any +from typing import List, Optional, Dict, Tuple, cast, Type, Union, TypeVar, Generic, Any import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer @@ -358,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph) self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True)) - self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True) + self.kernargs_alloctor:BumpAllocator = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True) self.devices.append(self) def synchronize(self): @@ -448,8 +448,9 @@ class HCQCompiled(Compiled, Generic[SignalType]): self.timeline_signal.value = 0 cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b) -# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size. -class HCQBuffer(Protocol): va_addr:int; size:int # noqa: E702 +class HCQBuffer: + def __init__(self, va_addr:int, size:int, texture_info:Any=None, meta:Any=None, _base:Optional[HCQBuffer]=None): + self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]): """ @@ -467,8 +468,7 @@ class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]): def map(self, buf:HCQBuffer): pass def _offset(self, buf, size:int, offset:int) -> HCQBuffer: - return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']}, - **{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf) + return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf) class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]): def _copyin(self, dest:HCQBuffer, src:memoryview):