hcq buffer is a class (#8106)

* hcq buffer is a class

* qcom

* no from_mv in qcom

* remove qcombuffer

* useless cast

* mypy

* qcom fix

* _md -> meta
This commit is contained in:
nimlgen
2024-12-08 13:29:43 +03:00
committed by GitHub
parent b9c977f1c8
commit d6e66095fd
4 changed files with 65 additions and 69 deletions

View File

@@ -270,7 +270,7 @@ class AMDAllocator(HCQAllocator['AMDDevice']):
self.dev.synchronize()
self.dev._gpu_free(opaque)
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf)
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
@@ -289,15 +289,15 @@ class AMDDevice(HCQCompiled):
signals_pool:List[int] = []
gpus:List[pathlib.Path] = []
def _gpu_map(self, mem):
if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
n_devices=len(mem.mapped_gpu_ids))
assert stm.n_success == len(mem.mapped_gpu_ids)
def _gpu_map(self, mem:HCQBuffer):
if self.gpu_id in getattr(mem.meta, "mapped_gpu_ids", []): return
mem.meta.__setattr__("mapped_gpu_ids", getattr(mem.meta, "mapped_gpu_ids", []) + [self.gpu_id])
c_gpus = (ctypes.c_int32 * len(mem.meta.mapped_gpu_ids))(*mem.meta.mapped_gpu_ids)
stm = kfd.AMDKFD_IOC_MAP_MEMORY_TO_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus),
n_devices=len(mem.meta.mapped_gpu_ids))
assert stm.n_success == len(mem.meta.mapped_gpu_ids)
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False):
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False) -> HCQBuffer:
flags = kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT
@@ -321,16 +321,16 @@ class AMDDevice(HCQCompiled):
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, self.drm_fd, mem.mmap_offset)
assert addr == buf == mem.va_addr
self._gpu_map(mem)
return mem
self._gpu_map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
return hcqbuf
def _gpu_free(self, mem):
if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
def _gpu_free(self, mem:HCQBuffer):
if len(gpus:=getattr(mem.meta, "mapped_gpu_ids", [])):
c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
stm = kfd.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU(self.kfd, handle=mem.meta.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
assert stm.n_success == len(gpus)
libc.munmap(mem.va_addr, mem.size)
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.handle)
kfd.AMDKFD_IOC_FREE_MEMORY_OF_GPU(self.kfd, handle=mem.meta.handle)
def __init__(self, device:str=""):
if AMDDevice.kfd == -1:
@@ -356,7 +356,7 @@ class AMDDevice(HCQCompiled):
AMDDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
AMDDevice.event_page = self._gpu_alloc(0x8000, uncached=True)
AMDDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle)
kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.meta.handle)
else:
self._gpu_map(AMDDevice.signals_page)
self._gpu_map(AMDDevice.event_page)

View File

@@ -268,7 +268,7 @@ class NVAllocator(HCQAllocator['NVDevice']):
self.dev.synchronize()
self.dev._gpu_free(opaque)
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if hasattr(buf, '_base') else buf)
def map(self, buf:HCQBuffer): self.dev._gpu_map(buf._base if buf._base is not None else buf)
@dataclass
class GPFifo:
@@ -309,7 +309,7 @@ class NVDevice(HCQCompiled[NVSignal]):
os.close(fd_dev)
return res
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag=""):
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
# Uncached memory is "system". Use huge pages only for gpu memory.
page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))
size = round_up(size, page_size)
@@ -347,29 +347,29 @@ class NVDevice(HCQCompiled[NVSignal]):
return self._gpu_uvm_map(va_addr, size, mem_handle, has_cpu_mapping=cpu_access or host, tag=tag)
def _gpu_free(self, mem):
if mem.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.hMemory)
def _gpu_free(self, mem:HCQBuffer):
if mem.meta.hMemory > NVDevice.host_object_enumerator: # not a host object, clear phys mem.
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.nvdevice, hObjectOld=mem.meta.hMemory)
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
if made.status != 0: raise RuntimeError(f"_gpu_free returned {get_error_str(made.status)}")
self._debug_mappings.pop((mem.va_addr, mem.size))
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
if mem.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
if mem.meta.has_cpu_mapping: libc.munmap(mem.va_addr, mem.size)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False, tag="") -> HCQBuffer:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
# NOTE: va_addr is set to make rawbufs compatable with HCQBuffer protocol.
self._debug_mappings[(va_base, size)] = tag
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
gpuAttributesCount=1, perGpuAttributes=attrs, va_addr=va_base, size=size, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping)
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root,
hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs, mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
def _gpu_map(self, mem):
if self.gpu_uuid in mem.mapped_gpu_ids: return
mem.mapped_gpu_ids.append(self.gpu_uuid)
self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False, tag="p2p mem")
def _gpu_map(self, mem:HCQBuffer):
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
mem.meta.mapped_gpu_ids.append(self.gpu_uuid)
self._gpu_uvm_map(mem.va_addr, mem.size, mem.meta.hMemory, create_range=False, tag="p2p mem")
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10), force_low=False):
return NVDevice.low_uvm_vaddr_allocator.alloc(size, alignment) if force_low else NVDevice.uvm_vaddr_allocator.alloc(size, alignment)
@@ -447,7 +447,7 @@ class NVDevice(HCQCompiled[NVSignal]):
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq")
self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, start=self.cmdq_page.va_addr, wrap=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
@@ -463,9 +463,9 @@ class NVDevice(HCQCompiled[NVSignal]):
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, enable_debug=False) -> GPFifo:
notifier = self._gpu_alloc(48 << 20, uncached=True)
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)

View File

@@ -8,7 +8,7 @@ from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQComp
from tinygrad.runtime.autogen import kgsl, adreno, libc
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, fromimport
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
@@ -179,9 +179,10 @@ class QCOMArgsState(HCQArgsState):
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
for i, b in enumerate(cast(List[QCOMBuffer], bufs)):
if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc)
elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo)
for i, b in enumerate(bufs):
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
else: self.update_buffer(i, b)
for i, v in enumerate(vals): self.update_var(i, v)
@@ -269,15 +270,13 @@ class QCOMProgram(HCQProgram):
def __del__(self):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
class QCOMBuffer(HCQBuffer):
def __init__(self, va_addr:int, size:int, info=None, mapped=False, desc=None, ibo=None, pitch=None, real_stride=None, **kwargs):
self.va_addr, self.size, self.info, self.mapped = va_addr, size, info, mapped
# Texture specific definitions
self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride
class QCOMTextureInfo:
def __init__(self, pitch:int, real_stride:int, desc:List[int], ibo:List[int]):
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
class QCOMAllocator(HCQAllocatorBase):
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
# Recalculate real size for texture
if options.image is not None:
imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize))
pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6
@@ -286,22 +285,18 @@ class QCOMAllocator(HCQAllocatorBase):
granularity = 128 if options.image.itemsize == 4 else 256
pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0
pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add
size = pitch * imgh
if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size)
else: texture = self.dev._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE)
texture.pitch, texture.real_stride = pitch, real_stride
buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
if options.image is not None:
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
texture.desc[0] = qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6)
texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh),
qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0,
*data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
return texture
return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]])
return buf
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
while src_off < src_size:
@@ -309,13 +304,14 @@ class QCOMAllocator(HCQAllocatorBase):
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
def _copyin(self, dest:HCQBuffer, src:memoryview):
if (qd:=cast(QCOMBuffer, dest)).pitch is not None: self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch)
else: ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch)
self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch)
def _copyout(self, dest:memoryview, src:HCQBuffer):
self.dev.synchronize()
if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch)
self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride)
def _as_buffer(self, src:HCQBuffer) -> memoryview:
self.dev.synchronize()
@@ -361,7 +357,7 @@ class QCOMDevice(HCQCompiled):
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
QCOMSignal, QCOMComputeQueue, None)
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
@@ -369,11 +365,11 @@ class QCOMDevice(HCQCompiled):
va_addr = libc.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.fd, alloc.id * 0x1000)
if fill_zeroes: ctypes.memset(va_addr, 0, size)
return QCOMBuffer(va_addr=va_addr, size=size, info=alloc)
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
def _gpu_free(self, mem):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
libc.munmap(mem.va_addr, mem.info.mmapsize)
def _gpu_free(self, mem:HCQBuffer):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
libc.munmap(mem.va_addr, mem.meta.mmapsize)
def _ensure_stack_size(self, sz):
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import List, Optional, Dict, Tuple, cast, Protocol, Type, Union, TypeVar, Generic, Any
from typing import List, Optional, Dict, Tuple, cast, Type, Union, TypeVar, Generic, Any
import contextlib, decimal, statistics, random, json, atexit, time, ctypes, array
from tinygrad.helpers import PROFILEPATH, PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
@@ -358,7 +358,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
self.kernargs_alloctor = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
self.kernargs_alloctor:BumpAllocator = BumpAllocator(self.kernargs_page.size, start=self.kernargs_page.va_addr, wrap=True)
self.devices.append(self)
def synchronize(self):
@@ -448,8 +448,9 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.timeline_signal.value = 0
cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b)
# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size.
class HCQBuffer(Protocol): va_addr:int; size:int # noqa: E702
class HCQBuffer:
def __init__(self, va_addr:int, size:int, texture_info:Any=None, meta:Any=None, _base:Optional[HCQBuffer]=None):
self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base
class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
"""
@@ -467,8 +468,7 @@ class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
def map(self, buf:HCQBuffer): pass
def _offset(self, buf, size:int, offset:int) -> HCQBuffer:
return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']},
**{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf)
return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf)
class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
def _copyin(self, dest:HCQBuffer, src:memoryview):