hcq: refactor int ptrs to hcqbuffers (#10105)

* hcq: refactor int ptrs to hcqbuffers

* more refactors

* linter

* use in allocator

* test fiz

* fx

* ops

* final?

* simpler

* keep this for now
This commit is contained in:
nimlgen
2025-04-30 00:12:18 +03:00
committed by GitHub
parent 573bbb9746
commit 5c7d004da5
6 changed files with 113 additions and 101 deletions

View File

@@ -2,7 +2,7 @@ import unittest, ctypes, struct, os
from tinygrad import Device, Tensor, dtypes
from tinygrad.helpers import getenv
from tinygrad.device import Buffer, BufferSpec
from tinygrad.runtime.support.hcq import HCQCompiled
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
from tinygrad.engine.realize import get_runner, CompiledRunner
from tinygrad.codegen.kernel import Kernel, Opt, OptOps
from tinygrad import Variable
@@ -45,17 +45,17 @@ class TestHCQ(unittest.TestCase):
if queue_type is None: continue
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
with self.subTest(name=str(queue_type)):
q = queue_type().signal(virt_signal, virt_val)
var_vals = {virt_signal.base_addr: TestHCQ.d0.timeline_signal.base_addr, virt_val: TestHCQ.d0.timeline_value}
var_vals = {virt_signal.base_buf.va_addr: TestHCQ.d0.timeline_signal.base_buf.va_addr, virt_val: TestHCQ.d0.timeline_value}
q.submit(TestHCQ.d0, var_vals)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
var_vals = {virt_signal.base_addr: TestHCQ.d0.timeline_signal.base_addr, virt_val: TestHCQ.d0.timeline_value}
var_vals = {virt_signal.base_buf.va_addr: TestHCQ.d0.timeline_signal.base_buf.va_addr, virt_val: TestHCQ.d0.timeline_value}
q.submit(TestHCQ.d0, var_vals)
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@@ -97,14 +97,14 @@ class TestHCQ(unittest.TestCase):
with self.subTest(name=str(queue_type)):
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
fake_signal = TestHCQ.d0.signal_t()
q = queue_type().wait(virt_signal, virt_val).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
fake_signal.value = 0x30
q.submit(TestHCQ.d0, {virt_signal.base_addr: fake_signal.base_addr, virt_val: fake_signal.value})
q.submit(TestHCQ.d0, {virt_signal.base_buf.va_addr: fake_signal.base_buf.va_addr, virt_val: fake_signal.value})
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@@ -265,7 +265,7 @@ class TestHCQ(unittest.TestCase):
if queue_type is None: continue
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
with self.subTest(name=str(queue_type)):
fake_signal = TestHCQ.d0.signal_t()
@@ -274,7 +274,7 @@ class TestHCQ(unittest.TestCase):
fake_signal.value = 0x30
q.submit(TestHCQ.d0, {virt_signal.base_addr: fake_signal.base_addr, virt_val: fake_signal.value})
q.submit(TestHCQ.d0, {virt_signal.base_buf.va_addr: fake_signal.base_buf.va_addr, virt_val: fake_signal.value})
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

View File

@@ -31,11 +31,12 @@ class HCQGraph(MultiGraphRunner):
# Fill initial arguments.
self.ji_args: dict[int, HCQArgsState] = {}
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, base=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size) for dev,buf in self.kernargs_bufs.items()}
for j,ji in enumerate(jit_cache):
if not isinstance(ji.prg, CompiledRunner): continue
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
argsbuf = self.kernargs_bufs[ji.prg.dev].offset(kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, argsbuf)
# Schedule Dependencies.
# There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any
@@ -125,7 +126,7 @@ class HCQGraph(MultiGraphRunner):
# Create variable timeline signals for each device.
timeline_sigaddrs = {dev: UOp.variable(f"timeline_sig_{dev.device_id}", 0, 0xffffffffffffffff, dtype=dtypes.uint64) for dev in self.devices}
self.virt_timeline_vals = {dev: UOp.variable(f"timeline_var_{dev.device_id}", 0, 0xffffffff, dtype=dtypes.uint32) for dev in self.devices}
self.virt_timeline_signals = {dev: dev.signal_t(base_addr=timeline_sigaddrs[dev], timeline_for_device=dev) for dev in self.devices}
self.virt_timeline_signals = {dev: dev.signal_t(base_buf=HCQBuffer(timeline_sigaddrs[dev], 16), timeline_for_device=dev) for dev in self.devices}
for dev in self.devices:
self.comp_queues[dev].memory_barrier().wait(self.virt_timeline_signals[dev], self.virt_timeline_vals[dev]) \
@@ -175,7 +176,7 @@ class HCQGraph(MultiGraphRunner):
hcq_var_vals = {self.kickoff_var: self.kickoff_value, **var_vals,
**{var: dev.timeline_value - 1 for dev, var in self.virt_timeline_vals.items()},
**{sig.base_addr: dev.timeline_signal.base_addr for dev, sig in self.virt_timeline_signals.items()}}
**{sig.base_buf.va_addr: dev.timeline_signal.base_buf.va_addr for dev, sig in self.virt_timeline_signals.items()}}
# Update rawbuffers
for (j,i),input_idx in self.input_replace.items(): hcq_var_vals[self.input_replace_to_var.get((j,i))] = input_rawbuffers[input_idx]._buf.va_addr

View File

@@ -24,8 +24,8 @@ WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
class AMDSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
super().__init__(base_buf, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
def _sleep(self, time_spent_waiting_ms:int):
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
@@ -234,14 +234,14 @@ class AMDComputeQueue(HWQueue):
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000]
if prg.enable_dispatch_ptr:
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
dp = (dp_t:=hsa.hsa_kernel_dispatch_packet_t).from_address(cast(int, (disp_buf:=args_state.buf.offset(prg.kernargs_segment_size)).va_addr))
self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
user_regs += [*data64_le(dp_addr)]
self.bind_sints(*local_size, mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='workgroup_size_x', fmt='H')
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='grid_size_x', fmt='I')
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.buf.va_addr
user_regs += [*data64_le(disp_buf.va_addr)]
user_regs += [*data64_le(args_state.ptr)]
user_regs += [*data64_le(args_state.buf.va_addr)]
if prg.dev.sqtt_enabled: self.sqtt_prg_marker(prg, global_size)
@@ -597,7 +597,7 @@ class KFDIface:
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
assert addr == buf == mem.va_addr
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem, view=MMIOInterface(mem.va_addr, mem.size, fmt='B') if cpu_access or host else None))
return hcqbuf
def free(self, mem):
@@ -750,11 +750,13 @@ class PCIIface:
self.pagemap.seek(va // mmap.PAGESIZE * 8)
paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access))
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access),
view=MMIOInterface(am_mapping.va_addr, size, fmt='B'))
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access))
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access),
view=MMIOInterface(am_mapping.va_addr, size, fmt='B') if cpu_access else None)
def free(self, mem):
for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
@@ -793,8 +795,8 @@ class PCIIface:
class AMDDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
signal_pool: ClassVar[list[HCQBuffer]] = []
driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))

View File

@@ -72,8 +72,8 @@ qmd_struct_t = make_qmd_struct_type()
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
class NVSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
super().__init__(base_buf, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
def __init__(self):
@@ -133,26 +133,27 @@ class NVComputeQueue(NVCommandQueue):
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
self.bind_args_state(args_state)
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
qmd_buf.cpu_view().view(size=0x40 * 4, fmt='B')[:] = bytes(prg.qmd)
assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
qmd = qmd_struct_t.from_address(qmd_buf.va_addr) # Save qmd for later update
self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8)
self.bind_sints_to_mem(*local_size, mem=qmd_buf.cpu_view(), fmt='H', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8)
self.bind_sints_to_mem(*local_size, *global_size, mem=args_state.buf.cpu_view(), fmt='I')
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.buf.va_addr)
if self.active_qmd is None:
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
else:
self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
self.active_qmd.dependent_qmd0_pointer = qmd_buf.va_addr >> 8
self.active_qmd.dependent_qmd0_action = 1
self.active_qmd.dependent_qmd0_prefetch = 1
self.active_qmd.dependent_qmd0_enable = 1
self.active_qmd = qmd
self.active_qmd, self.active_qmd_buf = qmd, qmd_buf
return self
def signal(self, signal:NVSignal, value:sint=0):
@@ -160,8 +161,9 @@ class NVComputeQueue(NVCommandQueue):
for i in range(2):
if getattr(self.active_qmd, f'release{i}_enable') == 0:
setattr(self.active_qmd, f'release{i}_enable', 1)
self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
self.bind_sints(signal.value_addr, mem=self.active_qmd_buf.cpu_view(), struct_t=qmd_struct_t, start_field=f'release{i}_address',
fmt='Q', mask=0xfffffffff)
self.bind_sints(value, mem=self.active_qmd_buf.cpu_view(), struct_t=qmd_struct_t, start_field=f'release{i}_payload', fmt='Q')
return self
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
@@ -187,9 +189,9 @@ class NVCopyQueue(NVCommandQueue):
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
class NVArgsState(CLikeArgsState):
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
super().__init__(buf, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
class NVProgram(HCQProgram):
def __init__(self, dev:NVDevice, name:str, lib:bytes):
@@ -292,8 +294,8 @@ class GPFifo:
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class NVDevice(HCQCompiled[NVSignal]):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
signal_pool: ClassVar[list[HCQBuffer]] = []
root = None
fd_ctl: FileIOInterface
@@ -376,7 +378,8 @@ class NVDevice(HCQCompiled[NVSignal]):
self._debug_mappings[(va_base, size)] = tag
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping),
view=MMIOInterface(va_base, size, fmt='B') if has_cpu_mapping else None)
def _gpu_map(self, mem:HCQBuffer):
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return

View File

@@ -5,7 +5,7 @@ from types import SimpleNamespace
from typing import Any, cast, ClassVar
from tinygrad.device import BufferSpec
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import FileIOInterface
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
from tinygrad.runtime.autogen import kgsl, adreno
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
@@ -37,8 +37,8 @@ class QCOMCompiler(CLCompiler):
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
class QCOMSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
super().__init__(base_buf, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
def _sleep(self, time_spent_waiting_ms:int):
# Sleep only for only timeline signals. Do it immediately to free cpu.
@@ -132,7 +132,7 @@ class QCOMComputeQueue(HWQueue):
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=1024 // 4),
*data64_le(args_state.ptr))
*data64_le(args_state.buf.va_addr))
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
*data64_le(prg.lib_gpu.va_addr))
@@ -145,21 +145,21 @@ class QCOMComputeQueue(HWQueue):
if args_state.prg.samp_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
*data64_le(args_state.ptr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
*data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
if args_state.prg.tex_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
*data64_le(args_state.ptr + args_state.prg.tex_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
*data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
if args_state.prg.ibo_cnt > 0:
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
*data64_le(args_state.ptr + args_state.prg.ibo_off))
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ptr + args_state.prg.ibo_off))
*data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
@@ -168,24 +168,24 @@ class QCOMComputeQueue(HWQueue):
return self
class QCOMArgsState(HCQArgsState):
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
super().__init__(ptr, prg, bufs, vals=vals)
def __init__(self, buf:HCQBuffer, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
super().__init__(buf, prg, bufs, vals=vals)
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
self.buf_info, self.args_info, self.args_view = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):], to_mv(ptr, prg.kernargs_alloc_size).cast('Q')
self.buf_info, self.args_info = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):]
ctypes.memset(self.ptr, 0, prg.kernargs_alloc_size)
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
ctypes.memset(cast(int, self.buf.va_addr), 0, prg.kernargs_alloc_size)
for cnst_val,cnst_off,cnst_sz in prg.consts_info: to_mv(self.buf.va_addr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
if prg.samp_cnt > 0: to_mv(self.buf.va_addr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
for i, b in enumerate(bufs):
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
self.bind_sints_to_ptr(b.va_addr, ptr=self.ptr + self.buf_info[i].offset + (0 if self.buf_info[i].type is BUFTYPE_BUF else 16), fmt='Q')
to_mv(self.buf.va_addr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
self.bind_sints_to_buf(b.va_addr, buf=self.buf, fmt='Q', offset=self.buf_info[i].offset+(0 if self.buf_info[i].type is BUFTYPE_BUF else 16))
for i, v in enumerate(vals): self.bind_sints_to_ptr(v, ptr=self.ptr + self.args_info[i].offset, fmt='I')
for i, v in enumerate(vals): self.bind_sints_to_buf(v, buf=self.buf, fmt='I', offset=self.args_info[i].offset)
class QCOMProgram(HCQProgram):
def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
@@ -318,8 +318,8 @@ class QCOMAllocator(HCQAllocatorBase):
class QCOMDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
signal_pool: ClassVar[list[HCQBuffer]] = []
gpu_id: int = 0
dummy_addr: int = 0
@@ -360,7 +360,7 @@ class QCOMDevice(HCQCompiled):
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
if fill_zeroes: ctypes.memset(va_addr, 0, size)
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc, view=MMIOInterface(va_addr, size, fmt='B'))
def _gpu_free(self, mem:HCQBuffer):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)

View File

@@ -74,7 +74,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
self._q:Any = []
self.binded_device:DeviceType|None = None
self.q_sints:list[tuple[int, int]] = []
self.mv_sints:list[tuple[memoryview, int, int, int|None]] = []
self.mv_sints:list[tuple[MMIOInterface, int, int, int|None]] = []
self.syms:list[sint] = []
self._prev_resolved_syms:list[int|None] = []
@@ -173,13 +173,13 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
"""
def bind_args_state(self, args_state:ArgsStateType):
for vals, ptr, fmt in args_state.bind_data: self.bind_sints_to_ptr(*vals, ptr=ptr, fmt=fmt)
for vals, mem, fmt in args_state.bind_data: self.bind_sints_to_mem(*vals, mem=mem, fmt=fmt)
def bind_sints(self, *vals:sint, struct:ctypes.Structure, start_field:str, fmt, mask:int|None=None):
self.bind_sints_to_ptr(*vals, ptr=ctypes.addressof(struct) + getattr(type(struct), start_field).offset, fmt=fmt, mask=mask)
def bind_sints(self, *vals:sint, mem:MMIOInterface, struct_t:Type[ctypes.Structure], start_field:str, fmt, mask:int|None=None):
self.bind_sints_to_mem(*vals, mem=mem, fmt=fmt, mask=mask, offset=getattr(struct_t, start_field).offset)
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt, mask:int|None=None):
mv = to_mv(ptr, 8*len(vals)).cast(fmt)
def bind_sints_to_mem(self, *vals:sint, mem:MMIOInterface, fmt, mask:int|None=None, offset:int=0):
mv = mem.view(offset=offset, size=len(vals)*8, fmt=fmt)
for i, val in enumerate(vals):
if isinstance(val, int): mv[i] = val if mask is None else ((mv[i] & ~mask) | val)
else: self.mv_sints.append((mv, i, self._new_sym(val), mask))
@@ -211,19 +211,19 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
class HCQSignal(Generic[DeviceType]):
def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
def __init__(self, base_buf:HCQBuffer|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
timestamp_divider=1, value_off=0, timestamp_off=8):
self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr
self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t
self.base_buf = cast(HCQBuffer, dev_t._alloc_signal() if dev_t is not None and base_buf is None else base_buf)
self.value_addr, self.timestamp_addr, self.dev_t = self.base_buf.va_addr+value_off, self.base_buf.va_addr+timestamp_off, dev_t
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
self.timeline_for_device:DeviceType|None = timeline_for_device
if isinstance(self.base_addr, int):
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
if isinstance(self.base_buf.va_addr, int):
self.value_mv, self.timestamp_mv = self.base_buf.cpu_view().view(value_off, 8, 'Q'), self.base_buf.cpu_view().view(timestamp_off, 8, 'Q')
self.value_mv[0] = value
def __del__(self):
if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr)
if isinstance(self.base_buf.va_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_buf)
@property
def value(self) -> int: return self.value_mv[0]
@@ -281,27 +281,27 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
class HCQArgsState(Generic[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
self.ptr, self.prg = ptr, prg
self.bind_data:list[tuple[tuple[sint, ...], int, str]] = []
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
self.buf, self.prg = buf, prg
self.bind_data:list[tuple[tuple[sint, ...], MMIOInterface, str]] = []
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
def bind_sints_to_buf(self, *vals:sint, buf:HCQBuffer, fmt, offset=0): self.bind_data.append((vals, buf.cpu_view().view(offset=offset), fmt))
class CLikeArgsState(HCQArgsState[ProgramType]):
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:list[int]|None=None):
super().__init__(ptr, prg, bufs, vals=vals)
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:list[int]|None=None):
super().__init__(buf, prg, bufs, vals=vals)
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
if prefix is not None: self.buf.cpu_view().view(size=len(prefix) * 4, fmt='I')[:] = array.array('I', prefix)
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q')
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
self.bind_sints_to_buf(*[b.va_addr for b in bufs], buf=self.buf, fmt='Q', offset=len(prefix or []) * 4)
self.bind_sints_to_buf(*vals, buf=self.buf, fmt='I', offset=len(prefix or []) * 4 + len(bufs) * 8)
class HCQProgram(Generic[DeviceType]):
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState:
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs:HCQBuffer|None=None) -> HCQArgsState:
"""
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
Args:
@@ -311,7 +311,8 @@ class HCQProgram(Generic[DeviceType]):
Returns:
Arguments state with the given buffers and values set for the program.
"""
return self.args_state_t(kernargs_ptr or self.dev.kernargs_allocator.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
argsbuf = kernargs or self.dev.kernargs_buf.offset(offset=self.dev.kernargs_offset_allocator.alloc(self.kernargs_alloc_size))
return self.args_state_t(argsbuf, self, bufs, vals=vals)
def __call__(self, *bufs:HCQBuffer, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1),
vals:tuple[int, ...]=(), wait:bool=False) -> float|None:
@@ -345,8 +346,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
"""
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[Any]] = []
signal_pool: ClassVar[list[int]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
signal_pool: ClassVar[list[HCQBuffer]] = []
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
comp_queue_t:Callable[[], HWQueue], copy_queue_t:Callable[[], HWQueue]|None):
@@ -365,8 +366,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
self.kernargs_buf:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)
def synchronize(self):
try: self.timeline_signal.wait(self.timeline_value - 1)
@@ -384,10 +385,10 @@ class HCQCompiled(Compiled, Generic[SignalType]):
return self.timeline_value - 1
@classmethod
def _alloc_signal_addr(cls) -> int:
def _alloc_signal(cls) -> HCQBuffer:
if not cls.signal_pool:
cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True)))
cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)]
cls.signal_pool += [alc.offset(offset=off, size=16) for off in range(0, alc.size, 16)]
for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc)
return cls.signal_pool.pop()
@@ -416,8 +417,16 @@ class HCQCompiled(Compiled, Generic[SignalType]):
return buf, realloced
class HCQBuffer:
def __init__(self, va_addr:sint, size:int, texture_info:Any=None, meta:Any=None, _base:HCQBuffer|None=None):
self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base
def __init__(self, va_addr:sint, size:int, texture_info:Any=None, meta:Any=None, _base:HCQBuffer|None=None, view:MMIOInterface|None=None):
self.va_addr, self.size, self.texture_info, self.meta, self._base, self.view = va_addr, size, texture_info, meta, _base, view
def offset(self, offset:int=0, size:int|None=None) -> HCQBuffer:
return HCQBuffer(self.va_addr+offset, size or (self.size - offset), texture_info=self.texture_info, meta=self.meta, _base=self._base or self,
view=(self.view.view(offset=offset, size=size) if self.view is not None else None))
def cpu_view(self) -> MMIOInterface:
assert self.view is not None, "buffer has no cpu_view"
return self.view
class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
"""
@@ -433,9 +442,7 @@ class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
super().__init__()
def map(self, buf:HCQBuffer): pass
def _offset(self, buf, size:int, offset:int) -> HCQBuffer:
return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf)
def _offset(self, buf, size:int, offset:int) -> HCQBuffer: return buf.offset(offset=offset, size=size)
class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
def _copyin(self, dest:HCQBuffer, src:memoryview):
@@ -476,7 +483,6 @@ class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
self.dev.timeline_signal.wait(self.dev.timeline_value - 1)
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
def _transfer(self, dest:HCQBuffer, src:HCQBuffer, sz:int, src_dev:DeviceType, dest_dev:DeviceType):