mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
hcq: refactor int ptrs to hcqbuffers (#10105)
* hcq: refactor int ptrs to hcqbuffers * more refactors * linter * use in allocator * test fiz * fx * ops * final? * simpler * keep this for now
This commit is contained in:
@@ -2,7 +2,7 @@ import unittest, ctypes, struct, os
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.device import Buffer, BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
|
||||
from tinygrad.engine.realize import get_runner, CompiledRunner
|
||||
from tinygrad.codegen.kernel import Kernel, Opt, OptOps
|
||||
from tinygrad import Variable
|
||||
@@ -45,17 +45,17 @@ class TestHCQ(unittest.TestCase):
|
||||
if queue_type is None: continue
|
||||
|
||||
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
|
||||
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
|
||||
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
|
||||
|
||||
with self.subTest(name=str(queue_type)):
|
||||
q = queue_type().signal(virt_signal, virt_val)
|
||||
|
||||
var_vals = {virt_signal.base_addr: TestHCQ.d0.timeline_signal.base_addr, virt_val: TestHCQ.d0.timeline_value}
|
||||
var_vals = {virt_signal.base_buf.va_addr: TestHCQ.d0.timeline_signal.base_buf.va_addr, virt_val: TestHCQ.d0.timeline_value}
|
||||
q.submit(TestHCQ.d0, var_vals)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
var_vals = {virt_signal.base_addr: TestHCQ.d0.timeline_signal.base_addr, virt_val: TestHCQ.d0.timeline_value}
|
||||
var_vals = {virt_signal.base_buf.va_addr: TestHCQ.d0.timeline_signal.base_buf.va_addr, virt_val: TestHCQ.d0.timeline_value}
|
||||
q.submit(TestHCQ.d0, var_vals)
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
@@ -97,14 +97,14 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
with self.subTest(name=str(queue_type)):
|
||||
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
|
||||
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
|
||||
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
|
||||
|
||||
fake_signal = TestHCQ.d0.signal_t()
|
||||
q = queue_type().wait(virt_signal, virt_val).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
|
||||
fake_signal.value = 0x30
|
||||
|
||||
q.submit(TestHCQ.d0, {virt_signal.base_addr: fake_signal.base_addr, virt_val: fake_signal.value})
|
||||
q.submit(TestHCQ.d0, {virt_signal.base_buf.va_addr: fake_signal.base_buf.va_addr, virt_val: fake_signal.value})
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
@@ -265,7 +265,7 @@ class TestHCQ(unittest.TestCase):
|
||||
if queue_type is None: continue
|
||||
|
||||
virt_val = Variable("sig_val", 0, 0xffffffff, dtypes.uint32)
|
||||
virt_signal = TestHCQ.d0.signal_t(base_addr=Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64))
|
||||
virt_signal = TestHCQ.d0.signal_t(base_buf=HCQBuffer(Variable("sig_addr", 0, 0xffffffffffffffff, dtypes.uint64), 16))
|
||||
|
||||
with self.subTest(name=str(queue_type)):
|
||||
fake_signal = TestHCQ.d0.signal_t()
|
||||
@@ -274,7 +274,7 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
fake_signal.value = 0x30
|
||||
|
||||
q.submit(TestHCQ.d0, {virt_signal.base_addr: fake_signal.base_addr, virt_val: fake_signal.value})
|
||||
q.submit(TestHCQ.d0, {virt_signal.base_buf.va_addr: fake_signal.base_buf.va_addr, virt_val: fake_signal.value})
|
||||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
|
||||
@@ -31,11 +31,12 @@ class HCQGraph(MultiGraphRunner):
|
||||
# Fill initial arguments.
|
||||
self.ji_args: dict[int, HCQArgsState] = {}
|
||||
|
||||
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size, base=cast(int, buf.va_addr)) for dev,buf in self.kernargs_bufs.items()}
|
||||
kargs_alloc: dict[Compiled, BumpAllocator] = {dev:BumpAllocator(buf.size) for dev,buf in self.kernargs_bufs.items()}
|
||||
for j,ji in enumerate(jit_cache):
|
||||
if not isinstance(ji.prg, CompiledRunner): continue
|
||||
|
||||
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
|
||||
argsbuf = self.kernargs_bufs[ji.prg.dev].offset(kargs_alloc[ji.prg.dev].alloc(ji.prg._prg.kernargs_alloc_size, 16))
|
||||
self.ji_args[j] = ji.prg._prg.fill_kernargs(self.hcq_bufs[j], ji.prg.p.vars, argsbuf)
|
||||
|
||||
# Schedule Dependencies.
|
||||
# There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any
|
||||
@@ -125,7 +126,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
# Create variable timeline signals for each device.
|
||||
timeline_sigaddrs = {dev: UOp.variable(f"timeline_sig_{dev.device_id}", 0, 0xffffffffffffffff, dtype=dtypes.uint64) for dev in self.devices}
|
||||
self.virt_timeline_vals = {dev: UOp.variable(f"timeline_var_{dev.device_id}", 0, 0xffffffff, dtype=dtypes.uint32) for dev in self.devices}
|
||||
self.virt_timeline_signals = {dev: dev.signal_t(base_addr=timeline_sigaddrs[dev], timeline_for_device=dev) for dev in self.devices}
|
||||
self.virt_timeline_signals = {dev: dev.signal_t(base_buf=HCQBuffer(timeline_sigaddrs[dev], 16), timeline_for_device=dev) for dev in self.devices}
|
||||
|
||||
for dev in self.devices:
|
||||
self.comp_queues[dev].memory_barrier().wait(self.virt_timeline_signals[dev], self.virt_timeline_vals[dev]) \
|
||||
@@ -175,7 +176,7 @@ class HCQGraph(MultiGraphRunner):
|
||||
|
||||
hcq_var_vals = {self.kickoff_var: self.kickoff_value, **var_vals,
|
||||
**{var: dev.timeline_value - 1 for dev, var in self.virt_timeline_vals.items()},
|
||||
**{sig.base_addr: dev.timeline_signal.base_addr for dev, sig in self.virt_timeline_signals.items()}}
|
||||
**{sig.base_buf.va_addr: dev.timeline_signal.base_buf.va_addr for dev, sig in self.virt_timeline_signals.items()}}
|
||||
|
||||
# Update rawbuffers
|
||||
for (j,i),input_idx in self.input_replace.items(): hcq_var_vals[self.input_replace_to_var.get((j,i))] = input_rawbuffers[input_idx]._buf.va_addr
|
||||
|
||||
@@ -24,8 +24,8 @@ WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
|
||||
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
|
||||
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
|
||||
super().__init__(base_buf, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
|
||||
@@ -234,14 +234,14 @@ class AMDComputeQueue(HWQueue):
|
||||
user_regs = [scratch_hilo[0], scratch_hilo[1] | 1 << 31, 0xffffffff, 0x20c14000]
|
||||
|
||||
if prg.enable_dispatch_ptr:
|
||||
dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=args_state.ptr + prg.kernargs_segment_size)
|
||||
dp = (dp_t:=hsa.hsa_kernel_dispatch_packet_t).from_address(cast(int, (disp_buf:=args_state.buf.offset(prg.kernargs_segment_size)).va_addr))
|
||||
|
||||
self.bind_sints(*local_size, struct=dp, start_field='workgroup_size_x', fmt='H')
|
||||
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], struct=dp, start_field='grid_size_x', fmt='I')
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.ptr
|
||||
user_regs += [*data64_le(dp_addr)]
|
||||
self.bind_sints(*local_size, mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='workgroup_size_x', fmt='H')
|
||||
self.bind_sints(*[g*l for g,l in zip(global_size, local_size)], mem=disp_buf.cpu_view(), struct_t=dp_t, start_field='grid_size_x', fmt='I')
|
||||
dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, args_state.buf.va_addr
|
||||
user_regs += [*data64_le(disp_buf.va_addr)]
|
||||
|
||||
user_regs += [*data64_le(args_state.ptr)]
|
||||
user_regs += [*data64_le(args_state.buf.va_addr)]
|
||||
|
||||
if prg.dev.sqtt_enabled: self.sqtt_prg_marker(prg, global_size)
|
||||
|
||||
@@ -597,7 +597,7 @@ class KFDIface:
|
||||
buf = self.drm_fd.mmap(mem.va_addr, mem.size, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_FIXED, mem.mmap_offset)
|
||||
assert addr == buf == mem.va_addr
|
||||
|
||||
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem))
|
||||
self.map(hcqbuf:=HCQBuffer(mem.va_addr, mem.size, meta=mem, view=MMIOInterface(mem.va_addr, mem.size, fmt='B') if cpu_access or host else None))
|
||||
return hcqbuf
|
||||
|
||||
def free(self, mem):
|
||||
@@ -750,11 +750,13 @@ class PCIIface:
|
||||
self.pagemap.seek(va // mmap.PAGESIZE * 8)
|
||||
paddrs = [((x & ((1<<55) - 1)) * mmap.PAGESIZE, mmap.PAGESIZE) for x in array.array('Q', self.pagemap.read(size//mmap.PAGESIZE*8, binary=True))]
|
||||
am_mapping = self.adev.mm.map_range(vaddr, size, paddrs, system=True, snooped=True, uncached=True)
|
||||
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access))
|
||||
return HCQBuffer(vaddr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access),
|
||||
view=MMIOInterface(am_mapping.va_addr, size, fmt='B'))
|
||||
|
||||
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
||||
if cpu_access: self._map_pci_range(bar=0, off=am_mapping.paddrs[0][0], addr=am_mapping.va_addr, size=am_mapping.size)
|
||||
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access))
|
||||
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=cpu_access),
|
||||
view=MMIOInterface(am_mapping.va_addr, size, fmt='B') if cpu_access else None)
|
||||
|
||||
def free(self, mem):
|
||||
for dev in mem.meta.mapped_devs[1:]: dev.dev_iface.adev.mm.unmap_range(mem.va_addr, mem.size)
|
||||
@@ -793,8 +795,8 @@ class PCIIface:
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
||||
|
||||
|
||||
@@ -72,8 +72,8 @@ qmd_struct_t = make_qmd_struct_type()
|
||||
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
||||
|
||||
class NVSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
|
||||
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
|
||||
super().__init__(base_buf, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
|
||||
|
||||
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
|
||||
def __init__(self):
|
||||
@@ -133,26 +133,27 @@ class NVComputeQueue(NVCommandQueue):
|
||||
def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]):
|
||||
self.bind_args_state(args_state)
|
||||
|
||||
ctypes.memmove(qmd_addr:=(args_state.ptr + round_up(prg.constbufs[0][1], 1 << 8)), ctypes.addressof(prg.qmd), 0x40 * 4)
|
||||
assert qmd_addr < (1 << 40), f"large qmd addr {qmd_addr:x}"
|
||||
qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
|
||||
qmd_buf.cpu_view().view(size=0x40 * 4, fmt='B')[:] = bytes(prg.qmd)
|
||||
assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
|
||||
|
||||
qmd = qmd_struct_t.from_address(qmd_addr) # Save qmd for later update
|
||||
qmd = qmd_struct_t.from_address(qmd_buf.va_addr) # Save qmd for later update
|
||||
|
||||
self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
|
||||
self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
|
||||
self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
|
||||
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)
|
||||
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8)
|
||||
self.bind_sints_to_mem(*local_size, mem=qmd_buf.cpu_view(), fmt='H', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8)
|
||||
self.bind_sints_to_mem(*local_size, *global_size, mem=args_state.buf.cpu_view(), fmt='I')
|
||||
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.buf.va_addr)
|
||||
|
||||
if self.active_qmd is None:
|
||||
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_addr >> 8)
|
||||
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
|
||||
self.nvm(1, nv_gpu.NVC6C0_SEND_SIGNALING_PCAS2_B, 9)
|
||||
else:
|
||||
self.active_qmd.dependent_qmd0_pointer = qmd_addr >> 8
|
||||
self.active_qmd.dependent_qmd0_pointer = qmd_buf.va_addr >> 8
|
||||
self.active_qmd.dependent_qmd0_action = 1
|
||||
self.active_qmd.dependent_qmd0_prefetch = 1
|
||||
self.active_qmd.dependent_qmd0_enable = 1
|
||||
|
||||
self.active_qmd = qmd
|
||||
self.active_qmd, self.active_qmd_buf = qmd, qmd_buf
|
||||
return self
|
||||
|
||||
def signal(self, signal:NVSignal, value:sint=0):
|
||||
@@ -160,8 +161,9 @@ class NVComputeQueue(NVCommandQueue):
|
||||
for i in range(2):
|
||||
if getattr(self.active_qmd, f'release{i}_enable') == 0:
|
||||
setattr(self.active_qmd, f'release{i}_enable', 1)
|
||||
self.bind_sints(signal.value_addr, struct=self.active_qmd, start_field=f'release{i}_address', fmt='Q', mask=0xfffffffff)
|
||||
self.bind_sints(value, struct=self.active_qmd, start_field=f'release{i}_payload', fmt='Q')
|
||||
self.bind_sints(signal.value_addr, mem=self.active_qmd_buf.cpu_view(), struct_t=qmd_struct_t, start_field=f'release{i}_address',
|
||||
fmt='Q', mask=0xfffffffff)
|
||||
self.bind_sints(value, mem=self.active_qmd_buf.cpu_view(), struct_t=qmd_struct_t, start_field=f'release{i}_payload', fmt='Q')
|
||||
return self
|
||||
|
||||
self.nvm(0, nv_gpu.NVC56F_SEM_ADDR_LO, *data64_le(signal.value_addr), *data64_le(value),
|
||||
@@ -187,9 +189,9 @@ class NVCopyQueue(NVCommandQueue):
|
||||
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
|
||||
|
||||
class NVArgsState(CLikeArgsState):
|
||||
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
|
||||
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
||||
super().__init__(buf, prg, bufs, vals=vals, prefix=prg.constbuffer_0)
|
||||
|
||||
class NVProgram(HCQProgram):
|
||||
def __init__(self, dev:NVDevice, name:str, lib:bytes):
|
||||
@@ -292,8 +294,8 @@ class GPFifo:
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class NVDevice(HCQCompiled[NVSignal]):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
root = None
|
||||
fd_ctl: FileIOInterface
|
||||
@@ -376,7 +378,8 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
self._debug_mappings[(va_base, size)] = tag
|
||||
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,
|
||||
hClient=self.root, hMemory=mem_handle, gpuAttributesCount=1, perGpuAttributes=attrs,
|
||||
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping))
|
||||
mapped_gpu_ids=[self.gpu_uuid], has_cpu_mapping=has_cpu_mapping),
|
||||
view=MMIOInterface(va_base, size, fmt='B') if has_cpu_mapping else None)
|
||||
|
||||
def _gpu_map(self, mem:HCQBuffer):
|
||||
if self.gpu_uuid in mem.meta.mapped_gpu_ids: return
|
||||
|
||||
@@ -5,7 +5,7 @@ from types import SimpleNamespace
|
||||
from typing import Any, cast, ClassVar
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
||||
from tinygrad.runtime.autogen import kgsl, adreno
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
@@ -37,8 +37,8 @@ class QCOMCompiler(CLCompiler):
|
||||
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
|
||||
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
|
||||
super().__init__(base_buf, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
|
||||
|
||||
def _sleep(self, time_spent_waiting_ms:int):
|
||||
# Sleep only for only timeline signals. Do it immediately to free cpu.
|
||||
@@ -132,7 +132,7 @@ class QCOMComputeQueue(HWQueue):
|
||||
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=1024 // 4),
|
||||
*data64_le(args_state.ptr))
|
||||
*data64_le(args_state.buf.va_addr))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
|
||||
*data64_le(prg.lib_gpu.va_addr))
|
||||
@@ -145,21 +145,21 @@ class QCOMComputeQueue(HWQueue):
|
||||
if args_state.prg.samp_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
|
||||
|
||||
if args_state.prg.tex_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
|
||||
*data64_le(args_state.ptr + args_state.prg.tex_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
||||
|
||||
if args_state.prg.ibo_cnt > 0:
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
|
||||
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
|
||||
*data64_le(args_state.ptr + args_state.prg.ibo_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ptr + args_state.prg.ibo_off))
|
||||
*data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
|
||||
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
|
||||
@@ -168,24 +168,24 @@ class QCOMComputeQueue(HWQueue):
|
||||
return self
|
||||
|
||||
class QCOMArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
def __init__(self, buf:HCQBuffer, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
super().__init__(buf, prg, bufs, vals=vals)
|
||||
|
||||
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
|
||||
|
||||
self.buf_info, self.args_info, self.args_view = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):], to_mv(ptr, prg.kernargs_alloc_size).cast('Q')
|
||||
self.buf_info, self.args_info = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):]
|
||||
|
||||
ctypes.memset(self.ptr, 0, prg.kernargs_alloc_size)
|
||||
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
||||
ctypes.memset(cast(int, self.buf.va_addr), 0, prg.kernargs_alloc_size)
|
||||
for cnst_val,cnst_off,cnst_sz in prg.consts_info: to_mv(self.buf.va_addr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
||||
|
||||
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
||||
if prg.samp_cnt > 0: to_mv(self.buf.va_addr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
||||
for i, b in enumerate(bufs):
|
||||
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
|
||||
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
|
||||
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
|
||||
self.bind_sints_to_ptr(b.va_addr, ptr=self.ptr + self.buf_info[i].offset + (0 if self.buf_info[i].type is BUFTYPE_BUF else 16), fmt='Q')
|
||||
to_mv(self.buf.va_addr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
|
||||
self.bind_sints_to_buf(b.va_addr, buf=self.buf, fmt='Q', offset=self.buf_info[i].offset+(0 if self.buf_info[i].type is BUFTYPE_BUF else 16))
|
||||
|
||||
for i, v in enumerate(vals): self.bind_sints_to_ptr(v, ptr=self.ptr + self.args_info[i].offset, fmt='I')
|
||||
for i, v in enumerate(vals): self.bind_sints_to_buf(v, buf=self.buf, fmt='I', offset=self.args_info[i].offset)
|
||||
|
||||
class QCOMProgram(HCQProgram):
|
||||
def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
|
||||
@@ -318,8 +318,8 @@ class QCOMAllocator(HCQAllocatorBase):
|
||||
|
||||
class QCOMDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
gpu_id: int = 0
|
||||
dummy_addr: int = 0
|
||||
@@ -360,7 +360,7 @@ class QCOMDevice(HCQCompiled):
|
||||
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
|
||||
|
||||
if fill_zeroes: ctypes.memset(va_addr, 0, size)
|
||||
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
|
||||
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc, view=MMIOInterface(va_addr, size, fmt='B'))
|
||||
|
||||
def _gpu_free(self, mem:HCQBuffer):
|
||||
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
|
||||
|
||||
@@ -74,7 +74,7 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
self._q:Any = []
|
||||
self.binded_device:DeviceType|None = None
|
||||
self.q_sints:list[tuple[int, int]] = []
|
||||
self.mv_sints:list[tuple[memoryview, int, int, int|None]] = []
|
||||
self.mv_sints:list[tuple[MMIOInterface, int, int, int|None]] = []
|
||||
self.syms:list[sint] = []
|
||||
self._prev_resolved_syms:list[int|None] = []
|
||||
|
||||
@@ -173,13 +173,13 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
"""
|
||||
|
||||
def bind_args_state(self, args_state:ArgsStateType):
|
||||
for vals, ptr, fmt in args_state.bind_data: self.bind_sints_to_ptr(*vals, ptr=ptr, fmt=fmt)
|
||||
for vals, mem, fmt in args_state.bind_data: self.bind_sints_to_mem(*vals, mem=mem, fmt=fmt)
|
||||
|
||||
def bind_sints(self, *vals:sint, struct:ctypes.Structure, start_field:str, fmt, mask:int|None=None):
|
||||
self.bind_sints_to_ptr(*vals, ptr=ctypes.addressof(struct) + getattr(type(struct), start_field).offset, fmt=fmt, mask=mask)
|
||||
def bind_sints(self, *vals:sint, mem:MMIOInterface, struct_t:Type[ctypes.Structure], start_field:str, fmt, mask:int|None=None):
|
||||
self.bind_sints_to_mem(*vals, mem=mem, fmt=fmt, mask=mask, offset=getattr(struct_t, start_field).offset)
|
||||
|
||||
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt, mask:int|None=None):
|
||||
mv = to_mv(ptr, 8*len(vals)).cast(fmt)
|
||||
def bind_sints_to_mem(self, *vals:sint, mem:MMIOInterface, fmt, mask:int|None=None, offset:int=0):
|
||||
mv = mem.view(offset=offset, size=len(vals)*8, fmt=fmt)
|
||||
for i, val in enumerate(vals):
|
||||
if isinstance(val, int): mv[i] = val if mask is None else ((mv[i] & ~mask) | val)
|
||||
else: self.mv_sints.append((mv, i, self._new_sym(val), mask))
|
||||
@@ -211,19 +211,19 @@ class HWQueue(Generic[SignalType, DeviceType, ProgramType, ArgsStateType]):
|
||||
def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
|
||||
|
||||
class HCQSignal(Generic[DeviceType]):
|
||||
def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
|
||||
def __init__(self, base_buf:HCQBuffer|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
|
||||
timestamp_divider=1, value_off=0, timestamp_off=8):
|
||||
self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr
|
||||
self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t
|
||||
self.base_buf = cast(HCQBuffer, dev_t._alloc_signal() if dev_t is not None and base_buf is None else base_buf)
|
||||
self.value_addr, self.timestamp_addr, self.dev_t = self.base_buf.va_addr+value_off, self.base_buf.va_addr+timestamp_off, dev_t
|
||||
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
|
||||
self.timeline_for_device:DeviceType|None = timeline_for_device
|
||||
|
||||
if isinstance(self.base_addr, int):
|
||||
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
|
||||
if isinstance(self.base_buf.va_addr, int):
|
||||
self.value_mv, self.timestamp_mv = self.base_buf.cpu_view().view(value_off, 8, 'Q'), self.base_buf.cpu_view().view(timestamp_off, 8, 'Q')
|
||||
self.value_mv[0] = value
|
||||
|
||||
def __del__(self):
|
||||
if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr)
|
||||
if isinstance(self.base_buf.va_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_buf)
|
||||
|
||||
@property
|
||||
def value(self) -> int: return self.value_mv[0]
|
||||
@@ -281,27 +281,27 @@ def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((cast(HCQSignal, st), cast(HCQSignal, en), desc, queue_type is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQArgsState(Generic[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
|
||||
self.ptr, self.prg = ptr, prg
|
||||
self.bind_data:list[tuple[tuple[sint, ...], int, str]] = []
|
||||
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=()):
|
||||
self.buf, self.prg = buf, prg
|
||||
self.bind_data:list[tuple[tuple[sint, ...], MMIOInterface, str]] = []
|
||||
|
||||
def bind_sints_to_ptr(self, *vals:sint, ptr:int, fmt): self.bind_data.append((vals, ptr, fmt))
|
||||
def bind_sints_to_buf(self, *vals:sint, buf:HCQBuffer, fmt, offset=0): self.bind_data.append((vals, buf.cpu_view().view(offset=offset), fmt))
|
||||
|
||||
class CLikeArgsState(HCQArgsState[ProgramType]):
|
||||
def __init__(self, ptr:int, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:list[int]|None=None):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
def __init__(self, buf:HCQBuffer, prg:ProgramType, bufs:tuple[HCQBuffer, ...], vals:tuple[sint, ...]=(), prefix:list[int]|None=None):
|
||||
super().__init__(buf, prg, bufs, vals=vals)
|
||||
|
||||
if prefix is not None: to_mv(self.ptr, len(prefix) * 4).cast('I')[:] = array.array('I', prefix)
|
||||
if prefix is not None: self.buf.cpu_view().view(size=len(prefix) * 4, fmt='I')[:] = array.array('I', prefix)
|
||||
|
||||
self.bind_sints_to_ptr(*[b.va_addr for b in bufs], ptr=self.ptr + len(prefix or []) * 4, fmt='Q')
|
||||
self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I')
|
||||
self.bind_sints_to_buf(*[b.va_addr for b in bufs], buf=self.buf, fmt='Q', offset=len(prefix or []) * 4)
|
||||
self.bind_sints_to_buf(*vals, buf=self.buf, fmt='I', offset=len(prefix or []) * 4 + len(bufs) * 8)
|
||||
|
||||
class HCQProgram(Generic[DeviceType]):
|
||||
def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None):
|
||||
self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size
|
||||
if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)]
|
||||
|
||||
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState:
|
||||
def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs:HCQBuffer|None=None) -> HCQArgsState:
|
||||
"""
|
||||
Fills arguments for the kernel, optionally allocating space from the device if `kernargs_ptr` is not provided.
|
||||
Args:
|
||||
@@ -311,7 +311,8 @@ class HCQProgram(Generic[DeviceType]):
|
||||
Returns:
|
||||
Arguments state with the given buffers and values set for the program.
|
||||
"""
|
||||
return self.args_state_t(kernargs_ptr or self.dev.kernargs_allocator.alloc(self.kernargs_alloc_size), self, bufs, vals=vals)
|
||||
argsbuf = kernargs or self.dev.kernargs_buf.offset(offset=self.dev.kernargs_offset_allocator.alloc(self.kernargs_alloc_size))
|
||||
return self.args_state_t(argsbuf, self, bufs, vals=vals)
|
||||
|
||||
def __call__(self, *bufs:HCQBuffer, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1),
|
||||
vals:tuple[int, ...]=(), wait:bool=False) -> float|None:
|
||||
@@ -345,8 +346,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
|
||||
"""
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[Any]] = []
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Callable[[], HWQueue], copy_queue_t:Callable[[], HWQueue]|None):
|
||||
@@ -365,8 +366,8 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
|
||||
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
|
||||
|
||||
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
|
||||
self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
|
||||
self.kernargs_buf:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
|
||||
self.kernargs_offset_allocator:BumpAllocator = BumpAllocator(self.kernargs_buf.size, wrap=True)
|
||||
|
||||
def synchronize(self):
|
||||
try: self.timeline_signal.wait(self.timeline_value - 1)
|
||||
@@ -384,10 +385,10 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
return self.timeline_value - 1
|
||||
|
||||
@classmethod
|
||||
def _alloc_signal_addr(cls) -> int:
|
||||
def _alloc_signal(cls) -> HCQBuffer:
|
||||
if not cls.signal_pool:
|
||||
cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True)))
|
||||
cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)]
|
||||
cls.signal_pool += [alc.offset(offset=off, size=16) for off in range(0, alc.size, 16)]
|
||||
for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc)
|
||||
return cls.signal_pool.pop()
|
||||
|
||||
@@ -416,8 +417,16 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
return buf, realloced
|
||||
|
||||
class HCQBuffer:
|
||||
def __init__(self, va_addr:sint, size:int, texture_info:Any=None, meta:Any=None, _base:HCQBuffer|None=None):
|
||||
self.va_addr, self.size, self.texture_info, self.meta, self._base = va_addr, size, texture_info, meta, _base
|
||||
def __init__(self, va_addr:sint, size:int, texture_info:Any=None, meta:Any=None, _base:HCQBuffer|None=None, view:MMIOInterface|None=None):
|
||||
self.va_addr, self.size, self.texture_info, self.meta, self._base, self.view = va_addr, size, texture_info, meta, _base, view
|
||||
|
||||
def offset(self, offset:int=0, size:int|None=None) -> HCQBuffer:
|
||||
return HCQBuffer(self.va_addr+offset, size or (self.size - offset), texture_info=self.texture_info, meta=self.meta, _base=self._base or self,
|
||||
view=(self.view.view(offset=offset, size=size) if self.view is not None else None))
|
||||
|
||||
def cpu_view(self) -> MMIOInterface:
|
||||
assert self.view is not None, "buffer has no cpu_view"
|
||||
return self.view
|
||||
|
||||
class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
|
||||
"""
|
||||
@@ -433,9 +442,7 @@ class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]):
|
||||
super().__init__()
|
||||
|
||||
def map(self, buf:HCQBuffer): pass
|
||||
|
||||
def _offset(self, buf, size:int, offset:int) -> HCQBuffer:
|
||||
return HCQBuffer(va_addr=buf.va_addr + offset, size=size, texture_info=buf.texture_info, meta=buf.meta, _base=buf._base or buf)
|
||||
def _offset(self, buf, size:int, offset:int) -> HCQBuffer: return buf.offset(offset=offset, size=size)
|
||||
|
||||
class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
|
||||
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
||||
@@ -476,7 +483,6 @@ class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]):
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
self.dev.timeline_signal.wait(self.dev.timeline_value - 1)
|
||||
|
||||
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
||||
|
||||
def _transfer(self, dest:HCQBuffer, src:HCQBuffer, sz:int, src_dev:DeviceType, dest_dev:DeviceType):
|
||||
|
||||
Reference in New Issue
Block a user