mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-05 05:04:27 -05:00
416 lines
25 KiB
Python
416 lines
25 KiB
Python
from __future__ import annotations
|
|
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
|
|
assert sys.platform != 'win32'
|
|
from typing import Any, cast
|
|
from tinygrad.device import BufferSpec, CompilerSet, CompilerPair
|
|
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
|
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
|
from tinygrad.runtime.autogen import kgsl, mesa
|
|
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
|
|
from tinygrad.renderer.cstyle import QCOMRenderer
|
|
from tinygrad.renderer.nir import IR3Renderer
|
|
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE, suppress_finalizing
|
|
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC
|
|
from tinygrad.dtype import ImageDType
|
|
from tinygrad.runtime.support.system import System
|
|
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
|
|
|
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
|
|
|
#Parse C-style defines: <regname>_<field_x>__SHIFT and <regname>_<field_y>__MASK from the adreno module into the following format:
|
|
# qreg.<regname>(<field_x>=..., <field_y>=..., ..., <field_n>=...)
|
|
def _qreg_exec(__reg, __val=0, **kwargs):
|
|
for k, v in kwargs.items():
|
|
reg_name = f"{__reg[4:]}_{k.removeprefix('_').upper()}"
|
|
__val |= (getattr(mesa, reg_name) if v else 0) if type(v) is bool else (v << getattr(mesa, f'{reg_name}__SHIFT'))
|
|
return __val
|
|
qreg: Any = type("QREG", (object,), {name[4:].lower(): functools.partial(_qreg_exec, name) for name in mesa.__dict__.keys() if name[:4] == 'REG_'})
|
|
|
|
def ctz(v): return (v & -v).bit_length() - 1
|
|
|
|
def parity(val: int):
|
|
for i in range(4,1,-1): val ^= val >> (1 << i)
|
|
return (~0x6996 >> (val & 0xf)) & 1
|
|
|
|
def pkt7_hdr(opcode: int, cnt: int): return mesa.CP_TYPE7_PKT | cnt & 0x3FFF | parity(cnt) << 15 | (opcode & 0x7F) << 16 | parity(opcode) << 23
|
|
|
|
def pkt4_hdr(reg: int, cnt: int): return mesa.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27
|
|
|
|
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
|
|
class QCOMCompiler(CLCompiler):
|
|
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
|
def disassemble(self, lib:bytes):
|
|
fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
|
|
|
|
class QCOMSignal(HCQSignal):
|
|
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 19.2})
|
|
|
|
def _sleep(self, time_spent_waiting_ms:int) -> bool:
|
|
# Sleep only for timeline signals. Do it immediately to free cpu.
|
|
if self.is_timeline and self.owner is not None:
|
|
kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.owner.fd, context_id=self.owner.ctx, timestamp=self.owner.last_cmd, timeout=0xffffffff)
|
|
return False
|
|
|
|
class QCOMComputeQueue(HWQueue):
|
|
def __init__(self, dev:QCOMDevice):
|
|
self.dev = dev
|
|
super().__init__()
|
|
|
|
@suppress_finalizing
|
|
def __del__(self):
|
|
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
|
|
|
|
def cmd(self, opcode: int, *vals: int): self.q(pkt7_hdr(opcode, len(vals)), *vals)
|
|
|
|
def reg(self, reg: int, *vals: int): self.q(pkt4_hdr(reg, len(vals)), *vals)
|
|
|
|
def _cache_flush(self, write_back=True, invalidate=False, sync=True, memsync=False):
|
|
# TODO: 7xx support.
|
|
if write_back: self.cmd(mesa.CP_EVENT_WRITE, mesa.CACHE_FLUSH_TS, *data64_le(self.dev.dummy_addr), 0) # dirty cache write-back.
|
|
if invalidate: self.cmd(mesa.CP_EVENT_WRITE, mesa.CACHE_INVALIDATE) # invalidate cache lines (following reads from RAM).
|
|
if memsync: self.cmd(mesa.CP_WAIT_MEM_WRITES)
|
|
if sync: self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
|
|
|
def memory_barrier(self):
|
|
self._cache_flush(write_back=True, invalidate=True, sync=True, memsync=True)
|
|
return self
|
|
|
|
def signal(self, signal:QCOMSignal, value=0):
|
|
self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
|
if self.dev.gpu_id[:2] < (7, 3):
|
|
self.cmd(mesa.CP_EVENT_WRITE, qreg.cp_event_write_0(event=mesa.CACHE_FLUSH_TS), *data64_le(signal.value_addr), lo32(value))
|
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
|
else:
|
|
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
|
|
raise RuntimeError('CP_EVENT_WRITE7 is not supported')
|
|
return self
|
|
|
|
def timestamp(self, signal:QCOMSignal):
|
|
self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
|
self.cmd(mesa.CP_REG_TO_MEM, qreg.cp_reg_to_mem_0(reg=mesa.REG_A6XX_CP_ALWAYS_ON_COUNTER, cnt=2, _64b=True),*data64_le(signal.timestamp_addr))
|
|
return self
|
|
|
|
def wait(self, signal:QCOMSignal, value=0):
|
|
self.cmd(mesa.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=mesa.WRITE_GE, poll=mesa.POLL_MEMORY),*data64_le(signal.value_addr),
|
|
qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
|
|
return self
|
|
|
|
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
|
|
to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
|
|
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
|
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
|
|
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
|
|
return submit_req, obj
|
|
|
|
def bind(self, dev:QCOMDevice):
|
|
self.binded_device = dev
|
|
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
|
self.submit_req, self.obj = self._build_gpu_command(self.binded_device, self.hw_page.va_addr)
|
|
# From now on, the queue is on the device for faster submission.
|
|
self._q = to_mv(self.obj.gpuaddr, len(self._q) * 4).cast("I")
|
|
|
|
def _submit(self, dev:QCOMDevice):
|
|
if self.binded_device == dev: submit_req = self.submit_req
|
|
else: submit_req, _ = self._build_gpu_command(dev)
|
|
dev.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(dev.fd, __payload=submit_req).timestamp
|
|
|
|
def exec(self, prg:QCOMProgram, args_state:QCOMArgsState, global_size, local_size):
|
|
self.bind_args_state(args_state)
|
|
|
|
def cast_int(x, ceil=False): return (math.ceil(x) if ceil else int(x)) if isinstance(x, float) else x
|
|
global_size_mp = [cast_int(g*l) for g,l in zip(global_size, local_size)]
|
|
|
|
self.cmd(mesa.CP_SET_MARKER, qreg.a6xx_cp_set_marker_0(mode=mesa.RM6_COMPUTE))
|
|
self.reg(mesa.REG_A6XX_SP_UPDATE_CNTL, qreg.a6xx_sp_update_cntl(cs_state=True, cs_uav=True))
|
|
self.reg(mesa.REG_A6XX_SP_UPDATE_CNTL, 0x0)
|
|
self.reg(mesa.REG_A6XX_SP_CS_TSIZE, qreg.a6xx_sp_cs_tsize(0x80)) # is this right? mesa uses 1
|
|
self.reg(mesa.REG_A6XX_SP_CS_USIZE, qreg.a6xx_sp_cs_usize(0x40)) # mesa also uses 1
|
|
self.reg(mesa.REG_A6XX_SP_MODE_CNTL, qreg.a6xx_sp_mode_cntl(isammode=mesa.ISAMMODE_GL if prg.NIR else mesa.ISAMMODE_CL))
|
|
self.reg(mesa.REG_A6XX_SP_PERFCTR_SHADER_MASK, qreg.a6xx_sp_perfctr_shader_mask(cs=True))
|
|
self.reg(mesa.REG_A6XX_TPL1_MODE_CNTL, qreg.a6xx_tpl1_mode_cntl(isammode=mesa.ISAMMODE_GL if prg.NIR else mesa.ISAMMODE_CL))
|
|
self.reg(mesa.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
|
|
self.cmd(mesa.CP_WAIT_FOR_IDLE)
|
|
|
|
self.reg(mesa.REG_A6XX_SP_CS_NDRANGE_0,
|
|
qreg.a6xx_sp_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
|
|
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_sp_cs_wge_cntl(threadsize=mesa.THREAD64),
|
|
cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
|
|
|
|
self.reg(mesa.REG_A6XX_SP_CS_CNTL_0,
|
|
qreg.a6xx_sp_cs_cntl_0(threadsize=mesa.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
|
|
qreg.a6xx_sp_cs_cntl_1(constantrammode=mesa.CONSTLEN_256, shared_size=prg.shared_size), # should this be CONSTLEN_512?
|
|
0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
|
|
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
|
|
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
|
|
|
if prg.NIR and prg.wgsz != 0xfc: to_mv(args_state.buf.va_addr + prg.wgsz * 4, 12)[:] = struct.pack("III", *local_size)
|
|
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_CONSTANTS, state_src=mesa.SS6_INDIRECT,
|
|
state_block=mesa.SB6_CS_SHADER, num_unit=1024 // 4),
|
|
*data64_le(args_state.buf.va_addr))
|
|
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_SHADER, state_src=mesa.SS6_INDIRECT,
|
|
state_block=mesa.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
|
|
*data64_le(prg.lib_gpu.va_addr))
|
|
|
|
self.reg(mesa.REG_A6XX_SP_REG_PROG_ID_0, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc, qreg.a6xx_sp_cs_const_config(constlen=1024 // 4, enabled=True))
|
|
|
|
self.reg(mesa.REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET, qreg.a6xx_sp_cs_pvt_mem_stack_offset(prg.hw_stack_offset))
|
|
self.reg(mesa.REG_A6XX_SP_CS_INSTR_SIZE, qreg.a6xx_sp_cs_instr_size(prg.image_size // 4))
|
|
|
|
if prg.samp_cnt > 0:
|
|
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_SHADER, state_src=mesa.SS6_INDIRECT,
|
|
state_block=mesa.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
|
*data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
|
self.reg(mesa.REG_A6XX_SP_CS_SAMPLER_BASE, *data64_le(args_state.buf.va_addr + args_state.prg.samp_off))
|
|
self.reg(mesa.REG_A6XX_TPL1_CS_BORDER_COLOR_BASE, *data64_le(prg.dev.border_color_buf.va_addr))
|
|
|
|
if prg.tex_cnt > 0:
|
|
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST_CONSTANTS, state_src=mesa.SS6_INDIRECT,
|
|
state_block=mesa.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
|
|
*data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
|
self.reg(mesa.REG_A6XX_SP_CS_TEXMEMOBJ_BASE, *data64_le(args_state.buf.va_addr + args_state.prg.tex_off))
|
|
|
|
if prg.ibo_cnt > 0:
|
|
self.cmd(mesa.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=mesa.ST6_UAV, state_src=mesa.SS6_INDIRECT,
|
|
state_block=mesa.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
|
|
*data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
|
|
self.reg(mesa.REG_A6XX_SP_CS_UAV_BASE, *data64_le(args_state.buf.va_addr + args_state.prg.ibo_off))
|
|
|
|
self.reg(mesa.REG_A6XX_SP_CS_CONFIG,
|
|
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nuav=args_state.prg.ibo_cnt))
|
|
|
|
if prg.NIR:
|
|
self.reg(mesa.REG_A6XX_SP_CS_CONST_CONFIG_0,
|
|
qreg.a6xx_sp_cs_const_config_0(wgidconstid=prg.wgid, wgsizeconstid=prg.wgsz, wgoffsetconstid=0xfc, localidregid=prg.lid),
|
|
qreg.a6xx_sp_cs_wge_cntl(linearlocalidregid=0xfc, threadsize=mesa.THREAD64))
|
|
self.cmd(mesa.CP_EXEC_CS, 0,
|
|
qreg.cp_exec_cs_1(ngroups_x=global_size[0]), qreg.cp_exec_cs_2(ngroups_y=global_size[1]), qreg.cp_exec_cs_3(_ngroups_z=global_size[2]))
|
|
else: self.cmd(mesa.CP_RUN_OPENCL, 0)
|
|
|
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
|
return self
|
|
|
|
class QCOMArgsState(HCQArgsState):
|
|
def __init__(self, buf:HCQBuffer, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
|
super().__init__(buf, prg, bufs, vals=vals)
|
|
ctypes.memset(cast(int, self.buf.va_addr), 0, prg.kernargs_alloc_size)
|
|
|
|
ubos = [b for i,b in enumerate(bufs) if not isinstance(prg.buf_dtypes[i], ImageDType)]
|
|
uavs = [(i,b) for i,b in enumerate(bufs) if isinstance(prg.buf_dtypes[i], ImageDType)]
|
|
ibos, texs = uavs[:prg.ibo_cnt], uavs[prg.ibo_cnt:]
|
|
for cnst_val,cnst_off,cnst_sz in prg.consts_info: to_mv(self.buf.va_addr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
|
|
|
if prg.samp_cnt > 0: to_mv(self.buf.va_addr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
|
if prg.NIR:
|
|
self.bind_sints_to_buf(*[b.va_addr for b in ubos], buf=self.buf, fmt='Q', offset=prg.buf_off)
|
|
self.bind_sints_to_buf(*vals, buf=self.buf, fmt='I', offset=prg.buf_off + len(ubos) * 8)
|
|
else:
|
|
for i, b in enumerate(ubos): self.bind_sints_to_buf(b.va_addr, buf=self.buf, fmt='Q', offset=prg.buf_offs[i])
|
|
for i, v in enumerate(vals): self.bind_sints_to_buf(v, buf=self.buf, fmt='I', offset=prg.buf_offs[i+len(ubos)])
|
|
|
|
def _tex(b, ibo=False):
|
|
fmt = mesa.FMT6_32_32_32_32_FLOAT if (img:=b[1].image or prg.buf_dtypes[b[0]]).itemsize == 4 else mesa.FMT6_16_16_16_16_FLOAT
|
|
return [qreg.a6xx_tex_const_0(fmt=fmt) if ibo else qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=fmt),
|
|
qreg.a6xx_tex_const_1(width=img.shape[1], height=img.shape[0]),
|
|
qreg.a6xx_tex_const_2(type=mesa.A6XX_TEX_2D, pitch=img.pitch, pitchalign=ctz(img.pitch)-6), 0, *data64_le(b[1].va_addr),
|
|
qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13), 0, 0, 0, 0, 0, 0, 0, 0]
|
|
|
|
self.bind_sints_to_buf(*flatten(map(_tex, texs)), buf=self.buf, fmt='I', offset=prg.tex_off)
|
|
self.bind_sints_to_buf(*flatten(map(functools.partial(_tex, ibo=True), ibos)), buf=self.buf, fmt='I', offset=prg.ibo_off)
|
|
|
|
class QCOMProgram(HCQProgram):
|
|
def __init__(self, dev: QCOMDevice, name: str, lib: bytes, buf_dtypes=[]):
|
|
self.dev: QCOMDevice = dev
|
|
self.buf_dtypes, self.name, self.lib, self.NIR = buf_dtypes, name, lib, isinstance(dev.renderer, IR3Renderer)
|
|
|
|
if self.NIR:
|
|
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
|
|
v, cs, self.imm_vals, self.image = IR3Compiler.unpack_lib(lib)
|
|
self.prg_offset, self.brnchstck, self.image_size, self.pvtmem, self.shmem = 0, v.branchstack, v.info.size, v.pvtmem_size, v.shared_size
|
|
self.wgsz = alloc.offset_vec4 * 4 + 8 if (alloc:=cs.allocs.consts[mesa.IR3_CONST_ALLOC_DRIVER_PARAMS]).size_vec4 else 0xfc
|
|
|
|
self.wgid, self.lid = v.cs.work_group_id, v.cs.local_invocation_id # register ids
|
|
self.buf_off, self.imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16
|
|
|
|
# see https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_shader.h#L525
|
|
# and https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_compiler_nir.c#L5389
|
|
self.samp_cnt, self.tex_cnt, self.ibo_cnt = (nt:=v.image_mapping.num_tex), nt, v.num_uavs - nt
|
|
# IR3 outputs a sampler for every texture (https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_compiler_nir.c#L1714)
|
|
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=mesa.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
|
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0] * self.samp_cnt
|
|
|
|
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
|
|
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
|
|
self.consts_info:list[tuple] = []
|
|
else: self._parse_lib()
|
|
|
|
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
|
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
|
|
|
|
self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
|
|
self.pvtmem_size_total: int = self.pvtmem_size_per_item * 128 * 2
|
|
self.hw_stack_offset: int = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
|
|
self.shared_size: int = max(1, (self.shmem - 1) // 1024)
|
|
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
|
|
dev._ensure_stack_size(self.hw_stack_offset * 4)
|
|
|
|
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + len(self.samplers) * 4, 0x100)
|
|
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
|
weakref.finalize(self, self._fini, self.dev, self.lib_gpu, buf_spec)
|
|
|
|
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
|
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
|
|
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
|
|
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
|
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
|
|
|
def _parse_lib(self):
|
|
# Extract image binary
|
|
self.image_size = _read_lib(self.lib, 0x100)
|
|
self.image = bytearray(self.lib[(image_offset:=_read_lib(self.lib, 0xc0)):image_offset+self.image_size])
|
|
|
|
# Parse image descriptors
|
|
image_desc_off = _read_lib(self.lib, 0x110)
|
|
self.prg_offset, self.brnchstck = _read_lib(self.lib, image_desc_off+0xc4), _read_lib(self.lib, image_desc_off+0x108) // 2
|
|
self.pvtmem, self.shmem = _read_lib(self.lib, image_desc_off+0xc8), _read_lib(self.lib, image_desc_off+0xd8)
|
|
|
|
# Fill up constants and buffers info
|
|
self.consts_info = []
|
|
|
|
# Collect sampler info.
|
|
self.samp_cnt = samp_cnt_in_file = _read_lib(self.lib, image_desc_off + 0xdc)
|
|
assert self.samp_cnt <= 1, "Up to one sampler supported"
|
|
if self.samp_cnt:
|
|
self.samp_cnt += 1
|
|
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=mesa.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
|
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0, 0, 0, 0, 0]
|
|
else: self.samplers = []
|
|
|
|
# Collect kernel arguments (buffers) info.
|
|
bdoff, binfos = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file, []
|
|
while bdoff + 32 <= len(self.lib):
|
|
length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", self.lib[bdoff:bdoff+32])
|
|
if length == 0: break
|
|
binfos.append((offset_words * 4, typ))
|
|
bdoff += length
|
|
self.buf_offs = [off for off,typ in binfos if typ not in {BUFTYPE_TEX, BUFTYPE_IBO}]
|
|
|
|
# Setting correct offsets to textures/ibos.
|
|
self.tex_cnt, self.ibo_cnt = sum(typ is BUFTYPE_TEX for _,typ in binfos), sum(typ is BUFTYPE_IBO for _,typ in binfos)
|
|
self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt
|
|
|
|
if _read_lib(self.lib, 0xb0) != 0: # check if we have constants.
|
|
cdoff = _read_lib(self.lib, 0xac)
|
|
while cdoff + 40 <= image_offset:
|
|
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
|
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
|
cdoff += 40
|
|
|
|
# Registers info
|
|
reg_desc_off = _read_lib(self.lib, 0x34)
|
|
self.fregs, self.hregs = _read_lib(self.lib, reg_desc_off + 0x14), _read_lib(self.lib, reg_desc_off + 0x18)
|
|
|
|
class QCOMTextureInfo:
|
|
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
|
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
|
|
|
class QCOMAllocator(HCQAllocatorBase):
|
|
def _alloc(self, size:int, opts:BufferSpec) -> HCQBuffer:
|
|
# Recalculate real size for texture
|
|
if opts.image is not None: size = opts.image.pitch * opts.image.shape[0]
|
|
return self.dev._gpu_map(opts.external_ptr, size, image=opts.image) if opts.external_ptr else self.dev._gpu_alloc(size, image=opts.image)
|
|
|
|
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, prof_text, dest_off=0, src_off=0):
|
|
with cpu_profile(prof_text, self.dev.device, is_copy=True):
|
|
while src_off < src_size:
|
|
ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
|
|
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
|
|
|
|
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
|
stride, pitch = (dest.image.shape[1] * 4 * dest.image.itemsize, dest.image.pitch) if dest.image else (src.nbytes, src.nbytes)
|
|
self._do_copy(mv_address(src), dest.cpu_view().addr, src.nbytes, stride, stride, pitch, f"TINY -> {self.dev.device}")
|
|
|
|
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
|
self.dev.synchronize()
|
|
|
|
stride, pitch = (src.image.shape[1] * 4 * src.image.itemsize, src.image.pitch) if src.image else (src.size, src.size)
|
|
self._do_copy(src.cpu_view().addr, mv_address(dest), src.size, stride, pitch, stride, f"{self.dev.device} -> TINY")
|
|
|
|
def _as_buffer(self, src:HCQBuffer) -> memoryview:
|
|
self.dev.synchronize()
|
|
return to_mv(src.cpu_view().addr, src.size)
|
|
|
|
def _do_free(self, opaque, options:BufferSpec): self.dev._gpu_free(opaque)
|
|
|
|
def flag(nm, val): return (val << getattr(kgsl, f"{nm}_SHIFT")) & getattr(kgsl, f"{nm}_MASK")
|
|
|
|
class QCOMDevice(HCQCompiled):
|
|
def __init__(self, device:str=""):
|
|
self.fd = FileIOInterface('/dev/kgsl-3d0', os.O_RDWR)
|
|
self.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
|
|
|
|
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
|
|
| flag("KGSL_CONTEXT_PRIORITY", getenv("QCOM_PRIORITY", 8)) | flag("KGSL_CONTEXT_PREEMPT_STYLE", kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
|
|
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
|
|
|
|
self.cmd_buf = self._gpu_alloc(16 << 20)
|
|
self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, base=cast(int, self.cmd_buf.va_addr), wrap=True)
|
|
|
|
self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
|
|
|
|
self.last_cmd:int = 0
|
|
|
|
# Set max power
|
|
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
|
|
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
|
|
|
|
# Load info about qcom device
|
|
info = kgsl.struct_kgsl_devinfo()
|
|
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
|
self.gpu_id = (info.chip_id >> 24, (info.chip_id >> 16) & 0xFF, (info.chip_id >> 8) & 0xFF)
|
|
|
|
# a7xx start with 730x or 'Cxxx', a8xx starts 'Exxx'
|
|
if self.gpu_id[:2] >= (7, 3): raise RuntimeError(f"Unsupported GPU: chip_id={info.chip_id:#x}")
|
|
|
|
if PROFILE and self.gpu_id[:2] < (7, 3):
|
|
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
|
|
|
|
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)),
|
|
CompilerPair(functools.partial(IR3Renderer, info.chip_id), None, QCOM_IR3)])
|
|
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
|
|
functools.partial(QCOMComputeQueue, self), None)
|
|
|
|
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False, **kwargs) -> HCQBuffer:
|
|
flags |= flag("KGSL_MEMALIGN", alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
|
if uncached: flags |= flag("KGSL_CACHEMODE", kgsl.KGSL_CACHEMODE_UNCACHED)
|
|
|
|
alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
|
|
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
|
|
|
|
if fill_zeroes: ctypes.memset(va_addr, 0, size)
|
|
return HCQBuffer(va_addr=va_addr, size=size, meta=(alloc, True), view=MMIOInterface(va_addr, size, fmt='B'), owner=self, **kwargs)
|
|
|
|
def _gpu_map(self, ptr:int, size:int, **kwargs) -> HCQBuffer:
|
|
ptr_aligned, size_aligned = (ptr & ~0xfff), round_up(size + (ptr & 0xfff), 0x1000)
|
|
try:
|
|
mi = kgsl.IOCTL_KGSL_MAP_USER_MEM(self.fd, hostptr=ptr_aligned, len=size_aligned, memtype=kgsl.KGSL_USER_MEM_TYPE_ADDR)
|
|
return HCQBuffer(mi.gpuaddr + (ptr - ptr_aligned), size=size, meta=(mi, False), view=MMIOInterface(ptr, size, fmt='B'), owner=self, **kwargs)
|
|
except OSError as e:
|
|
if e.errno == 14: return HCQBuffer(va_addr=ptr, size=size, meta=(None, False), view=MMIOInterface(ptr, size, fmt='B'), owner=self, **kwargs)
|
|
raise RuntimeError("Failed to map external pointer to GPU memory") from e
|
|
|
|
def _gpu_free(self, mem:HCQBuffer):
|
|
if mem.meta[0] is None: return
|
|
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta[0].id)
|
|
if mem.meta[1]: FileIOInterface.munmap(mem.va_addr, mem.meta[0].mmapsize)
|
|
|
|
def _ensure_stack_size(self, sz):
|
|
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
|
elif self._stack.size < sz:
|
|
self.synchronize()
|
|
self._gpu_free(self._stack)
|
|
self._stack = self._gpu_alloc(sz)
|
|
|
|
def _at_profile_finalize(self):
|
|
super()._at_profile_finalize()
|
|
with contextlib.suppress(RuntimeError): System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", "10", "Failed to reenable suspend mode")
|