QCOM cleanups (#15435)

This commit is contained in:
Christopher Milan
2026-03-23 19:18:38 -07:00
committed by GitHub
parent 85dee83f5d
commit d5320a9ddf
7 changed files with 239 additions and 48 deletions

View File

@@ -56,6 +56,7 @@ jobs:
python3 -c "from tinygrad.runtime.autogen import libusb"
python3 -c "from tinygrad.runtime.autogen import mesa"
python3 -c "from tinygrad.runtime.autogen import avcodec"
python3 -c "from tinygrad.runtime.autogen import llvm_qcom"
REGEN=1 python3 -c "from tinygrad.runtime.autogen import libclang"
- name: Check for differences
run: |

113
extra/tinydreno.h Normal file
View File

@@ -0,0 +1,113 @@
#ifndef _TINYDRENO_H
#define _TINYDRENO_H
#include <stdint.h>
#include <stddef.h>
typedef void * cl_llvm_instance;
cl_llvm_instance cl_compiler_create_llvm_instance(void);
void cl_compiler_destroy_llvm_instance(cl_llvm_instance inst);
enum cl_handle_type {
CL_HANDLE_COMPILED = 1,
CL_HANDLE_LIBRARY,
CL_HANDLE_LINKED
};
// handle->data for CL_HANDLE_COMPILED and CL_HANDLE_LIBRARY
struct cl_compiled_data {
uint64_t chip_id;
uint32_t mode;
void *llvm_bitcode;
uint64_t llvm_bitcode_size;
char *build_log;
uint32_t build_log_len;
uint32_t error_code;
};
// handle->data for CL_HANDLE_LINKED
struct cl_executable_data {
int32_t num_kernels;
void *kernel_props;
uint32_t error_code;
char *build_log;
char _unk0[0x20];
uint64_t chip_id;
uint32_t mode;
};
typedef struct {
enum cl_handle_type type;
union {
struct cl_compiled_data *compiled;
struct cl_executable_data *executable;
};
} cl_handle;
#define CL_MODE_32BIT 0
#define CL_MODE_64BIT 1
#define CL_SRC_STR 0
#define CL_SRC_BLOB 1
cl_handle *cl_compiler_compile_source(cl_llvm_instance inst, uint64_t chip_id, int mode, const char *options, int p5, uint64_t p6, uint64_t p7,
const char *source, uint64_t source_len, uint64_t source_type, void *p11);
cl_handle *cl_compiler_link_program(cl_llvm_instance inst, uint64_t chip_id, int mode, const char *options, int num_handles,
cl_handle **input_handles);
void cl_compiler_handle_create_binary(cl_handle *handle, void **out_ptr, size_t *out_size);
// lib binary format (output of handle_create_binary for type 3)
// layout: cl_lib_header, then cl_lib_section[num_sections], then data
#define CL_LIB_PROGRAM 0
#define CL_LIB_CONSTS 6
#define CL_LIB_IMAGE 7
#define CL_LIB_CODE 10
#define CL_LIB_IMAGE_DESC 11
typedef struct {
uint32_t id;
uint32_t offset;
uint32_t size;
uint32_t count;
uint32_t entry_size;
} cl_lib_section;
typedef struct {
uint32_t _unk0[6];
uint32_t num_sections;
uint32_t _unk1[5];
cl_lib_section sections[];
} cl_lib_header;
// at sections[CL_LIB_PROGRAM].offset
typedef struct {
char name[8];
uint32_t _unk0[3];
uint32_t fregs;
uint32_t hregs;
} cl_lib_prog;
// at sections[CL_LIB_IMAGE_DESC].offset
typedef struct {
char _unk0[0xc4];
uint32_t prg_offset;
uint32_t pvtmem;
char _unk1[0x0c];
uint32_t shmem;
uint32_t samp_cnt;
char _unk2[0x28];
uint32_t brnchstck;
char _unk4[0x4c];
char kernel_name[];
} cl_lib_img_desc;
void cl_compiler_free_handle(cl_handle *handle);
void cl_compiler_free_assembly(void *ptr);
#endif

View File

@@ -2,7 +2,7 @@ from typing import Callable, cast, Any
from tinygrad.dtype import AddrSpace, DType, PtrDType, ImageDType, dtypes, truncate
from tinygrad.helpers import DEBUG, OSX, unwrap, fromimport
from tinygrad.renderer import Renderer
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.renderer.cstyle import CUDARenderer, OpenCLRenderer
from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str
from tinygrad.runtime.autogen import mesa
from tinygrad.runtime.support.c import POINTER
@@ -264,7 +264,7 @@ _nload_img = nir_instr(intrins=lambda dtype:{'IMAGE_DIM':mesa.GLSL_SAMPLER_DIM_2
nc=4, bs=32, num_components=4, srcs=lambda b,img,coord:[nsrc(x) for x in [img, tovec(b, coord), nundef(b, dtypes.int), nimm(b, 0, dtypes.int)]])(
lambda b,img,coord,dtype: mesa.nir_intrinsic_instr_create(b.shader, g("nir_intrinsic_image_load")))
class IR3Renderer(NIRRenderer):
class IR3Renderer(NIRRenderer, OpenCLRenderer):
device = "QCOM"
has_aux = True
@@ -299,5 +299,3 @@ class IR3Renderer(NIRRenderer):
self.b.shader.contents.info.num_ubos = len([u for u in bufs if not isinstance(u.dtype, ImageDType)])
self.b.shader.contents.info.num_images = texs() + imgs()
def aux(self, uops:list[UOp]): return (tuple(u.dtype for u in uops if u.op == Ops.PARAM),)

View File

@@ -160,4 +160,5 @@ def __getattr__(nm):
case "corefoundation": return load("corefoundation", "'CoreFoundation'",
[f"{macossdk}/System/Library/Frameworks/CoreFoundation.framework/Headers/CF{s}.h" for s in ["String", "Data"]],
args=["-isysroot", macossdk])
case "llvm_qcom": return load("llvm_qcom", "'llvm-qcom'", [root/"extra/tinydreno.h"])
case _: raise AttributeError(f"no such autogen: {nm}")

View File

@@ -0,0 +1,102 @@
# mypy: disable-error-code="empty-body"
from __future__ import annotations
import ctypes
from typing import Annotated, Literal, TypeAlias
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
from tinygrad.runtime.support import c
dll = c.DLL('llvm_qcom', 'llvm-qcom')
cl_llvm_instance: TypeAlias = ctypes.c_void_p
@dll.bind
def cl_compiler_create_llvm_instance() -> cl_llvm_instance: ...
@dll.bind
def cl_compiler_destroy_llvm_instance(inst:cl_llvm_instance) -> None: ...
class enum_cl_handle_type(Annotated[int, ctypes.c_uint32], c.Enum): pass
CL_HANDLE_COMPILED = enum_cl_handle_type.define('CL_HANDLE_COMPILED', 1)
CL_HANDLE_LIBRARY = enum_cl_handle_type.define('CL_HANDLE_LIBRARY', 2)
CL_HANDLE_LINKED = enum_cl_handle_type.define('CL_HANDLE_LINKED', 3)
@c.record
class struct_cl_compiled_data(c.Struct):
SIZE = 48
chip_id: Annotated[uint64_t, 0]
mode: Annotated[uint32_t, 8]
llvm_bitcode: Annotated[ctypes.c_void_p, 16]
llvm_bitcode_size: Annotated[uint64_t, 24]
build_log: Annotated[c.POINTER[Annotated[bytes, ctypes.c_char]], 32]
build_log_len: Annotated[uint32_t, 40]
error_code: Annotated[uint32_t, 44]
uint64_t: TypeAlias = Annotated[int, ctypes.c_uint64]
uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]
@c.record
class struct_cl_executable_data(c.Struct):
SIZE = 80
num_kernels: Annotated[int32_t, 0]
kernel_props: Annotated[ctypes.c_void_p, 8]
error_code: Annotated[uint32_t, 16]
build_log: Annotated[c.POINTER[Annotated[bytes, ctypes.c_char]], 24]
_unk0: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[32]], 32]
chip_id: Annotated[uint64_t, 64]
mode: Annotated[uint32_t, 72]
int32_t: TypeAlias = Annotated[int, ctypes.c_int32]
@c.record
class cl_handle(c.Struct):
SIZE = 16
type: Annotated[enum_cl_handle_type, 0]
compiled: Annotated[c.POINTER[struct_cl_compiled_data], 8]
executable: Annotated[c.POINTER[struct_cl_executable_data], 8]
@dll.bind
def cl_compiler_compile_source(inst:cl_llvm_instance, chip_id:uint64_t, mode:Annotated[int, ctypes.c_int32], options:c.POINTER[Annotated[bytes, ctypes.c_char]], p5:Annotated[int, ctypes.c_int32], p6:uint64_t, p7:uint64_t, source:c.POINTER[Annotated[bytes, ctypes.c_char]], source_len:uint64_t, source_type:uint64_t, p11:ctypes.c_void_p) -> c.POINTER[cl_handle]: ...
@dll.bind
def cl_compiler_link_program(inst:cl_llvm_instance, chip_id:uint64_t, mode:Annotated[int, ctypes.c_int32], options:c.POINTER[Annotated[bytes, ctypes.c_char]], num_handles:Annotated[int, ctypes.c_int32], input_handles:c.POINTER[c.POINTER[cl_handle]]) -> c.POINTER[cl_handle]: ...
size_t: TypeAlias = Annotated[int, ctypes.c_uint64]
@dll.bind
def cl_compiler_handle_create_binary(handle:c.POINTER[cl_handle], out_ptr:c.POINTER[ctypes.c_void_p], out_size:c.POINTER[size_t]) -> None: ...
@c.record
class cl_lib_section(c.Struct):
SIZE = 20
id: Annotated[uint32_t, 0]
offset: Annotated[uint32_t, 4]
size: Annotated[uint32_t, 8]
count: Annotated[uint32_t, 12]
entry_size: Annotated[uint32_t, 16]
@c.record
class cl_lib_header(c.Struct):
SIZE = 48
_unk0: Annotated[c.Array[uint32_t, Literal[6]], 0]
num_sections: Annotated[uint32_t, 24]
_unk1: Annotated[c.Array[uint32_t, Literal[5]], 28]
sections: Annotated[c.Array[cl_lib_section, Literal[0]], 48]
@c.record
class cl_lib_prog(c.Struct):
SIZE = 28
name: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[8]], 0]
_unk0: Annotated[c.Array[uint32_t, Literal[3]], 8]
fregs: Annotated[uint32_t, 20]
hregs: Annotated[uint32_t, 24]
@c.record
class cl_lib_img_desc(c.Struct):
SIZE = 344
_unk0: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[196]], 0]
prg_offset: Annotated[uint32_t, 196]
pvtmem: Annotated[uint32_t, 200]
_unk1: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[12]], 204]
shmem: Annotated[uint32_t, 216]
samp_cnt: Annotated[uint32_t, 220]
_unk2: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[40]], 224]
brnchstck: Annotated[uint32_t, 264]
_unk4: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[76]], 268]
kernel_name: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[0]], 344]
@dll.bind
def cl_compiler_free_handle(handle:c.POINTER[cl_handle]) -> None: ...
@dll.bind
def cl_compiler_free_assembly(ptr:ctypes.c_void_p) -> None: ...
c.init_records()
CL_MODE_32BIT = 0 # type: ignore
CL_MODE_64BIT = 1 # type: ignore
CL_SRC_STR = 0 # type: ignore
CL_SRC_BLOB = 1 # type: ignore
CL_LIB_PROGRAM = 0 # type: ignore
CL_LIB_CONSTS = 6 # type: ignore
CL_LIB_IMAGE = 7 # type: ignore
CL_LIB_CODE = 10 # type: ignore
CL_LIB_IMAGE_DESC = 11 # type: ignore

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
assert sys.platform != 'win32'
from typing import Any
from typing import Any, cast
from tinygrad.device import BufferSpec, CompilerSet, Device
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
@@ -203,7 +203,8 @@ class QCOMArgsState(HCQArgsState):
ubos = [b for i,b in enumerate(bufs) for _,dt in prg.buf_dtypes[i] if not isinstance(dt, ImageDType)]
uavs = [(dt,b) for i,b in enumerate(bufs) for _,dt in prg.buf_dtypes[i] if isinstance(dt, ImageDType)]
ibos, texs = uavs[:prg.ibo_cnt], uavs[prg.ibo_cnt:]
for cnst_val,cnst_off,cnst_sz in prg.consts_info: to_mv(self.buf.va_addr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
for cnst_val,cnst_off,cnst_sz in prg.consts_info:
to_mv(cast(int, self.buf.va_addr) + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
if prg.samp_cnt > 0: to_mv(int(self.buf.va_addr) + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
if prg.NIR:
@@ -231,12 +232,13 @@ class QCOMProgram(HCQProgram):
if self.NIR:
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
v, cs, self.imm_vals, self.image = IR3Compiler.unpack_lib(lib)
v, cs, imm_vals, self.image = IR3Compiler.unpack_lib(lib)
self.prg_offset, self.brnchstck, self.image_size, self.pvtmem, self.shmem = 0, v.branchstack, v.info.size, v.pvtmem_size, v.shared_size
self.wgsz = alloc.offset_vec4 * 4 + 8 if (alloc:=cs.allocs.consts[mesa.IR3_CONST_ALLOC_DRIVER_PARAMS]).size_vec4 else 0xfc
self.wgid, self.lid = v.cs.work_group_id, v.cs.local_invocation_id # register ids
self.buf_off, self.imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16
self.buf_off, imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16
self.consts_info = [(struct.unpack_from("<I", imm_vals, i)[0], imm_off + i, 4) for i in range(0, len(imm_vals), 4)]
# see https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_shader.h#L525
# and https://elixir.bootlin.com/mesa/mesa-25.3.0/source/src/freedreno/ir3/ir3_compiler_nir.c#L5389
@@ -247,7 +249,6 @@ class QCOMProgram(HCQProgram):
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
self.consts_info:list[tuple] = []
else: self._parse_lib(lib)
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))

View File

@@ -1,62 +1,37 @@
import ctypes, struct
from tinygrad.device import Compiler
from tinygrad.helpers import DEBUG, system
from tinygrad.runtime.support.c import DLL
from tinygrad.runtime.support.compiler_mesa import disas_adreno
# see https://github.com/sirhcm/tinydreno
dll = DLL("llvm-qcom", ["llvm-qcom"])
(create_llvm_instance:=dll.cl_compiler_create_llvm_instance).restype, create_llvm_instance.argtypes = ctypes.c_void_p, []
(compile_source:=dll.cl_compiler_compile_source).restype = ctypes.c_void_p
compile_source.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_uint64, ctypes.c_uint64,
ctypes.c_char_p, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_void_p]
(link_program:=dll.cl_compiler_link_program).restype = ctypes.c_void_p
link_program.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p]
(get_error_code:=dll.cl_compiler_get_error_code).restype, get_error_code.argtypes = ctypes.c_int, [ctypes.c_void_p]
(get_build_log:=dll.cl_compiler_get_build_log).restype, get_build_log.argtypes = ctypes.c_char_p, [ctypes.c_void_p]
(handle_create_binary:=dll.cl_compiler_handle_create_binary).restype = None
handle_create_binary.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t)]
(free_handle:=dll.cl_compiler_free_handle).restype, free_handle.argtypes = None, [ctypes.c_void_p]
(free_assembly:=dll.cl_compiler_free_assembly).restype, free_assembly.argtypes = None, [ctypes.c_void_p]
(destroy_llvm_instance:=dll.cl_compiler_destroy_llvm_instance).restype, destroy_llvm_instance.argtypes = None, [ctypes.c_void_p]
MODE_32BIT, MODE_64BIT, SRC_STR, SRC_BLOB = 0, 1, 0, 1
from tinygrad.runtime.autogen import llvm_qcom
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
class QCOMCompiler(Compiler):
def __init__(self, chip_id):
self.chip_id, self.llvm_inst = chip_id, create_llvm_instance()
self.chip_id, self.llvm_inst = chip_id, llvm_qcom.cl_compiler_create_llvm_instance()
super().__init__(f"compile_qcomcl_{chip_id}")
def __del__(self): destroy_llvm_instance(self.llvm_inst)
def __del__(self): llvm_qcom.cl_compiler_destroy_llvm_instance(self.llvm_inst)
def __reduce__(self): return QCOMCompiler, (self.chip_id,)
def checked(self, handle):
if handle is None or get_error_code(handle) != 0:
destroy_llvm_instance(self.llvm_inst)
self.llvm_inst = create_llvm_instance()
raise RuntimeError("QCOM Compilation Error" + ("" if handle is None else f": {get_build_log(handle)}"))
if not handle or (data:=(hc.executable if (hc:=handle.contents).type == llvm_qcom.CL_HANDLE_LINKED else hc.compiled).contents).error_code != 0:
llvm_qcom.cl_compiler_destroy_llvm_instance(self.llvm_inst)
self.llvm_inst = llvm_qcom.cl_compiler_create_llvm_instance()
raise RuntimeError("QCOM Compilation Error" + ("" if not handle else f": {ctypes.string_at(data.build_log).decode()}"))
return handle
def compile(self, src) -> bytes:
ch = self.checked(compile_source(self.llvm_inst, self.chip_id, MODE_64BIT, b"", 0, 0, 0, src.encode(), 0, SRC_STR, None))
if DEBUG >= 8:
handle_create_binary(ch, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t()))
print(system("llvm-dis", input=ctypes.string_at(ptr, sz.value)[16:]))
free_assembly(ptr)
lh = self.checked(link_program(self.llvm_inst, self.chip_id, MODE_64BIT, None, 1, ctypes.pointer(ctypes.c_void_p(ch))))
handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t()))
for h in [ch, lh]: free_handle(h)
ch = self.checked(llvm_qcom.cl_compiler_compile_source(self.llvm_inst, self.chip_id, llvm_qcom.CL_MODE_64BIT, b"", 0, 0, 0, src.encode(), 0,
llvm_qcom.CL_SRC_STR, None))
if DEBUG >= 8: print(system("llvm-dis", input=ctypes.string_at((comp:=ch.contents.compiled.contents).llvm_bitcode, comp.llvm_bitcode_size)))
lh = self.checked(llvm_qcom.cl_compiler_link_program(self.llvm_inst, self.chip_id, llvm_qcom.CL_MODE_64BIT, None, 1, ch))
llvm_qcom.cl_compiler_handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t()))
for h in [ch, lh]: llvm_qcom.cl_compiler_free_handle(h)
ret = ctypes.string_at(ptr, sz.value)
free_assembly(ptr)
llvm_qcom.cl_compiler_free_assembly(ptr)
return ret
def disassemble(self, lib: bytes): disas_adreno(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)], self.chip_id)