diff --git a/.github/workflows/autogen.yml b/.github/workflows/autogen.yml index 8b3a3e759f..c7c4842638 100644 --- a/.github/workflows/autogen.yml +++ b/.github/workflows/autogen.yml @@ -56,6 +56,7 @@ jobs: python3 -c "from tinygrad.runtime.autogen import libusb" python3 -c "from tinygrad.runtime.autogen import mesa" python3 -c "from tinygrad.runtime.autogen import avcodec" + python3 -c "from tinygrad.runtime.autogen import llvm_qcom" REGEN=1 python3 -c "from tinygrad.runtime.autogen import libclang" - name: Check for differences run: | diff --git a/extra/tinydreno.h b/extra/tinydreno.h new file mode 100644 index 0000000000..61236026f0 --- /dev/null +++ b/extra/tinydreno.h @@ -0,0 +1,113 @@ +#ifndef _TINYDRENO_H +#define _TINYDRENO_H + +#include +#include + +typedef void * cl_llvm_instance; + +cl_llvm_instance cl_compiler_create_llvm_instance(void); +void cl_compiler_destroy_llvm_instance(cl_llvm_instance inst); + +enum cl_handle_type { + CL_HANDLE_COMPILED = 1, + CL_HANDLE_LIBRARY, + CL_HANDLE_LINKED +}; + +// handle->data for CL_HANDLE_COMPILED and CL_HANDLE_LIBRARY +struct cl_compiled_data { + uint64_t chip_id; + uint32_t mode; + void *llvm_bitcode; + uint64_t llvm_bitcode_size; + char *build_log; + uint32_t build_log_len; + uint32_t error_code; +}; + +// handle->data for CL_HANDLE_LINKED +struct cl_executable_data { + int32_t num_kernels; + void *kernel_props; + uint32_t error_code; + char *build_log; + char _unk0[0x20]; + uint64_t chip_id; + uint32_t mode; +}; + +typedef struct { + enum cl_handle_type type; + union { + struct cl_compiled_data *compiled; + struct cl_executable_data *executable; + }; +} cl_handle; + + +#define CL_MODE_32BIT 0 +#define CL_MODE_64BIT 1 + +#define CL_SRC_STR 0 +#define CL_SRC_BLOB 1 + +cl_handle *cl_compiler_compile_source(cl_llvm_instance inst, uint64_t chip_id, int mode, const char *options, int p5, uint64_t p6, uint64_t p7, + const char *source, uint64_t source_len, uint64_t source_type, void *p11); + +cl_handle *cl_compiler_link_program(cl_llvm_instance inst, uint64_t chip_id, int mode, const char *options, int num_handles, + cl_handle **input_handles); + + +void cl_compiler_handle_create_binary(cl_handle *handle, void **out_ptr, size_t *out_size); + +// lib binary format (output of handle_create_binary for type 3) +// layout: cl_lib_header, then cl_lib_section[num_sections], then data + +#define CL_LIB_PROGRAM 0 +#define CL_LIB_CONSTS 6 +#define CL_LIB_IMAGE 7 +#define CL_LIB_CODE 10 +#define CL_LIB_IMAGE_DESC 11 + +typedef struct { + uint32_t id; + uint32_t offset; + uint32_t size; + uint32_t count; + uint32_t entry_size; +} cl_lib_section; + +typedef struct { + uint32_t _unk0[6]; + uint32_t num_sections; + uint32_t _unk1[5]; + cl_lib_section sections[]; +} cl_lib_header; + +// at sections[CL_LIB_PROGRAM].offset +typedef struct { + char name[8]; + uint32_t _unk0[3]; + uint32_t fregs; + uint32_t hregs; +} cl_lib_prog; + +// at sections[CL_LIB_IMAGE_DESC].offset +typedef struct { + char _unk0[0xc4]; + uint32_t prg_offset; + uint32_t pvtmem; + char _unk1[0x0c]; + uint32_t shmem; + uint32_t samp_cnt; + char _unk2[0x28]; + uint32_t brnchstck; + char _unk4[0x4c]; + char kernel_name[]; +} cl_lib_img_desc; + +void cl_compiler_free_handle(cl_handle *handle); +void cl_compiler_free_assembly(void *ptr); + +#endif diff --git a/tinygrad/renderer/nir.py b/tinygrad/renderer/nir.py index 89a611afad..45a4d43c18 100644 --- a/tinygrad/renderer/nir.py +++ b/tinygrad/renderer/nir.py @@ -2,7 +2,7 @@ from typing import Callable, cast, Any from tinygrad.dtype import AddrSpace, DType, PtrDType, ImageDType, dtypes, truncate from tinygrad.helpers import DEBUG, OSX, unwrap, fromimport from tinygrad.renderer import Renderer -from tinygrad.renderer.cstyle import CUDARenderer +from tinygrad.renderer.cstyle import CUDARenderer, OpenCLRenderer from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str from tinygrad.runtime.autogen import mesa from tinygrad.runtime.support.c import POINTER @@ -264,7 +264,7 @@ _nload_img = nir_instr(intrins=lambda dtype:{'IMAGE_DIM':mesa.GLSL_SAMPLER_DIM_2 nc=4, bs=32, num_components=4, srcs=lambda b,img,coord:[nsrc(x) for x in [img, tovec(b, coord), nundef(b, dtypes.int), nimm(b, 0, dtypes.int)]])( lambda b,img,coord,dtype: mesa.nir_intrinsic_instr_create(b.shader, g("nir_intrinsic_image_load"))) -class IR3Renderer(NIRRenderer): +class IR3Renderer(NIRRenderer, OpenCLRenderer): device = "QCOM" has_aux = True @@ -299,5 +299,3 @@ class IR3Renderer(NIRRenderer): self.b.shader.contents.info.num_ubos = len([u for u in bufs if not isinstance(u.dtype, ImageDType)]) self.b.shader.contents.info.num_images = texs() + imgs() - - def aux(self, uops:list[UOp]): return (tuple(u.dtype for u in uops if u.op == Ops.PARAM),) diff --git a/tinygrad/runtime/autogen/__init__.py b/tinygrad/runtime/autogen/__init__.py index 67b961c00d..e693209258 100644 --- a/tinygrad/runtime/autogen/__init__.py +++ b/tinygrad/runtime/autogen/__init__.py @@ -160,4 +160,5 @@ def __getattr__(nm): case "corefoundation": return load("corefoundation", "'CoreFoundation'", [f"{macossdk}/System/Library/Frameworks/CoreFoundation.framework/Headers/CF{s}.h" for s in ["String", "Data"]], args=["-isysroot", macossdk]) + case "llvm_qcom": return load("llvm_qcom", "'llvm-qcom'", [root/"extra/tinydreno.h"]) case _: raise AttributeError(f"no such autogen: {nm}") diff --git a/tinygrad/runtime/autogen/llvm_qcom.py b/tinygrad/runtime/autogen/llvm_qcom.py new file mode 100644 index 0000000000..5c82521cfb --- /dev/null +++ b/tinygrad/runtime/autogen/llvm_qcom.py @@ -0,0 +1,102 @@ +# mypy: disable-error-code="empty-body" +from __future__ import annotations +import ctypes +from typing import Annotated, Literal, TypeAlias +from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR +from tinygrad.runtime.support import c +dll = c.DLL('llvm_qcom', 'llvm-qcom') +cl_llvm_instance: TypeAlias = ctypes.c_void_p +@dll.bind +def cl_compiler_create_llvm_instance() -> cl_llvm_instance: ... +@dll.bind +def cl_compiler_destroy_llvm_instance(inst:cl_llvm_instance) -> None: ... +class enum_cl_handle_type(Annotated[int, ctypes.c_uint32], c.Enum): pass +CL_HANDLE_COMPILED = enum_cl_handle_type.define('CL_HANDLE_COMPILED', 1) +CL_HANDLE_LIBRARY = enum_cl_handle_type.define('CL_HANDLE_LIBRARY', 2) +CL_HANDLE_LINKED = enum_cl_handle_type.define('CL_HANDLE_LINKED', 3) + +@c.record +class struct_cl_compiled_data(c.Struct): + SIZE = 48 + chip_id: Annotated[uint64_t, 0] + mode: Annotated[uint32_t, 8] + llvm_bitcode: Annotated[ctypes.c_void_p, 16] + llvm_bitcode_size: Annotated[uint64_t, 24] + build_log: Annotated[c.POINTER[Annotated[bytes, ctypes.c_char]], 32] + build_log_len: Annotated[uint32_t, 40] + error_code: Annotated[uint32_t, 44] +uint64_t: TypeAlias = Annotated[int, ctypes.c_uint64] +uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32] +@c.record +class struct_cl_executable_data(c.Struct): + SIZE = 80 + num_kernels: Annotated[int32_t, 0] + kernel_props: Annotated[ctypes.c_void_p, 8] + error_code: Annotated[uint32_t, 16] + build_log: Annotated[c.POINTER[Annotated[bytes, ctypes.c_char]], 24] + _unk0: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[32]], 32] + chip_id: Annotated[uint64_t, 64] + mode: Annotated[uint32_t, 72] +int32_t: TypeAlias = Annotated[int, ctypes.c_int32] +@c.record +class cl_handle(c.Struct): + SIZE = 16 + type: Annotated[enum_cl_handle_type, 0] + compiled: Annotated[c.POINTER[struct_cl_compiled_data], 8] + executable: Annotated[c.POINTER[struct_cl_executable_data], 8] +@dll.bind +def cl_compiler_compile_source(inst:cl_llvm_instance, chip_id:uint64_t, mode:Annotated[int, ctypes.c_int32], options:c.POINTER[Annotated[bytes, ctypes.c_char]], p5:Annotated[int, ctypes.c_int32], p6:uint64_t, p7:uint64_t, source:c.POINTER[Annotated[bytes, ctypes.c_char]], source_len:uint64_t, source_type:uint64_t, p11:ctypes.c_void_p) -> c.POINTER[cl_handle]: ... +@dll.bind +def cl_compiler_link_program(inst:cl_llvm_instance, chip_id:uint64_t, mode:Annotated[int, ctypes.c_int32], options:c.POINTER[Annotated[bytes, ctypes.c_char]], num_handles:Annotated[int, ctypes.c_int32], input_handles:c.POINTER[c.POINTER[cl_handle]]) -> c.POINTER[cl_handle]: ... +size_t: TypeAlias = Annotated[int, ctypes.c_uint64] +@dll.bind +def cl_compiler_handle_create_binary(handle:c.POINTER[cl_handle], out_ptr:c.POINTER[ctypes.c_void_p], out_size:c.POINTER[size_t]) -> None: ... +@c.record +class cl_lib_section(c.Struct): + SIZE = 20 + id: Annotated[uint32_t, 0] + offset: Annotated[uint32_t, 4] + size: Annotated[uint32_t, 8] + count: Annotated[uint32_t, 12] + entry_size: Annotated[uint32_t, 16] +@c.record +class cl_lib_header(c.Struct): + SIZE = 48 + _unk0: Annotated[c.Array[uint32_t, Literal[6]], 0] + num_sections: Annotated[uint32_t, 24] + _unk1: Annotated[c.Array[uint32_t, Literal[5]], 28] + sections: Annotated[c.Array[cl_lib_section, Literal[0]], 48] +@c.record +class cl_lib_prog(c.Struct): + SIZE = 28 + name: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[8]], 0] + _unk0: Annotated[c.Array[uint32_t, Literal[3]], 8] + fregs: Annotated[uint32_t, 20] + hregs: Annotated[uint32_t, 24] +@c.record +class cl_lib_img_desc(c.Struct): + SIZE = 344 + _unk0: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[196]], 0] + prg_offset: Annotated[uint32_t, 196] + pvtmem: Annotated[uint32_t, 200] + _unk1: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[12]], 204] + shmem: Annotated[uint32_t, 216] + samp_cnt: Annotated[uint32_t, 220] + _unk2: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[40]], 224] + brnchstck: Annotated[uint32_t, 264] + _unk4: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[76]], 268] + kernel_name: Annotated[c.Array[Annotated[bytes, ctypes.c_char], Literal[0]], 344] +@dll.bind +def cl_compiler_free_handle(handle:c.POINTER[cl_handle]) -> None: ... +@dll.bind +def cl_compiler_free_assembly(ptr:ctypes.c_void_p) -> None: ... +c.init_records() +CL_MODE_32BIT = 0 # type: ignore +CL_MODE_64BIT = 1 # type: ignore +CL_SRC_STR = 0 # type: ignore +CL_SRC_BLOB = 1 # type: ignore +CL_LIB_PROGRAM = 0 # type: ignore +CL_LIB_CONSTS = 6 # type: ignore +CL_LIB_IMAGE = 7 # type: ignore +CL_LIB_CODE = 10 # type: ignore +CL_LIB_IMAGE_DESC = 11 # type: ignore \ No newline at end of file diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index a3420dbd7a..c97d63a9d9 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -1,7 +1,7 @@ from __future__ import annotations import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib assert sys.platform != 'win32' -from typing import Any +from typing import Any, cast from tinygrad.device import BufferSpec, CompilerSet, Device from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface @@ -203,7 +203,8 @@ class QCOMArgsState(HCQArgsState): ubos = [b for i,b in enumerate(bufs) for _,dt in prg.buf_dtypes[i] if not isinstance(dt, ImageDType)] uavs = [(dt,b) for i,b in enumerate(bufs) for _,dt in prg.buf_dtypes[i] if isinstance(dt, ImageDType)] ibos, texs = uavs[:prg.ibo_cnt], uavs[prg.ibo_cnt:] - for cnst_val,cnst_off,cnst_sz in prg.consts_info: to_mv(self.buf.va_addr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little') + for cnst_val,cnst_off,cnst_sz in prg.consts_info: + to_mv(cast(int, self.buf.va_addr) + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little') if prg.samp_cnt > 0: to_mv(int(self.buf.va_addr) + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers) if prg.NIR: @@ -231,12 +232,13 @@ class QCOMProgram(HCQProgram): if self.NIR: from tinygrad.runtime.support.compiler_mesa import IR3Compiler - v, cs, self.imm_vals, self.image = IR3Compiler.unpack_lib(lib) + v, cs, imm_vals, self.image = IR3Compiler.unpack_lib(lib) self.prg_offset, self.brnchstck, self.image_size, self.pvtmem, self.shmem = 0, v.branchstack, v.info.size, v.pvtmem_size, v.shared_size self.wgsz = alloc.offset_vec4 * 4 + 8 if (alloc:=cs.allocs.consts[mesa.IR3_CONST_ALLOC_DRIVER_PARAMS]).size_vec4 else 0xfc self.wgid, self.lid = v.cs.work_group_id, v.cs.local_invocation_id # register ids - self.buf_off, self.imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16 + self.buf_off, imm_off = cs.ubo_state.range[0].offset, cs.allocs.max_const_offset_vec4 * 16 + self.consts_info = [(struct.unpack_from(" int: return struct.unpack("I", lib[off:off+4])[0] class QCOMCompiler(Compiler): def __init__(self, chip_id): - self.chip_id, self.llvm_inst = chip_id, create_llvm_instance() + self.chip_id, self.llvm_inst = chip_id, llvm_qcom.cl_compiler_create_llvm_instance() super().__init__(f"compile_qcomcl_{chip_id}") - def __del__(self): destroy_llvm_instance(self.llvm_inst) + def __del__(self): llvm_qcom.cl_compiler_destroy_llvm_instance(self.llvm_inst) def __reduce__(self): return QCOMCompiler, (self.chip_id,) def checked(self, handle): - if handle is None or get_error_code(handle) != 0: - destroy_llvm_instance(self.llvm_inst) - self.llvm_inst = create_llvm_instance() - raise RuntimeError("QCOM Compilation Error" + ("" if handle is None else f": {get_build_log(handle)}")) + if not handle or (data:=(hc.executable if (hc:=handle.contents).type == llvm_qcom.CL_HANDLE_LINKED else hc.compiled).contents).error_code != 0: + llvm_qcom.cl_compiler_destroy_llvm_instance(self.llvm_inst) + self.llvm_inst = llvm_qcom.cl_compiler_create_llvm_instance() + raise RuntimeError("QCOM Compilation Error" + ("" if not handle else f": {ctypes.string_at(data.build_log).decode()}")) return handle def compile(self, src) -> bytes: - ch = self.checked(compile_source(self.llvm_inst, self.chip_id, MODE_64BIT, b"", 0, 0, 0, src.encode(), 0, SRC_STR, None)) - if DEBUG >= 8: - handle_create_binary(ch, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t())) - print(system("llvm-dis", input=ctypes.string_at(ptr, sz.value)[16:])) - free_assembly(ptr) - lh = self.checked(link_program(self.llvm_inst, self.chip_id, MODE_64BIT, None, 1, ctypes.pointer(ctypes.c_void_p(ch)))) - handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t())) - for h in [ch, lh]: free_handle(h) + ch = self.checked(llvm_qcom.cl_compiler_compile_source(self.llvm_inst, self.chip_id, llvm_qcom.CL_MODE_64BIT, b"", 0, 0, 0, src.encode(), 0, + llvm_qcom.CL_SRC_STR, None)) + if DEBUG >= 8: print(system("llvm-dis", input=ctypes.string_at((comp:=ch.contents.compiled.contents).llvm_bitcode, comp.llvm_bitcode_size))) + lh = self.checked(llvm_qcom.cl_compiler_link_program(self.llvm_inst, self.chip_id, llvm_qcom.CL_MODE_64BIT, None, 1, ch)) + llvm_qcom.cl_compiler_handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t())) + for h in [ch, lh]: llvm_qcom.cl_compiler_free_handle(h) ret = ctypes.string_at(ptr, sz.value) - free_assembly(ptr) + llvm_qcom.cl_compiler_free_assembly(ptr) return ret def disassemble(self, lib: bytes): disas_adreno(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)], self.chip_id)