From 7810be8d3c9d0d399dd2babcaad13fbce2d975bf Mon Sep 17 00:00:00 2001
From: Christopher Milan <chrismilan@ucla.edu>
Date: Fri, 6 Mar 2026 03:24:27 -0800
Subject: [PATCH] compile QCOM without opening device (#15165)

Co-authored-by: Comma Device <device@comma.ai>
---
 .github/actions/setup-tinygrad/action.yml | 10 ++++
 .github/workflows/test.yml                | 23 +++++++++
 test/backend/test_ops.py                  | 17 +++++--
 tinygrad/device.py                        |  4 +-
 tinygrad/helpers.py                       |  3 +-
 tinygrad/renderer/cstyle.py               |  7 ++-
 tinygrad/runtime/ops_null.py              |  7 +--
 tinygrad/runtime/ops_qcom.py              | 15 +++---
 tinygrad/runtime/support/compiler_qcom.py | 57 +++++++++++++++++++++++
 9 files changed, 122 insertions(+), 21 deletions(-)
 create mode 100644 tinygrad/runtime/support/compiler_qcom.py

diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml
index b75bdc0f6c..dff7f3ebe2 100644
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@@ -45,6 +45,10 @@ inputs:
     description: "Install mesa"
     required: false
     default: 'false'
+  tinydreno:
+    description: "Install tinydreno"
+    required: false
+    default: 'false'
 runs:
   using: "composite"
   steps:
@@ -326,3 +330,9 @@ runs:
       if: inputs.mesa == 'true' && runner.os == 'macOS'
       shell: bash
       run: brew install sirhcm/tinymesa/tinymesa_cpu
+
+    # *** tinydreno ***
+    - name: Install tinydreno (linux)
+      if: inputs.tinydreno == 'true' && runner.os == 'Linux'
+      shell: bash
+      run: sudo curl -fL https://github.com/sirhcm/tinydreno/raw/refs/heads/master/libllvm-qcom.so -o /usr/lib/libllvm-qcom.so
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dc2023ca43..afd546b2d0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1011,3 +1011,26 @@ jobs:
           python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
           DEBUG=4 python3 test/backend/test_ops.py TestOps.test_add
           python -m pytest -n=auto test/backend/test_ops.py --durations=20
+  qcomclcompiletests:
+    name: Compile-only (QCOM CL)
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 15
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Setup Environment
+        uses: ./.github/actions/setup-tinygrad
+        with:
+          key: compile-qcomcl
+          deps: testing_unit
+          tinydreno: 'true'
+          python-version: '3.12'
+      - name: Set env
+        shell: bash
+        run: printf "NULL=1\nNULL_ALLOW_COPYOUT=1\nNULL_QCOMCL=1" >> $GITHUB_ENV
+      - name: Run test_ops
+        shell: bash
+        run: |
+          python -c "from tinygrad import Device; assert Device.DEFAULT == 'NULL'"
+          DEBUG=4 python3 test/backend/test_ops.py TestOps.test_add
+          python -m pytest -n=auto test/backend/test_ops.py --durations=20
diff --git a/test/backend/test_ops.py b/test/backend/test_ops.py
index e4e910b246..a507a0830f 100644
--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@@ -6,6 +6,7 @@ from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, CPU_LLVM, AMD_LL
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
+from tinygrad.renderer.cstyle import QCOMCLRenderer
 from tinygrad.renderer.nir import NIRRenderer
 
 TINY_BACKEND = getenv("TINY_BACKEND")
@@ -436,7 +437,7 @@ class TestOps(unittest.TestCase):
     helper_test_op([(45,35), (45,35), (45,35)], lambda x,y,z: x.lerp(y,z))
     helper_test_op(None, lambda x,y,z: x.lerp(y,z), vals=[[1.,2.,3.], [4.,5.,6.], 0.5])
 
-  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_tril(self):
     helper_test_op([(3,3)], lambda x: x.tril())
     helper_test_op([(3,3)], lambda x: x.tril(1))
@@ -454,7 +455,7 @@ class TestOps(unittest.TestCase):
     helper_test_op([(5,3,3)], lambda x: x.tril(1))
     helper_test_op(None, lambda x: x.tril(), vals=[[[True] * 3] * 3], forward_only=True)
 
-  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_triu(self):
     helper_test_op([(3,3)], lambda x: x.triu())
     helper_test_op([(3,3)], lambda x: x.triu(1))
@@ -765,6 +766,7 @@ class TestOps(unittest.TestCase):
 
     self.helper_test_exception([(4), (4)], lambda x,y: x.bitwise_xor(y), expected=RuntimeError)
 
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_and(self):
     data = [[1,-8,1],[32,1,6]]
     tor = torch.tensor(data, dtype=torch.int)
@@ -782,6 +784,7 @@ class TestOps(unittest.TestCase):
 
     self.helper_test_exception([(4), (4)], lambda x,y: x.bitwise_and(y), expected=RuntimeError)
 
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_or(self):
     data = [[1,-8,1],[32,1,6]]
     tor = torch.tensor(data, dtype=torch.int)
@@ -1170,6 +1173,7 @@ class TestOps(unittest.TestCase):
     helper_test_op(None, lambda x: x.type(torch.int32).argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[False, True]])
     helper_test_op(None, lambda x: x.type(torch.int32).argmax().type(torch.int32), lambda x: x.argmax(), forward_only=True, vals=[[True, False]])
 
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_argmin(self):
     # check if it returns the first index for multiple occurrences
     helper_test_op(None, lambda x: x.argmin().type(torch.int32), lambda x: x.argmin(), forward_only=True, vals=[[2, 2]])
@@ -1475,6 +1479,7 @@ class TestOps(unittest.TestCase):
   def test_prod_dtype_arg(self):
     with self.assertRaises(AttributeError): Tensor([1.0, 2.0]).prod(dtype="")
 
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_min(self):
     helper_test_op([(3,3)], lambda x: x.min())
     helper_test_op([(45,3)], lambda x: x.min())
@@ -1503,7 +1508,6 @@ class TestOps(unittest.TestCase):
     helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).prod(), lambda x: (x.full_like(2)).prod(), forward_only=True)
     helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).max(), lambda x: (x.full_like(2)).max(), forward_only=True)
 
-  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
   def test_any(self):
     helper_test_op([(3,4,5,6)], lambda x: x.any(), forward_only=True)
     helper_test_op(None, lambda x: x.any(), vals=[[True, True]], forward_only=True)
@@ -1515,7 +1519,7 @@ class TestOps(unittest.TestCase):
   def test_any_zero_axis(self):
     helper_test_op([(1,0,3,0,5)], lambda x: x.any(axis=(1,3)), forward_only=True)
 
-  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_all(self):
     helper_test_op([(3,4,5,6)], lambda x: x.all(), forward_only=True)
     helper_test_op(None, lambda x: x.all(), vals=[[True, True]], forward_only=True)
@@ -2889,6 +2893,7 @@ class TestOps(unittest.TestCase):
     helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
 
   @slow_test
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_slice_fancy_indexing_dim_collapse_int(self):
     a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # dim collapse from int
@@ -2899,6 +2904,7 @@ class TestOps(unittest.TestCase):
     helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
 
   @slow_test
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_slice_fancy_indexing_dim_inject_none(self):
     a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # dim injection from None
@@ -2933,6 +2939,7 @@ class TestOps(unittest.TestCase):
                             lambda x: x[Tensor([[0,1,-1],[-1,-2,0]]), Tensor([2,1,-1])])
 
   @slow_test
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_slice_fancy_indexing_list_indices(self):
     a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     helper_test_op([(2,5,6,5,3,4)], lambda x: x[((0,),)])
@@ -2944,6 +2951,7 @@ class TestOps(unittest.TestCase):
     helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,(2,1,0),c,(-2,1,0),e], lambda x: x[i,(2,1,0),k,(-2,1,0),p])
 
   @slow_test
+  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, QCOMCLRenderer), "QCOM CL vectorized bool bug")
   def test_slice_fancy_indexing_tuple_indices(self):
     a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     helper_test_op([(2,5,6,5,3,4)], lambda x: x[(((0,),),)], lambda x: x[(((0,),),)])
@@ -3285,7 +3293,6 @@ class TestOps(unittest.TestCase):
     helper_test_op([(20,)], lambda x: (x>0.5).nonzero().int(), lambda x: (x>0.5).nonzero(), forward_only=True)
     helper_test_op([(10, 5, 3)], lambda x: (x>0.5).nonzero().int(), lambda x: (x>0.5).nonzero(), forward_only=True)
 
-  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
   def test_cast(self):
     helper_test_op([(3, 3)], lambda x: x.float())
     helper_test_op(None, lambda x: x.float(), vals=[[0, 1, 2, 3]], forward_only=True)
diff --git a/tinygrad/device.py b/tinygrad/device.py
index aaf83131a2..3e1d5f4ac3 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -6,7 +6,7 @@ import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re
 from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
 from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
 from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
-from tinygrad.helpers import EMULATED_DTYPES, TracingKey
+from tinygrad.helpers import EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, TracingKey
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
 if TYPE_CHECKING: from tinygrad.renderer import Renderer
 
@@ -371,7 +371,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
     if device in ["CUDA", "NV"]: return not CI
     if device == "CPU" and CPU_LLVM: return OSX
     if device == "PYTHON": return sys.version_info >= (3, 12)
-  if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not getenv("NULL_IR3")
+  if dtype == dtypes.float64: return (device not in {"METAL", "QCOM"} and not (OSX and device == "CL") and not NULL_IR3 and not NULL_QCOMCL
                                       and dtypes.long not in EMULATED_DTYPES.tolist(dtypes))
   return True
 
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 4f38b13094..748844ab6d 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -195,7 +195,8 @@ CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasat
 CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
 NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
 CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
-NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
+NULL_QCOMCL, NULL_IR3, NULL_NAK = ContextVar("NULL_QCOMCL", 0), ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0)
+NULL_ALLOW_COPYOUT = ContextVar("NULL_ALLOW_COPYOUT", 0)
 AMD_CC, AMD_LLVM, AMD_HIPCC  = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
 QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
 # VIZ implies PROFILE, but you can run PROFILE without VIZ
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 624305aec1..9e2a40d950 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -566,4 +566,9 @@ class AMDHIPCCRenderer(AMDHIPRenderer):
     super().__init__(arch)
     self.compiler = HIPCCCompiler(arch)
 
-class QCOMRenderer(OpenCLRenderer): device = "QCOM"
+class QCOMCLRenderer(OpenCLRenderer):
+  device = "QCOM"
+
+  def __init__(self, chip_id):
+    from tinygrad.runtime.support.compiler_qcom import QCOMCompiler
+    self.compiler = QCOMCompiler(chip_id)
diff --git a/tinygrad/runtime/ops_null.py b/tinygrad/runtime/ops_null.py
index 529204562c..8436d75f23 100644
--- a/tinygrad/runtime/ops_null.py
+++ b/tinygrad/runtime/ops_null.py
@@ -1,9 +1,9 @@
 import functools
 from tinygrad.device import Compiled, Allocator, CompilerSet
 from tinygrad.engine.jit import MultiGraphRunner
-from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer
+from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer, QCOMCLRenderer
 from tinygrad.uop.ops import Ops
-from tinygrad.helpers import cpu_profile, EMULATE, NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT
+from tinygrad.helpers import cpu_profile, EMULATE, NULL_QCOMCL, NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT
 from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
 
 class NullRenderer(CStyleLanguage):
@@ -39,6 +39,7 @@ class NullDevice(Compiled):
       case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
       case "": renderer = NullRenderer
       case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
-    compilers = CompilerSet([(renderer, None), (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
+    compilers = CompilerSet([(renderer, None), (functools.partial(QCOMCLRenderer, 0x6030001), NULL_QCOMCL), # adreno 630
+                             (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
                              (functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
     super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)
diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py
index 632d1d79fd..8a75a2099d 100644
--- a/tinygrad/runtime/ops_qcom.py
+++ b/tinygrad/runtime/ops_qcom.py
@@ -6,11 +6,10 @@ from tinygrad.device import BufferSpec, CompilerSet, Device
 from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
 from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
 from tinygrad.runtime.autogen import kgsl, mesa
-from tinygrad.runtime.ops_cl import CLDevice
-from tinygrad.renderer.cstyle import QCOMRenderer
+from tinygrad.renderer.cstyle import QCOMCLRenderer
 from tinygrad.renderer.nir import IR3Renderer
-from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, fromimport, cpu_profile, lo32, suppress_finalizing
-from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE, DEBUG
+from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, cpu_profile, lo32, suppress_finalizing
+from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
 from tinygrad.dtype import ImageDType, dtypes
 from tinygrad.runtime.support.system import System
 if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl  # noqa: F401  # pylint: disable=unused-import
@@ -248,9 +247,7 @@ class QCOMProgram(HCQProgram):
       self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
       self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
       self.consts_info:list[tuple] = []
-    else:
-      self._parse_lib(lib:=self.dev.cl_dev.cl_compiler.compile_cached(lib.decode()))
-      if DEBUG >= 7: fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
+    else: self._parse_lib(lib)
 
     self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
     to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
@@ -384,8 +381,8 @@ class QCOMDevice(HCQCompiled):
     if PROFILE and self.gpu_id[:2] < (7, 3):
       System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
 
-    self.cl_dev = CLDevice(device)
-    compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(QCOMRenderer, None), (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
+    compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(functools.partial(QCOMCLRenderer, info.chip_id), None),
+                                                    (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
     super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
                      functools.partial(QCOMComputeQueue, self), None)
 
diff --git a/tinygrad/runtime/support/compiler_qcom.py b/tinygrad/runtime/support/compiler_qcom.py
new file mode 100644
index 0000000000..a2878315e6
--- /dev/null
+++ b/tinygrad/runtime/support/compiler_qcom.py
@@ -0,0 +1,57 @@
+import ctypes, struct
+from tinygrad.device import Compiler
+from tinygrad.runtime.support.c import DLL
+from tinygrad.runtime.support.compiler_mesa import disas_adreno
+
+# see https://github.com/sirhcm/tinydreno
+dll = DLL("llvm-qcom", ["llvm-qcom"])
+
+(create_llvm_instance:=dll.cl_compiler_create_llvm_instance).restype, create_llvm_instance.argtypes = ctypes.c_void_p, []
+
+(compile_source:=dll.cl_compiler_compile_source).restype = ctypes.c_void_p
+compile_source.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_uint64, ctypes.c_uint64,
+                           ctypes.c_char_p, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_void_p]
+
+(link_program:=dll.cl_compiler_link_program).restype = ctypes.c_void_p
+link_program.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p]
+
+(get_error_code:=dll.cl_compiler_get_error_code).restype, get_error_code.argtypes = ctypes.c_int, [ctypes.c_void_p]
+(get_build_log:=dll.cl_compiler_get_build_log).restype, get_build_log.argtypes = ctypes.c_char_p, [ctypes.c_void_p]
+
+(handle_create_binary:=dll.cl_compiler_handle_create_binary).restype = None
+handle_create_binary.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t)]
+
+(free_handle:=dll.cl_compiler_free_handle).restype, free_handle.argtypes = None, [ctypes.c_void_p]
+(free_assembly:=dll.cl_compiler_free_assembly).restype, free_assembly.argtypes = None, [ctypes.c_void_p]
+(destroy_llvm_instance:=dll.cl_compiler_destroy_llvm_instance).restype, destroy_llvm_instance.argtypes = None, [ctypes.c_void_p]
+
+MODE_32BIT, MODE_64BIT, SRC_STR, SRC_BLOB = 0, 1, 0, 1
+
+def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
+
+class QCOMCompiler(Compiler):
+  def __init__(self, chip_id):
+    self.chip_id, self.llvm_inst = chip_id, create_llvm_instance()
+    super().__init__(f"compile_qcomcl_{chip_id}")
+
+  def __del__(self): destroy_llvm_instance(self.llvm_inst)
+
+  def __reduce__(self): return QCOMCompiler, (self.chip_id,)
+
+  def checked(self, handle):
+    if handle is None or get_error_code(handle) != 0:
+      destroy_llvm_instance(self.llvm_inst)
+      self.llvm_inst = create_llvm_instance()
+      raise RuntimeError("QCOM Compilation Error" + ("" if handle is None else f": {get_build_log(handle)}"))
+    return handle
+
+  def compile(self, src) -> bytes:
+    ch = self.checked(compile_source(self.llvm_inst, self.chip_id, MODE_64BIT, b"", 0, 0, 0, src.encode(), 0, SRC_STR, None))
+    lh = self.checked(link_program(self.llvm_inst, self.chip_id, MODE_64BIT, None, 1, ctypes.pointer(ctypes.c_void_p(ch))))
+    handle_create_binary(lh, ctypes.byref(ptr:=ctypes.c_void_p()), ctypes.byref(sz:=ctypes.c_size_t()))
+    for h in [ch, lh]: free_handle(h)
+    ret = ctypes.string_at(ptr, sz.value)
+    free_assembly(ptr)
+    return ret
+
+  def disassemble(self, lib: bytes): disas_adreno(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)], self.chip_id)