diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fcddcd9f86..074c4469ea 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,6 +27,8 @@ jobs:
         python-version: 3.12
     - name: Install docs dependencies (no cache)
       run: pip install -e '.[docs]'
+    - name: Install capstone for CLANG disassembly
+      run: pip install capstone
     - name: Use as an external package
       run: |
         mkdir $HOME/test_external_dir
diff --git a/docs/abstractions2.py b/docs/abstractions2.py
index a7cf946e5a..067e0ac328 100644
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -7,7 +7,7 @@
 
 print("******** first, the runtime ***********")
 
-from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler, MallocAllocator
+from tinygrad.runtime.ops_clang import ClangJITCompiler, MallocAllocator, CPUProgram
 
 # allocate some buffers
 out = MallocAllocator.alloc(4)
@@ -19,10 +19,10 @@ MallocAllocator._copyin(a, memoryview(bytearray([2,0,0,0])))
 MallocAllocator._copyin(b, memoryview(bytearray([3,0,0,0])))
 
 # compile a program to a binary
-lib = ClangCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
+lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
 
-# create a runtime for the program (ctypes.CDLL)
-fxn = ClangProgram("add", lib)
+# create a runtime for the program
+fxn = CPUProgram("add", lib)
 
 # run the program
 fxn(out, a, b)
@@ -65,7 +65,7 @@ kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
 # compile a program (and print the source)
 fxn = CompiledRunner(kernel.to_program())
 print(fxn.p.src)
-# NOTE: fxn.clprg is the ClangProgram
+# NOTE: fxn.clprg is the CPUProgram
 
 # run the program
 fxn.exec([out, a, b])
diff --git a/docs/developer/runtime.md b/docs/developer/runtime.md
index 2367d47b1e..0c8a9d5ed9 100644
--- a/docs/developer/runtime.md
+++ b/docs/developer/runtime.md
@@ -36,9 +36,9 @@ The `Allocator` class is responsible for managing memory on the device. There is
 
 ### Program
 
-The `Program` class is created for each loaded program. It is responsible for compiling and executing the program on the device. As an example, here is a `ClangProgram` implementation which loads program and runs it.
+The `Program` class is created for each loaded program. It is responsible for executing the program on the device. As an example, here is a `CPUProgram` implementation which loads program and runs it.
 
-::: tinygrad.runtime.ops_clang.ClangProgram
+::: tinygrad.runtime.ops_clang.CPUProgram
     options:
         members: true
 
diff --git a/setup.py b/setup.py
index d4048c756b..0e779b125a 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,8 @@ setup(name='tinygrad',
             "hypothesis",
             "nibabel",
             "bottle",
-            "ggml-python"
+            "ggml-python",
+            "capstone"
         ],
         'webgpu': ["wgpu"],
         'docs': [
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 58dc61756a..0163c2b51a 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -45,6 +45,8 @@ def ceildiv(num, amt): return int(ret) if isinstance((ret:=-(num//-amt)), float)
 def round_up(num:int, amt:int) -> int: return (num+amt-1)//amt * amt
 def data64(data:Any) -> tuple[Any, Any]: return (data >> 32, data & 0xFFFFFFFF) # Any is sint
 def data64_le(data:Any) -> tuple[Any, Any]: return (data & 0xFFFFFFFF, data >> 32) # Any is sint
+def getbits(value: int, start: int, end: int): return (value >> start) & ((1 << end-start+1) - 1)
+def i2u(bits: int, value: int): return value if value >= 0 else (1<<bits)+value
 def merge_dicts(ds:Iterable[dict[T,U]]) -> dict[T,U]:
   kvs = set([(k,v) for d in ds for k,v in d.items()])
   assert len(kvs) == len(set(kv[0] for kv in kvs)), f"cannot merge, {kvs} contains different values for the same key"
@@ -265,7 +267,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
 def from_mv(mv:memoryview, to_type=ctypes.c_char):
   return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type * len(mv))).contents
 def to_mv(ptr:int, sz:int) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B")
-def mv_address(mv:memoryview): return ctypes.addressof(ctypes.c_char.from_buffer(mv))
+def mv_address(mv): return ctypes.addressof(ctypes.c_char.from_buffer(mv))
 def to_char_p_p(options: list[bytes], to_type=ctypes.c_char):
   return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
 @functools.lru_cache(maxsize=None)
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index aef1e195ab..90fa45cf4a 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,32 +1,66 @@
-import ctypes, subprocess, pathlib, tempfile
+import ctypes, ctypes.util, struct, platform, subprocess
+from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
+from tinygrad.helpers import OSX, mv_address, cpu_time_execution
 from tinygrad.device import Compiled, Compiler, MallocAllocator
-from tinygrad.helpers import cpu_time_execution, cpu_objdump
+from tinygrad.runtime.support.elf import elf_loader, relocate
 from tinygrad.renderer.cstyle import ClangRenderer
 
-class ClangCompiler(Compiler):
-  def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
-    self.args = ['-march=native'] if args is None else args
-    self.objdump_tool = objdump_tool
-    super().__init__(cachekey)
+# NOTE: MAP_JIT is added to mmap module in python 3.13
+MAP_JIT = 0x0800
+
+class ClangJITCompiler(Compiler):
+  def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
 
   def compile(self, src:str) -> bytes:
-    # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
-    with tempfile.NamedTemporaryFile(delete=True) as output_file:
-      subprocess.check_output(['clang', '-shared', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
-                               '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
-      return pathlib.Path(output_file.name).read_bytes()
+    # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
+    # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
+    args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
+    arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
+    obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
+    image, _, relocs = elf_loader(obj)
+    # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
+    for ploc,tgt,r_type,r_addend in relocs:
+      image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
+    return bytes(image)
 
-  def disassemble(self, lib:bytes): return cpu_objdump(lib, self.objdump_tool)
+  def disassemble(self, lib):
+    import capstone
+    if platform.machine() == 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
+    elif platform.machine() in {'aarch64', 'arm64'}: cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
+    else: raise NotImplementedError(f"Capstone disassembly isn't supported for {platform.machine()}")
+    for instr in cs.disasm(lib, 0):
+      print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
+
+# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
+class CPUProgram:
+  helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
 
-class ClangProgram:
   def __init__(self, name:str, lib:bytes):
-    self.name, self.lib = name, lib
-    # write to disk so we can load it
-    with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
-      pathlib.Path(cached_file_path.name).write_bytes(lib)
-      self.fxn = ctypes.CDLL(str(cached_file_path.name))[name]
+    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
+    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
+    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
 
-  def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
+    self.mem.write(lib)
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
+
+    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
+    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
+    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
+    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
+    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+
+    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
+
+  def __call__(self, *bufs, vals=(), wait=False):
+    args = list(bufs) + list(vals)
+    # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
+    # apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
+    # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
+    # this hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
+    # the bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
+    if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
+    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
 
 class ClangDevice(Compiled):
-  def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler(), ClangProgram)
+  def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
diff --git a/tinygrad/runtime/support/elf.py b/tinygrad/runtime/support/elf.py
index 54f1cd844c..172fa9c596 100644
--- a/tinygrad/runtime/support/elf.py
+++ b/tinygrad/runtime/support/elf.py
@@ -1,5 +1,6 @@
-from dataclasses import dataclass
 import tinygrad.runtime.autogen.libc as libc
+from dataclasses import dataclass
+from tinygrad.helpers import getbits, i2u
 
 @dataclass(frozen=True)
 class ElfSection: name:str; header:libc.Elf64_Shdr; content:bytes # noqa: E702
@@ -34,3 +35,18 @@ def elf_loader(blob:bytes, force_section_align:int=1) -> tuple[memoryview, list[
     relocs += [(target_image_off + roff, sections[sym.st_shndx].header.sh_addr + sym.st_value, rtype, raddend) for roff, sym, rtype, raddend in rels]
 
   return memoryview(image), sections, relocs
+
+def relocate(instr: int, ploc: int, tgt: int, r_type: int):
+  # https://refspecs.linuxfoundation.org/elf/x86_64-abi-0.95.pdf
+  if r_type == libc.R_X86_64_PC32: return i2u(32, tgt-ploc)
+  # https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst for definitions of relocations
+  # https://www.scs.stanford.edu/~zyedidia/arm64/index.html for instruction encodings
+  if r_type == libc.R_AARCH64_ADR_PREL_PG_HI21:
+    rel_pg = i2u(33, (tgt & ~0xFFF) - (ploc & ~0xFFF))
+    return instr | (getbits(rel_pg, 12, 13) << 29) | (getbits(rel_pg, 14, 32) << 5)
+  if r_type == libc.R_AARCH64_ADD_ABS_LO12_NC: return instr | (getbits(tgt, 0, 11) << 10)
+  if r_type == libc.R_AARCH64_LDST16_ABS_LO12_NC: return instr | (getbits(tgt, 1, 11) << 10)
+  if r_type == libc.R_AARCH64_LDST32_ABS_LO12_NC: return instr | (getbits(tgt, 2, 11) << 10)
+  if r_type == libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10)
+  if r_type == libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10)
+  raise NotImplementedError(f"Encountered unknown relocation type {r_type:#x}")