diff --git a/docs/abstractions.py b/docs/abstractions.py
index 44b0ae3613..4b97372ca6 100644
--- a/docs/abstractions.py
+++ b/docs/abstractions.py
@@ -198,7 +198,7 @@ from tinygrad.device import MallocAllocator
 # ClangProgram is the simplest runtime (in tinygrad/runtime/ops_clang.py, code 7/10)
 # __init__ calls clang, and __call__ calls the function in the *.so outputted by clang
 # in CLANG, global_size and local_size are ignored
-from tinygrad.runtime.ops_clang import ClangProgram, compile_clang
+from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler
 
 # a concrete example looks like this, this adds two size 1 RawBuffer
 # first we create two numpy buffers containing 2 and 3
@@ -213,7 +213,7 @@ MallocAllocator.copyin(input_a, numpy_a.data.cast("B"))
 MallocAllocator.copyin(input_b, numpy_b.data.cast("B"))
 
 # compile the program, run it, and 2+3 does indeed equal 5
-program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"))
+program = ClangProgram("add", ClangCompiler().compile(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"))
 program(output, input_a, input_b)
 numpy_out = np.empty(1, dtype=np.float32)
 MallocAllocator.copyout(numpy_out.data.cast("B"), output)
@@ -251,7 +251,7 @@ result = Tensor(2.0).realize() + Tensor(3.0).realize()
 # use the real Linearizer to linearize 2+3
 from tinygrad.codegen.linearizer import Linearizer
 sched = result.lazydata.schedule()
-linearizer = Linearizer(sched[-1].ast)
+linearizer = Linearizer(sched[-1].ast, ClangCompiler.linearizer_opts)
 linearizer.linearize()
 
 # print the uops
diff --git a/docs/abstractions2.py b/docs/abstractions2.py
index 5160f165ce..9ce16d256a 100644
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -7,7 +7,7 @@
 
 print("******** first, the runtime ***********")
 
-from tinygrad.runtime.ops_clang import ClangProgram, compile_clang, MallocAllocator
+from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler, MallocAllocator
 
 # allocate some buffers
 out = MallocAllocator.alloc(4)
@@ -19,7 +19,7 @@ MallocAllocator.copyin(a, bytearray([2,0,0,0]))
 MallocAllocator.copyin(b, bytearray([3,0,0,0]))
 
 # compile a program to a binary
-lib = compile_clang("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
+lib = ClangCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
 
 # create a runtime for the program (ctypes.CDLL)
 fxn = ClangProgram("add", lib)
diff --git a/test/test_device_speed.py b/test/test_device_speed.py
index 5084e1c708..337220d083 100644
--- a/test/test_device_speed.py
+++ b/test/test_device_speed.py
@@ -8,19 +8,19 @@ class TestDeviceSpeed(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
     cls.dev = Device[Device.DEFAULT]
-    cls.empty = Device[Device.DEFAULT].renderer("test", [])
+    cls.empty = Device[Device.DEFAULT].compiler.render("test", [])
 
   def test_empty_compile(self):
     with Timing("compiler "):
-      self.dev.compiler(self.empty)
+      self.dev.compiler.compile(self.empty)
 
   def test_empty_compile_twice(self):
-    self.dev.compiler(self.empty)
+    self.dev.compiler.compile(self.empty)
     with Timing("compiler "):
-      self.dev.compiler(self.empty)
+      self.dev.compiler.compile(self.empty)
 
   def test_launch_speed(self):
-    prg_bin = self.dev.compiler(self.empty)
+    prg_bin = self.dev.compiler.compile(self.empty)
     prg = self.dev.runtime("test", prg_bin)
     for _ in range(10): prg() # ignore first launches
     with Timing("launch 1000x "):
@@ -29,7 +29,7 @@ class TestDeviceSpeed(unittest.TestCase):
       for _ in range(1000): prg(wait=True)
 
   def test_profile_launch_speed(self):
-    prg_bin = self.dev.compiler(self.empty)
+    prg_bin = self.dev.compiler.compile(self.empty)
     prg = self.dev.runtime("test", prg_bin)
     for _ in range(10): prg() # ignore first launches
     with Profiling():
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index d710691ece..d625bc387b 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -160,7 +160,7 @@ def helper_realized_ast(r:Tensor):
   output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype)
   return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
 
-@unittest.skipUnless(isinstance(Device[Device.DEFAULT], Compiled) and Device[Device.DEFAULT].linearizer_opts.supports_float4,
+@unittest.skipUnless(isinstance(Device[Device.DEFAULT], Compiled) and Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4,
                      "need Compiled backends that support float4")
 class TestFloat4(unittest.TestCase):
   @staticmethod
@@ -367,7 +367,7 @@ class TestHandCodedOpts(unittest.TestCase):
     assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
 
   def test_matvec(self):
-    if not Device[Device.DEFAULT].linearizer_opts.has_local:
+    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local:
       self.skipTest("Only devices with locals")
     N = 128
     a = Tensor.rand(1, N).realize()
@@ -417,7 +417,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
 @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "linearizer is only for compiled backends")
 class TestLinearizerOpts(unittest.TestCase):
   def test_local_and_grouped_reduce(self):
-    if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -463,7 +463,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_matmul(self):
-    if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -493,7 +493,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_double_reduce(self):
-    if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -520,7 +520,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_tensor_core_opts(self):
-    if not Device[Device.DEFAULT].linearizer_opts.has_local:
+    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local:
       self.skipTest("Only Compiled uses linearizer with locals")
     if Device.DEFAULT not in tensor_cores:
       self.skipTest("No tensor cores for device")
diff --git a/test/test_uops.py b/test/test_uops.py
index 2bae41c9f4..2910b5a664 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -10,9 +10,9 @@ from tinygrad.codegen.linearizer import UOps, UOp
 from test.test_dtype import is_dtype_supported
 
 def _uops_to_prg(uops):
-  src = Device[Device.DEFAULT].renderer("test", uops)
-  return CompiledASTRunner(None, "test", src, Device[Device.DEFAULT], [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
-                           [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None)
+  src = Device[Device.DEFAULT].compiler.render("test", uops)
+  has_local = Device[Device.DEFAULT].compiler.linearizer_opts.has_local
+  return CompiledASTRunner(None, "test", src, Device[Device.DEFAULT], [1] if has_local else None, [1] if has_local else None)
 
 def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
   uops.append(UOp(uop, dtype, tuple(vin), arg))
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 1072df3c13..1259e67f23 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -67,7 +67,8 @@ class LinearizerOptions(NamedTuple):
 
 class Kernel:
   def __init__(self, ast:LazyOp, opts:Optional[LinearizerOptions]=None):
-    self.opts = opts or (device.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) else LinearizerOptions(Device.DEFAULT))
+    self.opts = opts or (device.compiler.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) else
+                         LinearizerOptions(Device.DEFAULT))
     self.ast = ast
     assert ast.op == BufferOps.STORE, f"kernels must have a store as the output, got {ast.op}"
 
diff --git a/tinygrad/device.py b/tinygrad/device.py
index ebfd44e682..5b13b3ab2d 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from collections import defaultdict
-from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable, Tuple, cast
+from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable, Tuple, cast, ClassVar
 import importlib, inspect, functools, pathlib, time, re, ctypes
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put
@@ -262,6 +262,18 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret
 
 # **************** for Compiled Devices ****************
 
+class Compiler:
+  linearizer_opts: ClassVar[LinearizerOptions]
+  def __init__(self, cachekey=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
+  def render(self, name:str, uops) -> str: raise NotImplementedError("need a render function")
+  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile_cached(self, src:str) -> bytes:
+    if self.cachekey is not None: lib = diskcache_get(self.cachekey, src)
+    if lib is None:
+      lib = self.compile(src)
+      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
+    return lib
+
 class CompiledASTRunner(JITRunner):
   def __init__(self, ast:Optional[LazyOp], name:str, prg:str, device:Compiled, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, precompiled:Optional[bytes]=None):  # noqa: E501
     super().__init__()
@@ -270,12 +282,7 @@ class CompiledASTRunner(JITRunner):
     if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
     self.name, self.display_name, self.prg, self.device, self.global_size, self.local_size, self.first_run = \
       to_function_name(name), name, prg, device, global_size, local_size, True
-    lib: Optional[bytes] = precompiled
-    if lib is None:
-      if self.device.compiler_cachekey is not None: lib = diskcache_get(self.device.compiler_cachekey, prg)
-      if lib is None:
-        lib = self.device.compiler(prg)
-        if self.device.compiler_cachekey is not None: diskcache_put(self.device.compiler_cachekey, prg, lib)
+    lib:bytes = precompiled if precompiled is not None else self.device.compiler.compile_cached(prg)
     self.lib, self.clprg = lib, self.device.runtime(self.name, lib)
     self.vars: List[Variable] = []
     if ast:
@@ -306,31 +313,29 @@ class CompiledASTRunner(JITRunner):
     return et
 
 class Compiled:
-  def __init__(self, device:str, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, compiler_cachekey, runtime, graph=None):
-    self.dname, self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph, self.compiler_cachekey = \
-      device, allocator, linearizer_opts, renderer, compiler, runtime, graph, None if getenv("DISABLE_COMPILER_CACHE") else compiler_cachekey
+  def __init__(self, device:str, allocator:Allocator, compiler:Compiler, runtime, graph=None):
+    self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler, runtime, graph
   def synchronize(self): pass  # override this in your device
 
   def to_program(self, k:Linearizer) -> CompiledASTRunner:
-    assert self.compiler is not None, f"compiler is None, can't build {k.ast}"
     k.linearize()
-    return CompiledASTRunner(k.ast, k.name, self.renderer(to_function_name(k.name), k.uops), self, k.global_size, k.local_size)
+    return CompiledASTRunner(k.ast, k.name, self.compiler.render(to_function_name(k.name), k.uops), self, k.global_size, k.local_size)
 
   def get_linearizer(self, ast:LazyOp) -> Linearizer:
     if DEBUG >= 3:
       from tinygrad.graph import print_tree
       print_tree(ast)
     from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(ast, self.linearizer_opts)
+    k = Linearizer(ast, self.compiler.linearizer_opts)
     k.required_optimizations()
     if not NOOPT:
       if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
       if BEAM >= 1:
         lins = [(("tc" if used_tensor_cores else "hc"), k)]
         if used_tensor_cores:
-          lins.append(("hc", Linearizer(ast, self.linearizer_opts)))
+          lins.append(("hc", Linearizer(ast, self.compiler.linearizer_opts)))
           lins[-1][1].hand_coded_optimizations()
-        kb = Linearizer(ast, self.linearizer_opts)
+        kb = Linearizer(ast, self.compiler.linearizer_opts)
         kb.required_optimizations()
         from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
         test_rawbuffers = bufs_from_lin(kb)    # allocate scratch buffers for optimization
diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py
index 3680514198..91b615eaef 100644
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -1,6 +1,6 @@
 from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
 import itertools, functools, random, math, time, multiprocessing, traceback, signal
-from tinygrad.device import Device, Compiled, Buffer, CompiledASTRunner
+from tinygrad.device import Device, Compiled, Buffer, CompiledASTRunner, Compiler
 from tinygrad.ops import MemBuffer, LazyOp
 from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
 from tinygrad.dtype import ImageDType
@@ -43,13 +43,13 @@ def _time_program(ast:LazyOp, rdev:Compiled, lib:bytes, global_size, local_size,
     if early_stop is not None and early_stop < tms[-1]: break
   return tms
 
-def _compile_linearizer(rdev:Compiled, lin:Linearizer, name:Optional[str]=None) -> Tuple[bytes, Optional[List[int]], Optional[List[int]]]:
+def _compile_linearizer(compiler:Compiler, lin:Linearizer, name:Optional[str]=None) -> Tuple[bytes, Optional[List[int]], Optional[List[int]]]:
   lin.linearize()
-  src = rdev.renderer(name if name is not None else to_function_name(lin.name), lin.uops)   # NOTE: these all have the same name for deduping
-  return rdev.compiler(src), lin.global_size, lin.local_size
+  src = compiler.render(name if name is not None else to_function_name(lin.name), lin.uops)   # NOTE: these all have the same name for deduping
+  return compiler.compile(src), lin.global_size, lin.local_size
 
-def _try_compile_linearized_w_idx(x, device:str):
-  try: return (x[0], _compile_linearizer(cast(Compiled, Device[device]), x[1], "test"))
+def _try_compile_linearized_w_idx(x, compiler:Compiler):
+  try: return (x[0], _compile_linearizer(compiler, x[1], "test"))
   except Exception:
     if DEBUG >= 4: traceback.print_exc()
     return (x[0], None)
@@ -110,7 +110,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea
     while not exiting:
       acted_lins = flatten([get_linearizer_actions(lin, include_0=False).values() for lin,_ in beam]) if len(beam) else [lin]
       timed_lins: List[Tuple[Linearizer, float]] = []
-      _compile_fn = functools.partial(_try_compile_linearized_w_idx, device=lin.opts.device)
+      _compile_fn = functools.partial(_try_compile_linearized_w_idx, compiler=cast(Compiled, Device[lin.opts.device]).compiler)
       for i,proc in (pool.imap_unordered(_compile_fn, enumerate(acted_lins)) if pool is not None else map(_compile_fn, enumerate(acted_lins))):
         if proc is None: continue
         lib, global_size, local_size = proc
@@ -155,7 +155,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True,
   assert isinstance(dev, Compiled)
 
   var_vals = {k:(k.max+k.min)//2 for k in lin.ast.vars()}
-  lib, global_size, local_size = _compile_linearizer(dev, lin)
+  lib, global_size, local_size = _compile_linearizer(dev.compiler, lin)
   tms = _time_program(lin.ast, dev, lib, global_size, local_size, var_vals, rawbufs, max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))  # noqa: E501
 
   if CACHELEVEL >= 2: diskcache_put("time_linearizer", key, tms)
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 832dbf21af..b7eb5b3a83 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,16 +1,20 @@
-import ctypes, subprocess, functools, pathlib, tempfile
-from tinygrad.device import Compiled, MallocAllocator
+import ctypes, subprocess, pathlib, tempfile
+from tinygrad.device import Compiled, MallocAllocator, Compiler
 from tinygrad.helpers import cpu_time_execution
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
 
 CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include <stdbool.h>\n'  # noqa: E501
 
-def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
-  # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
-  with tempfile.NamedTemporaryFile(delete=True) as output_file:
-    subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))  # noqa: E501
-    return pathlib.Path(output_file.name).read_bytes()
+class ClangCompiler(Compiler):
+  linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
+  def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
+  def compile(self, src:str) -> bytes:
+    # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
+    with tempfile.NamedTemporaryFile(delete=True) as output_file:
+      subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+ \
+                                    str(output_file.name)).split(), input=(CLANG_PROGRAM_HEADER+src).encode('utf-8'))
+      return pathlib.Path(output_file.name).read_bytes()
 
 class ClangProgram:
   def __init__(self, name:str, lib:bytes):
@@ -23,6 +27,4 @@ class ClangProgram:
   def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
 
 class ClangDevice(Compiled):
-  def __init__(self, device:str):
-    super().__init__(device, MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False),
-                     functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict")), compile_clang, "compile_clang", ClangProgram)
+  def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangCompiler("compile_clang"), ClangProgram)
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index f4451508b1..1cb72ccff9 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -4,7 +4,7 @@ from pathlib import Path
 from typing import Tuple, Optional
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, colored, cpu_time_execution, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style  # noqa: E501
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import CUDARenderer
 
@@ -29,7 +29,16 @@ def check(status):
 
 def cu_time_execution(cb, enable=False) -> Optional[float]: return time_execution_cuda_style(cb, cuda.CUevent, cuda.cuEventCreate, cuda.cuEventRecord, cuda.cuEventSynchronize, cuda.cuEventDestroy_v2, cuda.cuEventElapsedTime, enable=enable) if not CUDACPU else cpu_time_execution(cb, enable=enable)  # noqa: E501
 
-def compile_cuda(prg:str, arch="sm_35") -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check)  # noqa: E501
+class CUDACompiler(Compiler):
+  linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024])
+  def __init__(self, arch:str):
+    self.arch = arch
+    super().__init__(f"compile_cuda_{self.arch}")
+  def render(self, name:str, uops) -> str: return CUDARenderer(name, uops)
+  def compile(self, src:str) -> bytes:
+    return compile_cuda_style(src, [f'--gpu-architecture={self.arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"],
+                              cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX,
+                              cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check)
 
 class CUDAProgram:
   def __init__(self, device:CUDADevice, name:str, lib:bytes):
@@ -83,10 +92,8 @@ class CUDADevice(Compiled):
     self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
 
     from tinygrad.runtime.graph.cuda import CUDAGraph
-    super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
-                     LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
-                     CUDARenderer, functools.partial(compile_cuda,arch=self.arch), f"compile_cuda_{self.arch}", functools.partial(CUDAProgram, self),
-                     graph=CUDAGraph if not CUDACPU else None)
+    super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, CUDACompiler(self.arch),
+                     functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
   def synchronize(self):
     if not CUDACPU:
       check(cuda.cuCtxSetCurrent(self.context))
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 1da6b397ed..9f03265f54 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
 
 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -14,18 +14,24 @@ def check(status):
   if status != 0: raise RuntimeError(f"OpenCL Error {status}")
 def checked(ret, status): return (check(status.value), ret)[1]
 
-def compile_cl(device:CLDevice, prg:str) -> bytes:
-  program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
-                                                 ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
-  status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
-  if status != 0:
-    cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
-    cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None)  # noqa: E501
-    raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
-  binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None)))  # noqa: E501
-  binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None)))  # noqa: E501
-  check(cl.clReleaseProgram(program))
-  return bytes(binary)
+class CLCompiler(Compiler):
+  linearizer_opts = LinearizerOptions("GPU")
+  def __init__(self, device:CLDevice, compile_key:str):
+    self.device = device
+    super().__init__(f"compile_cl_{compile_key}")
+  def render(self, name:str, uops) -> str: return OpenCLRenderer(name, uops)
+  def compile(self, src:str) -> bytes:
+    program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([prg_bytes := src.encode()]),
+                                                  ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
+    status = cl.clBuildProgram(program, 1, ctypes.byref(self.device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
+    if status != 0:
+      cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
+      cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None)  # noqa: E501
+      raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
+    binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None)))  # noqa: E501
+    binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None)))  # noqa: E501
+    check(cl.clReleaseProgram(program))
+    return bytes(binary)
 
 class CLProgram:
   def __init__(self, device:CLDevice, name:str, lib:bytes):
@@ -96,8 +102,7 @@ class CLDevice(Compiled):
     self.pending_copyin: List[memoryview] = []
 
     compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
-    super().__init__(device, CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer,
-                     functools.partial(compile_cl, self), f"compile_cl_{compile_key}", functools.partial(CLProgram, self))
+    super().__init__(device, CLAllocator(self), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
   def synchronize(self):
     check(cl.clFinish(self.queue))
     self.pending_copyin.clear()
diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py
index ba12dcc6d8..7225da2ffa 100644
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List, Any, cast, Set
 import tinygrad.runtime.autogen.hip as hip
 from tinygrad.helpers import DEBUG, getenv, init_c_var
 from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.runtime.compiler.hip_comgr import compile_hip
@@ -12,6 +12,14 @@ from tinygrad.runtime.compiler.hip_comgr import compile_hip
 # The default HIP stream is used for everything.
 MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
 
+class HIPCompiler(Compiler):
+  linearizer_opts = LinearizerOptions("HIP")
+  def __init__(self, arch:str):
+    self.arch = arch
+    super().__init__(f"compile_hip_{self.arch}")
+  def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
+  def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
+
 hip_current_device = None
 def hip_set_device(d:int):
   global hip_current_device
@@ -132,8 +140,8 @@ class HIPDevice(Compiled):
     self.peers: Set[int] = set()
 
     from tinygrad.runtime.graph.hip import HIPGraph
-    super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
-                     functools.partial(compile_hip,arch=self.arch), f"compile_hip_{self.arch}", functools.partial(HIPProgram, self.device), HIPGraph)
+    super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch),
+                     functools.partial(HIPProgram, self.device), HIPGraph)
   def synchronize(self):
     hip_set_device(self.device)
     check(hip.hipDeviceSynchronize())
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index a5ec322dea..21ef7a6dd9 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -1,18 +1,24 @@
 from __future__ import annotations
 import ctypes, functools
 from typing import Tuple
-from tinygrad.device import Compiled, MallocAllocator
+from tinygrad.device import Compiled, MallocAllocator, Compiler
 from tinygrad.helpers import DEBUG, cpu_time_execution
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
 import llvmlite.binding as llvm
 
-def compile_llvm(device, prg) -> bytes:
-  mod = llvm.parse_assembly(prg)
-  mod.verify()
-  device.optimizer.run(mod)
-  if DEBUG >= 5: print(device.target_machine.emit_assembly(mod))
-  return device.target_machine.emit_object(mod)
+class LLVMCompiler(Compiler):
+  linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
+  def __init__(self, device:LLVMDevice):
+    self.device = device
+    super().__init__("compile_llvm")
+  def render(self, name:str, uops) -> str: return uops_to_llvm_ir(name, uops)
+  def compile(self, src:str) -> bytes:
+    mod = llvm.parse_assembly(src)
+    mod.verify()
+    self.device.optimizer.run(mod)
+    if DEBUG >= 5: print(self.device.target_machine.emit_assembly(mod))
+    return self.device.target_machine.emit_object(mod)
 
 class LLVMProgram:
   def __init__(self, device:LLVMDevice, name:str, lib:bytes):
@@ -38,5 +44,4 @@ class LLVMDevice(Compiled):
     backing_mod = llvm.parse_assembly(str())
     backing_mod.triple = llvm.get_process_triple()
     self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
-    super().__init__(device, MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False),
-                     uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self))
+    super().__init__(device, MallocAllocator, LLVMCompiler(self), functools.partial(LLVMProgram, self))
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index 39b6a98730..ae6a186f95 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -4,18 +4,24 @@ import Metal, libdispatch
 from typing import List, Any, Tuple, Optional
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
-from tinygrad.device import Compiled, LRUAllocator
+from tinygrad.device import Compiled, LRUAllocator, Compiler
 from tinygrad.renderer.cstyle import MetalRenderer
 
-def compile_metal_xcode(prg:str) -> bytes:
-  # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
-  air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
-  return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
-
-def compile_metal(device, prg:str) -> bytes:
-  options = Metal.MTLCompileOptions.new()
-  library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None))
-  return library.libraryDataContents().bytes().tobytes()
+class MetalCompiler(Compiler):
+  linearizer_opts = LinearizerOptions("METAL")
+  def __init__(self, device:Optional[MetalDevice]):
+    self.device = device
+    super().__init__("compile_metal")
+  def render(self, name:str, uops) -> str: return MetalRenderer(name, uops)
+  def compile(self, src:str) -> bytes:
+    if self.device is None:
+      # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
+      air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
+      return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
+    else:
+      options = Metal.MTLCompileOptions.new()
+      library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
+      return library.libraryDataContents().bytes().tobytes()
 
 class MetalProgram:
   def __init__(self, device:MetalDevice, name:str, lib:bytes):
@@ -78,8 +84,7 @@ class MetalDevice(Compiled):
     self.mtl_buffers_in_flight: List[Any] = []
     self.mv_in_metal: List[memoryview] = []
     from tinygrad.runtime.graph.metal import MetalGraph
-    super().__init__(device, MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer,
-                     compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal",
+    super().__init__(device, MetalAllocator(self), MetalCompiler(None if getenv("METAL_XCODE") else self),
                      functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
   def synchronize(self):
     for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()