diff --git a/test/external/speed_compare_cuda_nv.py b/test/external/speed_compare_cuda_nv.py
index 48b7578864..fc8eb167ab 100644
--- a/test/external/speed_compare_cuda_nv.py
+++ b/test/external/speed_compare_cuda_nv.py
@@ -25,12 +25,12 @@ if __name__ == "__main__":
     # cuda compile
     culin = ast_str_to_lin(ast, opts=cudev.compiler.compiler_opts)
     culin.hand_coded_optimizations()
-    cuda_prg = cudev.to_program(culin)
+    cuda_prg = cudev.to_runner(culin)
     cubufs = bufs_from_lin(culin)
 
     nvlin = ast_str_to_lin(ast, opts=nvdev.compiler.compiler_opts)
     nvlin.hand_coded_optimizations()
-    nv_prg = nvdev.to_program(nvlin)
+    nv_prg = nvdev.to_runner(nvlin)
     nvbufs = bufs_from_lin(nvlin)
 
     # warmup
diff --git a/test/test_fusion_op.py b/test/test_fusion_op.py
index d1b09c1e1c..faab57cf8c 100644
--- a/test/test_fusion_op.py
+++ b/test/test_fusion_op.py
@@ -1,9 +1,9 @@
 import unittest
 import time
 import numpy as np
-from tinygrad import Tensor, dtypes, Device
+from tinygrad import Tensor, dtypes
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import run_schedule
+from tinygrad.engine.realize import lower_schedule_item, run_schedule
 
 class TestFusionOp(unittest.TestCase):
   def test_contiguous_add(self):
@@ -27,9 +27,9 @@ class TestFusionOp(unittest.TestCase):
     a = Tensor([1,2,3,4])
     for _ in range(24): a = a + a
     sched = create_schedule([a.lazydata], None)
-    ji = Device[Device.DEFAULT].get_runner(*sched[-1].ast)
+    ei = lower_schedule_item(sched[-1])
     self.assertLess(time.perf_counter()-st, 1.0)
-    assert len(ji.p.src.splitlines()) < 250
+    assert len(ei.prg.p.src.splitlines()) < 250
 
   def test_recursive_add_cmp(self):
     st = time.perf_counter()
diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py
index 2dcc21df22..29b968e90e 100644
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@@ -1,6 +1,7 @@
 import unittest
-from tinygrad import Tensor, Device
+from tinygrad import Tensor
 from tinygrad.engine.schedule import create_schedule
+from tinygrad.engine.realize import lower_schedule_item
 
 # TODO: can copy this in here when we remove it
 #from tinygrad.ops import get_lazyop_info
@@ -12,8 +13,8 @@ from tinygrad.engine.schedule import create_schedule
 
 def get_stats(x:Tensor):
   si = create_schedule([x.lazydata])[-1]
-  runner = Device[Device.DEFAULT].get_runner(*si.ast)
-  return runner.op_estimate, runner.mem_estimate
+  ei = lower_schedule_item(si)
+  return ei.prg.p.op_estimate, ei.prg.p.mem_estimate
 
 class TestUOpsStats(unittest.TestCase):
   def test_simple_add(self):
diff --git a/tinygrad/device.py b/tinygrad/device.py
index d59765e61e..25e9f3b10a 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -181,43 +181,6 @@ class Runner:
 
 # **************** for Compiled Devices ****************
 
-def fake_renderer(name, uops): raise NotImplementedError("needs a renderer")
-
-@dataclass(frozen=True)
-class CompilerOptions:
-  device: str = ""
-  suffix: str = ""
-  # TODO: make this generic with a list of supported types
-  supports_float4: bool = True
-  has_local: bool = True
-  has_shared: bool = True
-  has_tensor_cores: bool = False
-  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
-  global_max: Optional[List[int]] = None
-  local_max: Optional[List[int]] = None
-  shared_max: int = 32768
-  renderer: Callable = fake_renderer
-
-class Compiler:
-  compiler_opts: ClassVar[CompilerOptions]
-  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
-  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
-  def compile_cached(self, src:str) -> bytes:
-    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
-      lib = self.compile(src)
-      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
-    return lib
-
-  def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program:
-    k.linearize()
-    info = get_lazyop_info(k.ast[0])
-    ops, mem = k.uops.flops_mem()
-    run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else []))
-    # NOTE: we use min here to ignore the indexing FLOPS
-    return Program(k.name, self.compiler_opts.renderer(to_function_name(k.name), k.uops),
-                   override_device if override_device else self.compiler_opts.device,
-                   k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count))
-
 @dataclass(frozen=True)
 class Program:
   name:str
@@ -246,6 +209,43 @@ class Program:
     local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else None
     return global_size, local_size
 
+def fake_renderer(name, uops): raise NotImplementedError("needs a renderer")
+
+@dataclass(frozen=True)
+class CompilerOptions:
+  device: str = ""
+  suffix: str = ""
+  # TODO: make this generic with a list of supported types
+  supports_float4: bool = True
+  has_local: bool = True
+  has_shared: bool = True
+  has_tensor_cores: bool = False
+  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
+  global_max: Optional[List[int]] = None
+  local_max: Optional[List[int]] = None
+  shared_max: int = 32768
+  renderer: Callable = fake_renderer
+
+  def to_program(self, k:Linearizer, override_device:Optional[str]=None) -> Program:
+    k.linearize()
+    info = get_lazyop_info(k.ast[0])
+    ops, mem = k.uops.flops_mem()
+    run_count = prod((k.global_size if k.global_size else []) + (k.local_size if k.local_size else []))
+    # NOTE: we use min here to ignore the indexing FLOPS
+    return Program(k.name, self.renderer(to_function_name(k.name), k.uops),
+                   override_device if override_device else self.device,
+                   k.global_size, k.local_size, k.uops, min(info.flops, ops * run_count), min(info.mem_estimate, mem * run_count))
+
+class Compiler:
+  compiler_opts: ClassVar[CompilerOptions]
+  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
+  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile_cached(self, src:str) -> bytes:
+    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
+      lib = self.compile(src)
+      if self.cachekey is not None: diskcache_put(self.cachekey, src, lib)
+    return lib
+
 class CompiledRunner(Runner):
   def __init__(self, p:Program, precompiled:Optional[bytes]=None):
     if DEBUG >= 4: print(p.src)
@@ -280,7 +280,7 @@ class Compiled:
     self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler if compiler else Compiler(), runtime, graph
   def synchronize(self): pass  # override this in your device
 
-  def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.to_program(k, override_device=self.dname))
+  def to_runner(self, k:Linearizer) -> CompiledRunner: return CompiledRunner(self.compiler.compiler_opts.to_program(k, override_device=self.dname))
 
   def get_linearizer(self, *ast:LazyOp) -> Linearizer:
     if DEBUG >= 3:
diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py
index 628004dced..a01fab9472 100644
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -51,7 +51,7 @@ def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) ->
   try:
     x[1].linearize()
     if len(x[1].uops.uops) >= getenv("BEAM_UOPS_MAX", 3000) > 0: raise RuntimeError("too many uops")
-    p = compiler.to_program(x[1])
+    p = compiler.compiler_opts.to_program(x[1])
     st = time.perf_counter()
     prog = compiler.compile(p.src)
     et = time.perf_counter() - st
@@ -174,7 +174,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True,
 
   rawbufs = _ensure_buffer_alloc(rawbufs)
   var_vals = {k:(k.max+k.min)//2 for k in lin.ast[0].vars()}
-  p = dev.compiler.to_program(lin)
+  p = dev.compiler.compiler_opts.to_program(lin)
   tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs,
                       max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))