diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py
index 06e82e263b..4312997d57 100755
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@@ -6,7 +6,8 @@ Device.DEFAULT = "CLANG"
 from train_gpt2 import GPT, GPTConfig
 from tinygrad.helpers import dedup, to_function_name, flatten, getenv, GRAPH, GlobalCounters, ansilen, to_function_name
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import memory_planner, run_schedule
+from tinygrad.engine.realize import run_schedule
+from tinygrad.engine.memory import memory_planner
 from tinygrad.ops import BufferOps, LoadOps
 
 TIMING = getenv("TIMING")
diff --git a/openpilot/compile2.py b/openpilot/compile2.py
index c650f293c2..835b6f5e8a 100644
--- a/openpilot/compile2.py
+++ b/openpilot/compile2.py
@@ -18,7 +18,8 @@ from tinygrad.buffer import Buffer
 from tinygrad.dtype import ImageDType
 from tinygrad.device import CompiledRunner
 from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG
-from tinygrad.engine.realize import run_schedule, memory_planner, lower_schedule, ExecItem
+from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem
+from tinygrad.engine.memory import memory_planner
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.ops import LoadOps, ScheduleItem
 Device.DEFAULT = "GPU"
diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 317a5f5685..3a60f0f30a 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1,12 +1,12 @@
 import unittest, functools, random
 from typing import List
 from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes
-from tinygrad.device import BufferCopy, CompiledRunner
+from tinygrad.device import CompiledRunner
 from tinygrad.ops import LoadOps, ReduceOps
 from tinygrad.helpers import CI, prod, Context
 from tinygrad.nn.state import get_parameters, get_state_dict
 from tinygrad.engine.schedule import create_schedule
-from tinygrad.engine.realize import lower_schedule
+from tinygrad.engine.realize import lower_schedule, BufferCopy
 from tinygrad.features.multi import all_reduce, MultiLazyBuffer
 from random import randint
 import numpy as np
diff --git a/tinygrad/buffer.py b/tinygrad/buffer.py
index b8b06c3ab0..802c9b6974 100644
--- a/tinygrad/buffer.py
+++ b/tinygrad/buffer.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
-from typing import Any, Optional
+from typing import Any, Optional, Dict, Tuple
+import ctypes
+from collections import defaultdict
 from dataclasses import dataclass
-from tinygrad.helpers import GlobalCounters, flat_mv
+from tinygrad.helpers import GlobalCounters, flat_mv, from_mv, getenv
 from tinygrad.dtype import DType, ImageDType
 
 @dataclass(frozen=True, eq=True)
@@ -90,3 +92,40 @@ class Buffer:
     assert offset < self.nbytes, "offset must be less than nbytes"
     if self._base is not None: return Buffer(self.device, size, dtype, base=self._base, offset=self.offset+offset)
     return Buffer(self.device, size, dtype, base=self, offset=offset)
+
+# TODO: size, dest, src are the same type. can we enforce this?
+class Allocator:
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
+    return self._alloc(size, options if options is not None else BufferOptions())
+  def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
+  def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
+    self._free(opaque, options if options is not None else BufferOptions())
+  def _free(self, opaque, options:BufferOptions): pass  # if opaque is a Python object, you don't need a free
+  def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
+  def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
+
+class LRUAllocator(Allocator):  # pylint: disable=abstract-method
+  def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
+  def alloc(self, size:int, options:Optional[BufferOptions]=None):
+    if len(c := self.cache[(size, options)]): return c.pop()
+    try: return super().alloc(size, options)
+    except (RuntimeError, MemoryError):
+      self.free_cache()
+      return super().alloc(size, options)
+  def free_cache(self):
+    for (sz,options),opaques in self.cache.items():
+      for opaque in opaques: super().free(opaque, sz, options)
+      opaques.clear()
+  def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
+    if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque)
+    else: super().free(opaque, size, options)
+
+class _MallocAllocator(LRUAllocator):
+  def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
+  def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
+  def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
+  def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
+  def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size])
+
+MallocAllocator = _MallocAllocator()
diff --git a/tinygrad/device.py b/tinygrad/device.py
index 139ba37bb5..b879017a66 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 import multiprocessing
-from collections import defaultdict
 from dataclasses import dataclass, replace
-from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar
-import importlib, inspect, functools, pathlib, time, ctypes, os
-from tinygrad.helpers import prod, getenv, colored, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT
+from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, ClassVar
+import importlib, inspect, functools, pathlib, os
+from tinygrad.helpers import prod, getenv, all_int, to_function_name, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.ops import LazyOp, get_lazyop_info
-from tinygrad.buffer import Buffer, BufferOptions
+from tinygrad.buffer import Buffer, Allocator
 from tinygrad.codegen.uops import UOpGraph
 
 if TYPE_CHECKING:
@@ -53,74 +52,6 @@ class Runner:
   def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]:
     raise NotImplementedError("override this")
 
-# **************** Buffer / Allocator ****************
-
-class BufferCopy(Runner):
-  def __init__(self, total_sz, dest_device, src_device):
-    if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
-    else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
-    super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
-  def copy(self, dest, src):
-    if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_fd') and src.nbytes >= 4096 and hasattr(src.allocator.device, 'fd'):
-      dest.allocator.copy_from_fd(dest._buf, src.allocator.device.fd, src._buf.offset, src.nbytes)
-    elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
-      # fast(ish) path, uses readinto in diskbuffers
-      src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
-    else:
-      dest.copyin(src.as_buffer(allow_zero_copy=True))  # may allocate a CPU buffer depending on allow_zero_copy
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
-    dest, src = rawbufs[0:2]
-    assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}"
-    st = time.perf_counter()
-    self.copy(dest, src)
-    if wait:
-      Device[dest.device].synchronize()
-      return time.perf_counter() - st
-
-class BufferXfer(BufferCopy):
-  def copy(self, dest, src):
-    if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"):
-      dest.allocator.device.track_cross_buffer.append(src)
-      src.allocator.track_cross_device.add(dest.allocator.device)
-    dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
-
-# TODO: size, dest, src are the same type. can we enforce this?
-class Allocator:
-  def alloc(self, size:int, options:Optional[BufferOptions]=None):
-    assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}"
-    return self._alloc(size, options if options is not None else BufferOptions())
-  def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc")
-  def free(self, opaque, size:int, options:Optional[BufferOptions]=None):
-    self._free(opaque, options if options is not None else BufferOptions())
-  def _free(self, opaque, options:BufferOptions): pass  # if opaque is a Python object, you don't need a free
-  def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
-  def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
-
-class LRUAllocator(Allocator):  # pylint: disable=abstract-method
-  def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list)
-  def alloc(self, size:int, options:Optional[BufferOptions]=None):
-    if len(c := self.cache[(size, options)]): return c.pop()
-    try: return super().alloc(size, options)
-    except (RuntimeError, MemoryError):
-      self.free_cache()
-      return super().alloc(size, options)
-  def free_cache(self):
-    for (sz,options),opaques in self.cache.items():
-      for opaque in opaques: super().free(opaque, sz, options)
-      opaques.clear()
-  def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None):
-    if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque)
-    else: super().free(opaque, size, options)
-
-class _MallocAllocator(LRUAllocator):
-  def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)()
-  def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src))
-  def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
-  def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
-  def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size])
-
-MallocAllocator = _MallocAllocator()
-
 # **************** for Compiled Devices ****************
 
 @dataclass(frozen=True)
@@ -259,4 +190,3 @@ class Compiled:
     else:
       method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast))
     return ret
-
diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py
index 48c0a10cae..c4c7eac2d0 100644
--- a/tinygrad/engine/jit.py
+++ b/tinygrad/engine/jit.py
@@ -4,11 +4,12 @@ import functools, itertools, collections
 from tinygrad.tensor import Tensor
 from tinygrad.lazy import LazyBuffer
 from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT
-from tinygrad.device import Buffer, CompiledRunner, BufferXfer, Compiled, Device, Runner
+from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner
 from tinygrad.dtype import DType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, sint
-from tinygrad.engine.realize import ExecItem, capturing, _internal_memory_planner, EmptyOp, ViewOp
+from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer
+from tinygrad.engine.memory import _internal_memory_planner
 from tinygrad.nn.state import get_parameters
 from weakref import WeakKeyDictionary
 
diff --git a/tinygrad/engine/memory.py b/tinygrad/engine/memory.py
new file mode 100644
index 0000000000..9ebcf0961e
--- /dev/null
+++ b/tinygrad/engine/memory.py
@@ -0,0 +1,43 @@
+from typing import List, Dict, DefaultDict, Tuple, Union
+from collections import defaultdict
+from tinygrad.dtype import DType
+from tinygrad.buffer import Buffer
+from tinygrad.helpers import getenv, DEBUG, dedup
+from tinygrad.ops import ScheduleItem
+
+def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], debug_prefix="") -> Dict[Buffer, Buffer]:
+  if getenv("NO_MEMORY_PLANNER"): return {}
+  last_appearance = {}
+  for i,u in enumerate(buffers):
+    for buf in u: last_appearance[buf] = i
+
+  # LRU algorithm
+  assigned: Dict[Buffer, Buffer] = {}
+  local_cache: DefaultDict[Tuple[str, int, DType], List[Buffer]] = defaultdict(list)
+
+  def handle_buffer(buf):
+    key = (buf.device, buf.size, buf.dtype)
+    if buf not in assigned:
+      if len(ll:=local_cache[key]): assigned[buf] = ll.pop()
+      else: assigned[buf] = Buffer(*key)
+    if i == last_appearance[buf]:
+      if assigned[buf] not in local_cache[key]: local_cache[key].append(assigned[buf])
+
+  for i,u in enumerate(buffers):
+    for buf in u:
+      # all unallocated unparented buffers are fair game to replace
+      if buf.is_allocated() or buf.lb_refcount > 0: continue
+      # handle view buffers
+      if buf._base is not None:
+        assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf._base, buf._base), offset=buf.offset)
+      else:
+        handle_buffer(buf)
+
+  if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())):
+    print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,",
+          f"{len(ak)} -> {len(av)} bufs")
+  return assigned
+
+def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]:
+  assigned = _internal_memory_planner([si.bufs for si in schedule])
+  return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs)) for si in schedule]
diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py
index 2659c3d736..5688891c48 100644
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -1,13 +1,60 @@
-from typing import List, Dict, Optional, cast, Generator, DefaultDict, Tuple, Union
-from collections import defaultdict
+from typing import List, Dict, Optional, cast, Generator, Tuple
+import time
 from dataclasses import dataclass
-from tinygrad.dtype import DType
-from tinygrad.helpers import colored, getenv, dedup, DEBUG, GlobalCounters, ansilen
-from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, copy_ast
-from tinygrad.device import Runner, Device, BufferCopy, BufferXfer
+from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen
+from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, copy_ast, LazyOp
+from tinygrad.device import Runner, Device
 from tinygrad.buffer import Buffer
 from tinygrad.shape.symbolic import Variable, sym_infer
 
+# **************** Runners ****************
+
+class CustomOp(Runner):
+  def __init__(self, fxn):
+    self.fxn = fxn
+    super().__init__(self.fxn.__name__, "CUSTOM", 0, 0)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs)
+
+class EmptyOp(Runner):
+  def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass
+
+class ViewOp(Runner):
+  def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device)
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+    assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}"
+
+class BufferCopy(Runner):
+  def __init__(self, total_sz, dest_device, src_device):
+    if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
+    else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}"
+    super().__init__(colored(name, "yellow"), dest_device, 0, total_sz)
+  def copy(self, dest, src):
+    if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_fd') and src.nbytes >= 4096 and hasattr(src.allocator.device, 'fd'):
+      dest.allocator.copy_from_fd(dest._buf, src.allocator.device.fd, src._buf.offset, src.nbytes)
+    elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'):
+      # fast(ish) path, uses readinto in diskbuffers
+      src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf)
+    else:
+      dest.copyin(src.as_buffer(allow_zero_copy=True))  # may allocate a CPU buffer depending on allow_zero_copy
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
+    dest, src = rawbufs[0:2]
+    assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}"
+    st = time.perf_counter()
+    self.copy(dest, src)
+    if wait:
+      Device[dest.device].synchronize()
+      return time.perf_counter() - st
+
+class BufferXfer(BufferCopy):
+  def copy(self, dest, src):
+    if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"):
+      dest.allocator.device.track_cross_buffer.append(src)
+      src.allocator.track_cross_device.add(dest.allocator.device)
+    dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device)
+
+# **************** lowering functions ****************
+
 @dataclass(frozen=True)
 class ExecItem:
   prg: Runner
@@ -27,34 +74,20 @@ class ExecItem:
       self.prg.first_run = False
     return et
 
-class CustomOp(Runner):
-  def __init__(self, fxn):
-    self.fxn = fxn
-    super().__init__(self.fxn.__name__, "CUSTOM", 0, 0)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs)
-
-class EmptyOp(Runner):
-  def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass
-
-class ViewOp(Runner):
-  def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device)
-  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
-    assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}"
-
-def lower_runner(runner:Runner, bufs) -> ExecItem:
+def lower_runner(dname:str, ast:Tuple[LazyOp, ...], bufs) -> ExecItem:
+  runner = Device[dname].get_runner(*ast)
   # TODO: globals isn't on the stupid diskrunner, remove the need for it
   return ExecItem(runner, [bufs[x[0]] for x in runner.p.globals] if hasattr(runner, 'p') else bufs)
 
 def lower_schedule_item(si:ScheduleItem) -> ExecItem:
   assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY
-  if si.ast[0].op is BufferOps.STORE: return lower_runner(Device[si.outputs[0].device].get_runner(*si.ast), si.bufs)
+  if si.ast[0].op is BufferOps.STORE: return lower_runner(si.outputs[0].device, si.ast, si.bufs)
   assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
   out, ast = si.outputs[0], si.ast[0]
   if ast.op is LoadOps.COPY:
     kernel_type = BufferCopy
     if hasattr(Device[out.device].allocator, 'transfer') and out.device.split(":")[0] == si.inputs[0].device.split(":")[0]:
-      if getenv("USE_COPY_KERNEL"): return lower_runner(Device[out.device].get_runner(copy_ast(ast.arg)), si.bufs)
+      if getenv("USE_COPY_KERNEL"): return lower_runner(out.device, (copy_ast(ast.arg),), si.bufs)
       kernel_type = BufferXfer
     return ExecItem(kernel_type(ast.arg, out.device, si.inputs[0].device), list(si.bufs))
   if ast.op is LoadOps.CUSTOM: return ExecItem(CustomOp(ast.arg), list(si.bufs))
@@ -65,45 +98,10 @@ def lower_schedule_item(si:ScheduleItem) -> ExecItem:
 def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, None]:
   while len(schedule): yield lower_schedule_item(schedule.pop(0))
 
+# **************** main run function ****************
+
 capturing: List = []  # put classes with an add method in here
 
-def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], debug_prefix="") -> Dict[Buffer, Buffer]:
-  if getenv("NO_MEMORY_PLANNER"): return {}
-  last_appearance = {}
-  for i,u in enumerate(buffers):
-    for buf in u: last_appearance[buf] = i
-
-  # LRU algorithm
-  assigned: Dict[Buffer, Buffer] = {}
-  local_cache: DefaultDict[Tuple[str, int, DType], List[Buffer]] = defaultdict(list)
-
-  def handle_buffer(buf):
-    key = (buf.device, buf.size, buf.dtype)
-    if buf not in assigned:
-      if len(ll:=local_cache[key]): assigned[buf] = ll.pop()
-      else: assigned[buf] = Buffer(*key)
-    if i == last_appearance[buf]:
-      if assigned[buf] not in local_cache[key]: local_cache[key].append(assigned[buf])
-
-  for i,u in enumerate(buffers):
-    for buf in u:
-      # all unallocated unparented buffers are fair game to replace
-      if buf.is_allocated() or buf.lb_refcount > 0: continue
-      # handle view buffers
-      if buf._base is not None:
-        assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf._base, buf._base), offset=buf.offset)
-      else:
-        handle_buffer(buf)
-
-  if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())):
-    print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,",
-          f"{len(ak)} -> {len(av)} bufs")
-  return assigned
-
-def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]:
-  assigned = _internal_memory_planner([si.bufs for si in schedule])
-  return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs)) for si in schedule]
-
 def run_schedule(schedule:List[ScheduleItem], var_vals:Optional[Dict[Variable, int]]=None):
   for ei in lower_schedule(schedule):
     if len(capturing): capturing[0].add(ei)
diff --git a/tinygrad/runtime/graph/cuda.py b/tinygrad/runtime/graph/cuda.py
index 3ca47f8322..3f2ad0f4cc 100644
--- a/tinygrad/runtime/graph/cuda.py
+++ b/tinygrad/runtime/graph/cuda.py
@@ -2,10 +2,10 @@ import ctypes
 from typing import Any, Optional, Tuple, Dict, List, cast
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import init_c_var, GraphException
-from tinygrad.device import CompiledRunner, Buffer, BufferXfer, Device
+from tinygrad.device import CompiledRunner, Buffer, Device
 from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
 from tinygrad.shape.symbolic import Variable
-from tinygrad.engine.realize import ExecItem
+from tinygrad.engine.realize import ExecItem, BufferXfer
 from tinygrad.engine.jit import MultiGraphRunner
 
 class CUDAGraph(MultiGraphRunner):
diff --git a/tinygrad/runtime/graph/hsa.py b/tinygrad/runtime/graph/hsa.py
index ba345fede4..5da2992ba4 100644
--- a/tinygrad/runtime/graph/hsa.py
+++ b/tinygrad/runtime/graph/hsa.py
@@ -2,10 +2,10 @@ import ctypes, collections, time, itertools
 from typing import List, Any, Dict, cast, Optional, Tuple
 from tinygrad.helpers import GraphException, init_c_var, round_up
 from tinygrad.buffer import Buffer, BufferOptions
-from tinygrad.device import Compiled, CompiledRunner, BufferXfer, Device
+from tinygrad.device import Compiled, CompiledRunner, Device
 from tinygrad.shape.symbolic import Variable
 from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
-from tinygrad.engine.realize import ExecItem
+from tinygrad.engine.realize import ExecItem, BufferXfer
 from tinygrad.engine.jit import MultiGraphRunner
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index 6bd2db48f0..77949e93b3 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 from typing import Tuple, List, Any, cast
 import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno, subprocess, time
-from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
-from tinygrad.buffer import BufferOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import BufferOptions, LRUAllocator
 from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.runtime.driver.hip_comgr import compile_hip
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 2434f7576c..1f6b3f88f6 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,5 +1,6 @@
 import ctypes, subprocess, pathlib, tempfile
-from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import MallocAllocator
 from tinygrad.helpers import cpu_time_execution
 from tinygrad.renderer.cstyle import ClangRenderer
 
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 56c15ff938..5a4bf6bc6d 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -5,8 +5,8 @@ from dataclasses import replace
 from typing import Tuple, Optional, List
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, CompilerOptions
-from tinygrad.buffer import BufferOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import BufferOptions, LRUAllocator, MallocAllocator
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.renderer.assembly import PTXRenderer
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401
diff --git a/tinygrad/runtime/ops_disk.py b/tinygrad/runtime/ops_disk.py
index 99899e8eea..7493fd4996 100644
--- a/tinygrad/runtime/ops_disk.py
+++ b/tinygrad/runtime/ops_disk.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 import os, mmap, _posixshmem, io
 from typing import Optional
 from tinygrad.helpers import OSX
-from tinygrad.device import Compiled, Allocator
+from tinygrad.device import Compiled
+from tinygrad.buffer import Allocator
 
 class DiskBuffer:
   def __init__(self, device:DiskDevice, size:int, offset=0):
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 3fc78121bd..8de6c98f58 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -4,8 +4,8 @@ import ctypes, functools, hashlib
 import tinygrad.runtime.autogen.opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
 from tinygrad.renderer.cstyle import OpenCLRenderer
-from tinygrad.buffer import BufferOptions
-from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
+from tinygrad.buffer import BufferOptions, LRUAllocator
+from tinygrad.device import Compiled, Compiler, CompilerOptions
 
 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py
index 60e97c6e0a..ed22db4129 100644
--- a/tinygrad/runtime/ops_hsa.py
+++ b/tinygrad/runtime/ops_hsa.py
@@ -3,8 +3,8 @@ import ctypes, functools, subprocess, io, atexit, collections, json
 from typing import Tuple, TypeVar, List, Dict, Any
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
-from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
-from tinygrad.buffer import BufferOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import BufferOptions, LRUAllocator
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
 from tinygrad.runtime.driver.hip_comgr import compile_hip
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index 32d5db0d41..f52cf05287 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 import ctypes, functools
 from typing import Tuple
-from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import MallocAllocator
 from tinygrad.helpers import DEBUG, cpu_time_execution
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
 import llvmlite.binding as llvm
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index 13b79cbcd8..c35c40ccd0 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -3,7 +3,8 @@ import os, subprocess, pathlib, ctypes, tempfile, functools
 import Metal, libdispatch
 from typing import List, Set, Any, Tuple, Optional
 from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
-from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import LRUAllocator
 from tinygrad.renderer.cstyle import MetalRenderer
 
 def wait_check(cbuf: Any):
diff --git a/tinygrad/runtime/ops_npy.py b/tinygrad/runtime/ops_npy.py
index c8121b9a09..3470d0edd2 100644
--- a/tinygrad/runtime/ops_npy.py
+++ b/tinygrad/runtime/ops_npy.py
@@ -1,6 +1,7 @@
 import numpy as np
 from tinygrad.helpers import flat_mv
-from tinygrad.device import Compiled, Allocator
+from tinygrad.device import Compiled
+from tinygrad.buffer import Allocator
 
 class NpyAllocator(Allocator):
   def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index 71894bb1d0..0954d2f336 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time
 from typing import Tuple, List, Any, cast
 from dataclasses import replace
-from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import LRUAllocator, BufferOptions
 from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 2d7fe1b22d..393cb8e528 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -5,7 +5,8 @@ from typing import Tuple, List, Optional, Any, Dict
 import pickle, base64, itertools, time, struct
 from tinygrad.dtype import DType, dtypes, ImageDType
 from tinygrad.helpers import all_same, getenv, flatten
-from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions
+from tinygrad.device import Compiled, Compiler, CompilerOptions
+from tinygrad.buffer import Allocator
 from tinygrad.codegen.uops import UOpGraph, UOps
 from tinygrad.ops import BinaryOps, TernaryOps, exec_alu
 
diff --git a/tinygrad/runtime/ops_rhip.py b/tinygrad/runtime/ops_rhip.py
index 7bf252ba59..074509344f 100644
--- a/tinygrad/runtime/ops_rhip.py
+++ b/tinygrad/runtime/ops_rhip.py
@@ -1,5 +1,6 @@
 import ctypes
-from tinygrad.device import Compiled, MallocAllocator
+from tinygrad.device import Compiled
+from tinygrad.buffer import MallocAllocator
 from tinygrad.runtime.ops_hsa import HSACompiler
 
 rhip = ctypes.CDLL("/usr/local/lib/libremu.so")
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 3a062e3942..b64792e918 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -15,7 +15,8 @@ from tinygrad.ops import LoadOps, ScheduleItem
 from tinygrad.buffer import Buffer, BufferOptions
 from tinygrad.device import Device
 from tinygrad.shape.symbolic import sint, Variable, MulNode, Node
-from tinygrad.engine.realize import run_schedule, memory_planner
+from tinygrad.engine.realize import run_schedule
+from tinygrad.engine.memory import memory_planner
 from tinygrad.engine.schedule import create_schedule_with_vars
 
 # **** start with two base classes, Tensor and Function ****