diff --git a/docs/abstractions2.py b/docs/abstractions2.py
index 5f3bc75499..c778b79c5b 100644
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -37,9 +37,10 @@ print("******** second, the Device ***********")
 DEVICE = "CLANG"   # NOTE: you can change this!
 
 import struct
-from tinygrad.dtype import dtypes
+from tinygrad.dtype import PtrDType, dtypes
 from tinygrad.device import Buffer, Device
-from tinygrad.ops import LazyOp, BufferOps, MemBuffer, BinaryOps, MetaOps
+from tinygrad.ops import BinaryOps, MetaOps
+from tinygrad.codegen.uops import UOp, UOps
 from tinygrad.shape.shapetracker import ShapeTracker
 
 # allocate some buffers + load in values
@@ -49,15 +50,19 @@ b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struc
 # NOTE: a._buf is the same as the return from MallocAllocator.alloc
 
 # describe the computation
-ld_1 = LazyOp(BufferOps.LOAD, (), MemBuffer(1, dtypes.int32, ShapeTracker.from_shape((1,))))
-ld_2 = LazyOp(BufferOps.LOAD, (), MemBuffer(2, dtypes.int32, ShapeTracker.from_shape((1,))))
-alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2))
-st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,))))
-k = LazyOp(MetaOps.KERNEL, (st_0,))
+buf_1 = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.int32), (), 1)
+buf_2 = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.int32), (), 2)
+ld_1 = UOp(UOps.LOAD, dtypes.int32, (buf_1, *UOp.from_st(ShapeTracker.from_shape((1,)))))
+ld_2 = UOp(UOps.LOAD, dtypes.int32, (buf_2, *UOp.from_st(ShapeTracker.from_shape((1,)))))
+alu = ld_1 + ld_2
+output_buf = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.int32), (), 0)
+idx, valid = UOp.from_st(ShapeTracker.from_shape((1,)))
+st_0 = UOp(UOps.STORE, None, (output_buf, idx, alu, valid))
+s = UOp(UOps.SINK, None, (st_0,))
 
 # convert the computation to a "linearized" format (print the format)
 from tinygrad.engine.realize import get_kernel, CompiledRunner
-kernel = get_kernel(Device[DEVICE].renderer, k).linearize()
+kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
 kernel.uops.print()
 
 # compile a program (and print the source)
diff --git a/examples/handcode_opt.py b/examples/handcode_opt.py
index b368a3305b..ebd7182a19 100644
--- a/examples/handcode_opt.py
+++ b/examples/handcode_opt.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Tuple
 from extra.models.resnet import ResNet50
 from extra.mcts_search import mcts_search
 from examples.mlperf.helpers import get_mlperf_bert_model
@@ -83,7 +83,7 @@ if __name__ == "__main__":
     rawbufs = bufs_from_lin(Kernel(si.ast))
 
     # "linearize" the op into uops in different ways
-    lins:List[Kernel] = []
+    lins: List[Tuple[Kernel, str]] = []
 
     # always try hand coded opt
     lin = Kernel(si.ast, opts=device.renderer)
@@ -109,10 +109,10 @@ if __name__ == "__main__":
 
     # benchmark the programs
     choices = []
-    for (lin, nm) in lins:
+    for lin, nm in lins:
       tm = time_linearizer(lin, rawbufs, allow_test_size=False, cnt=10, disable_cache=True)
       ops = (prg:=lin.to_program()).op_estimate
-      gflops = sym_infer(ops, {k:k.min for k in lin.ast.vars()})*1e-9/tm
+      gflops = sym_infer(ops, {k:k.min for k in lin.ast.variables()})*1e-9/tm
       choices.append((tm, gflops, lin, prg, nm))
 
     sorted_choices = sorted(choices, key=lambda x: x[0])
diff --git a/extra/mcts_search.py b/extra/mcts_search.py
index f92d802714..8ad6d86d29 100644
--- a/extra/mcts_search.py
+++ b/extra/mcts_search.py
@@ -7,7 +7,6 @@ from tinygrad.helpers import DEBUG, getenv, CACHELEVEL, diskcache_get, diskcache
 from tinygrad.codegen.kernel import Kernel
 from tinygrad.device import Buffer, Device, CompileError
 from tinygrad.engine.search import _ensure_buffer_alloc, get_kernel_actions, _time_program
-from tinygrad.ops import LazyOp
 
 class MCTSNode:
   def __init__(self, kernel:Kernel, parent=None):
@@ -87,14 +86,14 @@ def mcts_search(lin:Kernel, rawbufs:List[Buffer], amt:int) -> Kernel:
     return ret
 
   rawbufs = _ensure_buffer_alloc(rawbufs)
-  var_vals = {k:(k.max+k.min)//2 for k in lin.ast.vars()}
+  var_vals = {k:(k.max+k.min)//2 for k in lin.ast.variables()}
   dev = Device[lin.opts.device]
   root = MCTSNode(lin)
 
   st = time.perf_counter()
   best, best_idx, best_tm = lin, 0, math.inf
   seen_libs: Dict[bytes, MCTSNode] = {}
-  seen_asts: Dict[LazyOp, MCTSNode] = {}
+  seen_asts: Dict[bytes, MCTSNode] = {}
   compile_time, runtime_time = 0.0, 0.0
   for i in range(amt):
     node = sample_tree(root, best_tm)  # sample and expand
@@ -102,12 +101,12 @@ def mcts_search(lin:Kernel, rawbufs:List[Buffer], amt:int) -> Kernel:
     node.i = i  # when was node explored
 
     opt_ast = node.kernel.get_optimized_ast()
-    if (sibling_node:=seen_asts.get(opt_ast, None)) is not None:
+    if (sibling_node:=seen_asts.get(opt_ast.key, None)) is not None:
       # early check for same optimized AST hit
       remove_node(node)
       tm = sibling_node.t
     else:
-      seen_asts[opt_ast] = node
+      seen_asts[opt_ast.key] = node
 
       # lowering (50% of the time)
       p = node.kernel.to_program(name_override="test")
diff --git a/extra/to_movement_ops.py b/extra/to_movement_ops.py
index 623c4e9f07..f69cb8d585 100644
--- a/extra/to_movement_ops.py
+++ b/extra/to_movement_ops.py
@@ -3,7 +3,7 @@ from enum import Enum, auto
 from collections import defaultdict
 from typing import List, Tuple, DefaultDict
 from extra.optimization.helpers import load_worlds, ast_str_to_ast
-from tinygrad.ops import BufferOps, LazyOp
+from extra.ops import BufferOps, LazyOp
 from tinygrad.helpers import prod, tqdm
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import sym_infer, Node
@@ -145,4 +145,4 @@ if __name__ == "__main__":
   for ast_str in tqdm(ast_strs):
     test_rebuild_bufferop_st(ast_str_to_ast(ast_str))
 
-  print(f"avg length of mop = {sum(k*v for k,v in c.items()) / sum(c.values()):.2f}")
\ No newline at end of file
+  print(f"avg length of mop = {sum(k*v for k,v in c.items()) / sum(c.values()):.2f}")
diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py
index 2426319463..2a869598b1 100644
--- a/test/external/fuzz_linearizer.py
+++ b/test/external/fuzz_linearizer.py
@@ -86,7 +86,7 @@ def compare_linearizer(lin: Kernel, rawbufs=None, var_vals=None, ground_truth=No
 
   if var_vals is None:
     # TODO: handle symbolic max case
-    var_vals = {v: random.randint(v.min, v.max if isinstance(v.max, int) else v.min) for v in lin.ast.vars()}
+    var_vals = {v: random.randint(v.min, v.max if isinstance(v.max, int) else v.min) for v in lin.ast.variables()}
 
   if ground_truth is None and not has_bf16:
     unoptimized = Kernel(lin.ast)
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 685091fcd3..fa6bc92bc6 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -71,7 +71,7 @@ class Kernel:
     def ordered_parents(op:UOp) -> List[UOp]: return dedup([item for x in op.src for item in ordered_parents(x)] + [op])
     self.reduceops = dedup([x for x in ordered_parents(self.ast) if x.op is UOps.REDUCE_AXIS])
 
-    self.vars: List[Variable] = dedup([x.arg for x in self.ast.vars()])
+    self.vars: List[Variable] = self.ast.variables()
     self.bufs: List[UOp] = [x for x in self.ast.parents if x.op in BUFFER_UOPS]
 
     # get earlybufs, before any reduceops
@@ -481,7 +481,7 @@ class Kernel:
       self.tensor_core_opts.fix_axes(axis) # fix up axes in TC opts if required after simplify_ones()
 
   def required_optimizations(self) -> Kernel:
-    if self.bufs[0].dtype.__class__ is ImageDType:
+    if isinstance(self.membufs[0].dtype, ImageDType):
       unit_stride_axes_mul_4 = [i for i in self.sts[0].unit_stride_axes(ignore_valid=True) if self.sts[0].shape[i]%4 == 0]
       assert len(unit_stride_axes_mul_4) >= 1, f"needs a unit stride axis in {self.bufs[0]}"
       if len(unit_stride_axes_mul_4) and all(x < self.first_upcast for x in unit_stride_axes_mul_4) and unit_stride_axes_mul_4[0] not in self.upcast_in_mid_reduce_axes:  # noqa: E501
@@ -692,7 +692,7 @@ class Kernel:
                               for i,s in enumerate(self.full_shape))
               srcs = []
               for i,(src,fix_st_fxn) in enumerate(zip(rsrc.src, [fix_st1, fix_st2])):
-                st_load = [self.sts[self.bufs.index(op)].real_strides() for op in src.parents if op.op is UOps.LOAD]
+                st_load = [self.sts[self.bufs.index(op)].real_strides() for op in rsrc.parents if op.op is UOps.LOAD]
                 local_shape = tuple(s if max(cast(int, x[i]) for x in st_load) != 0 else 1 for i,s in enumerate(ex_shape))
                 idx, valid = UOp.from_st(ShapeTracker.from_shape(local_shape).expand(ex_shape))
                 membuf = UOp(UOps.DEFINE_LOCAL, PtrDType(tc.dtype_in), (), (f"temp{-(-1-i)}", idx.arg.real_size()))
diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py
index 45855668e0..eeba83aa6f 100644
--- a/tinygrad/codegen/lowerer.py
+++ b/tinygrad/codegen/lowerer.py
@@ -161,21 +161,21 @@ class IndependentLowerer:
 
   def _to_uop(self, x:UOp) -> UOp:
     if x.op in BUFFER_UOPS:
-      idx, valid = st_to_uops(x.src[-1].arg, self.ridxs if x.op is UOps.LOAD and x.src[0].op is UOps.DEFINE_LOCAL else self.idxs, cast(DType,x.dtype))
+      idx, valid = st_to_uops(x.src[-1].arg, self.ridxs if x.op is UOps.LOAD and x.src[0].op is UOps.DEFINE_LOCAL else self.idxs,
+        cast(DType, x.dtype if x.op is UOps.CONST else x.src[0].dtype))
       # TODO: check has_valid in UPat, not here
       has_valid = valid.op is not UOps.CONST or valid.arg is not True
       if x.op is UOps.CONST: return valid.where(UOp.const(x.dtype, x.arg), UOp.const(x.dtype, 0))
       buf = x.src[0]
       if x.op is UOps.LOAD:
         barrier = (UOp(UOps.BARRIER, None, (self.to_uop(x.src[1]),)),) if x.src[0].op is UOps.DEFINE_LOCAL else ()
-        load_dtype = cast(DType,x.dtype).scalar()
         if idx.dtype == dtypes.int.vec(3):
           # this should all simplify if there's consts for id4. if not, w/e
           idx, id4 = UOp(UOps.VECTORIZE, dtypes.int.vec(2), (idx.src[0], idx.src[1])), idx.src[2]
-          vec_load = UOp(UOps.LOAD, load_dtype.vec(4), (buf, idx) + ((UOp.const(load_dtype.vec(4), 0), valid) if has_valid else ()) + barrier)
-          return functools.reduce(lambda ret, i: id4.ne(i).where(ret, UOp(UOps.GEP, load_dtype, (vec_load,), i)),
-                                  range(4), UOp.const(load_dtype, float('nan')))
-        return UOp(UOps.LOAD, load_dtype, (buf, idx) + ((UOp.const(load_dtype, 0), valid) if has_valid else ()) + barrier)
+          vec_load = UOp(UOps.LOAD, dt:=cast(DType, x.dtype).vec(4), (buf, idx) + ((UOp.const(dt, 0), valid) if has_valid else ()) + barrier)
+          return functools.reduce(lambda ret, i: id4.ne(i).where(ret, UOp(UOps.GEP, x.dtype, (vec_load,), i)),
+                                  range(4), UOp.const(x.dtype, float('nan')))
+        return UOp(UOps.LOAD, x.dtype, (buf, idx) + ((UOp.const(x.dtype, 0), valid) if has_valid else ()) + barrier)
       # NOTE: only store the local reduceop in the first thread (this is wrong for non group for reduces!)
       if x.src[0].op is UOps.DEFINE_GLOBAL:
         for oidx, ridx in zip(self.idxs, self.ridxs):
diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py
index 99169c2a6f..9806a63111 100644
--- a/tinygrad/codegen/uops.py
+++ b/tinygrad/codegen/uops.py
@@ -105,6 +105,9 @@ class UOp:
     # NOTE: UOps.DEFINE_GLOBAL and UOps.DEFINE_LOCAL don't have shape
     return tuple(max(x) for x in zip(*[x.full_shape for x in self.src if x.op not in {UOps.DEFINE_GLOBAL, UOps.DEFINE_LOCAL}]))
   def vars(self) -> Set[UOp]: return set([x for x in self.sparents if x.op is UOps.DEFINE_VAR])
+  def variables(self) -> List[Variable]:
+    st_vars: List[Set[Variable]] = [x.src[-1].arg.vars() for x in self.sparents if x.op in BUFFER_UOPS]
+    return sorted(set.union(*st_vars, set([x.arg for x in self.sparents if x.op is UOps.DEFINE_VAR])), key=lambda v: v.expr)
   def const_factor(self) -> int:
     """largest known int that divides self"""
     if self.op is UOps.CONST: return self.arg
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 2ce9e8c119..e3df45db87 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -96,8 +96,8 @@ def _recursive_uop(buf:LazyBuffer, st:ShapeTracker, outputs:Tuple[LazyBuffer, ..
   if buf.op in {MetaOps.CONTIGUOUS, MetaOps.ASSIGN}:
     assert buf in outputs, f"{buf.op} must be writable"
     return in_ops[0]
-  if buf.op is UnaryOps.CAST: return cache.setdefault((buf, st), UOp(UOps.CAST, buf.arg.scalar(), in_ops))
-  if buf.op is UnaryOps.BITCAST: return cache.setdefault((buf, st), UOp(UOps.BITCAST, buf.arg.scalar(), in_ops))
+  if buf.op is UnaryOps.CAST: return cache.setdefault((buf, st), UOp(UOps.CAST, dtype, in_ops))
+  if buf.op is UnaryOps.BITCAST: return cache.setdefault((buf, st), UOp(UOps.BITCAST, dtype, in_ops))
   return cache.setdefault((buf, st), UOp(UOps.ALU, dtype, in_ops, buf.op))
 
 def _permute_reduce(input_st:ShapeTracker, axis:Tuple[int, ...]) -> Tuple[ShapeTracker, Tuple[sint, ...]]:
diff --git a/tinygrad/engine/search.py b/tinygrad/engine/search.py
index 5ef01becee..1aff8a2c86 100644
--- a/tinygrad/engine/search.py
+++ b/tinygrad/engine/search.py
@@ -93,7 +93,7 @@ def bufs_from_lin(lin:Kernel, allocate:bool=True) -> List[Buffer]:
   bufsts: DefaultDict[int, List[UOp]] = defaultdict(list)
   for x in lin.bufs:
     if x.src[0].op is UOps.DEFINE_GLOBAL: bufsts[x.src[0].arg].append(x)
-  rawbufs:List[Optional[Buffer]] = [None]*len(bufsts)
+  rawbufs: List[Optional[Buffer]] = [None]*len(bufsts)
   for k,lx in bufsts.items():
     buf_size = prod(dtype.shape) if isinstance(dtype:=cast(DType,lx[0].src[0].dtype), ImageDType) else max(y.src[-1].arg.real_size() for y in lx)
     if buf_size == 0: buf_size = 1  # create a size 1 buffer if no cell is accessed in kernel. # TODO: remove from kernel input in this case.
@@ -141,7 +141,7 @@ def beam_search(lin:Kernel, rawbufs:List[Buffer], amt:int, allow_test_size=True,
 
   try:
     rawbufs = _ensure_buffer_alloc(rawbufs)
-    var_vals: Dict[Variable, int] = {k.arg:(k.arg.max+k.arg.min)//2 for k in lin.ast.vars()}
+    var_vals: Dict[Variable, int] = {k:(k.max+k.min)//2 for k in lin.ast.variables()}
     exiting, st = False, time.perf_counter()
     dev = Device[lin.opts.device]
     while not exiting:
@@ -199,7 +199,7 @@ def time_linearizer(lin:Kernel, rawbufs:List[Buffer], allow_test_size=True, max_
   assert dev.compiler is not None
 
   rawbufs = _ensure_buffer_alloc(rawbufs)
-  var_vals: Dict[Variable, int] = {k.arg:(k.arg.max+k.arg.min)//2 for k in lin.ast.vars()}
+  var_vals: Dict[Variable, int] = {k:(k.max+k.min)//2 for k in lin.ast.variables()}
   p = lin.to_program()
   tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs,
                       max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name))