diff --git a/extra/optimization/get_action_space.py b/extra/optimization/get_action_space.py
index ca3eaeb992..46c34cc1b1 100644
--- a/extra/optimization/get_action_space.py
+++ b/extra/optimization/get_action_space.py
@@ -1,15 +1,27 @@
 from tqdm import tqdm
 from extra.optimization.helpers import load_worlds, ast_str_to_lin
 from tinygrad.codegen.search import actions
+from tinygrad.codegen.linearizer import Linearizer
 
 if __name__ == "__main__":
   ast_strs = load_worlds(False, False, False)
   tactions = set()
   for ast_str in tqdm(ast_strs):
     lin = ast_str_to_lin(ast_str)
-    lin.hand_coded_optimizations()
+    if True or not lin.apply_tensor_cores(): lin.hand_coded_optimizations()
+    linr = Linearizer(lin.ast)
     for o in lin.applied_opts:
       assert o in actions
       tactions.add(o)
-  print(len(tactions))
+      linr.apply_opt(o)
+
+    assert len(lin.sts) == len(linr.sts)
+    for st1,st2 in zip(lin.sts, linr.sts):
+      assert st1 == st2
+
+    #lin.linearize()
+    #linr.linearize()
+    #assert lin.uops == linr.uops
+
+  print(len(tactions), len(actions))
   print(sorted(list(tactions)))
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 34da256635..28cac58fb6 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -19,7 +19,7 @@ def kopt_search_hook(k, create_k, to_prg, baseline, bufs, var_vals):
   def check_opt(x):
     try:
       k = create_k()
-      k.apply_auto_opt(x)
+      for o in x: k.apply_opt(o)
       prg = to_prg(k)
       first_tm = prg.exec(bufs, var_vals, force_wait=True, optimizing=True)
       np.testing.assert_allclose(wanna_output, bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index fabe4be95e..f492dda9db 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -109,111 +109,6 @@ def helper_realized_ast(r:Tensor):
   output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
   return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
 
-def helper_linearizer_opt(r:Tensor, opts=[]):
-  wanna_output = None
-  realized_ast, real_bufs = helper_realized_ast(r)
-
-  def check_opt(x, create_k, to_prg):
-    k = create_k()
-    k.apply_auto_opt(x)
-    prg = to_prg(k)
-    real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
-    prg.exec(real_bufs, force_wait=True)
-    np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
-
-  # Get baseline, which is not optimized at all.
-  k = Linearizer(realized_ast)
-  prg = Device[Device.DEFAULT].to_program(k)
-  prg.exec(real_bufs, force_wait=True)
-  wanna_output = real_bufs[0].toCPU().copy()
-
-  # Check correctness of handcoded optimiztions.
-  k = Linearizer(realized_ast)
-  k.hand_coded_optimizations()
-  prg = Device[Device.DEFAULT].to_program(k)
-  real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
-  prg.exec(real_bufs, force_wait=True)
-  np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
-  for x in opts: # Check custom transformations if any.
-    check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program)
-
-class TestLinearizerOpts(unittest.TestCase):
-  def test_local_and_grouped_reduce(self):
-    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
-      self.skipTest("Only Compiled uses linearizer with locals")
-
-    N = 128
-    Tensor.manual_seed(1882)
-    a = Tensor.rand(4, 4, N, N)
-    b = Tensor.rand(4, 4, N)
-    r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
-    helper_linearizer_opt(r, [
-      [(0, 2, 'L')], [(0, 8, 'L')], [(0, 16, 'L')], # Checking how it works with locals
-      [(0, 2, 'G')], [(0, 32, 'G')], [(0, 64, 'G')], # Checking how it works with grouped reduce
-      [(0, 2, 'L'), (0, 2, 'G')], [(0, 16, 'L'), (0, 16, 'G')], [(0, 32, 'L'), (0, 2, 'G')], [(0, 2, 'L'), (0, 64, 'G')], # Checking how it works with locals + grouped reduce
-      [(0, 2, 'L'), (0, 2, 'G'), (0, 8, 'U'), (0, 4, 'R')], # Checking how it works with locals + grouped reduce + upcasts
-    ])
-
-  def test_upcasts(self):
-    if not isinstance(Device[Device.DEFAULT], Compiled):
-      self.skipTest("Only Compiled uses linearizer")
-
-    N = 16
-    Tensor.manual_seed(1772)
-    a = Tensor.rand(N, N)
-    b = Tensor.rand(N, N)
-    r = (a+b).sqrt() * ((a+1).exp())
-    helper_linearizer_opt(r, [
-      [(0, 2, 'U')], [(0, 4, 'U')], [(0, 8, 'U')], # Checking how it works with upcasts
-    ])
-
-  def test_full_upcast(self):
-    if not isinstance(Device[Device.DEFAULT], Compiled):
-      self.skipTest("Only Compiled uses linearizer")
-
-    Tensor.manual_seed(1772)
-    a = Tensor.rand(4)
-    b = Tensor.rand(4)
-    r = (a+b).sqrt() * ((a+1).exp())
-    helper_linearizer_opt(r, [
-      [(0, 4, 'U')], # Checking how it works with upcasts
-    ])
-
-  def test_matmul(self):
-    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
-      self.skipTest("Only Compiled uses linearizer with locals")
-
-    N = 128
-    Tensor.manual_seed(1552)
-    a = Tensor.rand(N, N)
-    b = Tensor.rand(N, N)
-    r = a@b
-    helper_linearizer_opt(r, [
-      [(0, 2, 'U')], [(0, 4, 'U'), (1, 4, 'U')], # Checking how it works with upcasts
-      [(0, 2, 'L')], [(1, 32, 'L')], [(0, 4, 'L'), (1, 4, 'L')], [(0, 4, 'L'), (1, 32, 'L')], [(0, 16, 'L'), (1, 8, 'L')], # Checking how it works with locals
-      [(0, 2, 'G')], [(0, 32, 'G')], [(0, 32, 'G'), (0, 4, 'R')], # Checking how it works with grouped_reduce
-      [(0, 2, 'L'), (1, 2, 'L'), (0, 32, 'G')], [(0, 16, 'L'), (0, 32, 'G')], [(0, 16, 'L'), (0, 8, 'L'), (0, 4, 'G')], # Checking how it works with local+grouped_reduce
-      [(0, 4, 'L'), (0, 4, 'L'), (0, 16, 'G'), (0, 4, 'R'), (0, 4, 'U'), (1, 2, 'U')], # Checking all together
-      [(0, 4, 'L'), (0, 4, 'L'), (0, 16, 'G'), (0, 4, 'R'), (0, 8, 'U')], # Full global upcast + local
-    ])
-
-  def test_double_reduce(self):
-    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
-      self.skipTest("Only Compiled uses linearizer with locals")
-
-    N = 128
-    Tensor.manual_seed(1552)
-    a = Tensor.rand(8, N, 8, N)
-    r = a.sum(axis=(1,3))
-    helper_linearizer_opt(r, [
-      [(0, 2, 'G')], [(0, 32, 'G')], [(1, 2, 'G')], [(1, 32, 'G')], # Checking how it works with 1 grouped_reduce.
-      [(0, 2, 'G'), (1, 2, 'G')], [(0, 16, 'G'), (1, 2, 'G')], [(0, 4, 'G'), (1, 64, 'G')], # Checking how it works with 2 grouped_reduces.
-      [(0, 16, 'G'), (1, 2, 'G'), (1, 4, 'R')], [(0, 2, 'G'), (1, 32, 'G'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts.
-      [(0, 4, 'L'), (1, 4, 'L'), (0, 8, 'G'), (1, 4, 'G')], [(0, 4, 'L'), (1, 4, 'L'), (0, 2, 'G'), (1, 32, 'G'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts + locals.
-      [(0, 2, 'L'), (1, 2, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U')], [(0, 2, 'L'), (1, 2, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U'), (0, 4, 'R'), (1, 4, 'R')], # Checking how it works with 2 grouped_reduces + upcasts + locals.
-      [(0, 4, 'L'), (1, 4, 'L'), (0, 8, 'G'), (1, 4, 'G'), (0, 2, 'U'), (1, 2, 'U')], # No globals
-    ])
-
 class TestFloat4(unittest.TestCase):
   def setUp(self):
     if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.supports_float4:
diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py
index 7afe173a84..ec18bcd1ba 100644
--- a/tinygrad/codegen/optimizer.py
+++ b/tinygrad/codegen/optimizer.py
@@ -300,26 +300,6 @@ class OptimizedKernel(Kernel):
         return True
     return False
 
-  def apply_auto_opt(self, x):
-    for axis, amt, typ in x:
-      if axis is None or amt == 1: continue
-      if typ == "G" and self.opts.has_shared:
-        assert self.full_shape[self.first_reduce+axis+len(self.group_for_reduce)] % amt == 0, "no longer valid shift"
-        self.shift_to(self.first_reduce+axis+len(self.group_for_reduce), amt, top=True, insert_before=self.first_reduce+len(self.group_for_reduce))
-        self.group_for_reduce.append(amt)
-      if typ == "R":
-        typ = "U"
-        axis += self.first_reduce + len(self.group_for_reduce)
-      if typ == "U":
-        assert self.full_shape[axis] % amt == 0, "no longer valid shift"
-        self.shift_to(axis, amt)
-        self.upcast()
-      elif typ == "L":
-        assert self.full_shape[axis] % amt == 0, "no longer valid shift"
-        self.shift_to(axis, amt, insert_before=self.first_reduce)
-        self.local_dims += 1
-    self.simplify_ones()
-
   def apply_opt(self, opt:Opt):
     self.applied_opts.append(opt)
     axis = opt.axis + (self.first_reduce if opt.op == OptOps.UNROLL else 0)
@@ -350,8 +330,10 @@ class OptimizedKernel(Kernel):
       if (not early_only or buf in self.earlybufs) and self.bufs[buf_index].dtype.__class__ is ImageDType:
         assert len(unit_stride_axes_mul_4) >= 1, f"needs a unit stride axis in {self.bufs[buf_index]}"
         if all(x < (self.shape_len-self.upcasted) for x in unit_stride_axes_mul_4) and unit_stride_axes_mul_4[0] not in self.upcast_in_mid_reduce_axes:
-          if unit_stride_axes_mul_4[0] < self.first_reduce: self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4))
-          else: self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4))
+          if unit_stride_axes_mul_4[0] < self.first_reduce:
+            self.apply_opt(Opt(OptOps.UPCAST, unit_stride_axes_mul_4[0], 4))
+          else:
+            self.apply_opt(Opt(OptOps.UNROLL, unit_stride_axes_mul_4[0]-self.first_reduce, 4))
 
   def hand_coded_optimizations(self):
     # if there's images in the earlybufs, we have to make an axis the 4 loading one
diff --git a/tinygrad/features/kopt.py b/tinygrad/features/kopt.py
index e6eb4f3ebd..b63bacc08a 100644
--- a/tinygrad/features/kopt.py
+++ b/tinygrad/features/kopt.py
@@ -1,6 +1,7 @@
 from typing import Callable
 import time
 from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.codegen.optimizer import Opt, OptOps
 from tinygrad.helpers import DEBUG, prod, getenv
 from tinygrad.lazy import vars_from_ast
 
@@ -15,11 +16,10 @@ def kernel_optimize_opts(k:Linearizer):
   for i in range(k.first_reduce):
     # TODO: the upcast always happen first, you might want to reverse this?
     # TODO: the order of the locals might improve things too
-    opts.append(ng.p.TransitionChoice([(i,s,"U") for s in get_divisors(k.full_shape[i], max_div=8)]))
-    opts.append(ng.p.TransitionChoice([(i,s,"L") for s in get_divisors(k.full_shape[i], min_div=4)]))
+    opts.append(ng.p.TransitionChoice([Opt(OptOps.UPCAST,i,s) for s in get_divisors(k.full_shape[i], max_div=8)]))
+    opts.append(ng.p.TransitionChoice([Opt(OptOps.LOCAL,i,s) for s in get_divisors(k.full_shape[i], min_div=4)]))
   for i in range(k.shape_len-k.first_reduce):
-    opts.append(ng.p.TransitionChoice([(i,s,"R") for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=8)]))
-    opts.append(ng.p.TransitionChoice([(i,s,"G") for s in get_divisors(k.full_shape[k.first_reduce+i], min_div=4) if all(st.shape[k.first_reduce+i] % s == 0 or st.shape[k.first_reduce+i] == 1 for st in k.sts)]))
+    opts.append(ng.p.TransitionChoice([Opt(OptOps.UNROLL,i,s) for s in get_divisors(k.full_shape[k.first_reduce+i], max_div=8)]))
   return opts
 
 def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, baseline, bufs, var_vals):
@@ -27,7 +27,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p
   def opt(x):
     try:
       k = create_k()
-      k.apply_auto_opt(x)
+      for o in x: k.apply_opt(o)
       prg = to_prg(k)
       first_tm = prg.exec(bufs, var_vals, force_wait=True, optimizing=True)
       if baseline*5 < first_tm*1000: return first_tm*1000  # very slow
@@ -78,5 +78,7 @@ def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], to_prg, buf
       global_db[skey] = choice
       global_db.sync()
 
-  if choice == "BASELINE": k.hand_coded_optimizations()
-  else: k.apply_auto_opt(choice)
\ No newline at end of file
+  if choice == "BASELINE":
+    k.hand_coded_optimizations()
+  else:
+    for o in choice: k.apply_opt(o)
\ No newline at end of file