From b47f6cebb232ebf31ba7e4c9472cf93d9ab743ea Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Thu, 28 Mar 2024 17:50:23 -0400
Subject: [PATCH] LinearizerOptions -> CompilerOptions (#3978)

---
 examples/handcode_resnet50_opt.py       |  6 ++--
 extra/backends/ops_hip.py               |  5 ++-
 extra/backends/ops_webgl.py             |  6 ++--
 extra/backends/ops_webgpu.py            |  5 ++-
 extra/gemm/tvm_gemm.py                  |  4 +--
 extra/optimization/search.py            |  2 +-
 test/external/speed_beam_v_hcopt.py     |  2 +-
 test/external/speed_compare_cuda_ptx.py |  4 +--
 test/test_linearizer.py                 | 48 ++++++++++++-------------
 test/test_uops.py                       |  2 +-
 tinygrad/codegen/kernel.py              | 21 +++--------
 tinygrad/device.py                      | 24 +++++++++----
 tinygrad/runtime/ops_clang.py           |  5 ++-
 tinygrad/runtime/ops_cuda.py            | 11 +++---
 tinygrad/runtime/ops_gpu.py             |  5 ++-
 tinygrad/runtime/ops_hsa.py             |  5 ++-
 tinygrad/runtime/ops_llvm.py            |  5 ++-
 tinygrad/runtime/ops_metal.py           |  5 ++-
 tinygrad/runtime/ops_python.py          |  9 +++--
 19 files changed, 82 insertions(+), 92 deletions(-)

diff --git a/examples/handcode_resnet50_opt.py b/examples/handcode_resnet50_opt.py
index 212d3f02f1..bc3aac7b34 100644
--- a/examples/handcode_resnet50_opt.py
+++ b/examples/handcode_resnet50_opt.py
@@ -43,18 +43,18 @@ if __name__ == "__main__":
     lins:List[Linearizer] = []
 
     # always try hand coded opt
-    lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts)
+    lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts)
     lin.hand_coded_optimizations()
     lins.append(lin)
 
     # maybe try tensor cores
-    lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts)
+    lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts)
     if lin.apply_tensor_cores():
       lins.append(lin)
 
     # try a beam search
     if beam:=getenv("BEAM"):
-      lin = Linearizer(*si.ast, opts=device.compiler.linearizer_opts)
+      lin = Linearizer(*si.ast, opts=device.compiler.compiler_opts)
       lin = beam_search(lin, rawbufs, beam, bool(getenv("BEAM_ESTIMATE", 1)))
       lins.append(lin)
 
diff --git a/extra/backends/ops_hip.py b/extra/backends/ops_hip.py
index 774ed2a829..4c6a834211 100644
--- a/extra/backends/ops_hip.py
+++ b/extra/backends/ops_hip.py
@@ -4,13 +4,12 @@ from typing import Tuple, TypeVar, List, Any, cast, Set
 import tinygrad.runtime.autogen.hip as hip
 from tinygrad.helpers import DEBUG, getenv, init_c_var
 from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler, CompilerOptions
 from tinygrad.renderer.cstyle import HIPRenderer
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.runtime.driver.hip_comgr import compile_hip
 
 class HIPCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("HIP", has_tensor_cores=True, shared_max=65536)
+  compiler_opts = CompilerOptions("HIP", has_tensor_cores=True, shared_max=65536)
   def __init__(self, arch:str):
     self.arch = arch
     super().__init__(f"compile_hip_{self.arch}")
diff --git a/extra/backends/ops_webgl.py b/extra/backends/ops_webgl.py
index eef43d108b..93d42971a5 100644
--- a/extra/backends/ops_webgl.py
+++ b/extra/backends/ops_webgl.py
@@ -1,8 +1,8 @@
 import numpy as np
 import functools
 from tinygrad.dtype import dtypes, ImageDType
-from tinygrad.device import Compiled, Allocator
-from tinygrad.codegen.kernel import LinearizerOptions, OptOps
+from tinygrad.device import Compiled, Allocator, CompilerOptions
+from tinygrad.codegen.kernel import OptOps
 from tinygrad.renderer.cstyle import uops_to_cstyle
 from tinygrad.renderer.cstyle import GLSLLanguage
 import moderngl
@@ -47,6 +47,6 @@ class RawWebGLAllocator(Allocator):
 class WebGlDevice(Compiled):
   def __init__(self, device:str):
     super().__init__(RawWebGLAllocator(),
-      LinearizerOptions(device="WEBGL", global_max=[4096*4096,1,1], unsupported_opts=[OptOps.UPCAST, OptOps.UPCASTMID],
+      CompilerOptions(device="WEBGL", global_max=[4096*4096,1,1], unsupported_opts=[OptOps.UPCAST, OptOps.UPCASTMID],
       supports_float4=False, supports_float4_alu=False, has_local=False, has_shared=False, dont_use_locals=True),
       functools.partial(uops_to_cstyle, GLSLLanguage()), lambda x: x, WebGLProgram)
diff --git a/extra/backends/ops_webgpu.py b/extra/backends/ops_webgpu.py
index 70543f97bd..c88c8307dc 100644
--- a/extra/backends/ops_webgpu.py
+++ b/extra/backends/ops_webgpu.py
@@ -1,6 +1,5 @@
 from wgpu.utils.device import get_default_device
-from tinygrad.device import Compiled, Allocator
-from tinygrad.codegen.kernel import LinearizerOptions
+from tinygrad.device import Compiled, Allocator, CompilerOptions
 from tinygrad.renderer.cstyle import WGSLRenderer
 import wgpu
 
@@ -37,5 +36,5 @@ class WebGpuAllocator(Allocator):
 
 class WebGpuDevice(Compiled):
   def __init__(self, device:str):
-    super().__init__(WebGpuAllocator(), LinearizerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64],
+    super().__init__(WebGpuAllocator(), CompilerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64],
                                                           global_max=[65535, 65535, 65535]), WGSLRenderer, lambda x: x, WebGPUProgram)
diff --git a/extra/gemm/tvm_gemm.py b/extra/gemm/tvm_gemm.py
index 5961354088..3b04c1dc58 100644
--- a/extra/gemm/tvm_gemm.py
+++ b/extra/gemm/tvm_gemm.py
@@ -39,8 +39,8 @@ C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2)
 
 sched = create_schedule([C.lazydata])
 from tinygrad.codegen.linearizer import Linearizer
-from tinygrad.codegen.kernel import LinearizerOptions
-lin = Linearizer(sched[-1].ast, LinearizerOptions(has_local=False, supports_float4=False))
+from tinygrad.device import CompilerOptions
+lin = Linearizer(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False))
 #lin.hand_coded_optimizations()
 lin.linearize()
 from tinygrad.runtime.ops_clang import renderer
diff --git a/extra/optimization/search.py b/extra/optimization/search.py
index a6486c73f1..37f54f9883 100644
--- a/extra/optimization/search.py
+++ b/extra/optimization/search.py
@@ -25,7 +25,7 @@ if __name__ == '__main__':
 
   for ast_str in ast_strs:
     print(f"optimizing ast={ast_str}")
-    lin = ast_str_to_lin(ast_str, opts=device.compiler.linearizer_opts)
+    lin = ast_str_to_lin(ast_str, opts=device.compiler.compiler_opts)
     rawbufs = bufs_from_lin(lin)
     lin = beam_search(lin, rawbufs, getenv("BEAM", 8), bool(getenv("BEAM_ESTIMATE", 1)))
 
diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py
index b34d5ed628..4676fbd1cf 100644
--- a/test/external/speed_beam_v_hcopt.py
+++ b/test/external/speed_beam_v_hcopt.py
@@ -15,7 +15,7 @@ if __name__ == "__main__":
   beam_won, tested = 0, 0
 
   for num, ast in enumerate(ast_strs[:test_n]):
-    def new_lin(): return ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts)
+    def new_lin(): return ast_str_to_lin(ast, opts=dev.compiler.compiler_opts)
 
     k = new_lin()
     # k.required_optimizations()
diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py
index 137072def1..a977f5b2a0 100644
--- a/test/external/speed_compare_cuda_ptx.py
+++ b/test/external/speed_compare_cuda_ptx.py
@@ -26,14 +26,14 @@ if __name__ == "__main__":
   average_tm_cuda, average_tm_ptx = 0, 0
   for num,ast in enumerate(ast_strs):
     # cuda compile
-    lin = ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts)
+    lin = ast_str_to_lin(ast, opts=dev.compiler.compiler_opts)
     lin.hand_coded_optimizations()
     cuda_prg = dev.to_program(lin)
 
     bufs = bufs_from_lin(lin)
 
     # ptx compile
-    lin = ast_str_to_lin(ast, opts=ptx.linearizer_opts)
+    lin = ast_str_to_lin(ast, opts=ptx.compiler_opts)
     lin.hand_coded_optimizations()
     lin.linearize()
     ptx_src = ptx.render(to_function_name(lin.name), lin.uops)
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 7e31bdb94d..cd9dc70019 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -97,7 +97,7 @@ class TestLinearizer(unittest.TestCase):
     assert num_ops <= 1, "more alu uops than needed"
 
   def test_reduce_upcast(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4:
       self.skipTest("device does not support upcast")
     x, w = Tensor.randn((1,1,3)).realize(), Tensor.randn((1,1,2)).realize()
     r = Tensor.conv2d(x,w,padding=1).relu()
@@ -113,7 +113,7 @@ class TestLinearizer(unittest.TestCase):
     assert stores[0].vin[-1].dtype == accs[0].dtype == dtypes.float.vec(4)
 
   def test_upcast_with_locals(self):
-    if not (opts:=Device[Device.DEFAULT].compiler.linearizer_opts).has_local or not opts.has_shared or not opts.supports_float4:
+    if not (opts:=Device[Device.DEFAULT].compiler.compiler_opts).has_local or not opts.has_shared or not opts.supports_float4:
       self.skipTest("device does not support upcasted reduce with locals")
 
     x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
@@ -183,9 +183,9 @@ class TestLinearizer(unittest.TestCase):
       helper_arg_acc_dtype(d.conv2d(w, acc_dtype=acc_dtype), expected_dtype)
 
   def test_tensor_cores(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
       self.skipTest("device doesn't have tensor cores")
-    for tc in tensor_cores[Device[Device.DEFAULT].compiler.linearizer_opts.device]:
+    for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]:
       if getenv("EMULATE_CUDA") and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue
       a, b = Tensor.rand(tc.dims[1], tc.dims[2], dtype=tc.dtype_in), Tensor.rand(tc.dims[2], tc.dims[0], dtype=tc.dtype_in)
       np_a, np_b = a.numpy(), b.numpy()
@@ -281,7 +281,7 @@ def helper_realized_ast(r:Tensor):
   output_buffer = Buffer((out:=s[-1].outputs[0]).device, prod((s if isinstance(s, int) else s.max for s in out.shape)), out.dtype).allocate()
   return s[-1].ast[0], [output_buffer] + [l.realized for l in s[-1].inputs]
 
-@unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "need backends that support float4")
+@unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "need backends that support float4")
 class TestFloat4(unittest.TestCase):
   @staticmethod
   def count_float4(k):
@@ -490,7 +490,7 @@ class TestHandCodedOpts(unittest.TestCase):
     assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
 
   def test_matvec(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local:
       self.skipTest("Only devices with locals")
     N = 128
     a = Tensor.rand(1, N).realize()
@@ -539,9 +539,9 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
   for i, x in enumerate(opts): # Check custom transformations if any.
     check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program, color_sizes[i] if i < len(color_sizes) else None)
 
-class TestLinearizerOpts(unittest.TestCase):
+class TestKernelOpts(unittest.TestCase):
   def test_local_and_grouped_reduce(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -587,7 +587,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_matmul(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -617,7 +617,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_double_reduce(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 128
@@ -644,7 +644,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_invalid_tensor_core_extra_opts(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
       self.skipTest("device doesn't have tensor cores")
     if Device.DEFAULT not in tensor_cores:
       self.skipTest("No tensor cores for device")
@@ -665,25 +665,25 @@ class TestLinearizerOpts(unittest.TestCase):
         assert k.apply_tensor_cores(use_tensor_cores=1, extra_opts=x), "no valid tensor core" # for METAL in runners
 
   def test_buf_index_not_found_tensor_core(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
       self.skipTest("device doesn't have tensor cores")
     if Device.DEFAULT not in tensor_cores:
       self.skipTest("No tensor cores for device")
 
     ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BinaryOps.MUL, src=(LazyOp(op=UnaryOps.CAST, src=(LazyOp(op=BinaryOps.CMPEQ, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(0, 1), offset=0, mask=None, contiguous=False),)))), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=2, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(dtypes.float, False)), LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=3, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1243, 256), strides=(1, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=(0,)),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(1, 256), strides=(0, 1), offset=0, mask=None, contiguous=True),))))  # noqa: E501
-    k = Linearizer(ast, opts=Device[Device.DEFAULT].compiler.linearizer_opts)
+    k = Linearizer(ast, opts=Device[Device.DEFAULT].compiler.compiler_opts)
     with self.assertRaises(KernelOptError):
       k.apply_opt(Opt(OptOps.TC, 0, 1))
 
   def test_tensor_core_opts(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_tensor_cores:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
       self.skipTest("device doesn't have tensor cores")
     if Device.DEFAULT not in tensor_cores:
       self.skipTest("No tensor cores for device")
 
     N = 128
     Tensor.manual_seed(1552)
-    for tc in tensor_cores[Device[Device.DEFAULT].compiler.linearizer_opts.device]:
+    for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]:
       a, b = Tensor.rand(N, N, dtype=tc.dtype_in), Tensor.rand(N, N, dtype=tc.dtype_in)
       r = a.matmul(b, acc_dtype=tc.dtype_out)
       (atol, rtol) = ((0.25, 0.01) if tc.dtype_out == dtypes.half else (3e-2, 1e-3)) if tc.dtype_in == dtypes.half else (1e-4, 1e-4)
@@ -749,7 +749,7 @@ class TestLinearizerOpts(unittest.TestCase):
     ])
 
   def test_color_shapes_with_local(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared:
       self.skipTest("Only Compiled uses linearizer with locals and shared")
 
     N = 32
@@ -819,7 +819,7 @@ class TestLinearizerHelper(unittest.TestCase):
     assert expand_idxs(idxs) == (uidx0, NumNode(0), uidx1)
 
 class TestLinearizerUOptimize(unittest.TestCase):
-  @unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "device doesn't support float4")
+  @unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "device doesn't support float4")
   def test_grouped_store_phis(self):
     x, y = Tensor.randn(64,64), Tensor.randn(64,64)
     out = x.matmul(y)
@@ -833,7 +833,7 @@ class TestLinearizerUOptimize(unittest.TestCase):
     for val in store_vals:
       assert val.dtype == dtypes.float.vec(4) and val.uop != UOps.CAST
 
-  @unittest.skipUnless(Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "device doesn't support float4")
+  @unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "device doesn't support float4")
   def test_grouped_store_values(self):
     x = Tensor.randn((4,3,6,6)).realize()
     out = x.flip((0,1)).contiguous()
@@ -846,8 +846,8 @@ class TestLinearizerUOptimize(unittest.TestCase):
     assert store_val.dtype == dtypes.float.vec(4) and store_val.uop != UOps.CAST
 
   def test_grouped_store_locals_and_globals(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared or \
-       not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared or \
+       not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4:
       self.skipTest("Only Compiled uses linearizer with locals, shared, and float4")
 
     x, y = Tensor.rand(128, 128), Tensor.rand(128, 128)
@@ -871,8 +871,8 @@ class TestLinearizerUOptimize(unittest.TestCase):
     assert len([u for u in k.uops if u.uop is UOps.IF and u.vin[-1] == barrier]) == 1
 
   def test_grouped_store_local_only(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared or \
-       not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.has_shared or \
+       not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4:
       self.skipTest("Only Compiled uses linearizer with locals, shared, and float4")
 
     x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
@@ -891,7 +891,7 @@ class TestLinearizerUOptimize(unittest.TestCase):
     assert stores[1].vin[-1].dtype == dtypes.float
 
   def test_skip_unmatching_upcasts(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4:
       self.skipTest("Needs locals and float4")
     ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(1, 240, 0, 0), offset=0, mask=None, contiguous=False),)))),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(240, 40, 1, 1), strides=(40, 1, 0, 0), offset=0, mask=None, contiguous=True),)))) # noqa: E501
     opts = [
@@ -907,7 +907,7 @@ class TestLinearizerUOptimize(unittest.TestCase):
     assert out.vin[-1].uop is UOps.CAST and out.vin[-1].dtype == dtypes.float.vec(4)
 
   def test_skip_unmatching_upcasts_with_gep(self):
-    if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4:
+    if not Device[Device.DEFAULT].compiler.compiler_opts.has_local or not Device[Device.DEFAULT].compiler.compiler_opts.supports_float4:
       self.skipTest("Needs locals and float4")
     ast = LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BufferOps.LOAD, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(1, 8, 0, 0), offset=0, mask=None, contiguous=False),)))),), arg=MemBuffer(idx=0, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(8, 32, 1, 1), strides=(32, 1, 0, 0), offset=0, mask=None, contiguous=True),)))) # noqa: E501
     opts = [Opt(op=OptOps.LOCAL, axis=1, amt=4), Opt(op=OptOps.UPCAST, axis=2, amt=2), Opt(op=OptOps.LOCAL, axis=1, amt=8),
diff --git a/test/test_uops.py b/test/test_uops.py
index 2dc3c51dae..d0dc014532 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -12,7 +12,7 @@ from test.helpers import is_dtype_supported
 
 def _uops_to_prg(uops):
   src = Device[Device.DEFAULT].compiler.render("test", uops)
-  has_local = Device[Device.DEFAULT].compiler.linearizer_opts.has_local
+  has_local = Device[Device.DEFAULT].compiler.compiler_opts.has_local
   return CompiledASTRunner("test", src, Device.DEFAULT, [1] if has_local else None, [1] if has_local else None)
 
 def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index d6711223e1..4e58536889 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import math, itertools
 from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union
 from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps
-from tinygrad.device import Device
+from tinygrad.device import Device, CompilerOptions
 from tinygrad.dtype import dtypes, ImageDType, DType
 from tinygrad.helpers import colored, ansilen, dedup, flatten, getenv, prod, DEBUG, round_up, all_int, get_contraction
 from tinygrad.shape.shapetracker import ShapeTracker
@@ -61,23 +61,10 @@ class LocalBuffer(NamedTuple):
   realized: None = None
   def __str__(self): return f"localbuffer<{self.name}[{self.size}]>"
 
-class LinearizerOptions(NamedTuple):
-  device: str = ""
-  suffix: str = ""
-  # TODO: make this generic with a list of supported types
-  supports_float4: bool = True
-  has_local: bool = True
-  has_shared: bool = True
-  has_tensor_cores: bool = False
-  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
-  global_max: Optional[List[int]] = None
-  local_max: Optional[List[int]] = None
-  shared_max: int = 32768
-
 class Kernel:
-  def __init__(self, *ast:LazyOp, opts:Optional[LinearizerOptions]=None):
-    self.opts = opts if opts is not None else (device.compiler.linearizer_opts if (device:=Device[Device.DEFAULT]).compiler is not None else
-                                               LinearizerOptions(Device.DEFAULT))
+  def __init__(self, *ast:LazyOp, opts:Optional[CompilerOptions]=None):
+    self.opts = opts if opts is not None else (device.compiler.compiler_opts if (device:=Device[Device.DEFAULT]).compiler is not None else
+                                               CompilerOptions(Device.DEFAULT))
     assert all(op.op is BufferOps.STORE for op in ast), f"kernels must have stores as the output, got {ast}"
     assert len(set(op.arg.st.size for op in ast)) == 1, f"all outbufs should have the same size, got {[op.arg.st for op in ast]}"
     self.ast = ast
diff --git a/tinygrad/device.py b/tinygrad/device.py
index df07b248c2..b42ab551d0 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar
+from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar, NamedTuple
 import importlib, inspect, functools, pathlib, time, ctypes
 from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put
 from tinygrad.helpers import prod, CACHECOLLECTING
@@ -11,7 +11,6 @@ from tinygrad.codegen.uops import UOpGraph
 
 if TYPE_CHECKING:
   from tinygrad.codegen.linearizer import Linearizer
-  from tinygrad.codegen.kernel import LinearizerOptions
 
 # **************** Device ****************
 
@@ -134,8 +133,21 @@ MallocAllocator = _MallocAllocator()
 
 # **************** for Compiled Devices ****************
 
+class CompilerOptions(NamedTuple):
+  device: str = ""
+  suffix: str = ""
+  # TODO: make this generic with a list of supported types
+  supports_float4: bool = True
+  has_local: bool = True
+  has_shared: bool = True
+  has_tensor_cores: bool = False
+  # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
+  global_max: Optional[List[int]] = None
+  local_max: Optional[List[int]] = None
+  shared_max: int = 32768
+
 class Compiler:
-  linearizer_opts: ClassVar[LinearizerOptions]
+  compiler_opts: ClassVar[CompilerOptions]
   def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
   def render(self, name:str, uops:UOpGraph) -> str: raise NotImplementedError("need a render function")
   def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
@@ -215,16 +227,16 @@ class Compiled:
       from tinygrad.features.graph import print_tree
       for op in ast: print_tree(op)
     from tinygrad.codegen.linearizer import Linearizer
-    k = Linearizer(*ast, opts=self.compiler.linearizer_opts)
+    k = Linearizer(*ast, opts=self.compiler.compiler_opts)
     k.required_optimizations()
     if not NOOPT:
       if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
       if BEAM >= 1:
         lins = [(("tc" if used_tensor_cores else "hc"), k)]
         if used_tensor_cores:
-          lins.append(("hc", Linearizer(*ast, opts=self.compiler.linearizer_opts)))
+          lins.append(("hc", Linearizer(*ast, opts=self.compiler.compiler_opts)))
           lins[-1][1].hand_coded_optimizations()
-        kb = Linearizer(*ast, opts=self.compiler.linearizer_opts)
+        kb = Linearizer(*ast, opts=self.compiler.compiler_opts)
         kb.required_optimizations()
         from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
         test_rawbuffers = bufs_from_lin(kb)    # allocate scratch buffers for optimization
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 13607453c4..44d5fd3979 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,13 +1,12 @@
 import ctypes, subprocess, pathlib, tempfile
-from tinygrad.device import Compiled, MallocAllocator, Compiler
+from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
 from tinygrad.helpers import cpu_time_execution
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
 
 CLANG_PROGRAM_HEADER = '#include <stdbool.h>\n#include <tgmath.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n'
 
 class ClangCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
+  compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False)
   def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
   def compile(self, src:str) -> bytes:
     # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 367b3327ac..ed6eb3d388 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -4,8 +4,7 @@ from pathlib import Path
 from typing import Tuple, Optional, List
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions
-from tinygrad.codegen.kernel import LinearizerOptions
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions, CompilerOptions
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.renderer.assembly import PTXRenderer
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401
@@ -53,20 +52,20 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
   return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
 
 class PTXCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
+  compiler_opts = CompilerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
   def __init__(self, arch:str):
     self.arch = arch
     self.version = "7.8" if arch >= "sm_89" else "7.5"
-    PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
     super().__init__(f"compile_ptx_{self.arch}")
   def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
   def compile(self, src:str) -> bytes: return src.encode()
 
 class CUDACompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
+  compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
   def __init__(self, arch:str):
     self.arch = arch
-    CUDACompiler.linearizer_opts = CUDACompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
     check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
     self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
     if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 4ce0aa2c53..f0449990db 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -3,9 +3,8 @@ from typing import Tuple, Optional, List, cast
 import ctypes, functools, hashlib
 import tinygrad.runtime.autogen.opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
 
 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,7 +14,7 @@ def check(status):
 def checked(ret, status): return (check(status.value), ret)[1]
 
 class CLCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("GPU")
+  compiler_opts = CompilerOptions("GPU")
   def __init__(self, device:CLDevice, compile_key:str):
     self.device = device
     super().__init__(f"compile_cl_{compile_key}")
diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py
index ee21a025fc..c0cc4ec81c 100644
--- a/tinygrad/runtime/ops_hsa.py
+++ b/tinygrad/runtime/ops_hsa.py
@@ -3,8 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json
 from typing import Tuple, TypeVar, List, Dict, Any
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
-from tinygrad.codegen.kernel import LinearizerOptions
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
 from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.runtime.driver.hip_comgr import compile_hip
@@ -43,7 +42,7 @@ class HSAProfiler:
 Profiler = HSAProfiler()
 
 class HSACompiler(Compiler):
-  linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536)
+  compiler_opts = CompilerOptions("HSA", has_tensor_cores=True, shared_max=65536)
   def __init__(self, arch:str):
     self.arch = arch
     super().__init__(f"compile_hip_{self.arch}")
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index 869940652d..32d5db0d41 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 import ctypes, functools
 from typing import Tuple
-from tinygrad.device import Compiled, MallocAllocator, Compiler
+from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
 from tinygrad.helpers import DEBUG, cpu_time_execution
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
 import llvmlite.binding as llvm
 
 class LLVMCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
+  compiler_opts = CompilerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
   def __init__(self, device:LLVMDevice):
     self.device = device
     super().__init__("compile_llvm")
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index f209240798..b6050b6cc5 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -2,9 +2,8 @@ from __future__ import annotations
 import os, subprocess, pathlib, ctypes, tempfile, functools
 import Metal, libdispatch
 from typing import List, Set, Any, Tuple, Optional
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
-from tinygrad.device import Compiled, LRUAllocator, Compiler
+from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
 from tinygrad.renderer.cstyle import MetalRenderer
 
 def wait_check(cbuf: Any):
@@ -13,7 +12,7 @@ def wait_check(cbuf: Any):
     raise RuntimeError(error)
 
 class MetalCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
+  compiler_opts = CompilerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
   def __init__(self, device:Optional[MetalDevice]):
     self.device = device
     super().__init__("compile_metal")
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 0cad1b91c7..1febe89117 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -5,10 +5,9 @@ from typing import Tuple, List, Optional, Any, Dict
 import pickle, base64, itertools, time, struct
 from tinygrad.dtype import DType, dtypes, ImageDType
 from tinygrad.helpers import all_same, getenv, flatten
-from tinygrad.device import Compiled, Allocator, Compiler
+from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions
 from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu
 from tinygrad.ops import BinaryOps, TernaryOps
-from tinygrad.codegen.kernel import LinearizerOptions
 
 def _load(m, i):
   if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}")
@@ -179,9 +178,9 @@ class PythonProgram:
     return time.perf_counter() - st
 
 class PythonCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
-    (LinearizerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
-    (LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON")))
+  compiler_opts = CompilerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
+    (CompilerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
+    (CompilerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else CompilerOptions("PYTHON")))
   def render(self, name:str, uops:UOpGraph) -> str:
     lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops]
     return base64.b64encode(pickle.dumps(lops)).decode()