LinearizerOptions -> CompilerOptions (#3978)

2026-02-13 08:05:10 -05:00 · 2024-03-28 17:50:23 -04:00
parent 2bfb1d3e39
commit b47f6cebb2
19 changed files with 82 additions and 92 deletions
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,13 +1,12 @@
 import ctypes, subprocess, pathlib, tempfile
-from tinygrad.device import Compiled, MallocAllocator, Compiler
+from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
 from tinygrad.helpers import cpu_time_execution
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage

 CLANG_PROGRAM_HEADER = '#include <stdbool.h>\n#include <tgmath.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n'

 class ClangCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
+  compiler_opts = CompilerOptions("CLANG", supports_float4=False, has_local=False)
  def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
  def compile(self, src:str) -> bytes:
    # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -4,8 +4,7 @@ from pathlib import Path
 from typing import Tuple, Optional, List
 import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions
-from tinygrad.codegen.kernel import LinearizerOptions
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, BufferOptions, CompilerOptions
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.renderer.assembly import PTXRenderer
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401
@@ -53,20 +52,20 @@ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
  return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)

 class PTXCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
+  compiler_opts = CompilerOptions("CUDA", suffix="PTX", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
  def __init__(self, arch:str):
    self.arch = arch
    self.version = "7.8" if arch >= "sm_89" else "7.5"
-    PTXCompiler.linearizer_opts = PTXCompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    PTXCompiler.compiler_opts = PTXCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
    super().__init__(f"compile_ptx_{self.arch}")
  def render(self, name:str, uops) -> str: return PTXRenderer(name, uops).replace("TARGET", self.arch).replace("VERSION", self.version)
  def compile(self, src:str) -> bytes: return src.encode()

 class CUDACompiler(Compiler):
-  linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
+  compiler_opts = CompilerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
  def __init__(self, arch:str):
    self.arch = arch
-    CUDACompiler.linearizer_opts = CUDACompiler.linearizer_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
+    CUDACompiler.compiler_opts = CUDACompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
    check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -3,9 +3,8 @@ from typing import Tuple, Optional, List, cast
 import ctypes, functools, hashlib
 import tinygrad.runtime.autogen.opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions

 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,7 +14,7 @@ def check(status):
 def checked(ret, status): return (check(status.value), ret)[1]

 class CLCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("GPU")
+  compiler_opts = CompilerOptions("GPU")
  def __init__(self, device:CLDevice, compile_key:str):
    self.device = device
    super().__init__(f"compile_cl_{compile_key}")
--- a/tinygrad/runtime/ops_hsa.py
+++ b/tinygrad/runtime/ops_hsa.py
@@ -3,8 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json
 from typing import Tuple, TypeVar, List, Dict, Any
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
-from tinygrad.codegen.kernel import LinearizerOptions
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler, CompilerOptions
 from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.runtime.driver.hip_comgr import compile_hip
@@ -43,7 +42,7 @@ class HSAProfiler:
 Profiler = HSAProfiler()

 class HSACompiler(Compiler):
-  linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536)
+  compiler_opts = CompilerOptions("HSA", has_tensor_cores=True, shared_max=65536)
  def __init__(self, arch:str):
    self.arch = arch
    super().__init__(f"compile_hip_{self.arch}")
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 import ctypes, functools
 from typing import Tuple
-from tinygrad.device import Compiled, MallocAllocator, Compiler
+from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions
 from tinygrad.helpers import DEBUG, cpu_time_execution
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
 import llvmlite.binding as llvm

 class LLVMCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
+  compiler_opts = CompilerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
  def __init__(self, device:LLVMDevice):
    self.device = device
    super().__init__("compile_llvm")
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -2,9 +2,8 @@ from __future__ import annotations
 import os, subprocess, pathlib, ctypes, tempfile, functools
 import Metal, libdispatch
 from typing import List, Set, Any, Tuple, Optional
-from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
-from tinygrad.device import Compiled, LRUAllocator, Compiler
+from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
 from tinygrad.renderer.cstyle import MetalRenderer

 def wait_check(cbuf: Any):
@@ -13,7 +12,7 @@ def wait_check(cbuf: Any):
    raise RuntimeError(error)

 class MetalCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
+  compiler_opts = CompilerOptions("METAL", has_tensor_cores=os.uname().machine == "arm64", shared_max=32768)
  def __init__(self, device:Optional[MetalDevice]):
    self.device = device
    super().__init__("compile_metal")
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -5,10 +5,9 @@ from typing import Tuple, List, Optional, Any, Dict
 import pickle, base64, itertools, time, struct
 from tinygrad.dtype import DType, dtypes, ImageDType
 from tinygrad.helpers import all_same, getenv, flatten
-from tinygrad.device import Compiled, Allocator, Compiler
+from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions
 from tinygrad.codegen.uops import UOpGraph, UOps, exec_alu
 from tinygrad.ops import BinaryOps, TernaryOps
-from tinygrad.codegen.kernel import LinearizerOptions

 def _load(m, i):
  if i < 0 or i >= len(m): raise IndexError(f"load out of bounds, size is {len(m)} and access is {i}")
@@ -179,9 +178,9 @@ class PythonProgram:
    return time.perf_counter() - st

 class PythonCompiler(Compiler):
-  linearizer_opts = LinearizerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
-    (LinearizerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
-    (LinearizerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else LinearizerOptions("PYTHON")))
+  compiler_opts = CompilerOptions("METAL", has_tensor_cores=True) if getenv("EMULATE_METAL") else \
+    (CompilerOptions("HSA", has_tensor_cores=True) if getenv("EMULATE_HSA") else \
+    (CompilerOptions("CUDA", has_tensor_cores=True) if getenv("EMULATE_CUDA") else CompilerOptions("PYTHON")))
  def render(self, name:str, uops:UOpGraph) -> str:
    lops = [(u.uop, u.dtype, [uops.uops.index(v) for v in u.vin], u.arg) for u in uops]
    return base64.b64encode(pickle.dumps(lops)).decode()