update llvm api + add cache key (#3140)

* update llvm api + add cache key * use_xcode is a different function * types
2026-01-26 23:38:58 -05:00 · 2024-01-15 17:25:32 -08:00
parent cec0a7bc37
commit 120c8b1841
7 changed files with 56 additions and 64 deletions
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -282,22 +282,22 @@ class CompiledASTRunner(JITRunner):
    return et

 class Compiled:
-  def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, runtime, graph=None):
-    self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph = \
-      allocator, linearizer_opts, renderer, compiler, runtime, graph
+  def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, compiler_cachekey, runtime, graph=None):
+    self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph, self.compiler_cachekey = \
+      allocator, linearizer_opts, renderer, compiler, runtime, graph, compiler_cachekey
  def synchronize(self): pass  # override this in your device

  def to_program(self, k:Linearizer) -> CompiledASTRunner:
    assert self.compiler is not None, f"compiler is None, can't build {k.ast}"
    k.linearize()
    src = self.renderer(to_function_name(k.name), k.uops)
-    if getenv("DISABLE_COMPILER_CACHE") or '<' in self.compiler.__name__:
+    if getenv("DISABLE_COMPILER_CACHE") or self.compiler_cachekey is None:
      lib = self.compiler(src)
    else:
-      lib = diskcache_get(self.compiler.__name__, src)
+      lib = diskcache_get(self.compiler_cachekey, src)
      if lib is None:
        lib = self.compiler(src)
-        diskcache_put(self.compiler.__name__, src, lib)
+        diskcache_put(self.compiler_cachekey, src, lib)
    return CompiledASTRunner(k.ast, k.name, src, lib, k.global_size, k.local_size).build(self.runtime)

  def get_linearizer(self, ast:LazyOp) -> Linearizer:
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -23,4 +23,5 @@ class ClangProgram:
  def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)

 renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict"))
-ClangDevice = Compiled(MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram)
+ClangDevice = Compiled(MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False), renderer,
+                       compile_clang, "compile_clang", ClangProgram)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -86,7 +86,7 @@ class CUDADevice(Compiled):
    from tinygrad.runtime.graph.cuda import CUDAGraph
    super().__init__(CUDAAllocator(self) if not CUDACPU else MallocAllocator,
                     LinearizerOptions("CUDA", supports_float4_alu=False, global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
-                     CUDARenderer, compile_cuda, functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
+                     CUDARenderer, compile_cuda, "compile_cuda", functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
  def synchronize(self):
    if not CUDACPU:
      check(cuda.cuCtxSetCurrent(self.context))
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -15,14 +15,13 @@ def check(status):
  if status != 0: raise RuntimeError(f"OpenCL Error {status}")
 def checked(ret, status): return (check(status.value), ret)[1]

-def compile_cl(prg:str) -> bytes:
-  assert CLDevice.compiler_context is not None, 'OpenCL requires a "compiler_context" to compile, init a device before you call this'
-  program = checked(cl.clCreateProgramWithSource(CLDevice.compiler_context.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
+def compile_cl(device:CLDevice, prg:str) -> bytes:
+  program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
                                                 ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
-  status = cl.clBuildProgram(program, 1, ctypes.byref(CLDevice.compiler_context.device_id), None, cl.clBuildProgram.argtypes[4](), None)
+  status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
  if status != 0:
-    cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))  # noqa: E501
-    cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None)  # noqa: E501
+    cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
+    cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None)  # noqa: E501
    raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
  binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None)))  # noqa: E501
  binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None)))  # noqa: E501
@@ -76,7 +75,6 @@ class CLAllocator(LRUAllocator):

 class CLDevice(Compiled):
  device_ids = None                 # this is global and only initted once
-  compiler_context = None           # this is the first created context. we make an assumption they are all the same for the compiler
  def __init__(self, device:str=""):
    if CLDevice.device_ids is None:
      num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
@@ -90,10 +88,11 @@ class CLDevice(Compiled):

    self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
    self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status)  # noqa: E501
-    if CLDevice.compiler_context is None: CLDevice.compiler_context = self
    self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, ctypes.byref(status)), status)
    self.pending_copyin: List[memoryview] = []
-    super().__init__(CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer, compile_cl, functools.partial(CLProgram, self))
+    # TODO: vary the cache key based on device name
+    super().__init__(CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer,
+                     functools.partial(compile_cl, self), "compile_cl", functools.partial(CLProgram, self))
  def synchronize(self):
    check(cl.clFinish(self.queue))
    self.pending_copyin.clear()
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -89,7 +89,7 @@ class HIPDevice(Compiled):

    from tinygrad.runtime.graph.hip import HIPGraph
    super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
-                     compile_hip, functools.partial(HIPProgram, self.device), HIPGraph)
+                     compile_hip, "compile_hip", functools.partial(HIPProgram, self.device), HIPGraph)
  def synchronize(self):
    check(hip.hipSetDevice(self.device))
    check(hip.hipDeviceSynchronize())
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -1,49 +1,42 @@
-import ctypes
-from typing import ClassVar, Tuple
+from __future__ import annotations
+import ctypes, functools
+from typing import Tuple
 from tinygrad.device import Compiled, MallocAllocator
 from tinygrad.helpers import DEBUG, cpu_time_execution
-from ctypes import CFUNCTYPE
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
-
 import llvmlite.binding as llvm

-class LLVM:
-  target_machine: ClassVar[llvm.targets.TargetMachine] = None
-  engine: ClassVar[llvm.executionengine.ExecutionEngine] = None
-  optimizer: ClassVar[llvm.passmanagers.ModulePassManager] = None
+def compile_llvm(device, prg) -> bytes:
+  mod = llvm.parse_assembly(prg)
+  mod.verify()
+  device.optimizer.run(mod)
+  if DEBUG >= 5: print(device.target_machine.emit_assembly(mod))
+  return device.target_machine.emit_object(mod)

-  def __init__(self):
-    if LLVM.engine is not None: return
+class LLVMProgram:
+  def __init__(self, device:LLVMDevice, name:str, lib:bytes):
+    self.name, self.lib = name, lib
+    device.engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
+    self.fxn = device.engine.get_function_address(name)
+
+  def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
+    self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
+    return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
+
+class LLVMDevice(Compiled):
+  def __init__(self, device:str):
    llvm.initialize()
    llvm.initialize_native_target()
    llvm.initialize_native_asmprinter()
    llvm.initialize_native_asmparser()
-    target = llvm.Target.from_triple(llvm.get_process_triple())
-    LLVM.optimizer = llvm.create_module_pass_manager()
-    LLVM.target_machine = target.create_target_machine(opt=2)  # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
-    LLVM.target_machine.add_analysis_passes(LLVM.optimizer)
-    LLVM.target_machine.set_asm_verbosity(True)
+    self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager()
+    # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
+    self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2)
+    self.target_machine.add_analysis_passes(self.optimizer)
+    self.target_machine.set_asm_verbosity(True)
    backing_mod = llvm.parse_assembly(str())
    backing_mod.triple = llvm.get_process_triple()
-    LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine)
-
-def compile_llvm(prg) -> bytes:
-  mod = llvm.parse_assembly(prg)
-  mod.verify()
-  LLVM().optimizer.run(mod)
-  if DEBUG >= 5: print(LLVM.target_machine.emit_assembly(mod))
-  return LLVM.target_machine.emit_object(mod)
-
-class LLVMProgram:
-  def __init__(self, name:str, lib:bytes):
-    self.name, self.lib = name, lib
-    LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
-    self.fxn = LLVM.engine.get_function_address(name)
-
-  def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
-    self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
-    return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
-
-LLVMDevice = Compiled(MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False),
-                      uops_to_llvm_ir, compile_llvm, LLVMProgram)
+    self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
+    super().__init__(MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False),
+                     uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self))
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -7,14 +7,14 @@ from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
 from tinygrad.device import Compiled, LRUAllocator
 from tinygrad.renderer.cstyle import MetalRenderer

-def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes:
-  assert MetalDevice.compiler_device, "metal device creation is required for metal compile"
-  if use_xcode:
-    # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
-    air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
-    return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
+def compile_metal_xcode(prg:str) -> bytes:
+  # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
+  air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
+  return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
+
+def compile_metal(device, prg:str) -> bytes:
  options = Metal.MTLCompileOptions.new()
-  library = unwrap2(MetalDevice.compiler_device.newLibraryWithSource_options_error_(prg, options, None))
+  library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None))
  return library.libraryDataContents().bytes().tobytes()

 class MetalProgram:
@@ -72,16 +72,15 @@ class MetalAllocator(LRUAllocator):
  def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src)

 class MetalDevice(Compiled):
-  compiler_device = None
  def __init__(self, device:str):
    self.device = Metal.MTLCreateSystemDefaultDevice()
-    if MetalDevice.compiler_device is None: MetalDevice.compiler_device = self.device
    self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
    self.mtl_buffers_in_flight: List[Any] = []
    self.mv_in_metal: List[memoryview] = []
    from tinygrad.runtime.graph.metal import MetalGraph
    super().__init__(MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer,
-                     compile_metal, functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
+                     compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal",
+                     functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
  def synchronize(self):
    for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
    self.mv_in_metal.clear()