From 120c8b1841977eb3aa047d282b26d3a157d66b6f Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Mon, 15 Jan 2024 17:25:32 -0800 Subject: [PATCH] update llvm api + add cache key (#3140) * update llvm api + add cache key * use_xcode is a different function * types --- tinygrad/device.py | 12 +++---- tinygrad/runtime/ops_clang.py | 3 +- tinygrad/runtime/ops_cuda.py | 2 +- tinygrad/runtime/ops_gpu.py | 17 +++++---- tinygrad/runtime/ops_hip.py | 2 +- tinygrad/runtime/ops_llvm.py | 65 ++++++++++++++++------------------- tinygrad/runtime/ops_metal.py | 19 +++++----- 7 files changed, 56 insertions(+), 64 deletions(-) diff --git a/tinygrad/device.py b/tinygrad/device.py index 8352110d06..1db30975d4 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -282,22 +282,22 @@ class CompiledASTRunner(JITRunner): return et class Compiled: - def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, runtime, graph=None): - self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph = \ - allocator, linearizer_opts, renderer, compiler, runtime, graph + def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, compiler_cachekey, runtime, graph=None): + self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph, self.compiler_cachekey = \ + allocator, linearizer_opts, renderer, compiler, runtime, graph, compiler_cachekey def synchronize(self): pass # override this in your device def to_program(self, k:Linearizer) -> CompiledASTRunner: assert self.compiler is not None, f"compiler is None, can't build {k.ast}" k.linearize() src = self.renderer(to_function_name(k.name), k.uops) - if getenv("DISABLE_COMPILER_CACHE") or '<' in self.compiler.__name__: + if getenv("DISABLE_COMPILER_CACHE") or self.compiler_cachekey is None: lib = self.compiler(src) else: - lib = diskcache_get(self.compiler.__name__, src) + lib = diskcache_get(self.compiler_cachekey, src) if lib is None: lib = self.compiler(src) - diskcache_put(self.compiler.__name__, src, lib) + diskcache_put(self.compiler_cachekey, src, lib) return CompiledASTRunner(k.ast, k.name, src, lib, k.global_size, k.local_size).build(self.runtime) def get_linearizer(self, ast:LazyOp) -> Linearizer: diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 64c7e8981d..4f74376474 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -23,4 +23,5 @@ class ClangProgram: def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait) renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict")) -ClangDevice = Compiled(MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram) +ClangDevice = Compiled(MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False), renderer, + compile_clang, "compile_clang", ClangProgram) diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 12f19ee403..7244354d75 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -86,7 +86,7 @@ class CUDADevice(Compiled): from tinygrad.runtime.graph.cuda import CUDAGraph super().__init__(CUDAAllocator(self) if not CUDACPU else MallocAllocator, LinearizerOptions("CUDA", supports_float4_alu=False, global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]), - CUDARenderer, compile_cuda, functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None) + CUDARenderer, compile_cuda, "compile_cuda", functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None) def synchronize(self): if not CUDACPU: check(cuda.cuCtxSetCurrent(self.context)) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index db19e5ba35..2c0093b5e4 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -15,14 +15,13 @@ def check(status): if status != 0: raise RuntimeError(f"OpenCL Error {status}") def checked(ret, status): return (check(status.value), ret)[1] -def compile_cl(prg:str) -> bytes: - assert CLDevice.compiler_context is not None, 'OpenCL requires a "compiler_context" to compile, init a device before you call this' - program = checked(cl.clCreateProgramWithSource(CLDevice.compiler_context.context, 1, to_char_p_p([prg_bytes := prg.encode()]), +def compile_cl(device:CLDevice, prg:str) -> bytes: + program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]), ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status) - status = cl.clBuildProgram(program, 1, ctypes.byref(CLDevice.compiler_context.device_id), None, cl.clBuildProgram.argtypes[4](), None) + status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None) if status != 0: - cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t())) # noqa: E501 - cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501 + cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t())) + cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501 raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}") binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501 binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501 @@ -76,7 +75,6 @@ class CLAllocator(LRUAllocator): class CLDevice(Compiled): device_ids = None # this is global and only initted once - compiler_context = None # this is the first created context. we make an assumption they are all the same for the compiler def __init__(self, device:str=""): if CLDevice.device_ids is None: num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x)))) @@ -90,10 +88,11 @@ class CLDevice(Compiled): self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])] self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status) # noqa: E501 - if CLDevice.compiler_context is None: CLDevice.compiler_context = self self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, ctypes.byref(status)), status) self.pending_copyin: List[memoryview] = [] - super().__init__(CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer, compile_cl, functools.partial(CLProgram, self)) + # TODO: vary the cache key based on device name + super().__init__(CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer, + functools.partial(compile_cl, self), "compile_cl", functools.partial(CLProgram, self)) def synchronize(self): check(cl.clFinish(self.queue)) self.pending_copyin.clear() diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 75b8034325..21add0f7c4 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -89,7 +89,7 @@ class HIPDevice(Compiled): from tinygrad.runtime.graph.hip import HIPGraph super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer, - compile_hip, functools.partial(HIPProgram, self.device), HIPGraph) + compile_hip, "compile_hip", functools.partial(HIPProgram, self.device), HIPGraph) def synchronize(self): check(hip.hipSetDevice(self.device)) check(hip.hipDeviceSynchronize()) diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 80ecee163d..2a78becd93 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,49 +1,42 @@ -import ctypes -from typing import ClassVar, Tuple +from __future__ import annotations +import ctypes, functools +from typing import Tuple from tinygrad.device import Compiled, MallocAllocator from tinygrad.helpers import DEBUG, cpu_time_execution -from ctypes import CFUNCTYPE from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.llvmir import uops_to_llvm_ir - import llvmlite.binding as llvm -class LLVM: - target_machine: ClassVar[llvm.targets.TargetMachine] = None - engine: ClassVar[llvm.executionengine.ExecutionEngine] = None - optimizer: ClassVar[llvm.passmanagers.ModulePassManager] = None +def compile_llvm(device, prg) -> bytes: + mod = llvm.parse_assembly(prg) + mod.verify() + device.optimizer.run(mod) + if DEBUG >= 5: print(device.target_machine.emit_assembly(mod)) + return device.target_machine.emit_object(mod) - def __init__(self): - if LLVM.engine is not None: return +class LLVMProgram: + def __init__(self, device:LLVMDevice, name:str, lib:bytes): + self.name, self.lib = name, lib + device.engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib)) + self.fxn = device.engine.get_function_address(name) + + def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False): + self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn) + return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait) + +class LLVMDevice(Compiled): + def __init__(self, device:str): llvm.initialize() llvm.initialize_native_target() llvm.initialize_native_asmprinter() llvm.initialize_native_asmparser() - target = llvm.Target.from_triple(llvm.get_process_triple()) - LLVM.optimizer = llvm.create_module_pass_manager() - LLVM.target_machine = target.create_target_machine(opt=2) # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA - LLVM.target_machine.add_analysis_passes(LLVM.optimizer) - LLVM.target_machine.set_asm_verbosity(True) + self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager() + # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA + self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2) + self.target_machine.add_analysis_passes(self.optimizer) + self.target_machine.set_asm_verbosity(True) backing_mod = llvm.parse_assembly(str()) backing_mod.triple = llvm.get_process_triple() - LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine) - -def compile_llvm(prg) -> bytes: - mod = llvm.parse_assembly(prg) - mod.verify() - LLVM().optimizer.run(mod) - if DEBUG >= 5: print(LLVM.target_machine.emit_assembly(mod)) - return LLVM.target_machine.emit_object(mod) - -class LLVMProgram: - def __init__(self, name:str, lib:bytes): - self.name, self.lib = name, lib - LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib)) - self.fxn = LLVM.engine.get_function_address(name) - - def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False): - self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn) - return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait) - -LLVMDevice = Compiled(MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False), - uops_to_llvm_ir, compile_llvm, LLVMProgram) + self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine) + super().__init__(MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False), + uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self)) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index abf66f9a7f..21e385435f 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -7,14 +7,14 @@ from tinygrad.helpers import prod, getenv, DEBUG, unwrap2 from tinygrad.device import Compiled, LRUAllocator from tinygrad.renderer.cstyle import MetalRenderer -def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes: - assert MetalDevice.compiler_device, "metal device creation is required for metal compile" - if use_xcode: - # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode - air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) - return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) +def compile_metal_xcode(prg:str) -> bytes: + # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode + air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) + return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) + +def compile_metal(device, prg:str) -> bytes: options = Metal.MTLCompileOptions.new() - library = unwrap2(MetalDevice.compiler_device.newLibraryWithSource_options_error_(prg, options, None)) + library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None)) return library.libraryDataContents().bytes().tobytes() class MetalProgram: @@ -72,16 +72,15 @@ class MetalAllocator(LRUAllocator): def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src) class MetalDevice(Compiled): - compiler_device = None def __init__(self, device:str): self.device = Metal.MTLCreateSystemDefaultDevice() - if MetalDevice.compiler_device is None: MetalDevice.compiler_device = self.device self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024) self.mtl_buffers_in_flight: List[Any] = [] self.mv_in_metal: List[memoryview] = [] from tinygrad.runtime.graph.metal import MetalGraph super().__init__(MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer, - compile_metal, functools.partial(MetalProgram, self), functools.partial(MetalGraph, self)) + compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal", + functools.partial(MetalProgram, self), functools.partial(MetalGraph, self)) def synchronize(self): for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted() self.mv_in_metal.clear()