diff --git a/tinygrad/llops/ops_llvm.py b/tinygrad/llops/ops_llvm.py index f322bbb696..ced1ba7007 100644 --- a/tinygrad/llops/ops_llvm.py +++ b/tinygrad/llops/ops_llvm.py @@ -1,19 +1,14 @@ from __future__ import annotations -import hashlib import math -import time from typing import Tuple, Union, Dict, Any, List, ClassVar -from tinygrad.helpers import prod, getenv +from tinygrad.helpers import prod from tinygrad.shape import ShapeTracker, ZeroView from tinygrad.ops import LazyOp from tinygrad.ast import ASTKernel import ctypes import numpy as np -from ctypes import CFUNCTYPE -from tinygrad.ops import DEBUG, UnaryOps, BinaryOps, ReduceOps, ExplicitExecAST, GlobalCounters - -from llvmlite import ir # type: ignore -import llvmlite.binding as llvm # type: ignore +from tinygrad.ops import DEBUG, UnaryOps, BinaryOps, ReduceOps, ExplicitExecAST +from tinygrad.runtime.llvm import LLVM, ir def int_const(x): return ir.Constant(ir.IntType(64), x) @@ -67,82 +62,6 @@ def idx_deref(builder, buf, ptr, idx): else: return builder.load(builder.gep(ptr, [idx], inbounds=True)) -class LLVM: - target_machine : ClassVar[llvm.targets.TargetMachine] = None - engine : ClassVar[llvm.executionengine.ExecutionEngine] = None - optimizer : ClassVar[llvm.passmanagers.ModulePassManager] = None - - def __init__(self): - if LLVM.engine is not None: - return - llvm.initialize() - llvm.initialize_native_target() - llvm.initialize_native_asmprinter() - llvm.initialize_native_asmparser() - target = llvm.Target.from_triple(llvm.get_process_triple()) - LLVM.optimizer = llvm.create_module_pass_manager() - LLVM.target_machine = target.create_target_machine(opt=2) # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA - LLVM.target_machine.add_analysis_passes(LLVM.optimizer) - - # TODO: this makes compile times so much faster - if getenv("LLVMOPT"): - llvm.set_option(str(), '-force-vector-interleave=4') # this makes sum the same speed as torch, it also doubles the (slow) conv speed - if DEBUG >= 4: - llvm.set_option(str(), '--debug-only=loop-vectorize') - #llvm.set_option(str(), '--debug') - - # does this do anything? - builder = llvm.create_pass_manager_builder() - builder.opt_level = 3 - builder.size_level = 0 - builder.loop_vectorize = True - builder.slp_vectorize = True - builder.populate(LLVM.optimizer) - - LLVM.target_machine.set_asm_verbosity(True) - backing_mod = llvm.parse_assembly(str()) - backing_mod.triple = llvm.get_process_triple() - LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine) - - def exec(self, module:ir.Module, bufs, op_estimate=0, mem_estimate=0): - module.triple = llvm.get_process_triple() - module.data_layout = self.engine.target_data - llvm_ir = str(module) - - if DEBUG >= 2: - print(llvm_ir) - - mod = llvm.parse_assembly(llvm_ir) - mod.verify() - LLVM.optimizer.run(mod) - if DEBUG >= 4: - print("Optimized IR:") - print(str(mod)) - mod.name = hashlib.sha1(llvm_ir.encode('utf-8')).hexdigest() - if DEBUG >= 3: - print(LLVM.target_machine.emit_assembly(mod)) - LLVM.engine.add_module(mod) - LLVM.engine.finalize_object() - - # call function (NOTE: if the types don't match, there's likely something wrong with the cache) - #cfunc = CFUNCTYPE(ctypes.c_int, *[type(x._buf) for x in bufs])(LLVM.engine.get_function_address('exec')) - - # why is this needed without the types. fixed tests below - # LLVM=1 OPT=2 python3 test/test_ops.py TestOps.test_cat TestOps.test_multicat - cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for x in bufs])(LLVM.engine.get_function_address('exec')) - - st = time.monotonic() - cfunc(*[x._buf for x in bufs]) - et = time.monotonic() - st - if DEBUG >= 1: - print(f"**LLVM** time {et*1000:7.2f} ms OPs {op_estimate/1e6:7.2f}M -- {(op_estimate/1e9)/et:5.2f} GFLOPS -- {mem_estimate:10d} reads -- {(mem_estimate*4/1e9)/et:5.2f} GB/s") - GlobalCounters.global_ops += op_estimate - GlobalCounters.global_mem += mem_estimate - - # we are done - LLVM.engine.remove_module(mod) - return cfunc - # TODO: Refactor LLVMBuffer and GPUBuffer into ShapeTrackedBuffer class LLVMBuffer(ExplicitExecAST): diff --git a/tinygrad/runtime/llvm.py b/tinygrad/runtime/llvm.py new file mode 100644 index 0000000000..9206b69bce --- /dev/null +++ b/tinygrad/runtime/llvm.py @@ -0,0 +1,87 @@ +from typing import ClassVar +from tinygrad.helpers import getenv +from tinygrad.ops import DEBUG, GlobalCounters +import hashlib +import time +import ctypes +from ctypes import CFUNCTYPE + +import llvmlite.binding as llvm # type: ignore +from llvmlite import ir # type: ignore + +class LLVM: + target_machine : ClassVar[llvm.targets.TargetMachine] = None + engine : ClassVar[llvm.executionengine.ExecutionEngine] = None + optimizer : ClassVar[llvm.passmanagers.ModulePassManager] = None + + def __init__(self): + if LLVM.engine is not None: + return + llvm.initialize() + llvm.initialize_native_target() + llvm.initialize_native_asmprinter() + llvm.initialize_native_asmparser() + target = llvm.Target.from_triple(llvm.get_process_triple()) + LLVM.optimizer = llvm.create_module_pass_manager() + LLVM.target_machine = target.create_target_machine(opt=2) # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA + LLVM.target_machine.add_analysis_passes(LLVM.optimizer) + + # TODO: this makes compile times so much faster + if getenv("LLVMOPT"): + llvm.set_option(str(), '-force-vector-interleave=4') # this makes sum the same speed as torch, it also doubles the (slow) conv speed + if DEBUG >= 4: + llvm.set_option(str(), '--debug-only=loop-vectorize') + #llvm.set_option(str(), '--debug') + + # does this do anything? + builder = llvm.create_pass_manager_builder() + builder.opt_level = 3 + builder.size_level = 0 + builder.loop_vectorize = True + builder.slp_vectorize = True + builder.populate(LLVM.optimizer) + + LLVM.target_machine.set_asm_verbosity(True) + backing_mod = llvm.parse_assembly(str()) + backing_mod.triple = llvm.get_process_triple() + LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine) + + # TODO: LLVMProgram + def exec(self, module:ir.Module, bufs, op_estimate=0, mem_estimate=0): + module.triple = llvm.get_process_triple() + module.data_layout = self.engine.target_data + llvm_ir = str(module) + + if DEBUG >= 2: + print(llvm_ir) + + mod = llvm.parse_assembly(llvm_ir) + mod.verify() + LLVM.optimizer.run(mod) + if DEBUG >= 4: + print("Optimized IR:") + print(str(mod)) + mod.name = hashlib.sha1(llvm_ir.encode('utf-8')).hexdigest() + if DEBUG >= 3: + print(LLVM.target_machine.emit_assembly(mod)) + LLVM.engine.add_module(mod) + LLVM.engine.finalize_object() + + # call function (NOTE: if the types don't match, there's likely something wrong with the cache) + #cfunc = CFUNCTYPE(ctypes.c_int, *[type(x._buf) for x in bufs])(LLVM.engine.get_function_address('exec')) + + # why is this needed without the types. fixed tests below + # LLVM=1 OPT=2 python3 test/test_ops.py TestOps.test_cat TestOps.test_multicat + cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for x in bufs])(LLVM.engine.get_function_address('exec')) + + st = time.monotonic() + cfunc(*[x._buf for x in bufs]) + et = time.monotonic() - st + if DEBUG >= 1: + print(f"**LLVM** time {et*1000:7.2f} ms OPs {op_estimate/1e6:7.2f}M -- {(op_estimate/1e9)/et:5.2f} GFLOPS -- {mem_estimate:10d} reads -- {(mem_estimate*4/1e9)/et:5.2f} GB/s") + GlobalCounters.global_ops += op_estimate + GlobalCounters.global_mem += mem_estimate + + # we are done + LLVM.engine.remove_module(mod) + return cfunc