diff --git a/docs/abstractions.py b/docs/abstractions.py index 44b0ae3613..4b97372ca6 100644 --- a/docs/abstractions.py +++ b/docs/abstractions.py @@ -198,7 +198,7 @@ from tinygrad.device import MallocAllocator # ClangProgram is the simplest runtime (in tinygrad/runtime/ops_clang.py, code 7/10) # __init__ calls clang, and __call__ calls the function in the *.so outputted by clang # in CLANG, global_size and local_size are ignored -from tinygrad.runtime.ops_clang import ClangProgram, compile_clang +from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler # a concrete example looks like this, this adds two size 1 RawBuffer # first we create two numpy buffers containing 2 and 3 @@ -213,7 +213,7 @@ MallocAllocator.copyin(input_a, numpy_a.data.cast("B")) MallocAllocator.copyin(input_b, numpy_b.data.cast("B")) # compile the program, run it, and 2+3 does indeed equal 5 -program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}")) +program = ClangProgram("add", ClangCompiler().compile(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}")) program(output, input_a, input_b) numpy_out = np.empty(1, dtype=np.float32) MallocAllocator.copyout(numpy_out.data.cast("B"), output) @@ -251,7 +251,7 @@ result = Tensor(2.0).realize() + Tensor(3.0).realize() # use the real Linearizer to linearize 2+3 from tinygrad.codegen.linearizer import Linearizer sched = result.lazydata.schedule() -linearizer = Linearizer(sched[-1].ast) +linearizer = Linearizer(sched[-1].ast, ClangCompiler.linearizer_opts) linearizer.linearize() # print the uops diff --git a/docs/abstractions2.py b/docs/abstractions2.py index 5160f165ce..9ce16d256a 100644 --- a/docs/abstractions2.py +++ b/docs/abstractions2.py @@ -7,7 +7,7 @@ print("******** first, the runtime ***********") -from tinygrad.runtime.ops_clang import ClangProgram, compile_clang, MallocAllocator +from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler, MallocAllocator # allocate some buffers out = MallocAllocator.alloc(4) @@ -19,7 +19,7 @@ MallocAllocator.copyin(a, bytearray([2,0,0,0])) MallocAllocator.copyin(b, bytearray([3,0,0,0])) # compile a program to a binary -lib = compile_clang("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") +lib = ClangCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") # create a runtime for the program (ctypes.CDLL) fxn = ClangProgram("add", lib) diff --git a/test/test_device_speed.py b/test/test_device_speed.py index 5084e1c708..337220d083 100644 --- a/test/test_device_speed.py +++ b/test/test_device_speed.py @@ -8,19 +8,19 @@ class TestDeviceSpeed(unittest.TestCase): @classmethod def setUpClass(cls): cls.dev = Device[Device.DEFAULT] - cls.empty = Device[Device.DEFAULT].renderer("test", []) + cls.empty = Device[Device.DEFAULT].compiler.render("test", []) def test_empty_compile(self): with Timing("compiler "): - self.dev.compiler(self.empty) + self.dev.compiler.compile(self.empty) def test_empty_compile_twice(self): - self.dev.compiler(self.empty) + self.dev.compiler.compile(self.empty) with Timing("compiler "): - self.dev.compiler(self.empty) + self.dev.compiler.compile(self.empty) def test_launch_speed(self): - prg_bin = self.dev.compiler(self.empty) + prg_bin = self.dev.compiler.compile(self.empty) prg = self.dev.runtime("test", prg_bin) for _ in range(10): prg() # ignore first launches with Timing("launch 1000x "): @@ -29,7 +29,7 @@ class TestDeviceSpeed(unittest.TestCase): for _ in range(1000): prg(wait=True) def test_profile_launch_speed(self): - prg_bin = self.dev.compiler(self.empty) + prg_bin = self.dev.compiler.compile(self.empty) prg = self.dev.runtime("test", prg_bin) for _ in range(10): prg() # ignore first launches with Profiling(): diff --git a/test/test_linearizer.py b/test/test_linearizer.py index d710691ece..d625bc387b 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -160,7 +160,7 @@ def helper_realized_ast(r:Tensor): output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype) return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs] -@unittest.skipUnless(isinstance(Device[Device.DEFAULT], Compiled) and Device[Device.DEFAULT].linearizer_opts.supports_float4, +@unittest.skipUnless(isinstance(Device[Device.DEFAULT], Compiled) and Device[Device.DEFAULT].compiler.linearizer_opts.supports_float4, "need Compiled backends that support float4") class TestFloat4(unittest.TestCase): @staticmethod @@ -367,7 +367,7 @@ class TestHandCodedOpts(unittest.TestCase): assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49 def test_matvec(self): - if not Device[Device.DEFAULT].linearizer_opts.has_local: + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local: self.skipTest("Only devices with locals") N = 128 a = Tensor.rand(1, N).realize() @@ -417,7 +417,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False): @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "linearizer is only for compiled backends") class TestLinearizerOpts(unittest.TestCase): def test_local_and_grouped_reduce(self): - if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -463,7 +463,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_matmul(self): - if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -493,7 +493,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_double_reduce(self): - if not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared: + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local or not Device[Device.DEFAULT].compiler.linearizer_opts.has_shared: self.skipTest("Only Compiled uses linearizer with locals and shared") N = 128 @@ -520,7 +520,7 @@ class TestLinearizerOpts(unittest.TestCase): ]) def test_tensor_core_opts(self): - if not Device[Device.DEFAULT].linearizer_opts.has_local: + if not Device[Device.DEFAULT].compiler.linearizer_opts.has_local: self.skipTest("Only Compiled uses linearizer with locals") if Device.DEFAULT not in tensor_cores: self.skipTest("No tensor cores for device") diff --git a/test/test_uops.py b/test/test_uops.py index 2bae41c9f4..2910b5a664 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -10,9 +10,9 @@ from tinygrad.codegen.linearizer import UOps, UOp from test.test_dtype import is_dtype_supported def _uops_to_prg(uops): - src = Device[Device.DEFAULT].renderer("test", uops) - return CompiledASTRunner(None, "test", src, Device[Device.DEFAULT], [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, - [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None) + src = Device[Device.DEFAULT].compiler.render("test", uops) + has_local = Device[Device.DEFAULT].compiler.linearizer_opts.has_local + return CompiledASTRunner(None, "test", src, Device[Device.DEFAULT], [1] if has_local else None, [1] if has_local else None) def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp: uops.append(UOp(uop, dtype, tuple(vin), arg)) diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 1072df3c13..1259e67f23 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -67,7 +67,8 @@ class LinearizerOptions(NamedTuple): class Kernel: def __init__(self, ast:LazyOp, opts:Optional[LinearizerOptions]=None): - self.opts = opts or (device.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) else LinearizerOptions(Device.DEFAULT)) + self.opts = opts or (device.compiler.linearizer_opts if isinstance(device:=Device[Device.DEFAULT], Compiled) else + LinearizerOptions(Device.DEFAULT)) self.ast = ast assert ast.op == BufferOps.STORE, f"kernels must have a store as the output, got {ast.op}" diff --git a/tinygrad/device.py b/tinygrad/device.py index ebfd44e682..5b13b3ab2d 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -1,6 +1,6 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable, Tuple, cast +from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable, Tuple, cast, ClassVar import importlib, inspect, functools, pathlib, time, re, ctypes from tinygrad.dtype import DType, ImageDType from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put @@ -262,6 +262,18 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret # **************** for Compiled Devices **************** +class Compiler: + linearizer_opts: ClassVar[LinearizerOptions] + def __init__(self, cachekey=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey + def render(self, name:str, uops) -> str: raise NotImplementedError("need a render function") + def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function") + def compile_cached(self, src:str) -> bytes: + if self.cachekey is not None: lib = diskcache_get(self.cachekey, src) + if lib is None: + lib = self.compile(src) + if self.cachekey is not None: diskcache_put(self.cachekey, src, lib) + return lib + class CompiledASTRunner(JITRunner): def __init__(self, ast:Optional[LazyOp], name:str, prg:str, device:Compiled, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, precompiled:Optional[bytes]=None): # noqa: E501 super().__init__() @@ -270,12 +282,7 @@ class CompiledASTRunner(JITRunner): if local_size is not None: local_size = local_size + [1]*(3-len(local_size)) self.name, self.display_name, self.prg, self.device, self.global_size, self.local_size, self.first_run = \ to_function_name(name), name, prg, device, global_size, local_size, True - lib: Optional[bytes] = precompiled - if lib is None: - if self.device.compiler_cachekey is not None: lib = diskcache_get(self.device.compiler_cachekey, prg) - if lib is None: - lib = self.device.compiler(prg) - if self.device.compiler_cachekey is not None: diskcache_put(self.device.compiler_cachekey, prg, lib) + lib:bytes = precompiled if precompiled is not None else self.device.compiler.compile_cached(prg) self.lib, self.clprg = lib, self.device.runtime(self.name, lib) self.vars: List[Variable] = [] if ast: @@ -306,31 +313,29 @@ class CompiledASTRunner(JITRunner): return et class Compiled: - def __init__(self, device:str, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, compiler_cachekey, runtime, graph=None): - self.dname, self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph, self.compiler_cachekey = \ - device, allocator, linearizer_opts, renderer, compiler, runtime, graph, None if getenv("DISABLE_COMPILER_CACHE") else compiler_cachekey + def __init__(self, device:str, allocator:Allocator, compiler:Compiler, runtime, graph=None): + self.dname, self.allocator, self.compiler, self.runtime, self.graph = device, allocator, compiler, runtime, graph def synchronize(self): pass # override this in your device def to_program(self, k:Linearizer) -> CompiledASTRunner: - assert self.compiler is not None, f"compiler is None, can't build {k.ast}" k.linearize() - return CompiledASTRunner(k.ast, k.name, self.renderer(to_function_name(k.name), k.uops), self, k.global_size, k.local_size) + return CompiledASTRunner(k.ast, k.name, self.compiler.render(to_function_name(k.name), k.uops), self, k.global_size, k.local_size) def get_linearizer(self, ast:LazyOp) -> Linearizer: if DEBUG >= 3: from tinygrad.graph import print_tree print_tree(ast) from tinygrad.codegen.linearizer import Linearizer - k = Linearizer(ast, self.linearizer_opts) + k = Linearizer(ast, self.compiler.linearizer_opts) k.required_optimizations() if not NOOPT: if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations() if BEAM >= 1: lins = [(("tc" if used_tensor_cores else "hc"), k)] if used_tensor_cores: - lins.append(("hc", Linearizer(ast, self.linearizer_opts))) + lins.append(("hc", Linearizer(ast, self.compiler.linearizer_opts))) lins[-1][1].hand_coded_optimizations() - kb = Linearizer(ast, self.linearizer_opts) + kb = Linearizer(ast, self.compiler.linearizer_opts) kb.required_optimizations() from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py index 3680514198..91b615eaef 100644 --- a/tinygrad/features/search.py +++ b/tinygrad/features/search.py @@ -1,6 +1,6 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable import itertools, functools, random, math, time, multiprocessing, traceback, signal -from tinygrad.device import Device, Compiled, Buffer, CompiledASTRunner +from tinygrad.device import Device, Compiled, Buffer, CompiledASTRunner, Compiler from tinygrad.ops import MemBuffer, LazyOp from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name from tinygrad.dtype import ImageDType @@ -43,13 +43,13 @@ def _time_program(ast:LazyOp, rdev:Compiled, lib:bytes, global_size, local_size, if early_stop is not None and early_stop < tms[-1]: break return tms -def _compile_linearizer(rdev:Compiled, lin:Linearizer, name:Optional[str]=None) -> Tuple[bytes, Optional[List[int]], Optional[List[int]]]: +def _compile_linearizer(compiler:Compiler, lin:Linearizer, name:Optional[str]=None) -> Tuple[bytes, Optional[List[int]], Optional[List[int]]]: lin.linearize() - src = rdev.renderer(name if name is not None else to_function_name(lin.name), lin.uops) # NOTE: these all have the same name for deduping - return rdev.compiler(src), lin.global_size, lin.local_size + src = compiler.render(name if name is not None else to_function_name(lin.name), lin.uops) # NOTE: these all have the same name for deduping + return compiler.compile(src), lin.global_size, lin.local_size -def _try_compile_linearized_w_idx(x, device:str): - try: return (x[0], _compile_linearizer(cast(Compiled, Device[device]), x[1], "test")) +def _try_compile_linearized_w_idx(x, compiler:Compiler): + try: return (x[0], _compile_linearizer(compiler, x[1], "test")) except Exception: if DEBUG >= 4: traceback.print_exc() return (x[0], None) @@ -110,7 +110,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea while not exiting: acted_lins = flatten([get_linearizer_actions(lin, include_0=False).values() for lin,_ in beam]) if len(beam) else [lin] timed_lins: List[Tuple[Linearizer, float]] = [] - _compile_fn = functools.partial(_try_compile_linearized_w_idx, device=lin.opts.device) + _compile_fn = functools.partial(_try_compile_linearized_w_idx, compiler=cast(Compiled, Device[lin.opts.device]).compiler) for i,proc in (pool.imap_unordered(_compile_fn, enumerate(acted_lins)) if pool is not None else map(_compile_fn, enumerate(acted_lins))): if proc is None: continue lib, global_size, local_size = proc @@ -155,7 +155,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True, assert isinstance(dev, Compiled) var_vals = {k:(k.max+k.min)//2 for k in lin.ast.vars()} - lib, global_size, local_size = _compile_linearizer(dev, lin) + lib, global_size, local_size = _compile_linearizer(dev.compiler, lin) tms = _time_program(lin.ast, dev, lib, global_size, local_size, var_vals, rawbufs, max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name)) # noqa: E501 if CACHELEVEL >= 2: diskcache_put("time_linearizer", key, tms) diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 832dbf21af..b7eb5b3a83 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,16 +1,20 @@ -import ctypes, subprocess, functools, pathlib, tempfile -from tinygrad.device import Compiled, MallocAllocator +import ctypes, subprocess, pathlib, tempfile +from tinygrad.device import Compiled, MallocAllocator, Compiler from tinygrad.helpers import cpu_time_execution from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage CLANG_PROGRAM_HEADER = '#include \n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include \n' # noqa: E501 -def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes: - # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here - with tempfile.NamedTemporaryFile(delete=True) as output_file: - subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8')) # noqa: E501 - return pathlib.Path(output_file.name).read_bytes() +class ClangCompiler(Compiler): + linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False) + def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops) + def compile(self, src:str) -> bytes: + # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here + with tempfile.NamedTemporaryFile(delete=True) as output_file: + subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+ \ + str(output_file.name)).split(), input=(CLANG_PROGRAM_HEADER+src).encode('utf-8')) + return pathlib.Path(output_file.name).read_bytes() class ClangProgram: def __init__(self, name:str, lib:bytes): @@ -23,6 +27,4 @@ class ClangProgram: def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait) class ClangDevice(Compiled): - def __init__(self, device:str): - super().__init__(device, MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False), - functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict")), compile_clang, "compile_clang", ClangProgram) + def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangCompiler("compile_clang"), ClangProgram) diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index f4451508b1..1cb72ccff9 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Tuple, Optional import tinygrad.runtime.autogen.cuda as cuda from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, colored, cpu_time_execution, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style # noqa: E501 -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator +from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import CUDARenderer @@ -29,7 +29,16 @@ def check(status): def cu_time_execution(cb, enable=False) -> Optional[float]: return time_execution_cuda_style(cb, cuda.CUevent, cuda.cuEventCreate, cuda.cuEventRecord, cuda.cuEventSynchronize, cuda.cuEventDestroy_v2, cuda.cuEventElapsedTime, enable=enable) if not CUDACPU else cpu_time_execution(cb, enable=enable) # noqa: E501 -def compile_cuda(prg:str, arch="sm_35") -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check) # noqa: E501 +class CUDACompiler(Compiler): + linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]) + def __init__(self, arch:str): + self.arch = arch + super().__init__(f"compile_cuda_{self.arch}") + def render(self, name:str, uops) -> str: return CUDARenderer(name, uops) + def compile(self, src:str) -> bytes: + return compile_cuda_style(src, [f'--gpu-architecture={self.arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], + cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, + cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check) class CUDAProgram: def __init__(self, device:CUDADevice, name:str, lib:bytes): @@ -83,10 +92,8 @@ class CUDADevice(Compiled): self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35" from tinygrad.runtime.graph.cuda import CUDAGraph - super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, - LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]), - CUDARenderer, functools.partial(compile_cuda,arch=self.arch), f"compile_cuda_{self.arch}", functools.partial(CUDAProgram, self), - graph=CUDAGraph if not CUDACPU else None) + super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, CUDACompiler(self.arch), + functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None) def synchronize(self): if not CUDACPU: check(cuda.cuCtxSetCurrent(self.context)) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 1da6b397ed..9f03265f54 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.cstyle import OpenCLRenderer -from tinygrad.device import Compiled, LRUAllocator, BufferOptions +from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something OSX_TIMING_RATIO = (125/3) if OSX else 1.0 @@ -14,18 +14,24 @@ def check(status): if status != 0: raise RuntimeError(f"OpenCL Error {status}") def checked(ret, status): return (check(status.value), ret)[1] -def compile_cl(device:CLDevice, prg:str) -> bytes: - program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]), - ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status) - status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None) - if status != 0: - cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t())) - cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501 - raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}") - binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501 - binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501 - check(cl.clReleaseProgram(program)) - return bytes(binary) +class CLCompiler(Compiler): + linearizer_opts = LinearizerOptions("GPU") + def __init__(self, device:CLDevice, compile_key:str): + self.device = device + super().__init__(f"compile_cl_{compile_key}") + def render(self, name:str, uops) -> str: return OpenCLRenderer(name, uops) + def compile(self, src:str) -> bytes: + program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([prg_bytes := src.encode()]), + ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status) + status = cl.clBuildProgram(program, 1, ctypes.byref(self.device.device_id), None, cl.clBuildProgram.argtypes[4](), None) + if status != 0: + cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t())) + cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501 + raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}") + binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501 + binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501 + check(cl.clReleaseProgram(program)) + return bytes(binary) class CLProgram: def __init__(self, device:CLDevice, name:str, lib:bytes): @@ -96,8 +102,7 @@ class CLDevice(Compiled): self.pending_copyin: List[memoryview] = [] compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest() - super().__init__(device, CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer, - functools.partial(compile_cl, self), f"compile_cl_{compile_key}", functools.partial(CLProgram, self)) + super().__init__(device, CLAllocator(self), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self)) def synchronize(self): check(cl.clFinish(self.queue)) self.pending_copyin.clear() diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index ba12dcc6d8..7225da2ffa 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List, Any, cast, Set import tinygrad.runtime.autogen.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats +from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.runtime.compiler.hip_comgr import compile_hip @@ -12,6 +12,14 @@ from tinygrad.runtime.compiler.hip_comgr import compile_hip # The default HIP stream is used for everything. MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile +class HIPCompiler(Compiler): + linearizer_opts = LinearizerOptions("HIP") + def __init__(self, arch:str): + self.arch = arch + super().__init__(f"compile_hip_{self.arch}") + def render(self, name:str, uops) -> str: return HIPRenderer(name, uops) + def compile(self, src:str) -> bytes: return compile_hip(src, self.arch) + hip_current_device = None def hip_set_device(d:int): global hip_current_device @@ -132,8 +140,8 @@ class HIPDevice(Compiled): self.peers: Set[int] = set() from tinygrad.runtime.graph.hip import HIPGraph - super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer, - functools.partial(compile_hip,arch=self.arch), f"compile_hip_{self.arch}", functools.partial(HIPProgram, self.device), HIPGraph) + super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch), + functools.partial(HIPProgram, self.device), HIPGraph) def synchronize(self): hip_set_device(self.device) check(hip.hipDeviceSynchronize()) diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index a5ec322dea..21ef7a6dd9 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,18 +1,24 @@ from __future__ import annotations import ctypes, functools from typing import Tuple -from tinygrad.device import Compiled, MallocAllocator +from tinygrad.device import Compiled, MallocAllocator, Compiler from tinygrad.helpers import DEBUG, cpu_time_execution from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.renderer.llvmir import uops_to_llvm_ir import llvmlite.binding as llvm -def compile_llvm(device, prg) -> bytes: - mod = llvm.parse_assembly(prg) - mod.verify() - device.optimizer.run(mod) - if DEBUG >= 5: print(device.target_machine.emit_assembly(mod)) - return device.target_machine.emit_object(mod) +class LLVMCompiler(Compiler): + linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False) + def __init__(self, device:LLVMDevice): + self.device = device + super().__init__("compile_llvm") + def render(self, name:str, uops) -> str: return uops_to_llvm_ir(name, uops) + def compile(self, src:str) -> bytes: + mod = llvm.parse_assembly(src) + mod.verify() + self.device.optimizer.run(mod) + if DEBUG >= 5: print(self.device.target_machine.emit_assembly(mod)) + return self.device.target_machine.emit_object(mod) class LLVMProgram: def __init__(self, device:LLVMDevice, name:str, lib:bytes): @@ -38,5 +44,4 @@ class LLVMDevice(Compiled): backing_mod = llvm.parse_assembly(str()) backing_mod.triple = llvm.get_process_triple() self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine) - super().__init__(device, MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False), - uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self)) + super().__init__(device, MallocAllocator, LLVMCompiler(self), functools.partial(LLVMProgram, self)) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 39b6a98730..ae6a186f95 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -4,18 +4,24 @@ import Metal, libdispatch from typing import List, Any, Tuple, Optional from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.helpers import prod, getenv, DEBUG, unwrap2 -from tinygrad.device import Compiled, LRUAllocator +from tinygrad.device import Compiled, LRUAllocator, Compiler from tinygrad.renderer.cstyle import MetalRenderer -def compile_metal_xcode(prg:str) -> bytes: - # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode - air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) - return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) - -def compile_metal(device, prg:str) -> bytes: - options = Metal.MTLCompileOptions.new() - library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None)) - return library.libraryDataContents().bytes().tobytes() +class MetalCompiler(Compiler): + linearizer_opts = LinearizerOptions("METAL") + def __init__(self, device:Optional[MetalDevice]): + self.device = device + super().__init__("compile_metal") + def render(self, name:str, uops) -> str: return MetalRenderer(name, uops) + def compile(self, src:str) -> bytes: + if self.device is None: + # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode + air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8')) + return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) + else: + options = Metal.MTLCompileOptions.new() + library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None)) + return library.libraryDataContents().bytes().tobytes() class MetalProgram: def __init__(self, device:MetalDevice, name:str, lib:bytes): @@ -78,8 +84,7 @@ class MetalDevice(Compiled): self.mtl_buffers_in_flight: List[Any] = [] self.mv_in_metal: List[memoryview] = [] from tinygrad.runtime.graph.metal import MetalGraph - super().__init__(device, MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer, - compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal", + super().__init__(device, MetalAllocator(self), MetalCompiler(None if getenv("METAL_XCODE") else self), functools.partial(MetalProgram, self), functools.partial(MetalGraph, self)) def synchronize(self): for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()