compiler support (#3260)

* compiler support

* revert that

* fix tests
This commit is contained in:
George Hotz
2024-01-26 23:36:40 -08:00
committed by GitHub
parent 4273aabe31
commit 3c728d1082
14 changed files with 137 additions and 99 deletions

View File

@@ -1,16 +1,20 @@
import ctypes, subprocess, functools, pathlib, tempfile
from tinygrad.device import Compiled, MallocAllocator
import ctypes, subprocess, pathlib, tempfile
from tinygrad.device import Compiled, MallocAllocator, Compiler
from tinygrad.helpers import cpu_time_execution
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include <stdbool.h>\n' # noqa: E501
def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
with tempfile.NamedTemporaryFile(delete=True) as output_file:
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8')) # noqa: E501
return pathlib.Path(output_file.name).read_bytes()
class ClangCompiler(Compiler):
linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
def compile(self, src:str) -> bytes:
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
with tempfile.NamedTemporaryFile(delete=True) as output_file:
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+ \
str(output_file.name)).split(), input=(CLANG_PROGRAM_HEADER+src).encode('utf-8'))
return pathlib.Path(output_file.name).read_bytes()
class ClangProgram:
def __init__(self, name:str, lib:bytes):
@@ -23,6 +27,4 @@ class ClangProgram:
def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
class ClangDevice(Compiled):
def __init__(self, device:str):
super().__init__(device, MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False),
functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict")), compile_clang, "compile_clang", ClangProgram)
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangCompiler("compile_clang"), ClangProgram)

View File

@@ -4,7 +4,7 @@ from pathlib import Path
from typing import Tuple, Optional
import tinygrad.runtime.autogen.cuda as cuda
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, colored, cpu_time_execution, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style # noqa: E501
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import CUDARenderer
@@ -29,7 +29,16 @@ def check(status):
def cu_time_execution(cb, enable=False) -> Optional[float]: return time_execution_cuda_style(cb, cuda.CUevent, cuda.cuEventCreate, cuda.cuEventRecord, cuda.cuEventSynchronize, cuda.cuEventDestroy_v2, cuda.cuEventElapsedTime, enable=enable) if not CUDACPU else cpu_time_execution(cb, enable=enable) # noqa: E501
def compile_cuda(prg:str, arch="sm_35") -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check) # noqa: E501
class CUDACompiler(Compiler):
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024])
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_cuda_{self.arch}")
def render(self, name:str, uops) -> str: return CUDARenderer(name, uops)
def compile(self, src:str) -> bytes:
return compile_cuda_style(src, [f'--gpu-architecture={self.arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"],
cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX,
cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check)
class CUDAProgram:
def __init__(self, device:CUDADevice, name:str, lib:bytes):
@@ -83,10 +92,8 @@ class CUDADevice(Compiled):
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
from tinygrad.runtime.graph.cuda import CUDAGraph
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
CUDARenderer, functools.partial(compile_cuda,arch=self.arch), f"compile_cuda_{self.arch}", functools.partial(CUDAProgram, self),
graph=CUDAGraph if not CUDACPU else None)
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, CUDACompiler(self.arch),
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
def synchronize(self):
if not CUDACPU:
check(cuda.cuCtxSetCurrent(self.context))

View File

@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.opencl as cl
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import OpenCLRenderer
from tinygrad.device import Compiled, LRUAllocator, BufferOptions
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -14,18 +14,24 @@ def check(status):
if status != 0: raise RuntimeError(f"OpenCL Error {status}")
def checked(ret, status): return (check(status.value), ret)[1]
def compile_cl(device:CLDevice, prg:str) -> bytes:
program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
if status != 0:
cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501
check(cl.clReleaseProgram(program))
return bytes(binary)
class CLCompiler(Compiler):
linearizer_opts = LinearizerOptions("GPU")
def __init__(self, device:CLDevice, compile_key:str):
self.device = device
super().__init__(f"compile_cl_{compile_key}")
def render(self, name:str, uops) -> str: return OpenCLRenderer(name, uops)
def compile(self, src:str) -> bytes:
program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([prg_bytes := src.encode()]),
ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
status = cl.clBuildProgram(program, 1, ctypes.byref(self.device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
if status != 0:
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501
check(cl.clReleaseProgram(program))
return bytes(binary)
class CLProgram:
def __init__(self, device:CLDevice, name:str, lib:bytes):
@@ -96,8 +102,7 @@ class CLDevice(Compiled):
self.pending_copyin: List[memoryview] = []
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
super().__init__(device, CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer,
functools.partial(compile_cl, self), f"compile_cl_{compile_key}", functools.partial(CLProgram, self))
super().__init__(device, CLAllocator(self), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
def synchronize(self):
check(cl.clFinish(self.queue))
self.pending_copyin.clear()

View File

@@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List, Any, cast, Set
import tinygrad.runtime.autogen.hip as hip
from tinygrad.helpers import DEBUG, getenv, init_c_var
from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.runtime.compiler.hip_comgr import compile_hip
@@ -12,6 +12,14 @@ from tinygrad.runtime.compiler.hip_comgr import compile_hip
# The default HIP stream is used for everything.
MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
class HIPCompiler(Compiler):
linearizer_opts = LinearizerOptions("HIP")
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_hip_{self.arch}")
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
hip_current_device = None
def hip_set_device(d:int):
global hip_current_device
@@ -132,8 +140,8 @@ class HIPDevice(Compiled):
self.peers: Set[int] = set()
from tinygrad.runtime.graph.hip import HIPGraph
super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
functools.partial(compile_hip,arch=self.arch), f"compile_hip_{self.arch}", functools.partial(HIPProgram, self.device), HIPGraph)
super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch),
functools.partial(HIPProgram, self.device), HIPGraph)
def synchronize(self):
hip_set_device(self.device)
check(hip.hipDeviceSynchronize())

View File

@@ -1,18 +1,24 @@
from __future__ import annotations
import ctypes, functools
from typing import Tuple
from tinygrad.device import Compiled, MallocAllocator
from tinygrad.device import Compiled, MallocAllocator, Compiler
from tinygrad.helpers import DEBUG, cpu_time_execution
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.llvmir import uops_to_llvm_ir
import llvmlite.binding as llvm
def compile_llvm(device, prg) -> bytes:
mod = llvm.parse_assembly(prg)
mod.verify()
device.optimizer.run(mod)
if DEBUG >= 5: print(device.target_machine.emit_assembly(mod))
return device.target_machine.emit_object(mod)
class LLVMCompiler(Compiler):
linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
def __init__(self, device:LLVMDevice):
self.device = device
super().__init__("compile_llvm")
def render(self, name:str, uops) -> str: return uops_to_llvm_ir(name, uops)
def compile(self, src:str) -> bytes:
mod = llvm.parse_assembly(src)
mod.verify()
self.device.optimizer.run(mod)
if DEBUG >= 5: print(self.device.target_machine.emit_assembly(mod))
return self.device.target_machine.emit_object(mod)
class LLVMProgram:
def __init__(self, device:LLVMDevice, name:str, lib:bytes):
@@ -38,5 +44,4 @@ class LLVMDevice(Compiled):
backing_mod = llvm.parse_assembly(str())
backing_mod.triple = llvm.get_process_triple()
self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
super().__init__(device, MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False),
uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self))
super().__init__(device, MallocAllocator, LLVMCompiler(self), functools.partial(LLVMProgram, self))

View File

@@ -4,18 +4,24 @@ import Metal, libdispatch
from typing import List, Any, Tuple, Optional
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
from tinygrad.device import Compiled, LRUAllocator
from tinygrad.device import Compiled, LRUAllocator, Compiler
from tinygrad.renderer.cstyle import MetalRenderer
def compile_metal_xcode(prg:str) -> bytes:
# NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
def compile_metal(device, prg:str) -> bytes:
options = Metal.MTLCompileOptions.new()
library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None))
return library.libraryDataContents().bytes().tobytes()
class MetalCompiler(Compiler):
linearizer_opts = LinearizerOptions("METAL")
def __init__(self, device:Optional[MetalDevice]):
self.device = device
super().__init__("compile_metal")
def render(self, name:str, uops) -> str: return MetalRenderer(name, uops)
def compile(self, src:str) -> bytes:
if self.device is None:
# NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
else:
options = Metal.MTLCompileOptions.new()
library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
return library.libraryDataContents().bytes().tobytes()
class MetalProgram:
def __init__(self, device:MetalDevice, name:str, lib:bytes):
@@ -78,8 +84,7 @@ class MetalDevice(Compiled):
self.mtl_buffers_in_flight: List[Any] = []
self.mv_in_metal: List[memoryview] = []
from tinygrad.runtime.graph.metal import MetalGraph
super().__init__(device, MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer,
compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal",
super().__init__(device, MetalAllocator(self), MetalCompiler(None if getenv("METAL_XCODE") else self),
functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
def synchronize(self):
for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()