mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-15 00:55:11 -05:00
@@ -1,16 +1,20 @@
|
||||
import ctypes, subprocess, functools, pathlib, tempfile
|
||||
from tinygrad.device import Compiled, MallocAllocator
|
||||
import ctypes, subprocess, pathlib, tempfile
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler
|
||||
from tinygrad.helpers import cpu_time_execution
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
|
||||
|
||||
CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#include <stdbool.h>\n' # noqa: E501
|
||||
|
||||
def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
|
||||
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
||||
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
||||
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8')) # noqa: E501
|
||||
return pathlib.Path(output_file.name).read_bytes()
|
||||
class ClangCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CLANG", supports_float4=False, has_local=False)
|
||||
def render(self, name:str, uops) -> str: return uops_to_cstyle(CStyleLanguage(buffer_suffix=" restrict"), name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
||||
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
||||
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+ \
|
||||
str(output_file.name)).split(), input=(CLANG_PROGRAM_HEADER+src).encode('utf-8'))
|
||||
return pathlib.Path(output_file.name).read_bytes()
|
||||
|
||||
class ClangProgram:
|
||||
def __init__(self, name:str, lib:bytes):
|
||||
@@ -23,6 +27,4 @@ class ClangProgram:
|
||||
def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
|
||||
|
||||
class ClangDevice(Compiled):
|
||||
def __init__(self, device:str):
|
||||
super().__init__(device, MallocAllocator, LinearizerOptions("CLANG", supports_float4=False, has_local=False),
|
||||
functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict")), compile_clang, "compile_clang", ClangProgram)
|
||||
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangCompiler("compile_clang"), ClangProgram)
|
||||
|
||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, colored, cpu_time_execution, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style # noqa: E501
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
|
||||
@@ -29,7 +29,16 @@ def check(status):
|
||||
|
||||
def cu_time_execution(cb, enable=False) -> Optional[float]: return time_execution_cuda_style(cb, cuda.CUevent, cuda.cuEventCreate, cuda.cuEventRecord, cuda.cuEventSynchronize, cuda.cuEventDestroy_v2, cuda.cuEventElapsedTime, enable=enable) if not CUDACPU else cpu_time_execution(cb, enable=enable) # noqa: E501
|
||||
|
||||
def compile_cuda(prg:str, arch="sm_35") -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check) # noqa: E501
|
||||
class CUDACompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024])
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_cuda_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return CUDARenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
return compile_cuda_style(src, [f'--gpu-architecture={self.arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"],
|
||||
cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX,
|
||||
cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check)
|
||||
|
||||
class CUDAProgram:
|
||||
def __init__(self, device:CUDADevice, name:str, lib:bytes):
|
||||
@@ -83,10 +92,8 @@ class CUDADevice(Compiled):
|
||||
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
|
||||
|
||||
from tinygrad.runtime.graph.cuda import CUDAGraph
|
||||
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
|
||||
LinearizerOptions("CUDA", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
|
||||
CUDARenderer, functools.partial(compile_cuda,arch=self.arch), f"compile_cuda_{self.arch}", functools.partial(CUDAProgram, self),
|
||||
graph=CUDAGraph if not CUDACPU else None)
|
||||
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator, CUDACompiler(self.arch),
|
||||
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
|
||||
def synchronize(self):
|
||||
if not CUDACPU:
|
||||
check(cuda.cuCtxSetCurrent(self.context))
|
||||
|
||||
@@ -5,7 +5,7 @@ import tinygrad.runtime.autogen.opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
|
||||
|
||||
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
||||
@@ -14,18 +14,24 @@ def check(status):
|
||||
if status != 0: raise RuntimeError(f"OpenCL Error {status}")
|
||||
def checked(ret, status): return (check(status.value), ret)[1]
|
||||
|
||||
def compile_cl(device:CLDevice, prg:str) -> bytes:
|
||||
program = checked(cl.clCreateProgramWithSource(device.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
|
||||
ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
|
||||
status = cl.clBuildProgram(program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
|
||||
if status != 0:
|
||||
cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
|
||||
cl.clGetProgramBuildInfo(program, device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
|
||||
raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
|
||||
binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
|
||||
binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501
|
||||
check(cl.clReleaseProgram(program))
|
||||
return bytes(binary)
|
||||
class CLCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("GPU")
|
||||
def __init__(self, device:CLDevice, compile_key:str):
|
||||
self.device = device
|
||||
super().__init__(f"compile_cl_{compile_key}")
|
||||
def render(self, name:str, uops) -> str: return OpenCLRenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([prg_bytes := src.encode()]),
|
||||
ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
|
||||
status = cl.clBuildProgram(program, 1, ctypes.byref(self.device.device_id), None, cl.clBuildProgram.argtypes[4](), None)
|
||||
if status != 0:
|
||||
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t()))
|
||||
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
|
||||
raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
|
||||
binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
|
||||
binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501
|
||||
check(cl.clReleaseProgram(program))
|
||||
return bytes(binary)
|
||||
|
||||
class CLProgram:
|
||||
def __init__(self, device:CLDevice, name:str, lib:bytes):
|
||||
@@ -96,8 +102,7 @@ class CLDevice(Compiled):
|
||||
self.pending_copyin: List[memoryview] = []
|
||||
|
||||
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
|
||||
super().__init__(device, CLAllocator(self), LinearizerOptions("GPU"), OpenCLRenderer,
|
||||
functools.partial(compile_cl, self), f"compile_cl_{compile_key}", functools.partial(CLProgram, self))
|
||||
super().__init__(device, CLAllocator(self), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
|
||||
def synchronize(self):
|
||||
check(cl.clFinish(self.queue))
|
||||
self.pending_copyin.clear()
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import Tuple, TypeVar, List, Any, cast, Set
|
||||
import tinygrad.runtime.autogen.hip as hip
|
||||
from tinygrad.helpers import DEBUG, getenv, init_c_var
|
||||
from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats
|
||||
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.runtime.compiler.hip_comgr import compile_hip
|
||||
@@ -12,6 +12,14 @@ from tinygrad.runtime.compiler.hip_comgr import compile_hip
|
||||
# The default HIP stream is used for everything.
|
||||
MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
|
||||
|
||||
class HIPCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("HIP")
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
||||
|
||||
hip_current_device = None
|
||||
def hip_set_device(d:int):
|
||||
global hip_current_device
|
||||
@@ -132,8 +140,8 @@ class HIPDevice(Compiled):
|
||||
self.peers: Set[int] = set()
|
||||
|
||||
from tinygrad.runtime.graph.hip import HIPGraph
|
||||
super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), LinearizerOptions("HIP"), HIPRenderer,
|
||||
functools.partial(compile_hip,arch=self.arch), f"compile_hip_{self.arch}", functools.partial(HIPProgram, self.device), HIPGraph)
|
||||
super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch),
|
||||
functools.partial(HIPProgram, self.device), HIPGraph)
|
||||
def synchronize(self):
|
||||
hip_set_device(self.device)
|
||||
check(hip.hipDeviceSynchronize())
|
||||
|
||||
@@ -1,18 +1,24 @@
|
||||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from typing import Tuple
|
||||
from tinygrad.device import Compiled, MallocAllocator
|
||||
from tinygrad.device import Compiled, MallocAllocator, Compiler
|
||||
from tinygrad.helpers import DEBUG, cpu_time_execution
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.llvmir import uops_to_llvm_ir
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
def compile_llvm(device, prg) -> bytes:
|
||||
mod = llvm.parse_assembly(prg)
|
||||
mod.verify()
|
||||
device.optimizer.run(mod)
|
||||
if DEBUG >= 5: print(device.target_machine.emit_assembly(mod))
|
||||
return device.target_machine.emit_object(mod)
|
||||
class LLVMCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False)
|
||||
def __init__(self, device:LLVMDevice):
|
||||
self.device = device
|
||||
super().__init__("compile_llvm")
|
||||
def render(self, name:str, uops) -> str: return uops_to_llvm_ir(name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
mod = llvm.parse_assembly(src)
|
||||
mod.verify()
|
||||
self.device.optimizer.run(mod)
|
||||
if DEBUG >= 5: print(self.device.target_machine.emit_assembly(mod))
|
||||
return self.device.target_machine.emit_object(mod)
|
||||
|
||||
class LLVMProgram:
|
||||
def __init__(self, device:LLVMDevice, name:str, lib:bytes):
|
||||
@@ -38,5 +44,4 @@ class LLVMDevice(Compiled):
|
||||
backing_mod = llvm.parse_assembly(str())
|
||||
backing_mod.triple = llvm.get_process_triple()
|
||||
self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
|
||||
super().__init__(device, MallocAllocator, LinearizerOptions("LLVM", supports_float4=False, has_local=False, has_shared=False),
|
||||
uops_to_llvm_ir, functools.partial(compile_llvm, self), "compile_llvm", functools.partial(LLVMProgram, self))
|
||||
super().__init__(device, MallocAllocator, LLVMCompiler(self), functools.partial(LLVMProgram, self))
|
||||
|
||||
@@ -4,18 +4,24 @@ import Metal, libdispatch
|
||||
from typing import List, Any, Tuple, Optional
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
|
||||
from tinygrad.device import Compiled, LRUAllocator
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler
|
||||
from tinygrad.renderer.cstyle import MetalRenderer
|
||||
|
||||
def compile_metal_xcode(prg:str) -> bytes:
|
||||
# NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
|
||||
air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
|
||||
return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
|
||||
|
||||
def compile_metal(device, prg:str) -> bytes:
|
||||
options = Metal.MTLCompileOptions.new()
|
||||
library = unwrap2(device.newLibraryWithSource_options_error_(prg, options, None))
|
||||
return library.libraryDataContents().bytes().tobytes()
|
||||
class MetalCompiler(Compiler):
|
||||
linearizer_opts = LinearizerOptions("METAL")
|
||||
def __init__(self, device:Optional[MetalDevice]):
|
||||
self.device = device
|
||||
super().__init__("compile_metal")
|
||||
def render(self, name:str, uops) -> str: return MetalRenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
if self.device is None:
|
||||
# NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
|
||||
air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
|
||||
return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
|
||||
else:
|
||||
options = Metal.MTLCompileOptions.new()
|
||||
library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
|
||||
return library.libraryDataContents().bytes().tobytes()
|
||||
|
||||
class MetalProgram:
|
||||
def __init__(self, device:MetalDevice, name:str, lib:bytes):
|
||||
@@ -78,8 +84,7 @@ class MetalDevice(Compiled):
|
||||
self.mtl_buffers_in_flight: List[Any] = []
|
||||
self.mv_in_metal: List[memoryview] = []
|
||||
from tinygrad.runtime.graph.metal import MetalGraph
|
||||
super().__init__(device, MetalAllocator(self), LinearizerOptions("METAL"), MetalRenderer,
|
||||
compile_metal_xcode if getenv("METAL_XCODE") else functools.partial(compile_metal, self.device), "compile_metal",
|
||||
super().__init__(device, MetalAllocator(self), MetalCompiler(None if getenv("METAL_XCODE") else self),
|
||||
functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
|
||||
def synchronize(self):
|
||||
for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
|
||||
|
||||
Reference in New Issue
Block a user