diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fcddcd9f86..074c4469ea 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,8 @@ jobs: python-version: 3.12 - name: Install docs dependencies (no cache) run: pip install -e '.[docs]' + - name: Install capstone for CLANG disassembly + run: pip install capstone - name: Use as an external package run: | mkdir $HOME/test_external_dir diff --git a/docs/abstractions2.py b/docs/abstractions2.py index a7cf946e5a..067e0ac328 100644 --- a/docs/abstractions2.py +++ b/docs/abstractions2.py @@ -7,7 +7,7 @@ print("******** first, the runtime ***********") -from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler, MallocAllocator +from tinygrad.runtime.ops_clang import ClangJITCompiler, MallocAllocator, CPUProgram # allocate some buffers out = MallocAllocator.alloc(4) @@ -19,10 +19,10 @@ MallocAllocator._copyin(a, memoryview(bytearray([2,0,0,0]))) MallocAllocator._copyin(b, memoryview(bytearray([3,0,0,0]))) # compile a program to a binary -lib = ClangCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") +lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") -# create a runtime for the program (ctypes.CDLL) -fxn = ClangProgram("add", lib) +# create a runtime for the program +fxn = CPUProgram("add", lib) # run the program fxn(out, a, b) @@ -65,7 +65,7 @@ kernel = get_kernel(Device[DEVICE].renderer, s).linearize() # compile a program (and print the source) fxn = CompiledRunner(kernel.to_program()) print(fxn.p.src) -# NOTE: fxn.clprg is the ClangProgram +# NOTE: fxn.clprg is the CPUProgram # run the program fxn.exec([out, a, b]) diff --git a/docs/developer/runtime.md b/docs/developer/runtime.md index 2367d47b1e..0c8a9d5ed9 100644 --- a/docs/developer/runtime.md +++ b/docs/developer/runtime.md @@ -36,9 +36,9 @@ The `Allocator` class is responsible for managing memory on the device. There is ### Program -The `Program` class is created for each loaded program. It is responsible for compiling and executing the program on the device. As an example, here is a `ClangProgram` implementation which loads program and runs it. +The `Program` class is created for each loaded program. It is responsible for executing the program on the device. As an example, here is a `CPUProgram` implementation which loads program and runs it. -::: tinygrad.runtime.ops_clang.ClangProgram +::: tinygrad.runtime.ops_clang.CPUProgram options: members: true diff --git a/setup.py b/setup.py index d4048c756b..0e779b125a 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,8 @@ setup(name='tinygrad', "hypothesis", "nibabel", "bottle", - "ggml-python" + "ggml-python", + "capstone" ], 'webgpu': ["wgpu"], 'docs': [ diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 58dc61756a..0163c2b51a 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -45,6 +45,8 @@ def ceildiv(num, amt): return int(ret) if isinstance((ret:=-(num//-amt)), float) def round_up(num:int, amt:int) -> int: return (num+amt-1)//amt * amt def data64(data:Any) -> tuple[Any, Any]: return (data >> 32, data & 0xFFFFFFFF) # Any is sint def data64_le(data:Any) -> tuple[Any, Any]: return (data & 0xFFFFFFFF, data >> 32) # Any is sint +def getbits(value: int, start: int, end: int): return (value >> start) & ((1 << end-start+1) - 1) +def i2u(bits: int, value: int): return value if value >= 0 else (1< dict[T,U]: kvs = set([(k,v) for d in ds for k,v in d.items()]) assert len(kvs) == len(set(kv[0] for kv in kvs)), f"cannot merge, {kvs} contains different values for the same key" @@ -265,7 +267,7 @@ def cpu_objdump(lib, objdump_tool='objdump'): def from_mv(mv:memoryview, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type * len(mv))).contents def to_mv(ptr:int, sz:int) -> memoryview: return memoryview(ctypes.cast(ptr, ctypes.POINTER(ctypes.c_uint8 * sz)).contents).cast("B") -def mv_address(mv:memoryview): return ctypes.addressof(ctypes.c_char.from_buffer(mv)) +def mv_address(mv): return ctypes.addressof(ctypes.c_char.from_buffer(mv)) def to_char_p_p(options: list[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options]) @functools.lru_cache(maxsize=None) diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index aef1e195ab..90fa45cf4a 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,32 +1,66 @@ -import ctypes, subprocess, pathlib, tempfile +import ctypes, ctypes.util, struct, platform, subprocess +from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE +from tinygrad.helpers import OSX, mv_address, cpu_time_execution from tinygrad.device import Compiled, Compiler, MallocAllocator -from tinygrad.helpers import cpu_time_execution, cpu_objdump +from tinygrad.runtime.support.elf import elf_loader, relocate from tinygrad.renderer.cstyle import ClangRenderer -class ClangCompiler(Compiler): - def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'): - self.args = ['-march=native'] if args is None else args - self.objdump_tool = objdump_tool - super().__init__(cachekey) +# NOTE: MAP_JIT is added to mmap module in python 3.13 +MAP_JIT = 0x0800 + +class ClangJITCompiler(Compiler): + def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) def compile(self, src:str) -> bytes: - # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here - with tempfile.NamedTemporaryFile(delete=True) as output_file: - subprocess.check_output(['clang', '-shared', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib', - '-', '-o', str(output_file.name)], input=src.encode('utf-8')) - return pathlib.Path(output_file.name).read_bytes() + # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call + # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it + args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] + arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else [] + obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) + image, _, relocs = elf_loader(obj) + # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved. + for ploc,tgt,r_type,r_addend in relocs: + image[ploc:ploc+4] = struct.pack(" tuple[memoryview, list[ relocs += [(target_image_off + roff, sections[sym.st_shndx].header.sh_addr + sym.st_value, rtype, raddend) for roff, sym, rtype, raddend in rels] return memoryview(image), sections, relocs + +def relocate(instr: int, ploc: int, tgt: int, r_type: int): + # https://refspecs.linuxfoundation.org/elf/x86_64-abi-0.95.pdf + if r_type == libc.R_X86_64_PC32: return i2u(32, tgt-ploc) + # https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst for definitions of relocations + # https://www.scs.stanford.edu/~zyedidia/arm64/index.html for instruction encodings + if r_type == libc.R_AARCH64_ADR_PREL_PG_HI21: + rel_pg = i2u(33, (tgt & ~0xFFF) - (ploc & ~0xFFF)) + return instr | (getbits(rel_pg, 12, 13) << 29) | (getbits(rel_pg, 14, 32) << 5) + if r_type == libc.R_AARCH64_ADD_ABS_LO12_NC: return instr | (getbits(tgt, 0, 11) << 10) + if r_type == libc.R_AARCH64_LDST16_ABS_LO12_NC: return instr | (getbits(tgt, 1, 11) << 10) + if r_type == libc.R_AARCH64_LDST32_ABS_LO12_NC: return instr | (getbits(tgt, 2, 11) << 10) + if r_type == libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10) + if r_type == libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10) + raise NotImplementedError(f"Encountered unknown relocation type {r_type:#x}")