diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 537cc04e6e..9e61940345 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -622,10 +622,7 @@ jobs: wintests: strategy: fail-fast: false - matrix: - backend: [llvm] - - name: Tests on Windows (${{ matrix.backend }}) + name: Tests on Windows (llvm+clang) runs-on: windows-latest timeout-minutes: 45 steps: @@ -642,20 +639,34 @@ jobs: with: path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }} + - name: Set env + shell: bash + run: | + if [ "${{ matrix.backend }}" = "clang" ]; then + echo "CLANG=1" >> $GITHUB_ENV + elif [ "${{ matrix.backend }}" = "llvm" ]; then + echo "LLVM=1" >> $GITHUB_ENV + fi - name: Install dependencies run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu - - name: Check Device.DEFAULT and print some source - env: - DEBUG: 5 - LLVM: 1 - PYTHONPATH: ${{ github.workspace }} + - name: Check Device.DEFAULT and print some source (llvm) + shell: bash run: | - python3 test/test_ops.py TestOps.test_add - - name: Run pytest - env: - DEBUG: 5 - LLVM: 1 - run: python -m pytest -n=auto test/test_tiny.py --durations=20 + PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT" + DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add + - name: Check Device.DEFAULT and print some source (clang) + shell: bash + run: | + PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT" + DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add + - name: Run pytest (llvm) + shell: bash + run: | + DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20 + - name: Run pytest (clang) + shell: bash + run: | + DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20 #testunicorn: # name: ARM64 unicorn Test diff --git a/tinygrad/device.py b/tinygrad/device.py index 04182a3c33..e7b502432b 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -222,23 +222,31 @@ MAP_JIT = 0x0800 class CPUProgram: helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s')) def __init__(self, name:str, lib:bytes): - assert sys.platform != "win32", "clang is not supported for windows yet" - from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE - # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/ - # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) - self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) + if sys.platform == "win32": + PAGE_EXECUTE_READWRITE = 0x40 + MEM_COMMIT = 0x1000 + MEM_RESERVE = 0x2000 + ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64 + ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE) + ctypes.memmove(ptr, lib, len(lib)) + self.fxn = ctypes.CFUNCTYPE(None)(ptr) + else: + from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE + # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/ + # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) + self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) - self.mem.write(lib) - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) + self.mem.write(lib) + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) - # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. - # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately - # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux - # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 - CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) + # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. + # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately + # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux + # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 + CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) - self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) + self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) def __call__(self, *bufs, vals=(), wait=False): args = list(bufs) + list(vals) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 5895b0cc68..9c055acd87 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'): def capstone_flatdump(lib: bytes): import capstone match platform.machine(): - case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64) + case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64) case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM) case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}") for instr in cs.disasm(lib, 0): diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 4f2e604f05..6b9a9f266d 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -1,5 +1,5 @@ from typing import Optional, Union, Literal, Callable, cast -import os, math +import os, math, sys from collections import defaultdict, Counter from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX @@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage): tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt, swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1")) for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]] - + if sys.platform == 'win32': + kernel_prefix = "__attribute__((ms_abi)) " def render_vector_prefix(self, dt:DType) -> str: return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));" diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 689c04c460..2baf572382 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,4 +1,4 @@ -import platform, tempfile, pathlib, subprocess +import platform, tempfile, pathlib, subprocess, sys from tinygrad.helpers import cpu_objdump, capstone_flatdump from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram from tinygrad.runtime.support.elf import jit_loader @@ -26,7 +26,8 @@ class ClangJITCompiler(Compiler): def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it - args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] + target = 'x86_64' if sys.platform == 'win32' else platform.machine() + args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else [] obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj)