From f8e485ee9e6fc42d375c8d3adb6bbde55e1ab9d2 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 17 Feb 2026 19:07:05 +0800 Subject: [PATCH] nvcc/nvdisasm macos shim (#14822) * move to backend * and arch * setup_nvcc_osx * blackwell * min test * now getting dumb assert is_ptx * support cubin. * work * remove that * simpler --- extra/setup_nvcc_osx.sh | 24 +++++++++++++++++++++++ tinygrad/helpers.py | 2 +- tinygrad/runtime/ops_nv.py | 5 +++-- tinygrad/runtime/support/compiler_cuda.py | 12 ++++++------ 4 files changed, 34 insertions(+), 9 deletions(-) create mode 100755 extra/setup_nvcc_osx.sh diff --git a/extra/setup_nvcc_osx.sh b/extra/setup_nvcc_osx.sh new file mode 100755 index 0000000000..d95348a342 --- /dev/null +++ b/extra/setup_nvcc_osx.sh @@ -0,0 +1,24 @@ +#!/bin/sh +install_loc="$HOME/.local/bin" +docker build --platform=linux/amd64 -t cuda-nvcc:12.8 - <<'EOF' +FROM ubuntu:22.04 +RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update && apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-nvdisasm-12-8 cuda-cuobjdump-12-8 && rm -rf /var/lib/apt/lists/* +ENV PATH=/usr/local/cuda/bin:$PATH +EOF + +mkdir -p "$install_loc" +tee "$install_loc/nvccshim" >/dev/null <<'EOF' +#!/bin/sh +set -eu +# assume the final arg is the input path +# mount it so that container can read it +dir=$(dirname "${@: -1}") +exec docker run --rm --platform=linux/amd64 -v "$dir":"$dir" cuda-nvcc:12.8 "$(basename "$0")" "$@" +EOF +chmod +x "$install_loc/nvccshim" +for t in nvcc nvdisasm; do + ln -sf "$install_loc/nvccshim" "$install_loc/$t" +done diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index b22c8169d9..e7d5673613 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -188,7 +188,7 @@ CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0) CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1))) # Compilers CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0) -NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0) +NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0) CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0) NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0) AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 13baff4de3..2ca17163f5 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile from tinygrad.uop.ops import sint from tinygrad.device import Compiled, BufferSpec, CompilerSet -from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE +from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE from tinygrad.helpers import ContextVar, VIZ, ProfileEvent from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import CUDARenderer @@ -621,7 +621,8 @@ class NVDevice(HCQCompiled[NVSignal]): compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None), (functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX), - (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)]) + (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK), + (functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)]) super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue) self.pma_enabled = PMA.value > 0 and PROFILE >= 1 diff --git a/tinygrad/runtime/support/compiler_cuda.py b/tinygrad/runtime/support/compiler_cuda.py index 8c4aa4234a..8d545bbb3b 100644 --- a/tinygrad/runtime/support/compiler_cuda.py +++ b/tinygrad/runtime/support/compiler_cuda.py @@ -59,16 +59,16 @@ class NVRTCCompiler(Compiler): class NVCCCompiler(Compiler): def __init__(self, arch:str, ptx:bool=True, cache_key:str="cuda", extra_options:list[str]=[]): - assert ptx, "NVCCCompiler cubin support unimplemented" - self.arch, self.extra_options = arch, extra_options - super().__init__(f"compile_nvcc_{cache_key}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}") + self.ptx, self.arch, self.extra_options = ptx, arch, extra_options + super().__init__(f"compile_nvcc_{cache_key+'ptx' if ptx else ''}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}") def compile(self, src:str) -> bytes: - with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=".ptx") as libf: + mode, suffix = ("-ptx", ".ptx") if self.ptx else ("-cubin", ".cubin") + with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=suffix) as libf: srcf.write(src.encode()) srcf.flush() - system(f"nvcc -arch={self.arch} -ptx -o {libf.name} {srcf.name}" + ' '.join(self.extra_options)) + system(f"nvcc -arch={self.arch} {mode} -o {libf.name} {srcf.name}" + ' '.join(self.extra_options)) return libf.read() - def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=True) + def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=self.ptx) class PTXCompiler(Compiler): def __init__(self, arch:str, cache_key="ptx"):