nvcc/nvdisasm macos shim (#14822)

* move to backend

* and arch

* setup_nvcc_osx

* blackwell

* min test

* now getting dumb assert is_ptx

* support cubin.

* work

* remove that

* simpler
This commit is contained in:
qazal
2026-02-17 19:07:05 +08:00
committed by GitHub
parent d24781f45f
commit f8e485ee9e
4 changed files with 34 additions and 9 deletions

24
extra/setup_nvcc_osx.sh Executable file
View File

@@ -0,0 +1,24 @@
#!/bin/sh
install_loc="$HOME/.local/bin"
docker build --platform=linux/amd64 -t cuda-nvcc:12.8 - <<'EOF'
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb && \
apt-get update && apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-nvdisasm-12-8 cuda-cuobjdump-12-8 && rm -rf /var/lib/apt/lists/*
ENV PATH=/usr/local/cuda/bin:$PATH
EOF
mkdir -p "$install_loc"
tee "$install_loc/nvccshim" >/dev/null <<'EOF'
#!/bin/sh
set -eu
# assume the final arg is the input path
# mount it so that container can read it
dir=$(dirname "${@: -1}")
exec docker run --rm --platform=linux/amd64 -v "$dir":"$dir" cuda-nvcc:12.8 "$(basename "$0")" "$@"
EOF
chmod +x "$install_loc/nvccshim"
for t in nvcc nvdisasm; do
ln -sf "$install_loc/nvccshim" "$install_loc/$t"
done

View File

@@ -188,7 +188,7 @@ CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
# Compilers
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)

View File

@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, BufferSpec, CompilerSet
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE
from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import CUDARenderer
@@ -621,7 +621,8 @@ class NVDevice(HCQCompiled[NVSignal]):
compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
(functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK),
(functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)])
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
self.pma_enabled = PMA.value > 0 and PROFILE >= 1

View File

@@ -59,16 +59,16 @@ class NVRTCCompiler(Compiler):
class NVCCCompiler(Compiler):
def __init__(self, arch:str, ptx:bool=True, cache_key:str="cuda", extra_options:list[str]=[]):
assert ptx, "NVCCCompiler cubin support unimplemented"
self.arch, self.extra_options = arch, extra_options
super().__init__(f"compile_nvcc_{cache_key}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
self.ptx, self.arch, self.extra_options = ptx, arch, extra_options
super().__init__(f"compile_nvcc_{cache_key+'ptx' if ptx else ''}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
def compile(self, src:str) -> bytes:
with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=".ptx") as libf:
mode, suffix = ("-ptx", ".ptx") if self.ptx else ("-cubin", ".cubin")
with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=suffix) as libf:
srcf.write(src.encode())
srcf.flush()
system(f"nvcc -arch={self.arch} -ptx -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
system(f"nvcc -arch={self.arch} {mode} -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
return libf.read()
def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=True)
def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=self.ptx)
class PTXCompiler(Compiler):
def __init__(self, arch:str, cache_key="ptx"):