nvcc/nvdisasm macos shim (#14822)

* move to backend * and arch * setup_nvcc_osx * blackwell * min test * now getting dumb assert is_ptx * support cubin. * work * remove that * simpler
2026-04-29 03:00:14 -04:00 · 2026-02-17 19:07:05 +08:00
parent d24781f45f
commit f8e485ee9e
4 changed files with 34 additions and 9 deletions
--- a/extra/setup_nvcc_osx.sh
+++ b/extra/setup_nvcc_osx.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+install_loc="$HOME/.local/bin"
+docker build --platform=linux/amd64 -t cuda-nvcc:12.8 - <<'EOF'
+FROM ubuntu:22.04
+RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates && \
+  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+  dpkg -i cuda-keyring_1.1-1_all.deb && \
+  apt-get update && apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-nvdisasm-12-8 cuda-cuobjdump-12-8 && rm -rf /var/lib/apt/lists/*
+ENV PATH=/usr/local/cuda/bin:$PATH
+EOF
+
+mkdir -p "$install_loc"
+tee "$install_loc/nvccshim" >/dev/null <<'EOF'
+#!/bin/sh
+set -eu
+# assume the final arg is the input path
+# mount it so that container can read it
+dir=$(dirname "${@: -1}")
+exec docker run --rm --platform=linux/amd64 -v "$dir":"$dir" cuda-nvcc:12.8 "$(basename "$0")" "$@"
+EOF
+chmod +x "$install_loc/nvccshim"
+for t in nvcc nvdisasm; do
+  ln -sf "$install_loc/nvccshim" "$install_loc/$t"
+done
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -188,7 +188,7 @@ CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
 CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
 # Compilers
 CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
-NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
+NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
 CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
 NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
 AMD_CC, AMD_LLVM, AMD_HIPCC  = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
 from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
 from tinygrad.uop.ops import sint
 from tinygrad.device import Compiled, BufferSpec, CompilerSet
-from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
+from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE
 from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.cstyle import CUDARenderer
@@ -621,7 +621,8 @@ class NVDevice(HCQCompiled[NVSignal]):

    compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
       (functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
-       (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
+       (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK),
+       (functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)])
    super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)

    self.pma_enabled = PMA.value > 0 and PROFILE >= 1
--- a/tinygrad/runtime/support/compiler_cuda.py
+++ b/tinygrad/runtime/support/compiler_cuda.py
@@ -59,16 +59,16 @@ class NVRTCCompiler(Compiler):

 class NVCCCompiler(Compiler):
  def __init__(self, arch:str, ptx:bool=True, cache_key:str="cuda", extra_options:list[str]=[]):
-    assert ptx, "NVCCCompiler cubin support unimplemented"
-    self.arch, self.extra_options = arch, extra_options
-    super().__init__(f"compile_nvcc_{cache_key}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
+    self.ptx, self.arch, self.extra_options = ptx, arch, extra_options
+    super().__init__(f"compile_nvcc_{cache_key+'ptx' if ptx else ''}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
  def compile(self, src:str) -> bytes:
-    with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=".ptx") as libf:
+    mode, suffix = ("-ptx", ".ptx") if self.ptx else ("-cubin", ".cubin")
+    with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=suffix) as libf:
      srcf.write(src.encode())
      srcf.flush()
-      system(f"nvcc -arch={self.arch} -ptx -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
+      system(f"nvcc -arch={self.arch} {mode} -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
      return libf.read()
-  def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=True)
+  def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=self.ptx)

 class PTXCompiler(Compiler):
  def __init__(self, arch:str, cache_key="ptx"):