From f8e485ee9e6fc42d375c8d3adb6bbde55e1ab9d2 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 17 Feb 2026 19:07:05 +0800
Subject: [PATCH] nvcc/nvdisasm macos shim (#14822)

* move to backend

* and arch

* setup_nvcc_osx

* blackwell

* min test

* now getting dumb assert is_ptx

* support cubin.

* work

* remove that

* simpler
---
 extra/setup_nvcc_osx.sh                   | 24 +++++++++++++++++++++++
 tinygrad/helpers.py                       |  2 +-
 tinygrad/runtime/ops_nv.py                |  5 +++--
 tinygrad/runtime/support/compiler_cuda.py | 12 ++++++------
 4 files changed, 34 insertions(+), 9 deletions(-)
 create mode 100755 extra/setup_nvcc_osx.sh

diff --git a/extra/setup_nvcc_osx.sh b/extra/setup_nvcc_osx.sh
new file mode 100755
index 0000000000..d95348a342
--- /dev/null
+++ b/extra/setup_nvcc_osx.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+install_loc="$HOME/.local/bin"
+docker build --platform=linux/amd64 -t cuda-nvcc:12.8 - <<'EOF'
+FROM ubuntu:22.04
+RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates && \
+  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+  dpkg -i cuda-keyring_1.1-1_all.deb && \
+  apt-get update && apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-nvdisasm-12-8 cuda-cuobjdump-12-8 && rm -rf /var/lib/apt/lists/*
+ENV PATH=/usr/local/cuda/bin:$PATH
+EOF
+
+mkdir -p "$install_loc"
+tee "$install_loc/nvccshim" >/dev/null <<'EOF'
+#!/bin/sh
+set -eu
+# assume the final arg is the input path
+# mount it so that container can read it
+dir=$(dirname "${@: -1}")
+exec docker run --rm --platform=linux/amd64 -v "$dir":"$dir" cuda-nvcc:12.8 "$(basename "$0")" "$@"
+EOF
+chmod +x "$install_loc/nvccshim"
+for t in nvcc nvdisasm; do
+  ln -sf "$install_loc/nvccshim" "$install_loc/$t"
+done
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index b22c8169d9..e7d5673613 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -188,7 +188,7 @@ CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
 CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
 # Compilers
 CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
-NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
+NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
 CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
 NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
 AMD_CC, AMD_LLVM, AMD_HIPCC  = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index 13baff4de3..2ca17163f5 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
 from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
 from tinygrad.uop.ops import sint
 from tinygrad.device import Compiled, BufferSpec, CompilerSet
-from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
+from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE
 from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.cstyle import CUDARenderer
@@ -621,7 +621,8 @@ class NVDevice(HCQCompiled[NVSignal]):
 
     compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
        (functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
-       (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
+       (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK),
+       (functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)])
     super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
 
     self.pma_enabled = PMA.value > 0 and PROFILE >= 1
diff --git a/tinygrad/runtime/support/compiler_cuda.py b/tinygrad/runtime/support/compiler_cuda.py
index 8c4aa4234a..8d545bbb3b 100644
--- a/tinygrad/runtime/support/compiler_cuda.py
+++ b/tinygrad/runtime/support/compiler_cuda.py
@@ -59,16 +59,16 @@ class NVRTCCompiler(Compiler):
 
 class NVCCCompiler(Compiler):
   def __init__(self, arch:str, ptx:bool=True, cache_key:str="cuda", extra_options:list[str]=[]):
-    assert ptx, "NVCCCompiler cubin support unimplemented"
-    self.arch, self.extra_options = arch, extra_options
-    super().__init__(f"compile_nvcc_{cache_key}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
+    self.ptx, self.arch, self.extra_options = ptx, arch, extra_options
+    super().__init__(f"compile_nvcc_{cache_key+'ptx' if ptx else ''}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
   def compile(self, src:str) -> bytes:
-    with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=".ptx") as libf:
+    mode, suffix = ("-ptx", ".ptx") if self.ptx else ("-cubin", ".cubin")
+    with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=suffix) as libf:
       srcf.write(src.encode())
       srcf.flush()
-      system(f"nvcc -arch={self.arch} -ptx -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
+      system(f"nvcc -arch={self.arch} {mode} -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
       return libf.read()
-  def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=True)
+  def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=self.ptx)
 
 class PTXCompiler(Compiler):
   def __init__(self, arch:str, cache_key="ptx"):