mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
nvcc/nvdisasm macos shim (#14822)
* move to backend * and arch * setup_nvcc_osx * blackwell * min test * now getting dumb assert is_ptx * support cubin. * work * remove that * simpler
This commit is contained in:
24
extra/setup_nvcc_osx.sh
Executable file
24
extra/setup_nvcc_osx.sh
Executable file
@@ -0,0 +1,24 @@
|
||||
#!/bin/sh
|
||||
install_loc="$HOME/.local/bin"
|
||||
docker build --platform=linux/amd64 -t cuda-nvcc:12.8 - <<'EOF'
|
||||
FROM ubuntu:22.04
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends wget ca-certificates && \
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
|
||||
dpkg -i cuda-keyring_1.1-1_all.deb && \
|
||||
apt-get update && apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-nvdisasm-12-8 cuda-cuobjdump-12-8 && rm -rf /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/cuda/bin:$PATH
|
||||
EOF
|
||||
|
||||
mkdir -p "$install_loc"
|
||||
tee "$install_loc/nvccshim" >/dev/null <<'EOF'
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
# assume the final arg is the input path
|
||||
# mount it so that container can read it
|
||||
dir=$(dirname "${@: -1}")
|
||||
exec docker run --rm --platform=linux/amd64 -v "$dir":"$dir" cuda-nvcc:12.8 "$(basename "$0")" "$@"
|
||||
EOF
|
||||
chmod +x "$install_loc/nvccshim"
|
||||
for t in nvcc nvdisasm; do
|
||||
ln -sf "$install_loc/nvccshim" "$install_loc/$t"
|
||||
done
|
||||
@@ -188,7 +188,7 @@ CAPTURE_PROCESS_REPLAY = ContextVar("CAPTURE_PROCESS_REPLAY", 0)
|
||||
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
|
||||
# Compilers
|
||||
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
|
||||
NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
|
||||
NV_CC, NV_PTX, NV_NAK, NV_NVCC = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("NV_NVCC", 0)
|
||||
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
|
||||
NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
|
||||
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
|
||||
|
||||
@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, BufferSpec, CompilerSet
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE
|
||||
from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
@@ -621,7 +621,8 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
|
||||
(functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
|
||||
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
|
||||
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK),
|
||||
(functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)])
|
||||
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
||||
|
||||
self.pma_enabled = PMA.value > 0 and PROFILE >= 1
|
||||
|
||||
@@ -59,16 +59,16 @@ class NVRTCCompiler(Compiler):
|
||||
|
||||
class NVCCCompiler(Compiler):
|
||||
def __init__(self, arch:str, ptx:bool=True, cache_key:str="cuda", extra_options:list[str]=[]):
|
||||
assert ptx, "NVCCCompiler cubin support unimplemented"
|
||||
self.arch, self.extra_options = arch, extra_options
|
||||
super().__init__(f"compile_nvcc_{cache_key}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
|
||||
self.ptx, self.arch, self.extra_options = ptx, arch, extra_options
|
||||
super().__init__(f"compile_nvcc_{cache_key+'ptx' if ptx else ''}_{self.arch}_{hashlib.sha256(' '.join(extra_options).encode()).hexdigest()[:8]}")
|
||||
def compile(self, src:str) -> bytes:
|
||||
with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=".ptx") as libf:
|
||||
mode, suffix = ("-ptx", ".ptx") if self.ptx else ("-cubin", ".cubin")
|
||||
with tempfile.NamedTemporaryFile(suffix=".cu") as srcf, tempfile.NamedTemporaryFile(suffix=suffix) as libf:
|
||||
srcf.write(src.encode())
|
||||
srcf.flush()
|
||||
system(f"nvcc -arch={self.arch} -ptx -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
|
||||
system(f"nvcc -arch={self.arch} {mode} -o {libf.name} {srcf.name}" + ' '.join(self.extra_options))
|
||||
return libf.read()
|
||||
def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=True)
|
||||
def disassemble(self, lib:bytes): cuda_disassemble(lib, self.arch, ptx=self.ptx)
|
||||
|
||||
class PTXCompiler(Compiler):
|
||||
def __init__(self, arch:str, cache_key="ptx"):
|
||||
|
||||
Reference in New Issue
Block a user