mirror of
https://github.com/eth-act/ere.git
synced 2026-04-03 03:00:17 -04:00
fix: detect all compute cap; require single cuda arch for zisk
This commit is contained in:
17
.github/scripts/build-image.sh
vendored
17
.github/scripts/build-image.sh
vendored
@@ -139,7 +139,10 @@ fi
|
||||
|
||||
# Default CUDA_ARCHS when --cuda is set but --cuda-archs not specified
|
||||
if [ "$CUDA" = true ] && [ -z "$CUDA_ARCHS" ]; then
|
||||
CUDA_ARCHS="89,120"
|
||||
case "$ZKVM" in
|
||||
zisk) CUDA_ARCHS="120" ;; # Default to RTX 50 series (ZisK only support setting single CUDA arch)
|
||||
*) CUDA_ARCHS="89,120" ;; # Default to RTX 40 and 50 series
|
||||
esac
|
||||
fi
|
||||
|
||||
# Per-zkVM CUDA architecture translation
|
||||
@@ -165,10 +168,14 @@ if [ "$CUDA" = true ] && [ -n "$CUDA_ARCHS" ]; then
|
||||
SERVER_ZKVM_BUILD_ARGS+=(--build-arg "NVCC_APPEND_FLAGS=$NVCC_APPEND_FLAGS")
|
||||
;;
|
||||
zisk)
|
||||
MAX_CUDA_ARCH=$(echo "$CUDA_ARCHS" | tr ',' '\n' | sort -n | tail -1)
|
||||
BASE_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${MAX_CUDA_ARCH}")
|
||||
SERVER_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${MAX_CUDA_ARCH}")
|
||||
CLUSTER_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${MAX_CUDA_ARCH}")
|
||||
IFS=',' read -ra ARCH_ARRAY <<< "$CUDA_ARCHS"
|
||||
if [ "${#ARCH_ARRAY[@]}" -ne 1 ]; then
|
||||
echo "Error: Multiple CUDA architectures are not supported for zisk: $CUDA_ARCHS"
|
||||
exit 1
|
||||
fi
|
||||
BASE_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${ARCH_ARRAY[0]}")
|
||||
SERVER_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${ARCH_ARRAY[0]}")
|
||||
CLUSTER_ZKVM_BUILD_ARGS+=(--build-arg "CUDA_ARCH=sm_${ARCH_ARRAY[0]}")
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
|
||||
12
.github/workflows/build-and-push-images.yml
vendored
12
.github/workflows/build-and-push-images.yml
vendored
@@ -100,6 +100,9 @@ jobs:
|
||||
- risc0
|
||||
- sp1
|
||||
- zisk
|
||||
include:
|
||||
- zkvm: zisk
|
||||
cuda_archs: '120'
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
@@ -121,7 +124,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.sha_tag }}-cuda \
|
||||
--base \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ matrix.cuda_archs || env.CUDA_ARCHS }}'
|
||||
|
||||
- name: Push ere-base and ere-base-${{ matrix.zkvm }} images with CUDA enabled
|
||||
run: |
|
||||
@@ -134,7 +137,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.sha_tag }}-cuda \
|
||||
--server \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ matrix.cuda_archs || env.CUDA_ARCHS }}'
|
||||
|
||||
- name: Push ere-server-${{ matrix.zkvm }} image with CUDA enabled
|
||||
run: |
|
||||
@@ -152,6 +155,9 @@ jobs:
|
||||
matrix:
|
||||
zkvm:
|
||||
- zisk
|
||||
include:
|
||||
- zkvm: zisk
|
||||
cuda_archs: '120'
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
@@ -173,7 +179,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.sha_tag }}-cuda \
|
||||
--cluster \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ matrix.cuda_archs || env.CUDA_ARCHS }}'
|
||||
|
||||
- name: Push ere-cluster-${{ matrix.zkvm }} image with CUDA enabled
|
||||
run: |
|
||||
|
||||
1
.github/workflows/test-zkvm-zisk.yml
vendored
1
.github/workflows/test-zkvm-zisk.yml
vendored
@@ -17,5 +17,6 @@ jobs:
|
||||
with:
|
||||
zkvm: zisk
|
||||
cuda: true
|
||||
cuda_archs: '120'
|
||||
cluster: true
|
||||
skip_prove_test: true
|
||||
|
||||
12
.github/workflows/test-zkvm.yml
vendored
12
.github/workflows/test-zkvm.yml
vendored
@@ -12,6 +12,11 @@ on:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cuda_archs:
|
||||
description: 'Comma-separated CUDA archs to gencode'
|
||||
required: false
|
||||
type: string
|
||||
default: '89,120'
|
||||
cluster:
|
||||
description: 'Whether to build cluster image'
|
||||
required: false
|
||||
@@ -26,7 +31,6 @@ on:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CUDA_ARCHS: '89,120'
|
||||
|
||||
jobs:
|
||||
image_meta:
|
||||
@@ -99,7 +103,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.image_registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.image_tag }}-cuda \
|
||||
--cached-tag "$CACHED_TAG" \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ inputs.cuda_archs }}'
|
||||
|
||||
- name: Build ere-server-${{ inputs.zkvm }} image with CUDA enabled
|
||||
run: |
|
||||
@@ -108,7 +112,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.image_registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.image_tag }}-cuda \
|
||||
--server \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ inputs.cuda_archs }}'
|
||||
|
||||
- name: Build ere-cluster-${{ inputs.zkvm }} image with CUDA enabled
|
||||
if: ${{ inputs.cluster && needs.image_meta.outputs.dockerfile_changed == 'true' }}
|
||||
@@ -118,7 +122,7 @@ jobs:
|
||||
--registry ${{ needs.image_meta.outputs.image_registry }} \
|
||||
--tag ${{ needs.image_meta.outputs.image_tag }}-cuda \
|
||||
--cluster \
|
||||
--cuda-archs '${{ env.CUDA_ARCHS }}'
|
||||
--cuda-archs '${{ inputs.cuda_archs }}'
|
||||
|
||||
clippy_via_docker:
|
||||
name: Clippy via Docker
|
||||
|
||||
@@ -1,65 +1,60 @@
|
||||
use std::{env, process::Command};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Returns Cuda GPU compute capability, for example
|
||||
/// - RTX 50 series - returns `12.0`
|
||||
/// - RTX 40 series - returns `8.9`
|
||||
/// Detects CUDA compute capabilities of all visible GPUs.
|
||||
///
|
||||
/// If there are multiple GPUs available, the first result will be returned.
|
||||
pub fn cuda_compute_cap() -> Option<String> {
|
||||
let output = Command::new("nvidia-smi")
|
||||
/// Returns a sorted, deduplicated list of numeric compute capabilities
|
||||
/// (e.g. `[89, 120]` for a mix of RTX 40 and RTX 50 series GPUs).
|
||||
///
|
||||
/// Returns an empty vec if `nvidia-smi` is not available or fails.
|
||||
pub fn detect_compute_caps() -> Vec<u32> {
|
||||
let Ok(output) = Command::new("nvidia-smi")
|
||||
.args(["--query-gpu=compute_cap", "--format=csv,noheader"])
|
||||
.output()
|
||||
.ok()?;
|
||||
else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
if !output.status.success() {
|
||||
return None;
|
||||
return vec![];
|
||||
}
|
||||
|
||||
Some(
|
||||
String::from_utf8_lossy(&output.stdout)
|
||||
.lines()
|
||||
.next()?
|
||||
.trim()
|
||||
.to_string(),
|
||||
)
|
||||
let mut caps: Vec<u32> = String::from_utf8_lossy(&output.stdout)
|
||||
.lines()
|
||||
.filter_map(|line| line.trim().replace('.', "").parse::<u32>().ok())
|
||||
.collect();
|
||||
caps.sort_unstable();
|
||||
caps.dedup();
|
||||
caps
|
||||
}
|
||||
|
||||
/// Returns CUDA architecture(s) as comma-separated numeric strings
|
||||
/// (e.g. "120", "89,120").
|
||||
/// Returns CUDA architectures as a list of numeric values (e.g. `[89, 120]`).
|
||||
///
|
||||
/// It does the following checks and returns the first valid value:
|
||||
/// 1. Read env variable `CUDA_ARCHS` and validate format (comma-separated numbers).
|
||||
/// 2. Detect compute capability of the first visible GPU and convert to numeric format.
|
||||
/// 2. Detect compute capabilities of all visible GPUs.
|
||||
///
|
||||
/// Otherwise it returns `None`.
|
||||
pub fn cuda_archs() -> Option<String> {
|
||||
/// Returns an empty vec if neither source provides valid architectures.
|
||||
pub fn cuda_archs() -> Vec<u32> {
|
||||
if let Ok(val) = env::var("CUDA_ARCHS") {
|
||||
let valid = !val.is_empty()
|
||||
&& val
|
||||
.split(',')
|
||||
.all(|s| !s.is_empty() && s.parse::<u32>().is_ok());
|
||||
if valid {
|
||||
info!("Using CUDA_ARCHS {val} from env variable");
|
||||
return Some(val);
|
||||
let archs: Option<Vec<u32>> = val.split(',').map(|s| s.parse::<u32>().ok()).collect();
|
||||
match archs {
|
||||
Some(archs) if !archs.is_empty() => {
|
||||
info!("Using CUDA_ARCHS {val} from env variable");
|
||||
return archs;
|
||||
}
|
||||
_ => warn!(
|
||||
"Skipping CUDA_ARCHS {val} from env variable \
|
||||
(expected comma-separated numbers, e.g. \"89,120\")"
|
||||
),
|
||||
}
|
||||
warn!(
|
||||
"Skipping CUDA_ARCHS {val} from env variable \
|
||||
(expected comma-separated numbers, e.g. \"89,120\")"
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(cap) = cuda_compute_cap() {
|
||||
let numeric = cap.replace('.', "");
|
||||
if numeric.parse::<u32>().is_ok() {
|
||||
info!("Using CUDA compute capability {cap} detected (CUDA_ARCHS={numeric})");
|
||||
return Some(numeric);
|
||||
}
|
||||
warn!(
|
||||
"Skipping CUDA compute capability {cap} detected \
|
||||
(expected a version number, e.g. 12.0)"
|
||||
);
|
||||
let caps = detect_compute_caps();
|
||||
if !caps.is_empty() {
|
||||
info!("Detected CUDA compute capabilities (CUDA_ARCHS={caps:?})");
|
||||
return caps;
|
||||
}
|
||||
|
||||
None
|
||||
vec![]
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ use std::{
|
||||
};
|
||||
use tempfile::TempDir;
|
||||
use tokio::{sync::RwLock, time::sleep};
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
mod error;
|
||||
|
||||
@@ -47,32 +47,54 @@ pub use error::Error;
|
||||
/// - Airbender: `CUDAARCHS` (semicolon-separated, e.g. "89;120")
|
||||
/// - OpenVM: `CUDA_ARCH` (comma-separated, e.g. "89,120")
|
||||
/// - Risc0: `NVCC_APPEND_FLAGS` (nvcc --generate-code flags)
|
||||
/// - Zisk: `CUDA_ARCH` (single largest arch, e.g. "sm_120")
|
||||
/// - Zisk: `CUDA_ARCH` (support only one CUDA architecture, e.g. "sm_120")
|
||||
fn apply_cuda_build_args(
|
||||
cmd: DockerBuildCmd,
|
||||
zkvm_kind: zkVMKind,
|
||||
cuda_archs: &str,
|
||||
) -> DockerBuildCmd {
|
||||
match zkvm_kind {
|
||||
zkVMKind::Airbender => cmd.build_arg("CUDAARCHS", cuda_archs.replace(',', ";")),
|
||||
zkVMKind::OpenVM => cmd.build_arg("CUDA_ARCH", cuda_archs),
|
||||
cuda_archs: &[u32],
|
||||
) -> Result<DockerBuildCmd, Error> {
|
||||
if cuda_archs.is_empty() {
|
||||
warn!("No CUDA_ARCHS set or detected, use default value in Dockerfile");
|
||||
return Ok(cmd);
|
||||
}
|
||||
|
||||
Ok(match zkvm_kind {
|
||||
zkVMKind::Airbender => {
|
||||
let value = cuda_archs
|
||||
.iter()
|
||||
.map(|arch| arch.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(";");
|
||||
cmd.build_arg("CUDAARCHS", value)
|
||||
}
|
||||
zkVMKind::OpenVM => {
|
||||
let value = cuda_archs
|
||||
.iter()
|
||||
.map(|arch| arch.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
cmd.build_arg("CUDA_ARCH", value)
|
||||
}
|
||||
zkVMKind::Risc0 => {
|
||||
let flags = cuda_archs
|
||||
.split(',')
|
||||
.map(|arch| format!("--generate-code arch=compute_{arch},code=sm_{arch} "))
|
||||
.collect::<String>();
|
||||
cmd.build_arg("NVCC_APPEND_FLAGS", flags.trim_end())
|
||||
let value = cuda_archs
|
||||
.iter()
|
||||
.map(|arch| format!("--generate-code arch=compute_{arch},code=sm_{arch}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
cmd.build_arg("NVCC_APPEND_FLAGS", value)
|
||||
}
|
||||
zkVMKind::Zisk => {
|
||||
let max_cuda_arch = cuda_archs
|
||||
.split(',')
|
||||
.filter_map(|s| s.parse::<u32>().ok())
|
||||
.max()
|
||||
.unwrap_or(120);
|
||||
cmd.build_arg("CUDA_ARCH", format!("sm_{max_cuda_arch}"))
|
||||
if cuda_archs.len() != 1 {
|
||||
return Err(Error::UnsupportedMultiCudaArchs(
|
||||
zkVMKind::Zisk,
|
||||
cuda_archs.to_vec(),
|
||||
));
|
||||
}
|
||||
let value = format!("sm_{}", cuda_archs[0]);
|
||||
cmd.build_arg("CUDA_ARCH", value)
|
||||
}
|
||||
_ => cmd,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// This method builds 3 Docker images in sequence:
|
||||
@@ -112,7 +134,7 @@ fn build_server_image(zkvm_kind: zkVMKind, gpu: bool) -> Result<(), Error> {
|
||||
let docker_zkvm_dir = docker_dir.join(zkvm_kind.as_str());
|
||||
|
||||
// Resolve CUDA architectures once for both base-zkvm and server builds.
|
||||
let cuda_archs = if gpu { cuda_archs() } else { None };
|
||||
let cuda_archs = if gpu { cuda_archs() } else { vec![] };
|
||||
|
||||
// Build `ere-base`
|
||||
if force_rebuild || !docker_image_exists(&base_image)? {
|
||||
@@ -141,10 +163,7 @@ fn build_server_image(zkvm_kind: zkVMKind, gpu: bool) -> Result<(), Error> {
|
||||
|
||||
if gpu {
|
||||
cmd = cmd.build_arg("CUDA", "1");
|
||||
|
||||
if let Some(ref cuda_archs) = cuda_archs {
|
||||
cmd = apply_cuda_build_args(cmd, zkvm_kind, cuda_archs);
|
||||
}
|
||||
cmd = apply_cuda_build_args(cmd, zkvm_kind, &cuda_archs)?;
|
||||
}
|
||||
|
||||
cmd.exec(&workspace_dir)?;
|
||||
@@ -161,10 +180,7 @@ fn build_server_image(zkvm_kind: zkVMKind, gpu: bool) -> Result<(), Error> {
|
||||
|
||||
if gpu {
|
||||
cmd = cmd.build_arg("CUDA", "1");
|
||||
|
||||
if let Some(ref cuda_archs) = cuda_archs {
|
||||
cmd = apply_cuda_build_args(cmd, zkvm_kind, cuda_archs);
|
||||
}
|
||||
cmd = apply_cuda_build_args(cmd, zkvm_kind, &cuda_archs)?;
|
||||
}
|
||||
|
||||
cmd.exec(&workspace_dir)?;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use ere_common::zkVMKind;
|
||||
use ere_server::client::{self, ParseError, TwirpErrorResponse};
|
||||
use ere_zkvm_interface::CommonError;
|
||||
use thiserror::Error;
|
||||
@@ -19,6 +20,10 @@ pub enum Error {
|
||||
CommonError(#[from] CommonError),
|
||||
#[error(transparent)]
|
||||
ParseUrl(#[from] ParseError),
|
||||
#[error(
|
||||
"Multiple CUDA architectures are not supported for {0:?}, CUDA_ARCHS set or detected: {1:?}"
|
||||
)]
|
||||
UnsupportedMultiCudaArchs(zkVMKind, Vec<u32>),
|
||||
#[error("zkVM method error: {0}")]
|
||||
zkVM(String),
|
||||
#[error("Connection to zkVM server timeout after 5 minutes")]
|
||||
|
||||
Reference in New Issue
Block a user