From d15c6b495d0bc42bf160ecfbb11fa7600047c821 Mon Sep 17 00:00:00 2001 From: Han Date: Fri, 12 Dec 2025 00:03:57 +0900 Subject: [PATCH] Add 1 hour timeout for zisk proving (#245) --- Cargo.lock | 1 + Cargo.toml | 1 + crates/dockerized/src/zkvm.rs | 3 +- crates/zkvm/zisk/Cargo.toml | 1 + crates/zkvm/zisk/src/zkvm.rs | 4 +- crates/zkvm/zisk/src/zkvm/error.rs | 3 ++ crates/zkvm/zisk/src/zkvm/sdk.rs | 80 +++++++++++++++++++++++------- 7 files changed, 73 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6294bcc..c042db1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4170,6 +4170,7 @@ dependencies = [ "tempfile", "thiserror 2.0.12", "tracing", + "wait-timeout", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4b442d2..34b5ed4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,7 @@ tracing = "0.1.41" tracing-subscriber = "0.3.19" twirp = "0.9.1" twirp-build = "0.9.0" +wait-timeout = "0.2.1" # Airbender dependencies airbender_execution_utils = { git = "https://github.com/matter-labs/zksync-airbender", package = "execution_utils", tag = "v0.5.1" } diff --git a/crates/dockerized/src/zkvm.rs b/crates/dockerized/src/zkvm.rs index 736ae16..cfe5347 100644 --- a/crates/dockerized/src/zkvm.rs +++ b/crates/dockerized/src/zkvm.rs @@ -164,7 +164,8 @@ impl ServerContainer { .inherit_env("ZISK_SHARED_TABLES") .inherit_env("ZISK_MAX_STREAMS") .inherit_env("ZISK_NUMBER_THREADS_WITNESS") - .inherit_env("ZISK_MAX_WITNESS_STORED"), + .inherit_env("ZISK_MAX_WITNESS_STORED") + .inherit_env("ZISK_PROVE_TIMEOUT_SEC"), _ => cmd, }; diff --git a/crates/zkvm/zisk/Cargo.toml b/crates/zkvm/zisk/Cargo.toml index 099f621..581a6ff 100644 --- a/crates/zkvm/zisk/Cargo.toml +++ b/crates/zkvm/zisk/Cargo.toml @@ -14,6 +14,7 @@ strum = { workspace = true, features = ["derive"] } tempfile.workspace = true thiserror.workspace = true tracing.workspace = true +wait-timeout.workspace = true # Local dependencies ere-compile-utils = { workspace = true, optional = true } diff --git a/crates/zkvm/zisk/src/zkvm.rs b/crates/zkvm/zisk/src/zkvm.rs index 85668ac..d7029c2 100644 --- a/crates/zkvm/zisk/src/zkvm.rs +++ b/crates/zkvm/zisk/src/zkvm.rs @@ -1,6 +1,6 @@ use crate::{ program::ZiskProgram, - zkvm::sdk::{RomDigest, ZiskOptions, ZiskSdk, ZiskServer}, + zkvm::sdk::{RomDigest, START_SERVER_TIMEOUT, ZiskOptions, ZiskSdk, ZiskServer}, }; use anyhow::bail; use ere_zkvm_interface::zkvm::{ @@ -46,7 +46,7 @@ impl EreZisk { if server .as_ref() - .is_none_or(|server| server.status().is_err()) + .is_none_or(|server| server.status(START_SERVER_TIMEOUT).is_err()) { const MAX_RETRY: usize = 3; let mut retry = 0; diff --git a/crates/zkvm/zisk/src/zkvm/error.rs b/crates/zkvm/zisk/src/zkvm/error.rs index cb616be..394fc2b 100644 --- a/crates/zkvm/zisk/src/zkvm/error.rs +++ b/crates/zkvm/zisk/src/zkvm/error.rs @@ -26,6 +26,9 @@ pub enum Error { #[error("Server crashed")] ServerCrashed, + #[error("Timeout waiting for server proving")] + TimeoutWaitingServerProving, + #[error("Timeout waiting for server ready")] TimeoutWaitingServerReady, diff --git a/crates/zkvm/zisk/src/zkvm/sdk.rs b/crates/zkvm/zisk/src/zkvm/sdk.rs index 7e8cf57..f6854ee 100644 --- a/crates/zkvm/zisk/src/zkvm/sdk.rs +++ b/crates/zkvm/zisk/src/zkvm/sdk.rs @@ -7,7 +7,7 @@ use std::{ iter, net::{Ipv4Addr, TcpStream}, path::{Path, PathBuf}, - process::{Child, Command}, + process::{Child, Command, Stdio}, sync::OnceLock, thread, time::{Duration, Instant}, @@ -15,6 +15,11 @@ use std::{ use strum::{EnumIter, IntoEnumIterator}; use tempfile::tempdir; use tracing::{error, info}; +use wait_timeout::ChildExt; + +pub const START_SERVER_TIMEOUT: Duration = Duration::from_secs(120); // 2 mins +pub const SHUTDOWN_SERVER_TIMEOUT: Duration = Duration::from_secs(30); // 30 secs +pub const DEFAULT_PROVE_TIMEOUT: Duration = Duration::from_secs(3600); // 1 hour /// Merkle root of ROM trace generated by `cargo-zisk rom-setup`. pub type RomDigest = [u64; 4]; @@ -334,16 +339,32 @@ pub struct ZiskServer { impl Drop for ZiskServer { fn drop(&mut self) { info!("Shutting down ZisK server"); - let result = Command::new("cargo-zisk") + + let mut cmd = Command::new("cargo-zisk"); + let result = cmd .args(["prove-client", "shutdown"]) .args(self.options.prove_client_args()) - .output(); - if result.is_err() || result.as_ref().is_ok_and(|output| !output.status.success()) { + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .and_then( + |mut child| match child.wait_timeout(SHUTDOWN_SERVER_TIMEOUT)? { + Some(_) => child.wait_with_output(), + None => { + child.kill().ok(); + Err(std::io::Error::other("shutdown command timed out")) + } + }, + ); + + if result.as_ref().is_ok_and(|output| output.status.success()) { + info!("Shutdown ZisK server"); + } else { error!( - "Failed to shutdown ZisK server{}", + "Failed to shutdown ZisK server: {}", result - .map(|output| format!(": {}", String::from_utf8_lossy(&output.stderr))) - .unwrap_or_default() + .map(|output| String::from_utf8_lossy(&output.stderr).to_string()) + .unwrap_or_else(|err| err.to_string()) ); error!("Shutdown server child process and asm services manually..."); let _ = self.child.kill(); @@ -351,20 +372,34 @@ impl Drop for ZiskServer { shutdown_asm_service(23116); shutdown_asm_service(23117); remove_shm_files(); - } else { - info!("Shutdown ZisK server"); } } } impl ZiskServer { /// Get status of server. - pub fn status(&self) -> Result { + pub fn status(&self, timeout: Duration) -> Result { let mut cmd = Command::new("cargo-zisk"); - let output = cmd + let mut child = cmd .args(["prove-client", "status"]) .args(self.options.prove_client_args()) - .output() + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|err| CommonError::command(&cmd, err))?; + + if child + .wait_timeout(timeout) + .map_err(|err| CommonError::command(&cmd, err))? + .is_none() + { + // Timeout reached, kill the process + child.kill().ok(); + return Err(Error::TimeoutWaitingServerReady); + } + + let output = child + .wait_with_output() .map_err(|err| CommonError::command(&cmd, err))?; if !output.status.success() { @@ -425,11 +460,20 @@ impl ZiskServer { ))?; } + // By default set 1 hour timeout for prove. + let prove_timeout = env::var("ZISK_PROVE_TIMEOUT_SEC") + .ok() + .and_then(|timeout| timeout.parse::().ok()) + .map(Duration::from_secs) + .unwrap_or(DEFAULT_PROVE_TIMEOUT); + // ZisK server will finish the `prove` requested above then respond the // following `status`. So if the following `status` succeeds, the proof // should also be ready. - self.status().map_err(|err| { - if err.to_string().contains("EOF") { + self.status(prove_timeout).map_err(|err| { + if matches!(err, Error::TimeoutWaitingServerReady) { + Error::TimeoutWaitingServerProving + } else if err.to_string().contains("EOF") { Error::ServerCrashed } else { err @@ -455,14 +499,16 @@ impl ZiskServer { /// Wait until the server status to be idle. fn wait_until_ready(&self) -> Result<(), Error> { - const TIMEOUT: Duration = Duration::from_secs(120); // 2mins const INTERVAL: Duration = Duration::from_secs(1); info!("Waiting until server is ready..."); let start = Instant::now(); - while !matches!(self.status(), Ok(ZiskServerStatus::Idle)) { - if start.elapsed() > TIMEOUT { + while !matches!( + self.status(START_SERVER_TIMEOUT), + Ok(ZiskServerStatus::Idle) + ) { + if start.elapsed() > START_SERVER_TIMEOUT { return Err(Error::TimeoutWaitingServerReady); } thread::sleep(INTERVAL);