Add 1 hour timeout for zisk proving (#245)

This commit is contained in:
Han
2025-12-12 00:03:57 +09:00
committed by GitHub
parent ab50b417d9
commit d15c6b495d
7 changed files with 73 additions and 20 deletions

1
Cargo.lock generated
View File

@@ -4170,6 +4170,7 @@ dependencies = [
"tempfile",
"thiserror 2.0.12",
"tracing",
"wait-timeout",
]
[[package]]

View File

@@ -79,6 +79,7 @@ tracing = "0.1.41"
tracing-subscriber = "0.3.19"
twirp = "0.9.1"
twirp-build = "0.9.0"
wait-timeout = "0.2.1"
# Airbender dependencies
airbender_execution_utils = { git = "https://github.com/matter-labs/zksync-airbender", package = "execution_utils", tag = "v0.5.1" }

View File

@@ -164,7 +164,8 @@ impl ServerContainer {
.inherit_env("ZISK_SHARED_TABLES")
.inherit_env("ZISK_MAX_STREAMS")
.inherit_env("ZISK_NUMBER_THREADS_WITNESS")
.inherit_env("ZISK_MAX_WITNESS_STORED"),
.inherit_env("ZISK_MAX_WITNESS_STORED")
.inherit_env("ZISK_PROVE_TIMEOUT_SEC"),
_ => cmd,
};

View File

@@ -14,6 +14,7 @@ strum = { workspace = true, features = ["derive"] }
tempfile.workspace = true
thiserror.workspace = true
tracing.workspace = true
wait-timeout.workspace = true
# Local dependencies
ere-compile-utils = { workspace = true, optional = true }

View File

@@ -1,6 +1,6 @@
use crate::{
program::ZiskProgram,
zkvm::sdk::{RomDigest, ZiskOptions, ZiskSdk, ZiskServer},
zkvm::sdk::{RomDigest, START_SERVER_TIMEOUT, ZiskOptions, ZiskSdk, ZiskServer},
};
use anyhow::bail;
use ere_zkvm_interface::zkvm::{
@@ -46,7 +46,7 @@ impl EreZisk {
if server
.as_ref()
.is_none_or(|server| server.status().is_err())
.is_none_or(|server| server.status(START_SERVER_TIMEOUT).is_err())
{
const MAX_RETRY: usize = 3;
let mut retry = 0;

View File

@@ -26,6 +26,9 @@ pub enum Error {
#[error("Server crashed")]
ServerCrashed,
#[error("Timeout waiting for server proving")]
TimeoutWaitingServerProving,
#[error("Timeout waiting for server ready")]
TimeoutWaitingServerReady,

View File

@@ -7,7 +7,7 @@ use std::{
iter,
net::{Ipv4Addr, TcpStream},
path::{Path, PathBuf},
process::{Child, Command},
process::{Child, Command, Stdio},
sync::OnceLock,
thread,
time::{Duration, Instant},
@@ -15,6 +15,11 @@ use std::{
use strum::{EnumIter, IntoEnumIterator};
use tempfile::tempdir;
use tracing::{error, info};
use wait_timeout::ChildExt;
pub const START_SERVER_TIMEOUT: Duration = Duration::from_secs(120); // 2 mins
pub const SHUTDOWN_SERVER_TIMEOUT: Duration = Duration::from_secs(30); // 30 secs
pub const DEFAULT_PROVE_TIMEOUT: Duration = Duration::from_secs(3600); // 1 hour
/// Merkle root of ROM trace generated by `cargo-zisk rom-setup`.
pub type RomDigest = [u64; 4];
@@ -334,16 +339,32 @@ pub struct ZiskServer {
impl Drop for ZiskServer {
fn drop(&mut self) {
info!("Shutting down ZisK server");
let result = Command::new("cargo-zisk")
let mut cmd = Command::new("cargo-zisk");
let result = cmd
.args(["prove-client", "shutdown"])
.args(self.options.prove_client_args())
.output();
if result.is_err() || result.as_ref().is_ok_and(|output| !output.status.success()) {
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.and_then(
|mut child| match child.wait_timeout(SHUTDOWN_SERVER_TIMEOUT)? {
Some(_) => child.wait_with_output(),
None => {
child.kill().ok();
Err(std::io::Error::other("shutdown command timed out"))
}
},
);
if result.as_ref().is_ok_and(|output| output.status.success()) {
info!("Shutdown ZisK server");
} else {
error!(
"Failed to shutdown ZisK server{}",
"Failed to shutdown ZisK server: {}",
result
.map(|output| format!(": {}", String::from_utf8_lossy(&output.stderr)))
.unwrap_or_default()
.map(|output| String::from_utf8_lossy(&output.stderr).to_string())
.unwrap_or_else(|err| err.to_string())
);
error!("Shutdown server child process and asm services manually...");
let _ = self.child.kill();
@@ -351,20 +372,34 @@ impl Drop for ZiskServer {
shutdown_asm_service(23116);
shutdown_asm_service(23117);
remove_shm_files();
} else {
info!("Shutdown ZisK server");
}
}
}
impl ZiskServer {
/// Get status of server.
pub fn status(&self) -> Result<ZiskServerStatus, Error> {
pub fn status(&self, timeout: Duration) -> Result<ZiskServerStatus, Error> {
let mut cmd = Command::new("cargo-zisk");
let output = cmd
let mut child = cmd
.args(["prove-client", "status"])
.args(self.options.prove_client_args())
.output()
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|err| CommonError::command(&cmd, err))?;
if child
.wait_timeout(timeout)
.map_err(|err| CommonError::command(&cmd, err))?
.is_none()
{
// Timeout reached, kill the process
child.kill().ok();
return Err(Error::TimeoutWaitingServerReady);
}
let output = child
.wait_with_output()
.map_err(|err| CommonError::command(&cmd, err))?;
if !output.status.success() {
@@ -425,11 +460,20 @@ impl ZiskServer {
))?;
}
// By default set 1 hour timeout for prove.
let prove_timeout = env::var("ZISK_PROVE_TIMEOUT_SEC")
.ok()
.and_then(|timeout| timeout.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(DEFAULT_PROVE_TIMEOUT);
// ZisK server will finish the `prove` requested above then respond the
// following `status`. So if the following `status` succeeds, the proof
// should also be ready.
self.status().map_err(|err| {
if err.to_string().contains("EOF") {
self.status(prove_timeout).map_err(|err| {
if matches!(err, Error::TimeoutWaitingServerReady) {
Error::TimeoutWaitingServerProving
} else if err.to_string().contains("EOF") {
Error::ServerCrashed
} else {
err
@@ -455,14 +499,16 @@ impl ZiskServer {
/// Wait until the server status to be idle.
fn wait_until_ready(&self) -> Result<(), Error> {
const TIMEOUT: Duration = Duration::from_secs(120); // 2mins
const INTERVAL: Duration = Duration::from_secs(1);
info!("Waiting until server is ready...");
let start = Instant::now();
while !matches!(self.status(), Ok(ZiskServerStatus::Idle)) {
if start.elapsed() > TIMEOUT {
while !matches!(
self.status(START_SERVER_TIMEOUT),
Ok(ZiskServerStatus::Idle)
) {
if start.elapsed() > START_SERVER_TIMEOUT {
return Err(Error::TimeoutWaitingServerReady);
}
thread::sleep(INTERVAL);