mirror of
https://github.com/eth-act/ere.git
synced 2026-04-03 03:00:17 -04:00
Retry RPC and restart server container when necessary (#225)
This commit is contained in:
40
Cargo.lock
generated
40
Cargo.lock
generated
@@ -2283,7 +2283,7 @@ dependencies = [
|
||||
"iana-time-zone",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3686,6 +3686,7 @@ dependencies = [
|
||||
"ere-server",
|
||||
"ere-test-utils",
|
||||
"ere-zkvm-interface",
|
||||
"parking_lot",
|
||||
"paste",
|
||||
"serde",
|
||||
"tempfile",
|
||||
@@ -6346,11 +6347,10 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.12"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
|
||||
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
@@ -9918,9 +9918,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.3"
|
||||
version = "0.12.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
|
||||
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
@@ -9928,15 +9928,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.9.10"
|
||||
version = "0.9.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
|
||||
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"windows-targets 0.52.6",
|
||||
"windows-link 0.2.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15116,7 +15116,7 @@ dependencies = [
|
||||
"windows-collections",
|
||||
"windows-core 0.61.0",
|
||||
"windows-future",
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
"windows-numerics",
|
||||
]
|
||||
|
||||
@@ -15146,7 +15146,7 @@ checksum = "4763c1de310c86d75a878046489e2e5ba02c649d185f21c67d4cf8a56d098980"
|
||||
dependencies = [
|
||||
"windows-implement",
|
||||
"windows-interface",
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
"windows-result",
|
||||
"windows-strings 0.4.0",
|
||||
]
|
||||
@@ -15158,7 +15158,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a1d6bbefcb7b60acd19828e1bc965da6fcf18a7e39490c5f8be71e54a19ba32"
|
||||
dependencies = [
|
||||
"windows-core 0.61.0",
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15189,6 +15189,12 @@ version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
|
||||
[[package]]
|
||||
name = "windows-numerics"
|
||||
version = "0.2.0"
|
||||
@@ -15196,7 +15202,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
|
||||
dependencies = [
|
||||
"windows-core 0.61.0",
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15216,7 +15222,7 @@ version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15225,7 +15231,7 @@ version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15234,7 +15240,7 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2ba9642430ee452d5a7aa78d72907ebe8cfda358e8cb7918a2050581322f97"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -15301,7 +15307,7 @@ version = "0.53.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-link 0.1.3",
|
||||
"windows_aarch64_gnullvm 0.53.0",
|
||||
"windows_aarch64_msvc 0.53.0",
|
||||
"windows_i686_gnu 0.53.0",
|
||||
|
||||
@@ -56,6 +56,7 @@ dashmap = "6.1.0"
|
||||
digest = { version = "0.10.7", default-features = false }
|
||||
eyre = "0.6.12"
|
||||
indexmap = "2.10.0"
|
||||
parking_lot = "0.12.5"
|
||||
paste = "1.0.15"
|
||||
postcard = { version = "1.0.8", default-features = false }
|
||||
prost = "0.13"
|
||||
|
||||
@@ -7,6 +7,7 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
parking_lot.workspace = true
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
tempfile.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -254,6 +254,32 @@ pub fn stop_docker_container(container_name: impl AsRef<str>) -> Result<(), Comm
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn docker_container_exists(container_name: impl AsRef<str>) -> Result<bool, CommonError> {
|
||||
let mut cmd = Command::new("docker");
|
||||
let output = cmd
|
||||
.args([
|
||||
"ps",
|
||||
"--filter",
|
||||
&format!("name={}", container_name.as_ref()),
|
||||
"--format",
|
||||
"{{.Names}}",
|
||||
])
|
||||
.output()
|
||||
.map_err(|err| CommonError::command(&cmd, err))?;
|
||||
|
||||
if !output.status.success() {
|
||||
Err(CommonError::command_exit_non_zero(
|
||||
&cmd,
|
||||
output.status,
|
||||
Some(&output),
|
||||
))?
|
||||
}
|
||||
|
||||
// If container exists and is running, its name will be printed
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
Ok(stdout.trim() == container_name.as_ref())
|
||||
}
|
||||
|
||||
pub fn docker_image_exists(image: impl AsRef<str>) -> Result<bool, CommonError> {
|
||||
let mut cmd = Command::new("docker");
|
||||
let output = cmd
|
||||
|
||||
@@ -5,7 +5,8 @@ use crate::{
|
||||
util::{
|
||||
cuda::cuda_arch,
|
||||
docker::{
|
||||
DockerBuildCmd, DockerRunCmd, docker_image_exists, force_rebuild, stop_docker_container,
|
||||
DockerBuildCmd, DockerRunCmd, docker_container_exists, docker_image_exists,
|
||||
force_rebuild, stop_docker_container,
|
||||
},
|
||||
home_dir, workspace_dir,
|
||||
},
|
||||
@@ -19,7 +20,8 @@ use ere_zkvm_interface::{
|
||||
PublicValues, zkVM,
|
||||
},
|
||||
};
|
||||
use std::iter;
|
||||
use parking_lot::RwLock;
|
||||
use std::{future::Future, iter};
|
||||
use tempfile::TempDir;
|
||||
use tracing::{error, info};
|
||||
|
||||
@@ -111,7 +113,7 @@ fn build_server_image(zkvm_kind: zkVMKind, gpu: bool) -> Result<(), Error> {
|
||||
|
||||
struct ServerContainer {
|
||||
name: String,
|
||||
port: u16,
|
||||
client: zkVMClient,
|
||||
#[allow(dead_code)]
|
||||
tempdir: TempDir,
|
||||
}
|
||||
@@ -215,25 +217,22 @@ impl ServerContainer {
|
||||
&program.0,
|
||||
)?;
|
||||
|
||||
let endpoint = Url::parse(&format!("http://127.0.0.1:{port}")).unwrap();
|
||||
let client = block_on(zkVMClient::new(endpoint))?;
|
||||
|
||||
Ok(ServerContainer {
|
||||
name,
|
||||
port,
|
||||
tempdir,
|
||||
client,
|
||||
})
|
||||
}
|
||||
|
||||
fn endpoint(&self) -> Url {
|
||||
Url::parse(&format!("http://127.0.0.1:{}", self.port)).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DockerizedzkVM {
|
||||
zkvm_kind: zkVMKind,
|
||||
program: SerializedProgram,
|
||||
resource: ProverResourceType,
|
||||
#[allow(dead_code)]
|
||||
server_container: ServerContainer,
|
||||
client: zkVMClient,
|
||||
container: RwLock<Option<ServerContainer>>,
|
||||
}
|
||||
|
||||
impl DockerizedzkVM {
|
||||
@@ -244,15 +243,13 @@ impl DockerizedzkVM {
|
||||
) -> Result<Self, Error> {
|
||||
build_server_image(zkvm_kind, matches!(resource, ProverResourceType::Gpu))?;
|
||||
|
||||
let server_container = ServerContainer::new(zkvm_kind, &program, &resource)?;
|
||||
let client = block_on(zkVMClient::new(server_container.endpoint()))?;
|
||||
let container = ServerContainer::new(zkvm_kind, &program, &resource)?;
|
||||
|
||||
Ok(Self {
|
||||
zkvm_kind,
|
||||
program,
|
||||
resource,
|
||||
server_container,
|
||||
client,
|
||||
container: RwLock::new(Some(container)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -267,14 +264,53 @@ impl DockerizedzkVM {
|
||||
pub fn resource(&self) -> &ProverResourceType {
|
||||
&self.resource
|
||||
}
|
||||
|
||||
fn with_retry<T, F>(&self, mut f: F) -> anyhow::Result<T>
|
||||
where
|
||||
F: FnMut(&zkVMClient) -> Result<T, ere_server::client::Error>,
|
||||
{
|
||||
const MAX_RETRY: usize = 3;
|
||||
|
||||
let mut attempt = 1;
|
||||
loop {
|
||||
let err = match f(&self.container.read().as_ref().unwrap().client) {
|
||||
Ok(ok) => return Ok(ok),
|
||||
Err(err) => Error::from(err),
|
||||
};
|
||||
|
||||
if matches!(&err, Error::zkVM(_))
|
||||
// Rpc error but not connection one
|
||||
|| matches!(&err, Error::Rpc(err) if err.rust_error().is_none_or(|err| !err.to_lowercase().contains("connect")))
|
||||
|| attempt > MAX_RETRY
|
||||
{
|
||||
return Err(err.into());
|
||||
}
|
||||
|
||||
error!("Rpc failed (attempt {attempt}/{MAX_RETRY}): {err}, checking container...");
|
||||
|
||||
let mut container = self.container.write();
|
||||
if docker_container_exists(&container.as_ref().unwrap().name).is_ok_and(|exists| exists)
|
||||
{
|
||||
info!("Container is still running, retrying...");
|
||||
} else {
|
||||
info!("Container not found, recreating...");
|
||||
|
||||
drop(container.take());
|
||||
*container = Some(ServerContainer::new(
|
||||
self.zkvm_kind,
|
||||
&self.program,
|
||||
&self.resource,
|
||||
)?);
|
||||
}
|
||||
attempt += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl zkVM for DockerizedzkVM {
|
||||
fn execute(&self, input: &[u8]) -> anyhow::Result<(PublicValues, ProgramExecutionReport)> {
|
||||
let (public_values, report) =
|
||||
block_on(self.client.execute(input.to_vec())).map_err(Error::from)?;
|
||||
|
||||
Ok((public_values, report))
|
||||
let input = input.to_vec();
|
||||
self.with_retry(|client| block_on(client.execute(input.clone())))
|
||||
}
|
||||
|
||||
fn prove(
|
||||
@@ -282,16 +318,13 @@ impl zkVM for DockerizedzkVM {
|
||||
input: &[u8],
|
||||
proof_kind: ProofKind,
|
||||
) -> anyhow::Result<(PublicValues, Proof, ProgramProvingReport)> {
|
||||
let (public_values, proof, report) =
|
||||
block_on(self.client.prove(input.to_vec(), proof_kind)).map_err(Error::from)?;
|
||||
|
||||
Ok((public_values, proof, report))
|
||||
let input = input.to_vec();
|
||||
self.with_retry(|client| block_on(client.prove(input.clone(), proof_kind)))
|
||||
}
|
||||
|
||||
fn verify(&self, proof: &Proof) -> anyhow::Result<PublicValues> {
|
||||
let public_values = block_on(self.client.verify(proof)).map_err(Error::from)?;
|
||||
|
||||
Ok(public_values)
|
||||
let proof = proof.clone();
|
||||
self.with_retry(|client| block_on(client.verify(&proof)))
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str {
|
||||
|
||||
Reference in New Issue
Block a user