Add sysbox support to remote runtime for eval; Add memory monitor, stress tests to help debug memory issue (#6684)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
2026-01-08 22:38:05 -05:00 · 2025-02-18 15:02:28 -05:00
parent 8d097efb4f
commit 1a7003a705
35 changed files with 687 additions and 419 deletions
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -57,18 +57,15 @@ def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE

    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,