Add sysbox support to remote runtime for eval; Add memory monitor, stress tests to help debug memory issue (#6684)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
2026-01-09 14:57:59 -05:00 · 2025-02-18 15:02:28 -05:00
parent 8d097efb4f
commit 1a7003a705
35 changed files with 687 additions and 419 deletions
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -60,17 +60,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -40,21 +40,15 @@ from openhands.utils.async_utils import call_async_from_sync
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-slim'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-slim',
            enable_auto_lint=True,
            use_host_network=False,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -24,7 +25,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    load_from_toml,
    parse_arguments,
@@ -47,22 +47,14 @@ SKIP_NUM = (
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.11-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            timeout=100,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=1800,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -57,18 +57,15 @@ def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -71,17 +71,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -36,17 +36,14 @@ def get_config(
    assert (
        metadata.max_iterations == 1
    ), 'max_iterations must be 1 for browsing delegation evaluation.'
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        workspace_base=None,
        workspace_mount_path=None,
    )
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    assert_and_raise,
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -105,9 +105,7 @@ def get_config(
    instance: pd.Series,
    metadata: EvalMetadata,
 ) -> AppConfig:
    # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
    assert USE_INSTANCE_IMAGE
    # We use a different instance image for the each instance of commit0 eval
    repo_name = instance['repo'].split('/')[1]
    base_container_image = get_instance_docker_image(repo_name)
    logger.info(
@@ -115,28 +113,16 @@ def get_config(
        f'Please make sure this image exists. '
        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
    )
-    # else:
+
-    #     raise
+    sandbox_config = get_default_sandbox_config_for_eval()
-    # base_container_image = SWE_BENCH_CONTAINER_IMAGE
+    sandbox_config.base_container_image = base_container_image
    # logger.info(f'Using swe-bench container image: {base_container_image}')
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image=base_container_image,
            enable_auto_lint=True,
            use_host_network=False,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -62,17 +62,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -48,17 +48,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -40,17 +40,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -29,6 +29,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -37,7 +38,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -61,17 +61,14 @@ ACTION_FORMAT = """
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -30,7 +31,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -82,17 +82,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -45,18 +45,18 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'xingyaoww/od-eval-logic-reasoning:v1.0'
    sandbox_config.runtime_extra_deps = (
        '$OH_INTERPRETER_PATH -m pip install scitools-pyke'
    )
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
            runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -55,23 +55,14 @@ def get_config(
    metadata: EvalMetadata,
    env_id: str,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='xingyaoww/od-eval-miniwob:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
            browsergym_eval_env=env_id,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            remote_runtime_init_timeout=1800,
            keep_runtime_alive=False,
            timeout=120,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -103,18 +103,18 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
    sandbox_config.runtime_extra_deps = (
        f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
    )
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='xingyaoww/od-eval-mint:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -25,6 +25,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -33,7 +34,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
    load_app_config,
@@ -77,16 +77,14 @@ ID2CONDA = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='public.ecr.aws/i5g0m1f6/ml-bench',
            enable_auto_lint=True,
            use_host_network=False,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -59,22 +59,17 @@ def get_config(
    metadata: EvalMetadata,
    instance_id: str,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = (
        'docker.io/xingyaoww/openhands-eval-scienceagentbench'
    )
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_budget_per_task=4,
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench',
            enable_auto_lint=True,
            use_host_network=False,
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -1,5 +1,6 @@
 import json
 import os
 import subprocess
 import tempfile
 import time
 from functools import partial
@@ -21,13 +22,14 @@ from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    get_default_sandbox_config_for_eval,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
 )
 from openhands.core.config import (
    AppConfig,
-    SandboxConfig,
+    LLMConfig,
    get_parser,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -79,22 +81,16 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
        f'Please make sure this image exists. '
        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
    )
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = base_container_image
    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
        dataset_name=metadata.dataset,
        instance_id=instance['instance_id'],
    )
    config = AppConfig(
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image=base_container_image,
            use_host_network=False,
            # large enough timeout, since some testcases take very long to run
            timeout=600,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            remote_runtime_init_timeout=3600,
            remote_runtime_resource_factor=get_instance_resource_factor(
                dataset_name=metadata.dataset,
                instance_id=instance['instance_id'],
            ),
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
@@ -415,13 +411,17 @@ if __name__ == '__main__':
    else:
        # Initialize with a dummy metadata when file doesn't exist
        metadata = EvalMetadata(
-            agent_class="dummy_agent",  # Placeholder agent class
+            agent_class='dummy_agent',  # Placeholder agent class
-            llm_config=LLMConfig(model="dummy_model"),  # Minimal LLM config
+            llm_config=LLMConfig(model='dummy_model'),  # Minimal LLM config
            max_iterations=1,  # Minimal iterations
-            eval_output_dir=os.path.dirname(args.input_file),  # Use input file dir as output dir
+            eval_output_dir=os.path.dirname(
                args.input_file
            ),  # Use input file dir as output dir
            start_time=time.strftime('%Y-%m-%d %H:%M:%S'),  # Current time
-            git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(),  # Current commit
+            git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
-            dataset=args.dataset  # Dataset name from args
+            .decode('utf-8')
            .strip(),  # Current commit
            dataset=args.dataset,  # Dataset name from args
        )
    # The evaluation harness constrains the signature of `process_instance_func` but we need to
--- a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
+++ b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
@@ -1 +0,0 @@
 {"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    assert_and_raise,
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
    is_fatal_evaluation_error,
    make_metadata,
@@ -30,7 +31,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -122,30 +122,23 @@ def get_config(
        base_container_image = SWE_BENCH_CONTAINER_IMAGE
        logger.info(f'Using swe-bench container image: {base_container_image}')
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = base_container_image
    sandbox_config.enable_auto_lint = True
    sandbox_config.use_host_network = False
    # Add platform to the sandbox config to solve issue 4401
    sandbox_config.platform = 'linux/amd64'
    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
        dataset_name=metadata.dataset,
        instance_id=instance['instance_id'],
    )
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image=base_container_image,
            enable_auto_lint=True,
            use_host_network=False,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            # Add platform to the sandbox config to solve issue 4401
            platform='linux/amd64',
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
            remote_runtime_api_timeout=120,
            remote_runtime_resource_factor=get_instance_resource_factor(
                dataset_name=metadata.dataset,
                instance_id=instance['instance_id'],
            ),
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
@@ -331,6 +324,22 @@ def complete_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    if obs.exit_code == -1:
        # The previous command is still running
        # We need to kill previous command
        logger.info('The previous command is still running, trying to kill it...')
        action = CmdRunAction(command='C-c')
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        # Then run the command again
        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
        action.set_hard_timeout(600)
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -13,11 +13,11 @@ from typing import List
 import yaml
 from browsing import pre_login
 from evaluation.utils.shared import get_default_sandbox_config_for_eval
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    LLMConfig,
    SandboxConfig,
    get_agent_config_arg,
    get_llm_config_arg,
    get_parser,
@@ -38,6 +38,8 @@ def get_config(
    llm_config: LLMConfig,
    agent_config: AgentConfig | None,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = base_container_image
    config = AppConfig(
        run_as_openhands=False,
        max_budget_per_task=4,
@@ -45,16 +47,7 @@ def get_config(
        save_trajectory_path=os.path.join(
            mount_path_on_host, f'traj_{task_short_name}.json'
        ),
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image=base_container_image,
            enable_auto_lint=True,
            # using host network to access the host machine from the container
            use_host_network=True,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_enable_retries=True,
        ),
        # we mount trajectories path so that trajectories, generated by OpenHands
        # controller, can be accessible to the evaluator file in the runtime container
        workspace_mount_path=mount_path_on_host,
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
 )
@@ -41,17 +41,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -55,32 +55,29 @@ def get_config(
    assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
    assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    sandbox_config.browsergym_eval_env = env_id
    sandbox_config.runtime_startup_env_vars = {
        'BASE_URL': base_url,
        'OPENAI_API_KEY': openai_api_key,
        'OPENAI_BASE_URL': openai_base_url,
        'VWA_CLASSIFIEDS': f'{base_url}:9980',
        'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
        'VWA_SHOPPING': f'{base_url}:7770',
        'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
        'VWA_REDDIT': f'{base_url}:9999',
        'VWA_GITLAB': f'{base_url}:8023',
        'VWA_WIKIPEDIA': f'{base_url}:8888',
        'VWA_HOMEPAGE': f'{base_url}:4399',
    }
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            browsergym_eval_env=env_id,
            runtime_startup_env_vars={
                'BASE_URL': base_url,
                'OPENAI_API_KEY': openai_api_key,
                'OPENAI_BASE_URL': openai_base_url,
                'VWA_CLASSIFIEDS': f'{base_url}:9980',
                'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
                'VWA_SHOPPING': f'{base_url}:7770',
                'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
                'VWA_REDDIT': f'{base_url}:9999',
                'VWA_GITLAB': f'{base_url}:8023',
                'VWA_WIKIPEDIA': f'{base_url}:8888',
                'VWA_HOMEPAGE': f'{base_url}:4399',
            },
            timeout=300,
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -50,29 +50,26 @@ def get_config(
    assert base_url is not None, 'WEBARENA_BASE_URL must be set'
    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    sandbox_config.browsergym_eval_env = env_id
    sandbox_config.runtime_startup_env_vars = {
        'BASE_URL': base_url,
        'OPENAI_API_KEY': openai_api_key,
        'SHOPPING': f'{base_url}:7770/',
        'SHOPPING_ADMIN': f'{base_url}:7780/admin',
        'REDDIT': f'{base_url}:9999',
        'GITLAB': f'{base_url}:8023',
        'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
        'MAP': f'{base_url}:3000',
        'HOMEPAGE': f'{base_url}:4399',
    }
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='docker',
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            browsergym_eval_env=env_id,
            runtime_startup_env_vars={
                'BASE_URL': base_url,
                'OPENAI_API_KEY': openai_api_key,
                'SHOPPING': f'{base_url}:7770/',
                'SHOPPING_ADMIN': f'{base_url}:7780/admin',
                'REDDIT': f'{base_url}:9999',
                'GITLAB': f'{base_url}:8023',
                'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
                'MAP': f'{base_url}:3000',
                'HOMEPAGE': f'{base_url}:4399',
            },
            remote_runtime_enable_retries=True,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -8,6 +8,7 @@ from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestRes
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    get_default_sandbox_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    AgentConfig,
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
    parse_arguments,
 )
@@ -43,23 +43,14 @@ def get_config(
    metadata: EvalMetadata,
    instance_id: str,
 ) -> AppConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.platform = 'linux/amd64'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime=os.environ.get('RUNTIME', 'docker'),
        max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
+        sandbox=sandbox_config,
            # use default base_container_image
            enable_auto_lint=True,
            use_host_network=False,
            timeout=300,
            # Add platform to the sandbox config to solve issue 4401
            platform='linux/amd64',
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_runtime_alive=False,
            remote_runtime_init_timeout=3600,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -16,7 +16,7 @@ from pydantic import BaseModel
 from tqdm import tqdm
 from openhands.controller.state.state import State
-from openhands.core.config import LLMConfig
+from openhands.core.config import LLMConfig, SandboxConfig
 from openhands.core.config.agent_config import AgentConfig
 from openhands.core.config.condenser_config import (
    CondenserConfig,
@@ -555,3 +555,18 @@ def get_metrics(state: State) -> dict[str, Any]:
    metrics = state.metrics.get() if state.metrics else {}
    metrics['condenser'] = get_condensation_metadata(state)
    return metrics
 def get_default_sandbox_config_for_eval() -> SandboxConfig:
    return SandboxConfig(
        use_host_network=False,
        # large enough timeout, since some testcases take very long to run
        timeout=300,
        api_key=os.environ.get('ALLHANDS_API_KEY', None),
        remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        keep_runtime_alive=False,
        remote_runtime_init_timeout=3600,
        remote_runtime_api_timeout=120,
        remote_runtime_enable_retries=True,
        remote_runtime_class='sysbox',
    )
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -52,6 +52,9 @@ class SandboxConfig(BaseModel):
    remote_runtime_init_timeout: int = Field(default=180)
    remote_runtime_api_timeout: int = Field(default=10)
    remote_runtime_enable_retries: bool = Field(default=False)
    remote_runtime_class: str | None = Field(
        default='sysbox'
    )  # can be "None" (default to gvisor) or "sysbox" (support docker inside runtime + more stable)
    enable_auto_lint: bool = Field(
        default=False  # once enabled, OpenHands would lint files after editing
    )
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -57,6 +57,7 @@ from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
 from openhands.runtime.utils.bash import BashSession
 from openhands.runtime.utils.files import insert_lines, read_lines
 from openhands.runtime.utils.memory_monitor import MemoryMonitor
 from openhands.runtime.utils.runtime_init import init_user_and_working_directory
 from openhands.runtime.utils.system_stats import get_system_stats
 from openhands.utils.async_utils import call_sync_from_async, wait_all
@@ -171,12 +172,19 @@ class ActionExecutor:
        else:
            logger.info('No max memory limit set, using all available system memory')
        self.memory_monitor = MemoryMonitor(
            enable=os.environ.get('RUNTIME_MEMORY_MONITOR', 'False').lower()
            in ['true', '1', 'yes']
        )
        self.memory_monitor.start_monitoring()
    @property
    def initial_cwd(self):
        return self._initial_cwd
    async def ainit(self):
        # bash needs to be initialized first
        logger.debug('Initializing bash session')
        self.bash_session = BashSession(
            work_dir=self._initial_cwd,
            username=self.username,
@@ -186,15 +194,18 @@ class ActionExecutor:
            max_memory_mb=self.max_memory_gb * 1024 if self.max_memory_gb else None,
        )
        self.bash_session.initialize()
        logger.debug('Bash session initialized')
        await wait_all(
            (self._init_plugin(plugin) for plugin in self.plugins_to_load),
            timeout=30,
        )
        logger.debug('All plugins initialized')
        # This is a temporary workaround
        # TODO: refactor AgentSkills to be part of JupyterPlugin
        # AFTER ServerRuntime is deprecated
        logger.debug('Initializing AgentSkills')
        if 'agent_skills' in self.plugins and 'jupyter' in self.plugins:
            obs = await self.run_ipython(
                IPythonRunCellAction(
@@ -203,6 +214,7 @@ class ActionExecutor:
            )
            logger.debug(f'AgentSkills initialized: {obs}')
        logger.debug('Initializing bash commands')
        await self._init_bash_commands()
        logger.debug('Runtime client initialized.')
        self._initialized = True
@@ -447,6 +459,7 @@ class ActionExecutor:
        return await browse(action, self.browser)
    def close(self):
        self.memory_monitor.stop_monitoring()
        if self.bash_session is not None:
            self.bash_session.close()
        self.browser.close()
--- a/openhands/runtime/impl/docker/docker_runtime.py
+++ b/openhands/runtime/impl/docker/docker_runtime.py
@@ -255,7 +255,6 @@ class DockerRuntime(ActionExecutionClient):
            server_port=self._container_port,
            plugins=self.plugins,
            app_config=self.config,
            use_nice_for_root=False,
        )
        try:
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -75,6 +75,8 @@ class RemoteRuntime(ActionExecutionClient):
                'remote_runtime_api_url is required in the remote runtime.'
            )
        assert self.config.sandbox.remote_runtime_class in (None, 'sysbox', 'gvisor')
        self.runtime_builder = RemoteRuntimeBuilder(
            self.config.sandbox.remote_runtime_api_url,
            self.config.sandbox.api_key,
@@ -225,6 +227,9 @@ class RemoteRuntime(ActionExecutionClient):
            'session_id': self.sid,
            'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
        }
        if self.config.sandbox.remote_runtime_class == 'sysbox':
            start_request['runtime_class'] = 'sysbox-runc'
        # We ignore other runtime classes for now, because both None and 'gvisor' map to 'gvisor'
        # Start the sandbox using the /start endpoint
        try:
--- a/openhands/runtime/utils/command.py
+++ b/openhands/runtime/utils/command.py
@@ -16,7 +16,6 @@ def get_action_execution_server_startup_command(
    plugins: list[PluginRequirement],
    app_config: AppConfig,
    python_prefix: list[str] = DEFAULT_PYTHON_PREFIX,
    use_nice_for_root: bool = True,
    override_user_id: int | None = None,
    override_username: str | None = None,
 ):
@@ -40,7 +39,6 @@ def get_action_execution_server_startup_command(
    user_id = override_user_id or (
        sandbox_config.user_id if app_config.run_as_openhands else 0
    )
    is_root = bool(username == 'root')
    base_cmd = [
        *python_prefix,
@@ -59,17 +57,4 @@ def get_action_execution_server_startup_command(
        *browsergym_args,
    ]
-    if is_root and use_nice_for_root:
+    return base_cmd
        # If running as root, set highest priority and lowest OOM score
        cmd_str = ' '.join(base_cmd)
        return [
            'nice',
            '-n',
            '-20',  # Highest priority
            'sh',
            '-c',
            f'echo -1000 > /proc/self/oom_score_adj && exec {cmd_str}',
        ]
    else:
        # If not root OR not using nice for root, run with normal priority
        return base_cmd
--- a/openhands/runtime/utils/memory_monitor.py
+++ b/openhands/runtime/utils/memory_monitor.py
@@ -0,0 +1,66 @@
 """Memory monitoring utilities for the runtime."""
 import threading
 from memory_profiler import memory_usage
 from openhands.core.logger import openhands_logger as logger
 class LogStream:
    """Stream-like object that redirects writes to a logger."""
    def write(self, message):
        if message and not message.isspace():
            logger.info(f'[Memory usage] {message.strip()}')
    def flush(self):
        pass
 class MemoryMonitor:
    def __init__(self, enable: bool = False):
        """Memory monitor for the runtime."""
        self._monitoring_thread: threading.Thread | None = None
        self._stop_monitoring = threading.Event()
        self.log_stream = LogStream()
        self.enable = enable
    def start_monitoring(self):
        """Start monitoring memory usage."""
        if not self.enable:
            return
        if self._monitoring_thread is not None:
            return
        def monitor_process():
            try:
                # Use memory_usage's built-in monitoring loop
                mem_usage = memory_usage(
                    -1,  # Monitor current process
                    interval=0.1,  # Check every second
                    timeout=3600,  # Run indefinitely
                    max_usage=False,  # Get continuous readings
                    include_children=True,  # Include child processes
                    multiprocess=True,  # Monitor all processes
                    stream=self.log_stream,  # Redirect output to logger
                    backend='psutil_pss',
                )
                logger.info(f'Memory usage across time: {mem_usage}')
            except Exception as e:
                logger.error(f'Memory monitoring failed: {e}')
        self._monitoring_thread = threading.Thread(target=monitor_process, daemon=True)
        self._monitoring_thread.start()
        logger.info('Memory monitoring started')
    def stop_monitoring(self):
        """Stop monitoring memory usage."""
        if not self.enable:
            return
        if self._monitoring_thread is not None:
            self._stop_monitoring.set()
            self._monitoring_thread = None
            logger.info('Memory monitoring stopped')
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
 [[package]]
 name = "aiohappyeyeballs"
@@ -4909,6 +4909,21 @@ files = [
    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
 ]
 [[package]]
 name = "memory-profiler"
 version = "0.61.0"
 description = "A module for monitoring memory usage of a python program"
 optional = false
 python-versions = ">=3.5"
 groups = ["main"]
 files = [
    {file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"},
    {file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"},
 ]
 [package.dependencies]
 psutil = "*"
 [[package]]
 name = "minio"
 version = "7.2.15"
@@ -10787,4 +10802,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "63c0a6d2f0c382f9e8010ab167df76d3275945acf4fba3da7611d68be8241429"
+content-hash = "a663ed31b71b4307c9f9665a8af4d5fbb8e1a4f0a5a562055df5ec981e5bdc16"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,9 +71,11 @@ openhands-aci = "^0.2.3"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
 sse-starlette = "^2.1.3"
 psutil = "*"
 stripe = "^11.5.0"
 ipywidgets = "^8.1.5"
 qtconsole = "^5.6.1"
 memory-profiler = "^0.61.0"
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -1,8 +1,21 @@
-"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.
 Example usage:
 ```bash
 export ALLHANDS_API_KEY="YOUR_API_KEY"
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.staging.all-hands.dev"
 poetry run pytest -vvxss tests/runtime/test_stress_remote_runtime.py
 ```
 """
 import asyncio
 import os
 import tempfile
 import time
 from datetime import datetime
 from unittest.mock import MagicMock
 import pandas as pd
@@ -30,7 +43,12 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.action import (
    CmdRunAction,
    FileEditAction,
    FileWriteAction,
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.llm import LLM
@@ -42,20 +60,10 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
 }
-def get_config(
+def get_config() -> AppConfig:
    metadata: EvalMetadata,
 ) -> AppConfig:
    assert (
        os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL') is not None
    ), 'SANDBOX_REMOTE_RUNTIME_API_URL must be set.'
    assert (
        os.environ.get('ALLHANDS_API_KEY') is not None
    ), 'ALLHANDS_API_KEY must be set.'
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+        runtime=os.environ.get('RUNTIME', 'remote'),
        runtime='remote',
        sandbox=SandboxConfig(
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
@@ -63,8 +71,11 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_api_url=os.environ.get(
                'SANDBOX_REMOTE_RUNTIME_API_URL', None
            ),
            keep_runtime_alive=False,
            remote_runtime_resource_factor=1,
        ),
        # do not mount workspace
        workspace_base=None,
@@ -79,132 +90,130 @@ def get_config(
    return config
 def initialize_runtime(
    runtime: Runtime,
 ):
    """Initialize the runtime for the agent.
    This function is called before the runtime is used to run the agent.
    """
    logger.info('-' * 30)
    logger.info('BEGIN Runtime Initialization Fn')
    logger.info('-' * 30)
    obs: CmdOutputObservation
    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
    action.set_hard_timeout(600)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
    action = CmdRunAction(command='mkdir -p /dummy_dir')
    action.set_hard_timeout(600)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
        f'Failed to create /dummy_dir: {str(obs)}',
    )
    with tempfile.TemporaryDirectory() as temp_dir:
        # Construct the full path for the desired file name within the temporary directory
        temp_file_path = os.path.join(temp_dir, 'dummy_file')
        # Write to the file with the desired name within the temporary directory
        with open(temp_file_path, 'w') as f:
            f.write('dummy content')
        # Copy the file to the desired location
        runtime.copy_to(temp_file_path, '/dummy_dir/')
    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
    logger.info('-' * 30)
 def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ) -> EvalOutput:
    config = get_config(metadata)
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
    runtime = create_runtime(config, headless_mode=False)
    call_async_from_sync(runtime.connect)
    try:
        initialize_runtime(runtime)
        instruction = 'dummy instruction'
        agent = Agent.get_cls(metadata.agent_class)(
            llm=LLM(config=metadata.llm_config),
            config=config.get_agent_config(metadata.agent_class),
        )
        def next_command(*args, **kwargs):
            return CmdRunAction(command='ls -lah')
        agent.step = MagicMock(side_effect=next_command)
        # Here's how you can run the agent (similar to the `main` function) and get the final task state
        state: State | None = asyncio.run(
            run_controller(
                config=config,
                initial_user_action=MessageAction(content=instruction),
                runtime=runtime,
                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                    metadata.agent_class
                ],
                agent=agent,
            )
        )
        # if fatal error, throw EvalError to trigger re-run
        if (
            state.last_error
            and 'fatal error during agent execution' in state.last_error
            and 'stuck in a loop' not in state.last_error
        ):
            raise EvalException('Fatal error detected: ' + state.last_error)
    finally:
        runtime.close()
    test_result = {}
    if state is None:
        raise ValueError('State should not be None.')
    histories = [event_to_dict(event) for event in state.history]
    metrics = state.metrics.get() if state.metrics else None
    # Save the output
    output = EvalOutput(
        instance_id=instance.instance_id,
        instruction=instruction,
        instance=instance.to_dict(),  # SWE Bench specific
        test_result=test_result,
        metadata=metadata,
        history=histories,
        metrics=metrics,
        error=state.last_error if state and state.last_error else None,
    )
    return output
@pytest.mark.skipif(
    TEST_IN_CI,
    reason='This test should only be run locally, not in CI.',
 )
-def test_stress_remote_runtime(n_eval_workers: int = 64):
+def test_stress_remote_runtime_eval(n_eval_workers: int = 64):
    """Mimic evaluation setting to test remote runtime in a multi-processing setting."""
    def _initialize_runtime(
        runtime: Runtime,
    ):
        """Initialize the runtime for the agent.
        This function is called before the runtime is used to run the agent.
        """
        logger.info('-' * 30)
        logger.info('BEGIN Runtime Initialization Fn')
        logger.info('-' * 30)
        obs: CmdOutputObservation
        action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
        action.set_hard_timeout(600)
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
        action = CmdRunAction(command='mkdir -p /dummy_dir')
        action.set_hard_timeout(600)
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert_and_raise(
            obs.exit_code == 0,
            f'Failed to create /dummy_dir: {str(obs)}',
        )
        with tempfile.TemporaryDirectory() as temp_dir:
            # Construct the full path for the desired file name within the temporary directory
            temp_file_path = os.path.join(temp_dir, 'dummy_file')
            # Write to the file with the desired name within the temporary directory
            with open(temp_file_path, 'w') as f:
                f.write('dummy content')
            # Copy the file to the desired location
            runtime.copy_to(temp_file_path, '/dummy_dir/')
        logger.info('-' * 30)
        logger.info('END Runtime Initialization Fn')
        logger.info('-' * 30)
    def _process_instance(
        instance: pd.Series,
        metadata: EvalMetadata,
        reset_logger: bool = True,
    ) -> EvalOutput:
        config = get_config()
        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
        if reset_logger:
            log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
            reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
        else:
            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
        runtime = create_runtime(config, headless_mode=True)
        call_async_from_sync(runtime.connect)
        try:
            _initialize_runtime(runtime)
            instruction = 'dummy instruction'
            agent = Agent.get_cls(metadata.agent_class)(
                llm=LLM(config=metadata.llm_config),
                config=config.get_agent_config(metadata.agent_class),
            )
            def next_command(*args, **kwargs):
                return CmdRunAction(command='ls -lah')
            agent.step = MagicMock(side_effect=next_command)
            # Here's how you can run the agent (similar to the `main` function) and get the final task state
            state: State | None = asyncio.run(
                run_controller(
                    config=config,
                    initial_user_action=MessageAction(content=instruction),
                    runtime=runtime,
                    fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                        metadata.agent_class
                    ],
                    agent=agent,
                )
            )
            # if fatal error, throw EvalError to trigger re-run
            if (
                state.last_error
                and 'fatal error during agent execution' in state.last_error
                and 'stuck in a loop' not in state.last_error
            ):
                raise EvalException('Fatal error detected: ' + state.last_error)
        finally:
            runtime.close()
        test_result = {}
        if state is None:
            raise ValueError('State should not be None.')
        histories = [event_to_dict(event) for event in state.history]
        metrics = state.metrics.get() if state.metrics else None
        # Save the output
        output = EvalOutput(
            instance_id=instance.instance_id,
            instruction=instruction,
            instance=instance.to_dict(),  # SWE Bench specific
            test_result=test_result,
            metadata=metadata,
            history=histories,
            metrics=metrics,
            error=state.last_error if state and state.last_error else None,
        )
        return output
    llm_config = LLMConfig()
    metadata = make_metadata(
        llm_config,
@@ -228,4 +237,247 @@ def test_stress_remote_runtime(n_eval_workers: int = 64):
        dummy_instance, output_file, eval_n_limit=len(dummy_instance)
    )
-    run_evaluation(instances, metadata, output_file, n_eval_workers, process_instance)
+    run_evaluation(instances, metadata, output_file, n_eval_workers, _process_instance)
@pytest.mark.skipif(
    TEST_IN_CI,
    reason='This test should only be run locally, not in CI.',
 )
 def test_stress_remote_runtime_long_output_with_soft_and_hard_timeout():
    """Stress test for the remote runtime."""
    config = get_config()
    try:
        runtime = create_runtime(config, headless_mode=True)
        call_async_from_sync(runtime.connect)
        _time_for_test = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        # Run a command that generates long output multiple times
        for i in range(10):
            start_time = time.time()
            iteration_stats = {
                'iteration': i,
                'timestamp': time.time(),
            }
            # Check overall system memory usage
            mem_action = CmdRunAction(
                'free -k | grep "Mem:" | awk \'{printf "Total: %8.1f MB, Used: %8.1f MB, Free: %8.1f MB, Available: %8.1f MB\\n", $2/1024, $3/1024, $4/1024, $7/1024}\''
            )
            mem_obs = runtime.run_action(mem_action)
            assert mem_obs.exit_code == 0
            logger.info(
                f'System memory usage (iteration {i}): {mem_obs.content.strip()}'
            )
            # Parse memory values from output
            mem_parts = mem_obs.content.strip().split(',')
            for part in mem_parts:
                key, value = part.strip().split(':')
                iteration_stats[f'memory_{key.lower()}'] = float(
                    value.replace('MB', '').strip()
                )
            # Check top memory-consuming processes
            mem_action = CmdRunAction(
                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | head -n 5'
            )
            mem_obs = runtime.run_action(mem_action)
            assert mem_obs.exit_code == 0
            _top_processes = [i.strip() for i in mem_obs.content.strip().split('\n')]
            logger.info(
                f'Top 5 memory-consuming processes (iteration {i}):\n{"- " + "\n- ".join(_top_processes)}'
            )
            iteration_stats['top_processes'] = _top_processes
            # Check tmux memory usage (in KB)
            mem_action = CmdRunAction(
                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\''
            )
            mem_obs = runtime.run_action(mem_action)
            assert mem_obs.exit_code == 0
            logger.info(
                f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB'
            )
            try:
                iteration_stats['tmux_memory_mb'] = float(mem_obs.content.strip())
            except (ValueError, AttributeError):
                iteration_stats['tmux_memory_mb'] = None
            # Check action_execution_server mem
            mem_action = CmdRunAction(
                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\''
            )
            mem_obs = runtime.run_action(mem_action)
            assert mem_obs.exit_code == 0
            logger.info(
                f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} MB'
            )
            try:
                iteration_stats['action_server_memory_mb'] = float(
                    mem_obs.content.strip()
                )
            except (ValueError, AttributeError):
                iteration_stats['action_server_memory_mb'] = None
            # Test soft timeout
            action = CmdRunAction(
                'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi'
            )
            obs = runtime.run_action(action)
            assert 'Do you want to continue?' in obs.content
            assert obs.exit_code == -1  # Command is still running, waiting for input
            # Send the confirmation
            action = CmdRunAction('Y', is_input=True)
            obs = runtime.run_action(action)
            assert 'Proceeding with operation...' in obs.content
            assert 'Operation completed successfully!' in obs.content
            assert obs.exit_code == 0
            assert '[The command completed with exit code 0.]' in obs.metadata.suffix
            # Test hard timeout w/ long output
            # Generate long output with 1000 asterisks per line
            action = CmdRunAction(
                f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done'
            )
            action.set_hard_timeout(2)
            obs = runtime.run_action(action)
            # Verify the output
            assert obs.exit_code == -1
            assert f'Line 1 - Iteration {i}' in obs.content
            # Because hard-timeout is triggered, the terminal will in a weird state
            # where it will not accept any new commands.
            obs = runtime.run_action(CmdRunAction('ls'))
            assert obs.exit_code == -1
            assert 'The previous command is still running' in obs.metadata.suffix
            # We need to send a Ctrl+C to reset the terminal.
            obs = runtime.run_action(CmdRunAction('C-c', is_input=True))
            assert obs.exit_code == 130
            # Now make sure the terminal is in a good state
            obs = runtime.run_action(CmdRunAction('ls'))
            assert obs.exit_code == 0
            duration = time.time() - start_time
            iteration_stats['duration'] = duration
            logger.info(f'Completed iteration {i} in {duration:.2f} seconds')
    finally:
        runtime.close()
@pytest.mark.skipif(
    TEST_IN_CI,
    reason='This test should only be run locally, not in CI.',
 )
 def test_stress_runtime_memory_limits():
    """Test runtime behavior under resource constraints."""
    config = get_config()
    # For Docker runtime, add resource constraints
    if config.runtime == 'docker':
        config.sandbox.docker_runtime_kwargs = {
            'cpu_period': 100000,  # 100ms
            'cpu_quota': 100000,  # Can use 100ms out of each 100ms period (1 CPU)
            'mem_limit': '4G',  # 4 GB of memory
            'memswap_limit': '0',  # No swap
            'mem_swappiness': 0,  # Disable swapping
            'oom_kill_disable': False,  # Enable OOM killer
        }
    config.sandbox.runtime_startup_env_vars = {
        'RUNTIME_MAX_MEMORY_GB': '3',
        'RUNTIME_MEMORY_MONITOR': 'true',
    }
    try:
        runtime = create_runtime(config, headless_mode=True)
        call_async_from_sync(runtime.connect)
        # Install stress-ng
        action = CmdRunAction(
            command='sudo apt-get update && sudo apt-get install -y stress-ng'
        )
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0
        action = CmdRunAction(
            command='stress-ng --vm 1 --vm-bytes 6G --timeout 1m --metrics'
        )
        action.set_hard_timeout(120)
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert 'aborted early, out of system resources' in obs.content
        assert obs.exit_code == 3  # OOM killed!
    finally:
        runtime.close()
@pytest.mark.skipif(
    TEST_IN_CI,
    reason='This test should only be run locally, not in CI.',
 )
 def test_stress_runtime_memory_limits_with_repeated_file_edit():
    """Test runtime behavior under resource constraints with repeated file edits."""
    config = get_config()
    # For Docker runtime, add resource constraints
    if config.runtime == 'docker':
        config.sandbox.docker_runtime_kwargs = {
            'cpu_period': 100000,  # 100ms
            'cpu_quota': 100000,  # Can use 100ms out of each 100ms period (1 CPU)
            'mem_limit': '4G',  # 4 GB of memory
            'memswap_limit': '0',  # No swap
            'mem_swappiness': 0,  # Disable swapping
            'oom_kill_disable': False,  # Enable OOM killer
        }
    config.sandbox.runtime_startup_env_vars = {
        'RUNTIME_MAX_MEMORY_GB': '3',
        'RUNTIME_MEMORY_MONITOR': 'true',
    }
    try:
        runtime = create_runtime(config, headless_mode=True)
        call_async_from_sync(runtime.connect)
        # Create initial test file with base content
        test_file = '/tmp/test_file.txt'
        # base_content = 'content_1\n' * 1000  # Create a reasonably sized file
        base_content = ''
        for i in range(1000):
            base_content += f'content_{i:03d}\n'
        # Use FileWriteAction to create initial file
        write_action = FileWriteAction(path=test_file, content=base_content)
        obs = runtime.run_action(write_action)
        # Perform repeated file edits
        for i in range(1000):
            # Use FileEditAction with str_replace instead of IPythonRunCellAction
            edit_action = FileEditAction(
                command='str_replace',
                path=test_file,
                old_str=f'content_{i:03d}',
                new_str=f'-content_{i:03d}',
            )
            obs = runtime.run_action(edit_action)
            assert (
                f'The file {test_file} has been edited' in obs.content
            ), f'Edit failed at iteration {i}'
            logger.info(f'finished iteration {i}')
        # Verify final file state using FileEditAction view command
        action = FileEditAction(command='view', path=test_file)
        obs = runtime.run_action(action)
        assert '-content_999' in obs.content, 'Final content verification failed'
        logger.info('Final file content verified successfully')
    finally:
        runtime.close()
		`@@ -1 +0,0 @@`
			{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}