Add sysbox support to remote runtime for eval; Add memory monitor, stress tests to help debug memory issue (#6684)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang
2025-02-18 15:02:28 -05:00
committed by GitHub
parent 8d097efb4f
commit 1a7003a705
35 changed files with 687 additions and 419 deletions

View File

@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -60,17 +60,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -40,21 +40,15 @@ from openhands.utils.async_utils import call_async_from_sync
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-slim'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-slim',
enable_auto_lint=True,
use_host_network=False,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -24,7 +25,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
load_from_toml, load_from_toml,
parse_arguments, parse_arguments,
@@ -47,22 +47,14 @@ SKIP_NUM = (
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.11-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.11-bookworm',
enable_auto_lint=True,
use_host_network=False,
timeout=100,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=1800,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -57,18 +57,15 @@ def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0' BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -71,17 +71,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -36,17 +36,14 @@ def get_config(
assert ( assert (
metadata.max_iterations == 1 metadata.max_iterations == 1
), 'max_iterations must be 1 for browsing delegation evaluation.' ), 'max_iterations must be 1 for browsing delegation evaluation.'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
remote_runtime_enable_retries=True,
),
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
) )

View File

@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
assert_and_raise, assert_and_raise,
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AgentConfig, AgentConfig,
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -105,9 +105,7 @@ def get_config(
instance: pd.Series, instance: pd.Series,
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
# COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
assert USE_INSTANCE_IMAGE assert USE_INSTANCE_IMAGE
# We use a different instance image for the each instance of commit0 eval
repo_name = instance['repo'].split('/')[1] repo_name = instance['repo'].split('/')[1]
base_container_image = get_instance_docker_image(repo_name) base_container_image = get_instance_docker_image(repo_name)
logger.info( logger.info(
@@ -115,28 +113,16 @@ def get_config(
f'Please make sure this image exists. ' f'Please make sure this image exists. '
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
) )
# else:
# raise sandbox_config = get_default_sandbox_config_for_eval()
# base_container_image = SWE_BENCH_CONTAINER_IMAGE sandbox_config.base_container_image = base_container_image
# logger.info(f'Using swe-bench container image: {base_container_image}')
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -25,7 +26,6 @@ from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AgentConfig, AgentConfig,
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -62,17 +62,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -48,17 +48,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -40,17 +40,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -29,6 +29,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -37,7 +38,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -61,17 +61,14 @@ ACTION_FORMAT = """
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -30,7 +31,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -82,17 +82,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -17,7 +18,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -45,18 +45,18 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-logic-reasoning:v1.0'
sandbox_config.runtime_extra_deps = (
'$OH_INTERPRETER_PATH -m pip install scitools-pyke'
)
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -55,23 +55,14 @@ def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
env_id: str, env_id: str,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
keep_runtime_alive=False,
timeout=120,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -22,7 +23,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -103,18 +103,18 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
sandbox_config.runtime_extra_deps = (
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
)
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='xingyaoww/od-eval-mint:v1.0',
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -25,6 +25,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -33,7 +34,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
load_app_config, load_app_config,
@@ -77,16 +77,14 @@ ID2CONDA = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='public.ecr.aws/i5g0m1f6/ml-bench',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -59,22 +59,17 @@ def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
instance_id: str, instance_id: str,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = (
'docker.io/xingyaoww/openhands-eval-scienceagentbench'
)
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_budget_per_task=4, max_budget_per_task=4,
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench',
enable_auto_lint=True,
use_host_network=False,
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -1,5 +1,6 @@
import json import json
import os import os
import subprocess
import tempfile import tempfile
import time import time
from functools import partial from functools import partial
@@ -21,13 +22,14 @@ from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
from evaluation.utils.shared import ( from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
get_default_sandbox_config_for_eval,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
) )
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig, LLMConfig,
get_parser, get_parser,
) )
from openhands.core.logger import openhands_logger as logger from openhands.core.logger import openhands_logger as logger
@@ -79,22 +81,16 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
f'Please make sure this image exists. ' f'Please make sure this image exists. '
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
) )
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = AppConfig( config = AppConfig(
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image=base_container_image,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=600,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=3600,
remote_runtime_resource_factor=get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
),
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
@@ -415,13 +411,17 @@ if __name__ == '__main__':
else: else:
# Initialize with a dummy metadata when file doesn't exist # Initialize with a dummy metadata when file doesn't exist
metadata = EvalMetadata( metadata = EvalMetadata(
agent_class="dummy_agent", # Placeholder agent class agent_class='dummy_agent', # Placeholder agent class
llm_config=LLMConfig(model="dummy_model"), # Minimal LLM config llm_config=LLMConfig(model='dummy_model'), # Minimal LLM config
max_iterations=1, # Minimal iterations max_iterations=1, # Minimal iterations
eval_output_dir=os.path.dirname(args.input_file), # Use input file dir as output dir eval_output_dir=os.path.dirname(
args.input_file
), # Use input file dir as output dir
start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(), # Current commit git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
dataset=args.dataset # Dataset name from args .decode('utf-8')
.strip(), # Current commit
dataset=args.dataset, # Dataset name from args
) )
# The evaluation harness constrains the signature of `process_instance_func` but we need to # The evaluation harness constrains the signature of `process_instance_func` but we need to

View File

@@ -1 +0,0 @@
{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}

View File

@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
assert_and_raise, assert_and_raise,
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
@@ -30,7 +31,6 @@ from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AgentConfig, AgentConfig,
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -122,30 +122,23 @@ def get_config(
base_container_image = SWE_BENCH_CONTAINER_IMAGE base_container_image = SWE_BENCH_CONTAINER_IMAGE
logger.info(f'Using swe-bench container image: {base_container_image}') logger.info(f'Using swe-bench container image: {base_container_image}')
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
sandbox_config.enable_auto_lint = True
sandbox_config.use_host_network = False
# Add platform to the sandbox config to solve issue 4401
sandbox_config.platform = 'linux/amd64'
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_api_timeout=120,
remote_runtime_resource_factor=get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
),
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
@@ -331,6 +324,22 @@ def complete_runtime(
logger.info(action, extra={'msg_type': 'ACTION'}) logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action) obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) logger.info(obs, extra={'msg_type': 'OBSERVATION'})
if obs.exit_code == -1:
# The previous command is still running
# We need to kill previous command
logger.info('The previous command is still running, trying to kill it...')
action = CmdRunAction(command='C-c')
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Then run the command again
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise( assert_and_raise(
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',

View File

@@ -13,11 +13,11 @@ from typing import List
import yaml import yaml
from browsing import pre_login from browsing import pre_login
from evaluation.utils.shared import get_default_sandbox_config_for_eval
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
LLMConfig, LLMConfig,
SandboxConfig,
get_agent_config_arg, get_agent_config_arg,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
@@ -38,6 +38,8 @@ def get_config(
llm_config: LLMConfig, llm_config: LLMConfig,
agent_config: AgentConfig | None, agent_config: AgentConfig | None,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
config = AppConfig( config = AppConfig(
run_as_openhands=False, run_as_openhands=False,
max_budget_per_task=4, max_budget_per_task=4,
@@ -45,16 +47,7 @@ def get_config(
save_trajectory_path=os.path.join( save_trajectory_path=os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json' mount_path_on_host, f'traj_{task_short_name}.json'
), ),
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image=base_container_image,
enable_auto_lint=True,
# using host network to access the host machine from the container
use_host_network=True,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_enable_retries=True,
),
# we mount trajectories path so that trajectories, generated by OpenHands # we mount trajectories path so that trajectories, generated by OpenHands
# controller, can be accessible to the evaluator file in the runtime container # controller, can be accessible to the evaluator file in the runtime container
workspace_mount_path=mount_path_on_host, workspace_mount_path=mount_path_on_host,

View File

@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
codeact_user_response, codeact_user_response,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -18,7 +19,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
get_parser, get_parser,
) )
@@ -41,17 +41,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -20,7 +21,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -55,32 +55,29 @@ def get_config(
assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set' assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
assert openai_api_key is not None, 'OPENAI_API_KEY must be set' assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
assert openai_base_url is not None, 'OPENAI_BASE_URL must be set' assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
sandbox_config.browsergym_eval_env = env_id
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'OPENAI_BASE_URL': openai_base_url,
'VWA_CLASSIFIEDS': f'{base_url}:9980',
'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
'VWA_SHOPPING': f'{base_url}:7770',
'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
'VWA_REDDIT': f'{base_url}:9999',
'VWA_GITLAB': f'{base_url}:8023',
'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399',
}
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
runtime_startup_env_vars={
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'OPENAI_BASE_URL': openai_base_url,
'VWA_CLASSIFIEDS': f'{base_url}:9980',
'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
'VWA_SHOPPING': f'{base_url}:7770',
'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
'VWA_REDDIT': f'{base_url}:9999',
'VWA_GITLAB': f'{base_url}:8023',
'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399',
},
timeout=300,
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -19,7 +20,6 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -50,29 +50,26 @@ def get_config(
assert base_url is not None, 'WEBARENA_BASE_URL must be set' assert base_url is not None, 'WEBARENA_BASE_URL must be set'
assert openai_api_key is not None, 'OPENAI_API_KEY must be set' assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
sandbox_config.browsergym_eval_env = env_id
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'SHOPPING': f'{base_url}:7770/',
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
'REDDIT': f'{base_url}:9999',
'GITLAB': f'{base_url}:8023',
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
}
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
runtime_startup_env_vars={
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'SHOPPING': f'{base_url}:7770/',
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
'REDDIT': f'{base_url}:9999',
'GITLAB': f'{base_url}:8023',
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
},
remote_runtime_enable_retries=True,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -8,6 +8,7 @@ from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestRes
from evaluation.utils.shared import ( from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
get_default_sandbox_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@@ -21,7 +22,6 @@ from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
AgentConfig, AgentConfig,
AppConfig, AppConfig,
SandboxConfig,
get_llm_config_arg, get_llm_config_arg,
parse_arguments, parse_arguments,
) )
@@ -43,23 +43,14 @@ def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
instance_id: str, instance_id: str,
) -> AppConfig: ) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.platform = 'linux/amd64'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=sandbox_config,
# use default base_container_image
enable_auto_lint=True,
use_host_network=False,
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel
from tqdm import tqdm from tqdm import tqdm
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import LLMConfig from openhands.core.config import LLMConfig, SandboxConfig
from openhands.core.config.agent_config import AgentConfig from openhands.core.config.agent_config import AgentConfig
from openhands.core.config.condenser_config import ( from openhands.core.config.condenser_config import (
CondenserConfig, CondenserConfig,
@@ -555,3 +555,18 @@ def get_metrics(state: State) -> dict[str, Any]:
metrics = state.metrics.get() if state.metrics else {} metrics = state.metrics.get() if state.metrics else {}
metrics['condenser'] = get_condensation_metadata(state) metrics['condenser'] = get_condensation_metadata(state)
return metrics return metrics
def get_default_sandbox_config_for_eval() -> SandboxConfig:
return SandboxConfig(
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_api_timeout=120,
remote_runtime_enable_retries=True,
remote_runtime_class='sysbox',
)

View File

@@ -52,6 +52,9 @@ class SandboxConfig(BaseModel):
remote_runtime_init_timeout: int = Field(default=180) remote_runtime_init_timeout: int = Field(default=180)
remote_runtime_api_timeout: int = Field(default=10) remote_runtime_api_timeout: int = Field(default=10)
remote_runtime_enable_retries: bool = Field(default=False) remote_runtime_enable_retries: bool = Field(default=False)
remote_runtime_class: str | None = Field(
default='sysbox'
) # can be "None" (default to gvisor) or "sysbox" (support docker inside runtime + more stable)
enable_auto_lint: bool = Field( enable_auto_lint: bool = Field(
default=False # once enabled, OpenHands would lint files after editing default=False # once enabled, OpenHands would lint files after editing
) )

View File

@@ -57,6 +57,7 @@ from openhands.runtime.browser.browser_env import BrowserEnv
from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
from openhands.runtime.utils.bash import BashSession from openhands.runtime.utils.bash import BashSession
from openhands.runtime.utils.files import insert_lines, read_lines from openhands.runtime.utils.files import insert_lines, read_lines
from openhands.runtime.utils.memory_monitor import MemoryMonitor
from openhands.runtime.utils.runtime_init import init_user_and_working_directory from openhands.runtime.utils.runtime_init import init_user_and_working_directory
from openhands.runtime.utils.system_stats import get_system_stats from openhands.runtime.utils.system_stats import get_system_stats
from openhands.utils.async_utils import call_sync_from_async, wait_all from openhands.utils.async_utils import call_sync_from_async, wait_all
@@ -171,12 +172,19 @@ class ActionExecutor:
else: else:
logger.info('No max memory limit set, using all available system memory') logger.info('No max memory limit set, using all available system memory')
self.memory_monitor = MemoryMonitor(
enable=os.environ.get('RUNTIME_MEMORY_MONITOR', 'False').lower()
in ['true', '1', 'yes']
)
self.memory_monitor.start_monitoring()
@property @property
def initial_cwd(self): def initial_cwd(self):
return self._initial_cwd return self._initial_cwd
async def ainit(self): async def ainit(self):
# bash needs to be initialized first # bash needs to be initialized first
logger.debug('Initializing bash session')
self.bash_session = BashSession( self.bash_session = BashSession(
work_dir=self._initial_cwd, work_dir=self._initial_cwd,
username=self.username, username=self.username,
@@ -186,15 +194,18 @@ class ActionExecutor:
max_memory_mb=self.max_memory_gb * 1024 if self.max_memory_gb else None, max_memory_mb=self.max_memory_gb * 1024 if self.max_memory_gb else None,
) )
self.bash_session.initialize() self.bash_session.initialize()
logger.debug('Bash session initialized')
await wait_all( await wait_all(
(self._init_plugin(plugin) for plugin in self.plugins_to_load), (self._init_plugin(plugin) for plugin in self.plugins_to_load),
timeout=30, timeout=30,
) )
logger.debug('All plugins initialized')
# This is a temporary workaround # This is a temporary workaround
# TODO: refactor AgentSkills to be part of JupyterPlugin # TODO: refactor AgentSkills to be part of JupyterPlugin
# AFTER ServerRuntime is deprecated # AFTER ServerRuntime is deprecated
logger.debug('Initializing AgentSkills')
if 'agent_skills' in self.plugins and 'jupyter' in self.plugins: if 'agent_skills' in self.plugins and 'jupyter' in self.plugins:
obs = await self.run_ipython( obs = await self.run_ipython(
IPythonRunCellAction( IPythonRunCellAction(
@@ -203,6 +214,7 @@ class ActionExecutor:
) )
logger.debug(f'AgentSkills initialized: {obs}') logger.debug(f'AgentSkills initialized: {obs}')
logger.debug('Initializing bash commands')
await self._init_bash_commands() await self._init_bash_commands()
logger.debug('Runtime client initialized.') logger.debug('Runtime client initialized.')
self._initialized = True self._initialized = True
@@ -447,6 +459,7 @@ class ActionExecutor:
return await browse(action, self.browser) return await browse(action, self.browser)
def close(self): def close(self):
self.memory_monitor.stop_monitoring()
if self.bash_session is not None: if self.bash_session is not None:
self.bash_session.close() self.bash_session.close()
self.browser.close() self.browser.close()

View File

@@ -255,7 +255,6 @@ class DockerRuntime(ActionExecutionClient):
server_port=self._container_port, server_port=self._container_port,
plugins=self.plugins, plugins=self.plugins,
app_config=self.config, app_config=self.config,
use_nice_for_root=False,
) )
try: try:

View File

@@ -75,6 +75,8 @@ class RemoteRuntime(ActionExecutionClient):
'remote_runtime_api_url is required in the remote runtime.' 'remote_runtime_api_url is required in the remote runtime.'
) )
assert self.config.sandbox.remote_runtime_class in (None, 'sysbox', 'gvisor')
self.runtime_builder = RemoteRuntimeBuilder( self.runtime_builder = RemoteRuntimeBuilder(
self.config.sandbox.remote_runtime_api_url, self.config.sandbox.remote_runtime_api_url,
self.config.sandbox.api_key, self.config.sandbox.api_key,
@@ -225,6 +227,9 @@ class RemoteRuntime(ActionExecutionClient):
'session_id': self.sid, 'session_id': self.sid,
'resource_factor': self.config.sandbox.remote_runtime_resource_factor, 'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
} }
if self.config.sandbox.remote_runtime_class == 'sysbox':
start_request['runtime_class'] = 'sysbox-runc'
# We ignore other runtime classes for now, because both None and 'gvisor' map to 'gvisor'
# Start the sandbox using the /start endpoint # Start the sandbox using the /start endpoint
try: try:

View File

@@ -16,7 +16,6 @@ def get_action_execution_server_startup_command(
plugins: list[PluginRequirement], plugins: list[PluginRequirement],
app_config: AppConfig, app_config: AppConfig,
python_prefix: list[str] = DEFAULT_PYTHON_PREFIX, python_prefix: list[str] = DEFAULT_PYTHON_PREFIX,
use_nice_for_root: bool = True,
override_user_id: int | None = None, override_user_id: int | None = None,
override_username: str | None = None, override_username: str | None = None,
): ):
@@ -40,7 +39,6 @@ def get_action_execution_server_startup_command(
user_id = override_user_id or ( user_id = override_user_id or (
sandbox_config.user_id if app_config.run_as_openhands else 0 sandbox_config.user_id if app_config.run_as_openhands else 0
) )
is_root = bool(username == 'root')
base_cmd = [ base_cmd = [
*python_prefix, *python_prefix,
@@ -59,17 +57,4 @@ def get_action_execution_server_startup_command(
*browsergym_args, *browsergym_args,
] ]
if is_root and use_nice_for_root: return base_cmd
# If running as root, set highest priority and lowest OOM score
cmd_str = ' '.join(base_cmd)
return [
'nice',
'-n',
'-20', # Highest priority
'sh',
'-c',
f'echo -1000 > /proc/self/oom_score_adj && exec {cmd_str}',
]
else:
# If not root OR not using nice for root, run with normal priority
return base_cmd

View File

@@ -0,0 +1,66 @@
"""Memory monitoring utilities for the runtime."""
import threading
from memory_profiler import memory_usage
from openhands.core.logger import openhands_logger as logger
class LogStream:
"""Stream-like object that redirects writes to a logger."""
def write(self, message):
if message and not message.isspace():
logger.info(f'[Memory usage] {message.strip()}')
def flush(self):
pass
class MemoryMonitor:
def __init__(self, enable: bool = False):
"""Memory monitor for the runtime."""
self._monitoring_thread: threading.Thread | None = None
self._stop_monitoring = threading.Event()
self.log_stream = LogStream()
self.enable = enable
def start_monitoring(self):
"""Start monitoring memory usage."""
if not self.enable:
return
if self._monitoring_thread is not None:
return
def monitor_process():
try:
# Use memory_usage's built-in monitoring loop
mem_usage = memory_usage(
-1, # Monitor current process
interval=0.1, # Check every second
timeout=3600, # Run indefinitely
max_usage=False, # Get continuous readings
include_children=True, # Include child processes
multiprocess=True, # Monitor all processes
stream=self.log_stream, # Redirect output to logger
backend='psutil_pss',
)
logger.info(f'Memory usage across time: {mem_usage}')
except Exception as e:
logger.error(f'Memory monitoring failed: {e}')
self._monitoring_thread = threading.Thread(target=monitor_process, daemon=True)
self._monitoring_thread.start()
logger.info('Memory monitoring started')
def stop_monitoring(self):
"""Stop monitoring memory usage."""
if not self.enable:
return
if self._monitoring_thread is not None:
self._stop_monitoring.set()
self._monitoring_thread = None
logger.info('Memory monitoring stopped')

19
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. # This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@@ -4909,6 +4909,21 @@ files = [
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
] ]
[[package]]
name = "memory-profiler"
version = "0.61.0"
description = "A module for monitoring memory usage of a python program"
optional = false
python-versions = ">=3.5"
groups = ["main"]
files = [
{file = "memory_profiler-0.61.0-py3-none-any.whl", hash = "sha256:400348e61031e3942ad4d4109d18753b2fb08c2f6fb8290671c5513a34182d84"},
{file = "memory_profiler-0.61.0.tar.gz", hash = "sha256:4e5b73d7864a1d1292fb76a03e82a3e78ef934d06828a698d9dada76da2067b0"},
]
[package.dependencies]
psutil = "*"
[[package]] [[package]]
name = "minio" name = "minio"
version = "7.2.15" version = "7.2.15"
@@ -10787,4 +10802,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = "^3.12" python-versions = "^3.12"
content-hash = "63c0a6d2f0c382f9e8010ab167df76d3275945acf4fba3da7611d68be8241429" content-hash = "a663ed31b71b4307c9f9665a8af4d5fbb8e1a4f0a5a562055df5ec981e5bdc16"

View File

@@ -71,9 +71,11 @@ openhands-aci = "^0.2.3"
python-socketio = "^5.11.4" python-socketio = "^5.11.4"
redis = "^5.2.0" redis = "^5.2.0"
sse-starlette = "^2.1.3" sse-starlette = "^2.1.3"
psutil = "*"
stripe = "^11.5.0" stripe = "^11.5.0"
ipywidgets = "^8.1.5" ipywidgets = "^8.1.5"
qtconsole = "^5.6.1" qtconsole = "^5.6.1"
memory-profiler = "^0.61.0"
[tool.poetry.group.llama-index.dependencies] [tool.poetry.group.llama-index.dependencies]
llama-index = "*" llama-index = "*"

View File

@@ -1,8 +1,21 @@
"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.""" """Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.
Example usage:
```bash
export ALLHANDS_API_KEY="YOUR_API_KEY"
export RUNTIME=remote
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.staging.all-hands.dev"
poetry run pytest -vvxss tests/runtime/test_stress_remote_runtime.py
```
"""
import asyncio import asyncio
import os import os
import tempfile import tempfile
import time
from datetime import datetime
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pandas as pd import pandas as pd
@@ -30,7 +43,12 @@ from openhands.core.config import (
) )
from openhands.core.logger import openhands_logger as logger from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller from openhands.core.main import create_runtime, run_controller
from openhands.events.action import CmdRunAction, MessageAction from openhands.events.action import (
CmdRunAction,
FileEditAction,
FileWriteAction,
MessageAction,
)
from openhands.events.observation import CmdOutputObservation from openhands.events.observation import CmdOutputObservation
from openhands.events.serialization.event import event_to_dict from openhands.events.serialization.event import event_to_dict
from openhands.llm import LLM from openhands.llm import LLM
@@ -42,20 +60,10 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
} }
def get_config( def get_config() -> AppConfig:
metadata: EvalMetadata,
) -> AppConfig:
assert (
os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL') is not None
), 'SANDBOX_REMOTE_RUNTIME_API_URL must be set.'
assert (
os.environ.get('ALLHANDS_API_KEY') is not None
), 'ALLHANDS_API_KEY must be set.'
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
max_iterations=metadata.max_iterations, runtime=os.environ.get('RUNTIME', 'remote'),
runtime='remote',
sandbox=SandboxConfig( sandbox=SandboxConfig(
base_container_image='python:3.11-bookworm', base_container_image='python:3.11-bookworm',
enable_auto_lint=True, enable_auto_lint=True,
@@ -63,8 +71,11 @@ def get_config(
# large enough timeout, since some testcases take very long to run # large enough timeout, since some testcases take very long to run
timeout=300, timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None), api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', None
),
keep_runtime_alive=False, keep_runtime_alive=False,
remote_runtime_resource_factor=1,
), ),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
@@ -79,132 +90,130 @@ def get_config(
return config return config
def initialize_runtime(
runtime: Runtime,
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info('-' * 30)
logger.info('BEGIN Runtime Initialization Fn')
logger.info('-' * 30)
obs: CmdOutputObservation
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
action = CmdRunAction(command='mkdir -p /dummy_dir')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to create /dummy_dir: {str(obs)}',
)
with tempfile.TemporaryDirectory() as temp_dir:
# Construct the full path for the desired file name within the temporary directory
temp_file_path = os.path.join(temp_dir, 'dummy_file')
# Write to the file with the desired name within the temporary directory
with open(temp_file_path, 'w') as f:
f.write('dummy content')
# Copy the file to the desired location
runtime.copy_to(temp_file_path, '/dummy_dir/')
logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
logger.info('-' * 30)
def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
config = get_config(metadata)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
runtime = create_runtime(config, headless_mode=False)
call_async_from_sync(runtime.connect)
try:
initialize_runtime(runtime)
instruction = 'dummy instruction'
agent = Agent.get_cls(metadata.agent_class)(
llm=LLM(config=metadata.llm_config),
config=config.get_agent_config(metadata.agent_class),
)
def next_command(*args, **kwargs):
return CmdRunAction(command='ls -lah')
agent.step = MagicMock(side_effect=next_command)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class
],
agent=agent,
)
)
# if fatal error, throw EvalError to trigger re-run
if (
state.last_error
and 'fatal error during agent execution' in state.last_error
and 'stuck in a loop' not in state.last_error
):
raise EvalException('Fatal error detected: ' + state.last_error)
finally:
runtime.close()
test_result = {}
if state is None:
raise ValueError('State should not be None.')
histories = [event_to_dict(event) for event in state.history]
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = EvalOutput(
instance_id=instance.instance_id,
instruction=instruction,
instance=instance.to_dict(), # SWE Bench specific
test_result=test_result,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
)
return output
@pytest.mark.skipif( @pytest.mark.skipif(
TEST_IN_CI, TEST_IN_CI,
reason='This test should only be run locally, not in CI.', reason='This test should only be run locally, not in CI.',
) )
def test_stress_remote_runtime(n_eval_workers: int = 64): def test_stress_remote_runtime_eval(n_eval_workers: int = 64):
"""Mimic evaluation setting to test remote runtime in a multi-processing setting.""" """Mimic evaluation setting to test remote runtime in a multi-processing setting."""
def _initialize_runtime(
runtime: Runtime,
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info('-' * 30)
logger.info('BEGIN Runtime Initialization Fn')
logger.info('-' * 30)
obs: CmdOutputObservation
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
action = CmdRunAction(command='mkdir -p /dummy_dir')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to create /dummy_dir: {str(obs)}',
)
with tempfile.TemporaryDirectory() as temp_dir:
# Construct the full path for the desired file name within the temporary directory
temp_file_path = os.path.join(temp_dir, 'dummy_file')
# Write to the file with the desired name within the temporary directory
with open(temp_file_path, 'w') as f:
f.write('dummy content')
# Copy the file to the desired location
runtime.copy_to(temp_file_path, '/dummy_dir/')
logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
logger.info('-' * 30)
def _process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
config = get_config()
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
runtime = create_runtime(config, headless_mode=True)
call_async_from_sync(runtime.connect)
try:
_initialize_runtime(runtime)
instruction = 'dummy instruction'
agent = Agent.get_cls(metadata.agent_class)(
llm=LLM(config=metadata.llm_config),
config=config.get_agent_config(metadata.agent_class),
)
def next_command(*args, **kwargs):
return CmdRunAction(command='ls -lah')
agent.step = MagicMock(side_effect=next_command)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class
],
agent=agent,
)
)
# if fatal error, throw EvalError to trigger re-run
if (
state.last_error
and 'fatal error during agent execution' in state.last_error
and 'stuck in a loop' not in state.last_error
):
raise EvalException('Fatal error detected: ' + state.last_error)
finally:
runtime.close()
test_result = {}
if state is None:
raise ValueError('State should not be None.')
histories = [event_to_dict(event) for event in state.history]
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = EvalOutput(
instance_id=instance.instance_id,
instruction=instruction,
instance=instance.to_dict(), # SWE Bench specific
test_result=test_result,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
)
return output
llm_config = LLMConfig() llm_config = LLMConfig()
metadata = make_metadata( metadata = make_metadata(
llm_config, llm_config,
@@ -228,4 +237,247 @@ def test_stress_remote_runtime(n_eval_workers: int = 64):
dummy_instance, output_file, eval_n_limit=len(dummy_instance) dummy_instance, output_file, eval_n_limit=len(dummy_instance)
) )
run_evaluation(instances, metadata, output_file, n_eval_workers, process_instance) run_evaluation(instances, metadata, output_file, n_eval_workers, _process_instance)
@pytest.mark.skipif(
TEST_IN_CI,
reason='This test should only be run locally, not in CI.',
)
def test_stress_remote_runtime_long_output_with_soft_and_hard_timeout():
"""Stress test for the remote runtime."""
config = get_config()
try:
runtime = create_runtime(config, headless_mode=True)
call_async_from_sync(runtime.connect)
_time_for_test = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Run a command that generates long output multiple times
for i in range(10):
start_time = time.time()
iteration_stats = {
'iteration': i,
'timestamp': time.time(),
}
# Check overall system memory usage
mem_action = CmdRunAction(
'free -k | grep "Mem:" | awk \'{printf "Total: %8.1f MB, Used: %8.1f MB, Free: %8.1f MB, Available: %8.1f MB\\n", $2/1024, $3/1024, $4/1024, $7/1024}\''
)
mem_obs = runtime.run_action(mem_action)
assert mem_obs.exit_code == 0
logger.info(
f'System memory usage (iteration {i}): {mem_obs.content.strip()}'
)
# Parse memory values from output
mem_parts = mem_obs.content.strip().split(',')
for part in mem_parts:
key, value = part.strip().split(':')
iteration_stats[f'memory_{key.lower()}'] = float(
value.replace('MB', '').strip()
)
# Check top memory-consuming processes
mem_action = CmdRunAction(
'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | head -n 5'
)
mem_obs = runtime.run_action(mem_action)
assert mem_obs.exit_code == 0
_top_processes = [i.strip() for i in mem_obs.content.strip().split('\n')]
logger.info(
f'Top 5 memory-consuming processes (iteration {i}):\n{"- " + "\n- ".join(_top_processes)}'
)
iteration_stats['top_processes'] = _top_processes
# Check tmux memory usage (in KB)
mem_action = CmdRunAction(
'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\''
)
mem_obs = runtime.run_action(mem_action)
assert mem_obs.exit_code == 0
logger.info(
f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB'
)
try:
iteration_stats['tmux_memory_mb'] = float(mem_obs.content.strip())
except (ValueError, AttributeError):
iteration_stats['tmux_memory_mb'] = None
# Check action_execution_server mem
mem_action = CmdRunAction(
'ps aux | awk \'{printf "%8.1f MB %s\\n", $6/1024, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\''
)
mem_obs = runtime.run_action(mem_action)
assert mem_obs.exit_code == 0
logger.info(
f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} MB'
)
try:
iteration_stats['action_server_memory_mb'] = float(
mem_obs.content.strip()
)
except (ValueError, AttributeError):
iteration_stats['action_server_memory_mb'] = None
# Test soft timeout
action = CmdRunAction(
'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi'
)
obs = runtime.run_action(action)
assert 'Do you want to continue?' in obs.content
assert obs.exit_code == -1 # Command is still running, waiting for input
# Send the confirmation
action = CmdRunAction('Y', is_input=True)
obs = runtime.run_action(action)
assert 'Proceeding with operation...' in obs.content
assert 'Operation completed successfully!' in obs.content
assert obs.exit_code == 0
assert '[The command completed with exit code 0.]' in obs.metadata.suffix
# Test hard timeout w/ long output
# Generate long output with 1000 asterisks per line
action = CmdRunAction(
f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done'
)
action.set_hard_timeout(2)
obs = runtime.run_action(action)
# Verify the output
assert obs.exit_code == -1
assert f'Line 1 - Iteration {i}' in obs.content
# Because hard-timeout is triggered, the terminal will in a weird state
# where it will not accept any new commands.
obs = runtime.run_action(CmdRunAction('ls'))
assert obs.exit_code == -1
assert 'The previous command is still running' in obs.metadata.suffix
# We need to send a Ctrl+C to reset the terminal.
obs = runtime.run_action(CmdRunAction('C-c', is_input=True))
assert obs.exit_code == 130
# Now make sure the terminal is in a good state
obs = runtime.run_action(CmdRunAction('ls'))
assert obs.exit_code == 0
duration = time.time() - start_time
iteration_stats['duration'] = duration
logger.info(f'Completed iteration {i} in {duration:.2f} seconds')
finally:
runtime.close()
@pytest.mark.skipif(
TEST_IN_CI,
reason='This test should only be run locally, not in CI.',
)
def test_stress_runtime_memory_limits():
"""Test runtime behavior under resource constraints."""
config = get_config()
# For Docker runtime, add resource constraints
if config.runtime == 'docker':
config.sandbox.docker_runtime_kwargs = {
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
}
config.sandbox.runtime_startup_env_vars = {
'RUNTIME_MAX_MEMORY_GB': '3',
'RUNTIME_MEMORY_MONITOR': 'true',
}
try:
runtime = create_runtime(config, headless_mode=True)
call_async_from_sync(runtime.connect)
# Install stress-ng
action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 1m --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert 'aborted early, out of system resources' in obs.content
assert obs.exit_code == 3 # OOM killed!
finally:
runtime.close()
@pytest.mark.skipif(
TEST_IN_CI,
reason='This test should only be run locally, not in CI.',
)
def test_stress_runtime_memory_limits_with_repeated_file_edit():
"""Test runtime behavior under resource constraints with repeated file edits."""
config = get_config()
# For Docker runtime, add resource constraints
if config.runtime == 'docker':
config.sandbox.docker_runtime_kwargs = {
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
}
config.sandbox.runtime_startup_env_vars = {
'RUNTIME_MAX_MEMORY_GB': '3',
'RUNTIME_MEMORY_MONITOR': 'true',
}
try:
runtime = create_runtime(config, headless_mode=True)
call_async_from_sync(runtime.connect)
# Create initial test file with base content
test_file = '/tmp/test_file.txt'
# base_content = 'content_1\n' * 1000 # Create a reasonably sized file
base_content = ''
for i in range(1000):
base_content += f'content_{i:03d}\n'
# Use FileWriteAction to create initial file
write_action = FileWriteAction(path=test_file, content=base_content)
obs = runtime.run_action(write_action)
# Perform repeated file edits
for i in range(1000):
# Use FileEditAction with str_replace instead of IPythonRunCellAction
edit_action = FileEditAction(
command='str_replace',
path=test_file,
old_str=f'content_{i:03d}',
new_str=f'-content_{i:03d}',
)
obs = runtime.run_action(edit_action)
assert (
f'The file {test_file} has been edited' in obs.content
), f'Edit failed at iteration {i}'
logger.info(f'finished iteration {i}')
# Verify final file state using FileEditAction view command
action = FileEditAction(command='view', path=test_file)
obs = runtime.run_action(action)
assert '-content_999' in obs.content, 'Final content verification failed'
logger.info('Final file content verified successfully')
finally:
runtime.close()