Evaluation: redirect sessions to repo-local .eval_sessions via helper; apply across entrypoints; add tests (#10540)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang
2025-08-22 09:34:02 -04:00
committed by GitHub
parent d9cf5b7302
commit 4507a25b85
36 changed files with 274 additions and 293 deletions

View File

@@ -17,8 +17,8 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
def get_config(
metadata: EvalMetadata,
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-slim'
# Create config with agent_bench-specific container image
config = get_openhands_config_for_eval(metadata=metadata)
# Override the container image for agent_bench
config.sandbox.base_container_image = 'python:3.12-slim'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False