Evaluation: redirect sessions to repo-local .eval_sessions via helper; apply across entrypoints; add tests (#10540)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-09 14:57:59 -05:00 · 2025-08-22 09:34:02 -04:00
parent d9cf5b7302
commit 4507a25b85
36 changed files with 274 additions and 293 deletions
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -17,8 +17,8 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
-    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
 def get_config(
    metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-slim'
+    # Create config with agent_bench-specific container image
+    config = get_openhands_config_for_eval(metadata=metadata)
+
+    # Override the container image for agent_bench
+    config.sandbox.base_container_image = 'python:3.12-slim'

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
    agent_config.enable_prompt_extensions = False