Add evaluation changes without disabling repository memory

2026-04-29 03:00:45 -04:00 · 2025-04-15 15:06:58 +00:00
21 changed files with 81 additions and 1 deletions
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -74,6 +75,7 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -55,6 +56,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -61,6 +62,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False

    # copy 'draft_editor' config if exists
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -72,6 +73,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -86,6 +87,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -50,6 +51,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/commit0/run_infer.py
+++ b/evaluation/benchmarks/commit0/run_infer.py
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
@@ -135,6 +136,7 @@ def get_config(
        enable_browsing=RUN_WITH_BROWSING,
        enable_llm_editor=False,
    )
+    agent_config = update_agent_config_for_eval(agent_config)
    config.set_agent_config(agent_config)
    return config

--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -76,6 +77,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    agent_config = AgentConfig(
        function_calling=False,
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -66,6 +67,8 @@ def get_config(
    else:
        logger.info('Agent config not provided, using default settings')
        agent_config = config.get_agent_config(metadata.agent_class)
+
+        agent_config = update_agent_config_for_eval(agent_config)
        agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -54,6 +55,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -34,6 +34,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -75,6 +76,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -27,6 +27,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -96,6 +97,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -63,6 +64,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -121,6 +122,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -30,6 +30,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -91,6 +92,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -30,6 +30,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
@@ -231,6 +232,7 @@ def get_config(
        condenser=metadata.condenser_config,
        enable_prompt_extensions=False,
    )
+    agent_config = update_agent_config_for_eval(agent_config)
    config.set_agent_config(agent_config)
    return config

--- a/evaluation/benchmarks/testgeneval/run_infer.py
+++ b/evaluation/benchmarks/testgeneval/run_infer.py
@@ -30,6 +30,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
@@ -164,6 +165,7 @@ def get_config(
        condenser=metadata.condenser_config,
        enable_prompt_extensions=False,
    )
+    agent_config = update_agent_config_for_eval(agent_config)
    config.set_agent_config(agent_config)
    return config

--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -13,7 +13,10 @@ from typing import List
 import yaml
 from browsing import pre_login

-from evaluation.utils.shared import get_default_sandbox_config_for_eval
+from evaluation.utils.shared import (
+    get_default_sandbox_config_for_eval,
+    update_agent_config_for_eval,
+)
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
@@ -58,12 +61,14 @@ def get_config(
    )
    config.set_llm_config(llm_config)
    if agent_config:
+        agent_config = update_agent_config_for_eval(agent_config)
        config.set_agent_config(agent_config)
    else:
        logger.info('Agent config not provided, using default settings')
        agent_config = AgentConfig(
            enable_prompt_extensions=False,
        )
+        agent_config = update_agent_config_for_eval(agent_config)
        config.set_agent_config(agent_config)
    return config

--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -55,6 +56,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_agent_config_for_eval,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -76,6 +77,8 @@ def get_config(
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
+
+    agent_config = update_agent_config_for_eval(agent_config)
    agent_config.enable_prompt_extensions = False
    return config

--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -160,6 +160,26 @@ def cleanup():
        process.join()


+def update_agent_config_for_eval(
+    agent_config: AgentConfig | None = None,
+) -> AgentConfig:
+    """Update agent config with evaluation-specific settings.
+
+    Args:
+        agent_config: The agent config to update. If None, a new AgentConfig will be created.
+
+    Returns:
+        The updated agent config.
+    """
+    if agent_config is None:
+        agent_config = AgentConfig()
+
+    # Note: We're not disabling repository memory here as requested
+    # This function can be used for other evaluation-specific settings
+
+    return agent_config
+
+
 def make_metadata(
    llm_config: LLMConfig,
    dataset_name: str,
@@ -172,6 +192,8 @@ def make_metadata(
    agent_config: AgentConfig | None = None,
    condenser_config: CondenserConfig | None = None,
 ) -> EvalMetadata:
+    # Update agent config with evaluation-specific settings
+    agent_config = update_agent_config_for_eval(agent_config)
    model_name = llm_config.model.split('/')[-1]
    model_path = model_name.replace(':', '_').replace('@', '-')
    eval_note = f'_N_{eval_note}' if eval_note else ''