specify condenser config for evals (#8177)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-10 07:18:10 -05:00 · 2025-05-21 22:08:57 +02:00
parent 2bd10de636
commit 637cb0726a
5 changed files with 189 additions and 4 deletions
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -42,6 +42,37 @@ api_key = "XXX"
 temperature = 0.0
 ```

+### Configuring Condensers for Evaluation
+
+For benchmarks that support condenser configuration (like SWE-Bench), you can define multiple condenser configurations in your `config.toml` file. A condenser is responsible for managing conversation history to maintain context while staying within token limits - you can learn more about how it works [here](https://www.all-hands.dev/blog/openhands-context-condensensation-for-more-efficient-ai-agents):
+
+```toml
+# LLM-based summarizing condenser for evaluation
+[condenser.summarizer_for_eval]
+type = "llm"
+llm_config = "haiku"  # Reference to an LLM config to use for summarization
+keep_first = 2        # Number of initial events to always keep
+max_size = 100        # Maximum size of history before triggering summarization
+
+# Recent events condenser for evaluation
+[condenser.recent_for_eval]
+type = "recent"
+keep_first = 2        # Number of initial events to always keep
+max_events = 50       # Maximum number of events to keep in history
+```
+
+You can then specify which condenser configuration to use when running evaluation scripts, for example:
+
+```bash
+EVAL_CONDENSER=summarizer_for_eval \
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
+```
+
+The name is up to you, but should match a name defined in your `config.toml` file. The last argument in the command specifies the condenser configuration to use. In this case, `summarizer_for_eval` is used, which refers to the LLM-based summarizing condenser as defined above.
+
+If no condenser configuration is specified, the 'noop' condenser will be used by default, which keeps the full conversation history.
+```
+
 For other configurations specific to evaluation, such as `save_trajectory_path`, these are typically set in the `get_config` function of the respective `run_infer.py` file for each benchmark.

 ## Supported Benchmarks
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -45,7 +45,7 @@ For example, for instance ID `django_django-11011`, it will try to pull our pre-
 This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]

 # Example
 ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
@@ -69,13 +69,20 @@ default, it is set to 1.
 - `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
 - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.

+- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
+- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
+
 > [!CAUTION]
 > Setting `num_workers` larger than 1 is not officially tested, YMMV.

-There is also one optional environment variable you can set.
+There are also optional environment variables you can set:

 ```bash
-export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
+# Use hint text in the evaluation (default: false)
+export USE_HINT_TEXT=true # Ignore this if you are not sure.
+
+# Specify a condenser configuration for memory management (default: NoOpCondenser)
+export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml
 ```

 Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -44,6 +44,8 @@ from openhands.core.config import (
    get_llm_config_arg,
    get_parser,
 )
+from openhands.core.config.utils import get_condenser_config_arg
+from openhands.core.config.condenser_config import NoOpCondenserConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.critic import AgentFinishedCritic
@@ -756,6 +758,7 @@ if __name__ == '__main__':
        choices=['swe', 'swt', 'swt-ci'],
        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
    )
+
    args, _ = parser.parse_known_args()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -792,6 +795,19 @@ if __name__ == '__main__':
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

+    # Get condenser config from environment variable
+    condenser_name = os.environ.get('EVAL_CONDENSER')
+    if condenser_name:
+        condenser_config = get_condenser_config_arg(condenser_name)
+        if condenser_config is None:
+            raise ValueError(
+                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
+            )
+    else:
+        # If no specific condenser config is provided via env var, default to NoOpCondenser
+        condenser_config = NoOpCondenserConfig()
+        logger.debug('No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.')
+
    details = {'mode': args.mode}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

@@ -806,6 +822,7 @@ if __name__ == '__main__':
        args.eval_note,
        args.eval_output_dir,
        details=details,
+        condenser_config=condenser_config,
    )

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -14,6 +14,7 @@ SPLIT=$8
 N_RUNS=$9
 MODE=${10}

+
 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
  echo "Number of workers not specified, use default $NUM_WORKERS"
@@ -51,6 +52,12 @@ if [ -z "$MODE" ]; then
  echo "MODE not specified, use default $MODE"
 fi

+if [ -n "$EVAL_CONDENSER" ]; then
+  echo "Using Condenser Config: $EVAL_CONDENSER"
+else
+  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
+fi
+
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

@@ -65,6 +72,7 @@ echo "MAX_ITER: $MAX_ITER"
 echo "NUM_WORKERS: $NUM_WORKERS"
 echo "COMMIT_HASH: $COMMIT_HASH"
 echo "MODE: $MODE"
+echo "EVAL_CONDENSER: $EVAL_CONDENSER"

 # Default to NOT use Hint
 if [ -z "$USE_HINT_TEXT" ]; then
@@ -88,6 +96,10 @@ fi
 if [ "$MODE" != "swe" ]; then
  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
 fi
+# Add condenser config to eval note if provided
+if [ -n "$EVAL_CONDENSER" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
+fi

 function run_eval() {
  local eval_note="${1}"
@@ -101,6 +113,8 @@ function run_eval() {
    --split $SPLIT \
    --mode $MODE"

+
+
  if [ -n "$EVAL_LIMIT" ]; then
    echo "EVAL_LIMIT: $EVAL_LIMIT"
    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -16,7 +16,11 @@ from openhands import __version__
 from openhands.core import logger
 from openhands.core.config.agent_config import AgentConfig
 from openhands.core.config.app_config import AppConfig
-from openhands.core.config.condenser_config import condenser_config_from_toml_section
+from openhands.core.config.condenser_config import (
+    CondenserConfig,
+    condenser_config_from_toml_section,
+    create_condenser_config,
+)
 from openhands.core.config.config_utils import (
    OH_DEFAULT_AGENT,
    OH_MAX_ITERATIONS,
@@ -491,6 +495,118 @@ def get_llm_config_arg(
    return None


+def get_condenser_config_arg(
+    condenser_config_arg: str, toml_file: str = 'config.toml'
+) -> CondenserConfig | None:
+    """Get a group of condenser settings from the config file by name.
+
+    A group in config.toml can look like this:
+
+    ```
+    [condenser.my_summarizer]
+    type = 'llm'
+    llm_config = 'gpt-4o' # References [llm.gpt-4o]
+    max_size = 50
+    ...
+    ```
+
+    The user-defined group name, like "my_summarizer", is the argument to this function.
+    The function will load the CondenserConfig object with the settings of this group,
+    from the config file.
+
+    Note that the group must be under the "condenser" group, or in other words,
+    the group name must start with "condenser.".
+
+    Args:
+        condenser_config_arg: The group of condenser settings to get from the config.toml file.
+        toml_file: Path to the configuration file to read from. Defaults to 'config.toml'.
+
+    Returns:
+        CondenserConfig: The CondenserConfig object with the settings from the config file, or None if not found/error.
+    """
+    # keep only the name, just in case
+    condenser_config_arg = condenser_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if condenser_config_arg.startswith('condenser.'):
+        condenser_config_arg = condenser_config_arg[10:]
+
+    logger.openhands_logger.debug(
+        f'Loading condenser config [{condenser_config_arg}] from {toml_file}'
+    )
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {toml_file}. Error: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse condenser group [{condenser_config_arg}] from {toml_file}. Exception: {e}'
+        )
+        return None
+
+    # Check if the condenser section and the specific config exist
+    if (
+        'condenser' not in toml_config
+        or condenser_config_arg not in toml_config['condenser']
+    ):
+        logger.openhands_logger.error(
+            f'Condenser config section [condenser.{condenser_config_arg}] not found in {toml_file}'
+        )
+        return None
+
+    condenser_data = toml_config['condenser'][
+        condenser_config_arg
+    ].copy()  # Use copy to modify
+
+    # Determine the type and handle potential LLM dependency
+    condenser_type = condenser_data.get('type')
+    if not condenser_type:
+        logger.openhands_logger.error(
+            f'Missing "type" field in [condenser.{condenser_config_arg}] section of {toml_file}'
+        )
+        return None
+
+    # Handle LLM config reference if needed, using get_llm_config_arg
+    if (
+        condenser_type in ('llm', 'llm_attention', 'structured')
+        and 'llm_config' in condenser_data
+        and isinstance(condenser_data['llm_config'], str)
+    ):
+        llm_config_name = condenser_data['llm_config']
+        logger.openhands_logger.debug(
+            f'Condenser [{condenser_config_arg}] requires LLM config [{llm_config_name}]. Loading it...'
+        )
+        # Use the existing function to load the specific LLM config
+        referenced_llm_config = get_llm_config_arg(llm_config_name, toml_file=toml_file)
+
+        if referenced_llm_config:
+            # Replace the string reference with the actual LLMConfig object
+            condenser_data['llm_config'] = referenced_llm_config
+        else:
+            # get_llm_config_arg already logs the error if not found
+            logger.openhands_logger.error(
+                f"Failed to load required LLM config '{llm_config_name}' for condenser '{condenser_config_arg}'."
+            )
+            return None
+
+    # Create the condenser config instance
+    try:
+        config = create_condenser_config(condenser_type, condenser_data)
+        logger.openhands_logger.info(
+            f'Successfully loaded condenser config [{condenser_config_arg}] from {toml_file}'
+        )
+        return config
+    except (ValidationError, ValueError) as e:
+        logger.openhands_logger.error(
+            f'Invalid condenser configuration for [{condenser_config_arg}]: {e}.'
+        )
+        return None
+
+
 # Command line arguments
 def get_parser() -> argparse.ArgumentParser:
    """Get the argument parser."""