specify condenser config for evals (#8177)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-09 14:57:59 -05:00 · 2025-05-21 22:08:57 +02:00
parent 2bd10de636
commit 637cb0726a
5 changed files with 189 additions and 4 deletions
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -45,7 +45,7 @@ For example, for instance ID `django_django-11011`, it will try to pull our pre-
 This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
-./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]

 # Example
 ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
@@ -69,13 +69,20 @@ default, it is set to 1.
 - `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
 - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.

+- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
+- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
+
 > [!CAUTION]
 > Setting `num_workers` larger than 1 is not officially tested, YMMV.

-There is also one optional environment variable you can set.
+There are also optional environment variables you can set:

 ```bash
-export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
+# Use hint text in the evaluation (default: false)
+export USE_HINT_TEXT=true # Ignore this if you are not sure.
+
+# Specify a condenser configuration for memory management (default: NoOpCondenser)
+export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml
 ```

 Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -44,6 +44,8 @@ from openhands.core.config import (
    get_llm_config_arg,
    get_parser,
 )
+from openhands.core.config.utils import get_condenser_config_arg
+from openhands.core.config.condenser_config import NoOpCondenserConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.critic import AgentFinishedCritic
@@ -756,6 +758,7 @@ if __name__ == '__main__':
        choices=['swe', 'swt', 'swt-ci'],
        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
    )
+
    args, _ = parser.parse_known_args()

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -792,6 +795,19 @@ if __name__ == '__main__':
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

+    # Get condenser config from environment variable
+    condenser_name = os.environ.get('EVAL_CONDENSER')
+    if condenser_name:
+        condenser_config = get_condenser_config_arg(condenser_name)
+        if condenser_config is None:
+            raise ValueError(
+                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
+            )
+    else:
+        # If no specific condenser config is provided via env var, default to NoOpCondenser
+        condenser_config = NoOpCondenserConfig()
+        logger.debug('No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.')
+
    details = {'mode': args.mode}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

@@ -806,6 +822,7 @@ if __name__ == '__main__':
        args.eval_note,
        args.eval_output_dir,
        details=details,
+        condenser_config=condenser_config,
    )

    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -14,6 +14,7 @@ SPLIT=$8
 N_RUNS=$9
 MODE=${10}

+
 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
  echo "Number of workers not specified, use default $NUM_WORKERS"
@@ -51,6 +52,12 @@ if [ -z "$MODE" ]; then
  echo "MODE not specified, use default $MODE"
 fi

+if [ -n "$EVAL_CONDENSER" ]; then
+  echo "Using Condenser Config: $EVAL_CONDENSER"
+else
+  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
+fi
+
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

@@ -65,6 +72,7 @@ echo "MAX_ITER: $MAX_ITER"
 echo "NUM_WORKERS: $NUM_WORKERS"
 echo "COMMIT_HASH: $COMMIT_HASH"
 echo "MODE: $MODE"
+echo "EVAL_CONDENSER: $EVAL_CONDENSER"

 # Default to NOT use Hint
 if [ -z "$USE_HINT_TEXT" ]; then
@@ -88,6 +96,10 @@ fi
 if [ "$MODE" != "swe" ]; then
  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
 fi
+# Add condenser config to eval note if provided
+if [ -n "$EVAL_CONDENSER" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
+fi

 function run_eval() {
  local eval_note="${1}"
@@ -101,6 +113,8 @@ function run_eval() {
    --split $SPLIT \
    --mode $MODE"

+
+
  if [ -n "$EVAL_LIMIT" ]; then
    echo "EVAL_LIMIT: $EVAL_LIMIT"
    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"