specify condenser config for evals (#8177)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Engel Nyst
2025-05-21 22:08:57 +02:00
committed by GitHub
parent 2bd10de636
commit 637cb0726a
5 changed files with 189 additions and 4 deletions

View File

@@ -42,6 +42,37 @@ api_key = "XXX"
temperature = 0.0
```
### Configuring Condensers for Evaluation
For benchmarks that support condenser configuration (like SWE-Bench), you can define multiple condenser configurations in your `config.toml` file. A condenser is responsible for managing conversation history to maintain context while staying within token limits - you can learn more about how it works [here](https://www.all-hands.dev/blog/openhands-context-condensensation-for-more-efficient-ai-agents):
```toml
# LLM-based summarizing condenser for evaluation
[condenser.summarizer_for_eval]
type = "llm"
llm_config = "haiku" # Reference to an LLM config to use for summarization
keep_first = 2 # Number of initial events to always keep
max_size = 100 # Maximum size of history before triggering summarization
# Recent events condenser for evaluation
[condenser.recent_for_eval]
type = "recent"
keep_first = 2 # Number of initial events to always keep
max_events = 50 # Maximum number of events to keep in history
```
You can then specify which condenser configuration to use when running evaluation scripts, for example:
```bash
EVAL_CONDENSER=summarizer_for_eval \
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
```
The name is up to you, but should match a name defined in your `config.toml` file. The last argument in the command specifies the condenser configuration to use. In this case, `summarizer_for_eval` is used, which refers to the LLM-based summarizing condenser as defined above.
If no condenser configuration is specified, the 'noop' condenser will be used by default, which keeps the full conversation history.
```
For other configurations specific to evaluation, such as `save_trajectory_path`, these are typically set in the `get_config` function of the respective `run_infer.py` file for each benchmark.
## Supported Benchmarks

View File

@@ -45,7 +45,7 @@ For example, for instance ID `django_django-11011`, it will try to pull our pre-
This image will be used create an OpenHands runtime image where the agent will operate on.
```bash
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]
# Example
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
@@ -69,13 +69,20 @@ default, it is set to 1.
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
> [!CAUTION]
> Setting `num_workers` larger than 1 is not officially tested, YMMV.
There is also one optional environment variable you can set.
There are also optional environment variables you can set:
```bash
export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
# Use hint text in the evaluation (default: false)
export USE_HINT_TEXT=true # Ignore this if you are not sure.
# Specify a condenser configuration for memory management (default: NoOpCondenser)
export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml
```
Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,

View File

@@ -44,6 +44,8 @@ from openhands.core.config import (
get_llm_config_arg,
get_parser,
)
from openhands.core.config.utils import get_condenser_config_arg
from openhands.core.config.condenser_config import NoOpCondenserConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.critic import AgentFinishedCritic
@@ -756,6 +758,7 @@ if __name__ == '__main__':
choices=['swe', 'swt', 'swt-ci'],
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
)
args, _ = parser.parse_known_args()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
@@ -792,6 +795,19 @@ if __name__ == '__main__':
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
# Get condenser config from environment variable
condenser_name = os.environ.get('EVAL_CONDENSER')
if condenser_name:
condenser_config = get_condenser_config_arg(condenser_name)
if condenser_config is None:
raise ValueError(
f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
)
else:
# If no specific condenser config is provided via env var, default to NoOpCondenser
condenser_config = NoOpCondenserConfig()
logger.debug('No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.')
details = {'mode': args.mode}
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
@@ -806,6 +822,7 @@ if __name__ == '__main__':
args.eval_note,
args.eval_output_dir,
details=details,
condenser_config=condenser_config,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')

View File

@@ -14,6 +14,7 @@ SPLIT=$8
N_RUNS=$9
MODE=${10}
if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
echo "Number of workers not specified, use default $NUM_WORKERS"
@@ -51,6 +52,12 @@ if [ -z "$MODE" ]; then
echo "MODE not specified, use default $MODE"
fi
if [ -n "$EVAL_CONDENSER" ]; then
echo "Using Condenser Config: $EVAL_CONDENSER"
else
echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
fi
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
@@ -65,6 +72,7 @@ echo "MAX_ITER: $MAX_ITER"
echo "NUM_WORKERS: $NUM_WORKERS"
echo "COMMIT_HASH: $COMMIT_HASH"
echo "MODE: $MODE"
echo "EVAL_CONDENSER: $EVAL_CONDENSER"
# Default to NOT use Hint
if [ -z "$USE_HINT_TEXT" ]; then
@@ -88,6 +96,10 @@ fi
if [ "$MODE" != "swe" ]; then
EVAL_NOTE="${EVAL_NOTE}-${MODE}"
fi
# Add condenser config to eval note if provided
if [ -n "$EVAL_CONDENSER" ]; then
EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
fi
function run_eval() {
local eval_note="${1}"
@@ -101,6 +113,8 @@ function run_eval() {
--split $SPLIT \
--mode $MODE"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"

View File

@@ -16,7 +16,11 @@ from openhands import __version__
from openhands.core import logger
from openhands.core.config.agent_config import AgentConfig
from openhands.core.config.app_config import AppConfig
from openhands.core.config.condenser_config import condenser_config_from_toml_section
from openhands.core.config.condenser_config import (
CondenserConfig,
condenser_config_from_toml_section,
create_condenser_config,
)
from openhands.core.config.config_utils import (
OH_DEFAULT_AGENT,
OH_MAX_ITERATIONS,
@@ -491,6 +495,118 @@ def get_llm_config_arg(
return None
def get_condenser_config_arg(
condenser_config_arg: str, toml_file: str = 'config.toml'
) -> CondenserConfig | None:
"""Get a group of condenser settings from the config file by name.
A group in config.toml can look like this:
```
[condenser.my_summarizer]
type = 'llm'
llm_config = 'gpt-4o' # References [llm.gpt-4o]
max_size = 50
...
```
The user-defined group name, like "my_summarizer", is the argument to this function.
The function will load the CondenserConfig object with the settings of this group,
from the config file.
Note that the group must be under the "condenser" group, or in other words,
the group name must start with "condenser.".
Args:
condenser_config_arg: The group of condenser settings to get from the config.toml file.
toml_file: Path to the configuration file to read from. Defaults to 'config.toml'.
Returns:
CondenserConfig: The CondenserConfig object with the settings from the config file, or None if not found/error.
"""
# keep only the name, just in case
condenser_config_arg = condenser_config_arg.strip('[]')
# truncate the prefix, just in case
if condenser_config_arg.startswith('condenser.'):
condenser_config_arg = condenser_config_arg[10:]
logger.openhands_logger.debug(
f'Loading condenser config [{condenser_config_arg}] from {toml_file}'
)
# load the toml file
try:
with open(toml_file, 'r', encoding='utf-8') as toml_contents:
toml_config = toml.load(toml_contents)
except FileNotFoundError as e:
logger.openhands_logger.error(f'Config file not found: {toml_file}. Error: {e}')
return None
except toml.TomlDecodeError as e:
logger.openhands_logger.error(
f'Cannot parse condenser group [{condenser_config_arg}] from {toml_file}. Exception: {e}'
)
return None
# Check if the condenser section and the specific config exist
if (
'condenser' not in toml_config
or condenser_config_arg not in toml_config['condenser']
):
logger.openhands_logger.error(
f'Condenser config section [condenser.{condenser_config_arg}] not found in {toml_file}'
)
return None
condenser_data = toml_config['condenser'][
condenser_config_arg
].copy() # Use copy to modify
# Determine the type and handle potential LLM dependency
condenser_type = condenser_data.get('type')
if not condenser_type:
logger.openhands_logger.error(
f'Missing "type" field in [condenser.{condenser_config_arg}] section of {toml_file}'
)
return None
# Handle LLM config reference if needed, using get_llm_config_arg
if (
condenser_type in ('llm', 'llm_attention', 'structured')
and 'llm_config' in condenser_data
and isinstance(condenser_data['llm_config'], str)
):
llm_config_name = condenser_data['llm_config']
logger.openhands_logger.debug(
f'Condenser [{condenser_config_arg}] requires LLM config [{llm_config_name}]. Loading it...'
)
# Use the existing function to load the specific LLM config
referenced_llm_config = get_llm_config_arg(llm_config_name, toml_file=toml_file)
if referenced_llm_config:
# Replace the string reference with the actual LLMConfig object
condenser_data['llm_config'] = referenced_llm_config
else:
# get_llm_config_arg already logs the error if not found
logger.openhands_logger.error(
f"Failed to load required LLM config '{llm_config_name}' for condenser '{condenser_config_arg}'."
)
return None
# Create the condenser config instance
try:
config = create_condenser_config(condenser_type, condenser_data)
logger.openhands_logger.info(
f'Successfully loaded condenser config [{condenser_config_arg}] from {toml_file}'
)
return config
except (ValidationError, ValueError) as e:
logger.openhands_logger.error(
f'Invalid condenser configuration for [{condenser_config_arg}]: {e}.'
)
return None
# Command line arguments
def get_parser() -> argparse.ArgumentParser:
"""Get the argument parser."""