mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
specify condenser config for evals (#8177)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -42,6 +42,37 @@ api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
### Configuring Condensers for Evaluation
|
||||
|
||||
For benchmarks that support condenser configuration (like SWE-Bench), you can define multiple condenser configurations in your `config.toml` file. A condenser is responsible for managing conversation history to maintain context while staying within token limits - you can learn more about how it works [here](https://www.all-hands.dev/blog/openhands-context-condensensation-for-more-efficient-ai-agents):
|
||||
|
||||
```toml
|
||||
# LLM-based summarizing condenser for evaluation
|
||||
[condenser.summarizer_for_eval]
|
||||
type = "llm"
|
||||
llm_config = "haiku" # Reference to an LLM config to use for summarization
|
||||
keep_first = 2 # Number of initial events to always keep
|
||||
max_size = 100 # Maximum size of history before triggering summarization
|
||||
|
||||
# Recent events condenser for evaluation
|
||||
[condenser.recent_for_eval]
|
||||
type = "recent"
|
||||
keep_first = 2 # Number of initial events to always keep
|
||||
max_events = 50 # Maximum number of events to keep in history
|
||||
```
|
||||
|
||||
You can then specify which condenser configuration to use when running evaluation scripts, for example:
|
||||
|
||||
```bash
|
||||
EVAL_CONDENSER=summarizer_for_eval \
|
||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
|
||||
```
|
||||
|
||||
The name is up to you, but should match a name defined in your `config.toml` file. The last argument in the command specifies the condenser configuration to use. In this case, `summarizer_for_eval` is used, which refers to the LLM-based summarizing condenser as defined above.
|
||||
|
||||
If no condenser configuration is specified, the 'noop' condenser will be used by default, which keeps the full conversation history.
|
||||
```
|
||||
|
||||
For other configurations specific to evaluation, such as `save_trajectory_path`, these are typically set in the `get_config` function of the respective `run_infer.py` file for each benchmark.
|
||||
|
||||
## Supported Benchmarks
|
||||
|
||||
@@ -45,7 +45,7 @@ For example, for instance ID `django_django-11011`, it will try to pull our pre-
|
||||
This image will be used create an OpenHands runtime image where the agent will operate on.
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] [n_runs] [mode]
|
||||
|
||||
# Example
|
||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test
|
||||
@@ -69,13 +69,20 @@ default, it is set to 1.
|
||||
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
|
||||
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
|
||||
|
||||
- `n_runs`, e.g. `3`, is the number of times to run the evaluation. Default is 1.
|
||||
- `mode`, e.g. `swt`, `swt-ci`, or `swe`, specifies the evaluation mode. Default is `swe`.
|
||||
|
||||
> [!CAUTION]
|
||||
> Setting `num_workers` larger than 1 is not officially tested, YMMV.
|
||||
|
||||
There is also one optional environment variable you can set.
|
||||
There are also optional environment variables you can set:
|
||||
|
||||
```bash
|
||||
export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
|
||||
# Use hint text in the evaluation (default: false)
|
||||
export USE_HINT_TEXT=true # Ignore this if you are not sure.
|
||||
|
||||
# Specify a condenser configuration for memory management (default: NoOpCondenser)
|
||||
export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml
|
||||
```
|
||||
|
||||
Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
|
||||
|
||||
@@ -44,6 +44,8 @@ from openhands.core.config import (
|
||||
get_llm_config_arg,
|
||||
get_parser,
|
||||
)
|
||||
from openhands.core.config.utils import get_condenser_config_arg
|
||||
from openhands.core.config.condenser_config import NoOpCondenserConfig
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.critic import AgentFinishedCritic
|
||||
@@ -756,6 +758,7 @@ if __name__ == '__main__':
|
||||
choices=['swe', 'swt', 'swt-ci'],
|
||||
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
@@ -792,6 +795,19 @@ if __name__ == '__main__':
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
||||
|
||||
# Get condenser config from environment variable
|
||||
condenser_name = os.environ.get('EVAL_CONDENSER')
|
||||
if condenser_name:
|
||||
condenser_config = get_condenser_config_arg(condenser_name)
|
||||
if condenser_config is None:
|
||||
raise ValueError(
|
||||
f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
|
||||
)
|
||||
else:
|
||||
# If no specific condenser config is provided via env var, default to NoOpCondenser
|
||||
condenser_config = NoOpCondenserConfig()
|
||||
logger.debug('No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.')
|
||||
|
||||
details = {'mode': args.mode}
|
||||
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
||||
|
||||
@@ -806,6 +822,7 @@ if __name__ == '__main__':
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
condenser_config=condenser_config,
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
|
||||
@@ -14,6 +14,7 @@ SPLIT=$8
|
||||
N_RUNS=$9
|
||||
MODE=${10}
|
||||
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
echo "Number of workers not specified, use default $NUM_WORKERS"
|
||||
@@ -51,6 +52,12 @@ if [ -z "$MODE" ]; then
|
||||
echo "MODE not specified, use default $MODE"
|
||||
fi
|
||||
|
||||
if [ -n "$EVAL_CONDENSER" ]; then
|
||||
echo "Using Condenser Config: $EVAL_CONDENSER"
|
||||
else
|
||||
echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
|
||||
fi
|
||||
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
@@ -65,6 +72,7 @@ echo "MAX_ITER: $MAX_ITER"
|
||||
echo "NUM_WORKERS: $NUM_WORKERS"
|
||||
echo "COMMIT_HASH: $COMMIT_HASH"
|
||||
echo "MODE: $MODE"
|
||||
echo "EVAL_CONDENSER: $EVAL_CONDENSER"
|
||||
|
||||
# Default to NOT use Hint
|
||||
if [ -z "$USE_HINT_TEXT" ]; then
|
||||
@@ -88,6 +96,10 @@ fi
|
||||
if [ "$MODE" != "swe" ]; then
|
||||
EVAL_NOTE="${EVAL_NOTE}-${MODE}"
|
||||
fi
|
||||
# Add condenser config to eval note if provided
|
||||
if [ -n "$EVAL_CONDENSER" ]; then
|
||||
EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
|
||||
fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
@@ -101,6 +113,8 @@ function run_eval() {
|
||||
--split $SPLIT \
|
||||
--mode $MODE"
|
||||
|
||||
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
|
||||
@@ -16,7 +16,11 @@ from openhands import __version__
|
||||
from openhands.core import logger
|
||||
from openhands.core.config.agent_config import AgentConfig
|
||||
from openhands.core.config.app_config import AppConfig
|
||||
from openhands.core.config.condenser_config import condenser_config_from_toml_section
|
||||
from openhands.core.config.condenser_config import (
|
||||
CondenserConfig,
|
||||
condenser_config_from_toml_section,
|
||||
create_condenser_config,
|
||||
)
|
||||
from openhands.core.config.config_utils import (
|
||||
OH_DEFAULT_AGENT,
|
||||
OH_MAX_ITERATIONS,
|
||||
@@ -491,6 +495,118 @@ def get_llm_config_arg(
|
||||
return None
|
||||
|
||||
|
||||
def get_condenser_config_arg(
|
||||
condenser_config_arg: str, toml_file: str = 'config.toml'
|
||||
) -> CondenserConfig | None:
|
||||
"""Get a group of condenser settings from the config file by name.
|
||||
|
||||
A group in config.toml can look like this:
|
||||
|
||||
```
|
||||
[condenser.my_summarizer]
|
||||
type = 'llm'
|
||||
llm_config = 'gpt-4o' # References [llm.gpt-4o]
|
||||
max_size = 50
|
||||
...
|
||||
```
|
||||
|
||||
The user-defined group name, like "my_summarizer", is the argument to this function.
|
||||
The function will load the CondenserConfig object with the settings of this group,
|
||||
from the config file.
|
||||
|
||||
Note that the group must be under the "condenser" group, or in other words,
|
||||
the group name must start with "condenser.".
|
||||
|
||||
Args:
|
||||
condenser_config_arg: The group of condenser settings to get from the config.toml file.
|
||||
toml_file: Path to the configuration file to read from. Defaults to 'config.toml'.
|
||||
|
||||
Returns:
|
||||
CondenserConfig: The CondenserConfig object with the settings from the config file, or None if not found/error.
|
||||
"""
|
||||
# keep only the name, just in case
|
||||
condenser_config_arg = condenser_config_arg.strip('[]')
|
||||
|
||||
# truncate the prefix, just in case
|
||||
if condenser_config_arg.startswith('condenser.'):
|
||||
condenser_config_arg = condenser_config_arg[10:]
|
||||
|
||||
logger.openhands_logger.debug(
|
||||
f'Loading condenser config [{condenser_config_arg}] from {toml_file}'
|
||||
)
|
||||
|
||||
# load the toml file
|
||||
try:
|
||||
with open(toml_file, 'r', encoding='utf-8') as toml_contents:
|
||||
toml_config = toml.load(toml_contents)
|
||||
except FileNotFoundError as e:
|
||||
logger.openhands_logger.error(f'Config file not found: {toml_file}. Error: {e}')
|
||||
return None
|
||||
except toml.TomlDecodeError as e:
|
||||
logger.openhands_logger.error(
|
||||
f'Cannot parse condenser group [{condenser_config_arg}] from {toml_file}. Exception: {e}'
|
||||
)
|
||||
return None
|
||||
|
||||
# Check if the condenser section and the specific config exist
|
||||
if (
|
||||
'condenser' not in toml_config
|
||||
or condenser_config_arg not in toml_config['condenser']
|
||||
):
|
||||
logger.openhands_logger.error(
|
||||
f'Condenser config section [condenser.{condenser_config_arg}] not found in {toml_file}'
|
||||
)
|
||||
return None
|
||||
|
||||
condenser_data = toml_config['condenser'][
|
||||
condenser_config_arg
|
||||
].copy() # Use copy to modify
|
||||
|
||||
# Determine the type and handle potential LLM dependency
|
||||
condenser_type = condenser_data.get('type')
|
||||
if not condenser_type:
|
||||
logger.openhands_logger.error(
|
||||
f'Missing "type" field in [condenser.{condenser_config_arg}] section of {toml_file}'
|
||||
)
|
||||
return None
|
||||
|
||||
# Handle LLM config reference if needed, using get_llm_config_arg
|
||||
if (
|
||||
condenser_type in ('llm', 'llm_attention', 'structured')
|
||||
and 'llm_config' in condenser_data
|
||||
and isinstance(condenser_data['llm_config'], str)
|
||||
):
|
||||
llm_config_name = condenser_data['llm_config']
|
||||
logger.openhands_logger.debug(
|
||||
f'Condenser [{condenser_config_arg}] requires LLM config [{llm_config_name}]. Loading it...'
|
||||
)
|
||||
# Use the existing function to load the specific LLM config
|
||||
referenced_llm_config = get_llm_config_arg(llm_config_name, toml_file=toml_file)
|
||||
|
||||
if referenced_llm_config:
|
||||
# Replace the string reference with the actual LLMConfig object
|
||||
condenser_data['llm_config'] = referenced_llm_config
|
||||
else:
|
||||
# get_llm_config_arg already logs the error if not found
|
||||
logger.openhands_logger.error(
|
||||
f"Failed to load required LLM config '{llm_config_name}' for condenser '{condenser_config_arg}'."
|
||||
)
|
||||
return None
|
||||
|
||||
# Create the condenser config instance
|
||||
try:
|
||||
config = create_condenser_config(condenser_type, condenser_data)
|
||||
logger.openhands_logger.info(
|
||||
f'Successfully loaded condenser config [{condenser_config_arg}] from {toml_file}'
|
||||
)
|
||||
return config
|
||||
except (ValidationError, ValueError) as e:
|
||||
logger.openhands_logger.error(
|
||||
f'Invalid condenser configuration for [{condenser_config_arg}]: {e}.'
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
# Command line arguments
|
||||
def get_parser() -> argparse.ArgumentParser:
|
||||
"""Get the argument parser."""
|
||||
|
||||
Reference in New Issue
Block a user