From 8fcf0817d47b2d7fe2ee071bbef00def06dd5de5 Mon Sep 17 00:00:00 2001 From: tobitege <10787084+tobitege@users.noreply.github.com> Date: Mon, 26 Aug 2024 18:49:26 +0200 Subject: [PATCH] (eval) Aider_bench: add eval_ids arg to run specific instance id's (#3592) * add eval_ids arg to run specific instance id's; fix/extend README * fix description in parser for --eval-ids * fix test_arg_parser.py to account for added arg * fix typo in README to say "summarize" instead of "summarise" for script --- evaluation/aider_bench/README.md | 43 ++++++++++++--------- evaluation/aider_bench/run_infer.py | 11 +++++- evaluation/aider_bench/scripts/run_infer.sh | 6 +++ evaluation/utils/shared.py | 13 ++++++- openhands/core/config.py | 6 +++ poetry.lock | 2 +- pyproject.toml | 3 +- tests/unit/test_arg_parser.py | 5 ++- 8 files changed, 63 insertions(+), 26 deletions(-) diff --git a/evaluation/aider_bench/README.md b/evaluation/aider_bench/README.md index 6947133603..3aafc410a1 100644 --- a/evaluation/aider_bench/README.md +++ b/evaluation/aider_bench/README.md @@ -16,42 +16,49 @@ development environment and LLM. ## Start the evaluation ```bash -./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] +./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] ``` -- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`. -- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version - you would like to evaluate. It could also be a release tag like `0.6.2`. -- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, +- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version + you would like to evaluate. It could also be a release tag like `0.9.0`. +- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`. -- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` +- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire Exercism test set (133 issues). Note: in order to use `eval_limit`, you must also set `agent`. +- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`. +- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the + given IDs (comma separated). Following is the basic command to start the evaluation. You can update the arguments in the script -`evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, -`--eval-num-workers` and so on. +`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`, +`--eval-num-workers` and so on: -- `--agent-cls`, the agent to use. For example, `CodeActAgent`. -- `--llm-config`: the LLM configuration to use. For example, - `eval_gpt4_1106_preview`. -- `--max-iterations`: the number of iterations to run the evaluation. For - example, `30`. -- `--eval-num-workers`: the number of workers to use for evaluation. For - example, `5`. -- `--eval-n-limit`: the number of examples to evaluate. For example, `100`. +- `--agent-cls`, the agent to use. For example, `CodeActAgent`. +- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`. +- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`. +- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`. +- `--eval-n-limit`: the number of examples to evaluate. For example, `100`. +- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`. ```bash -./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1 +./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10" ``` ## Summarize Results ```bash -poetry run python ./evaluation/agent_bench/scripts/summarise_results.py [path_to_output_jsonl_file] +poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] +``` + +Full example: + +```bash +poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl ``` This will list the instances that passed and the instances that failed. For each diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index 6bfc9a6360..729ff99ac6 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -245,7 +245,16 @@ if __name__ == '__main__': args.eval_output_dir, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') - instances = prepare_dataset(aider_bench_tests, output_file, args.eval_n_limit) + + # Parse dataset IDs if provided + eval_ids = None + if args.eval_ids: + eval_ids = str(args.eval_ids).split(',') + logger.info(f'Using specific dataset IDs: {eval_ids}') + + instances = prepare_dataset( + aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids + ) asyncio.run( run_evaluation( diff --git a/evaluation/aider_bench/scripts/run_infer.sh b/evaluation/aider_bench/scripts/run_infer.sh index b43607b3c7..1982a579aa 100755 --- a/evaluation/aider_bench/scripts/run_infer.sh +++ b/evaluation/aider_bench/scripts/run_infer.sh @@ -8,6 +8,7 @@ COMMIT_HASH=$2 AGENT=$3 EVAL_LIMIT=$4 NUM_WORKERS=$5 +EVAL_IDS=$6 if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1 @@ -39,5 +40,10 @@ if [ -n "$EVAL_LIMIT" ]; then COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" fi +if [ -n "$EVAL_IDS" ]; then + echo "EVAL_IDS: $EVAL_IDS" + COMMAND="$COMMAND --eval-ids $EVAL_IDS" +fi + # Run the command eval $COMMAND diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index a27ff5e3c1..d841a7b29a 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -164,7 +164,12 @@ def make_metadata( return metadata -def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int): +def prepare_dataset( + dataset: pd.DataFrame, + output_file: str, + eval_n_limit: int, + eval_ids: list[str] | None = None, +): assert ( 'instance_id' in dataset.columns ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column." @@ -180,7 +185,11 @@ def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int): f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.' ) - if eval_n_limit: + if eval_ids: + eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids] + dataset = dataset[dataset[id_column].isin(eval_ids_converted)] + logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.') + elif eval_n_limit: dataset = dataset.head(eval_n_limit) logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') diff --git a/openhands/core/config.py b/openhands/core/config.py index 97343df1a1..bd9b70db5e 100644 --- a/openhands/core/config.py +++ b/openhands/core/config.py @@ -740,6 +740,12 @@ def get_parser() -> argparse.ArgumentParser: type=str, help='Name for the session', ) + parser.add_argument( + '--eval-ids', + default=None, + type=str, + help='The comma-separated list (in quotes) of IDs of the instances to evaluate', + ) return parser diff --git a/poetry.lock b/poetry.lock index ea246ce2ae..f1c98a7634 100644 --- a/poetry.lock +++ b/poetry.lock @@ -9457,4 +9457,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "ea650e78171ccd3088112c232ca9b09b180db502bc45a22feaf313acfeaf83b6" +content-hash = "f6abf770480dfd3a739d3d0b4499b601df44f130b27684f34b5f6791950e99d8" diff --git a/pyproject.toml b/pyproject.toml index 1615bf5fa0..98d66ca0e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ PyPDF2 = "*" python-pptx = "*" pylatexenc = "*" tornado = "*" +python-dotenv = "*" [tool.poetry.group.llama-index.dependencies] llama-index = "*" @@ -82,7 +83,6 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] - [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -113,7 +113,6 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" - [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py index 19fb3f0e9f..8f7def8dec 100644 --- a/tests/unit/test_arg_parser.py +++ b/tests/unit/test_arg_parser.py @@ -104,7 +104,7 @@ def test_help_message(capsys): parser.parse_args(['--help']) captured = capsys.readouterr() help_output = captured.out - + print(help_output) expected_elements = [ 'usage:', 'Run an agent with a specific task', @@ -120,6 +120,7 @@ def test_help_message(capsys): '--eval-n-limit EVAL_N_LIMIT', '--eval-num-workers EVAL_NUM_WORKERS', '--eval-note EVAL_NOTE', + '--eval-ids EVAL_IDS', '-l LLM_CONFIG, --llm-config LLM_CONFIG', '-n NAME, --name NAME', ] @@ -128,4 +129,4 @@ def test_help_message(capsys): assert element in help_output, f"Expected '{element}' to be in the help message" option_count = help_output.count(' -') - assert option_count == 13, f'Expected 13 options, found {option_count}' + assert option_count == 14, f'Expected 14 options, found {option_count}'