From 8fcf0817d47b2d7fe2ee071bbef00def06dd5de5 Mon Sep 17 00:00:00 2001
From: tobitege <10787084+tobitege@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:49:26 +0200
Subject: [PATCH] (eval) Aider_bench: add eval_ids arg to run specific instance
 id's (#3592)

* add eval_ids arg to run specific instance id's; fix/extend README

* fix description in parser for --eval-ids

* fix test_arg_parser.py to account for added arg

* fix typo in README to say "summarize" instead of "summarise" for script
---
 evaluation/aider_bench/README.md            | 43 ++++++++++++---------
 evaluation/aider_bench/run_infer.py         | 11 +++++-
 evaluation/aider_bench/scripts/run_infer.sh |  6 +++
 evaluation/utils/shared.py                  | 13 ++++++-
 openhands/core/config.py                    |  6 +++
 poetry.lock                                 |  2 +-
 pyproject.toml                              |  3 +-
 tests/unit/test_arg_parser.py               |  5 ++-
 8 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/evaluation/aider_bench/README.md b/evaluation/aider_bench/README.md
index 6947133603..3aafc410a1 100644
--- a/evaluation/aider_bench/README.md
+++ b/evaluation/aider_bench/README.md
@@ -16,42 +16,49 @@ development environment and LLM.
 ## Start the evaluation
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 ```
 
--   `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
     your LLM settings, as defined in your `config.toml`.
--   `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
-    you would like to evaluate. It could also be a release tag like `0.6.2`.
--   `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
     defaulting to `CodeActAgent`.
--   `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
     instances. By default, the script evaluates the entire Exercism test set
     (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
 
 Following is the basic command to start the evaluation.
 
 You can update the arguments in the script
-`evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`,
-`--eval-num-workers` and so on.
+`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
+`--eval-num-workers` and so on:
 
--   `--agent-cls`, the agent to use. For example, `CodeActAgent`.
--   `--llm-config`: the LLM configuration to use. For example,
-    `eval_gpt4_1106_preview`.
--   `--max-iterations`: the number of iterations to run the evaluation. For
-    example, `30`.
--   `--eval-num-workers`: the number of workers to use for evaluation. For
-    example, `5`.
--   `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
+- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
+- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
+- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
+./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```
 
 ## Summarize Results
 
 ```bash
-poetry run python ./evaluation/agent_bench/scripts/summarise_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```
 
 This will list the instances that passed and the instances that failed. For each
diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
index 6bfc9a6360..729ff99ac6 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -245,7 +245,16 @@ if __name__ == '__main__':
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(aider_bench_tests, output_file, args.eval_n_limit)
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'Using specific dataset IDs: {eval_ids}')
+
+    instances = prepare_dataset(
+        aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids
+    )
 
     asyncio.run(
         run_evaluation(
diff --git a/evaluation/aider_bench/scripts/run_infer.sh b/evaluation/aider_bench/scripts/run_infer.sh
index b43607b3c7..1982a579aa 100755
--- a/evaluation/aider_bench/scripts/run_infer.sh
+++ b/evaluation/aider_bench/scripts/run_infer.sh
@@ -8,6 +8,7 @@ COMMIT_HASH=$2
 AGENT=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
+EVAL_IDS=$6
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -39,5 +40,10 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 
+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
 # Run the command
 eval $COMMAND
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index a27ff5e3c1..d841a7b29a 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -164,7 +164,12 @@ def make_metadata(
     return metadata
 
 
-def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
+def prepare_dataset(
+    dataset: pd.DataFrame,
+    output_file: str,
+    eval_n_limit: int,
+    eval_ids: list[str] | None = None,
+):
     assert (
         'instance_id' in dataset.columns
     ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
@@ -180,7 +185,11 @@ def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
             f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
         )
 
-    if eval_n_limit:
+    if eval_ids:
+        eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids]
+        dataset = dataset[dataset[id_column].isin(eval_ids_converted)]
+        logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.')
+    elif eval_n_limit:
         dataset = dataset.head(eval_n_limit)
         logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
 
diff --git a/openhands/core/config.py b/openhands/core/config.py
index 97343df1a1..bd9b70db5e 100644
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -740,6 +740,12 @@ def get_parser() -> argparse.ArgumentParser:
         type=str,
         help='Name for the session',
     )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
     return parser
 
 
diff --git a/poetry.lock b/poetry.lock
index ea246ce2ae..f1c98a7634 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -9457,4 +9457,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "ea650e78171ccd3088112c232ca9b09b180db502bc45a22feaf313acfeaf83b6"
+content-hash = "f6abf770480dfd3a739d3d0b4499b601df44f130b27684f34b5f6791950e99d8"
diff --git a/pyproject.toml b/pyproject.toml
index 1615bf5fa0..98d66ca0e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ PyPDF2 = "*"
 python-pptx = "*"
 pylatexenc = "*"
 tornado = "*"
+python-dotenv = "*"
 
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
@@ -82,7 +83,6 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
-
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -113,7 +113,6 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
-
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py
index 19fb3f0e9f..8f7def8dec 100644
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@@ -104,7 +104,7 @@ def test_help_message(capsys):
         parser.parse_args(['--help'])
     captured = capsys.readouterr()
     help_output = captured.out
-
+    print(help_output)
     expected_elements = [
         'usage:',
         'Run an agent with a specific task',
@@ -120,6 +120,7 @@ def test_help_message(capsys):
         '--eval-n-limit EVAL_N_LIMIT',
         '--eval-num-workers EVAL_NUM_WORKERS',
         '--eval-note EVAL_NOTE',
+        '--eval-ids EVAL_IDS',
         '-l LLM_CONFIG, --llm-config LLM_CONFIG',
         '-n NAME, --name NAME',
     ]
@@ -128,4 +129,4 @@ def test_help_message(capsys):
         assert element in help_output, f"Expected '{element}' to be in the help message"
 
     option_count = help_output.count('  -')
-    assert option_count == 13, f'Expected 13 options, found {option_count}'
+    assert option_count == 14, f'Expected 14 options, found {option_count}'