mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
feat: add SWE-bench fullset support (#3477)
* feat: add SWE-bench fullset support * fix instance image list * update eval script and documentation * add push script * handle the case when ret push is an generator * update pbar
This commit is contained in:
@@ -19,27 +19,16 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
|
||||
OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
|
||||
This is now the default behavior.
|
||||
|
||||
### Download Docker Images
|
||||
|
||||
**(Recommended for reproducibility)** If you have extra local space (e.g., 100GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
|
||||
|
||||
```bash
|
||||
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
|
||||
```
|
||||
|
||||
If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
|
||||
|
||||
```bash
|
||||
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
|
||||
```
|
||||
|
||||
## Run Inference on SWE-Bench Instances
|
||||
|
||||
Make sure your Docker daemon is running, and you have pulled the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
|
||||
Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
|
||||
|
||||
When the `run_infer.sh` script is started, it will automatically pull the relavant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
|
||||
|
||||
```bash
|
||||
./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
|
||||
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300
|
||||
./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, and the rest are optional.
|
||||
@@ -57,6 +46,8 @@ in order to use `eval_limit`, you must also set `agent`.
|
||||
default, it is set to 30.
|
||||
- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
|
||||
default, it is set to 1.
|
||||
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench` or `princeton-nlp/SWE-bench_Lite`, specifies which dataset to evaluate on.
|
||||
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
|
||||
|
||||
There are also two optional environment variables you can set.
|
||||
```
|
||||
@@ -95,6 +86,28 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil
|
||||
|
||||
## Evaluate Generated Patches
|
||||
|
||||
### Download Docker Images
|
||||
|
||||
**(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
|
||||
|
||||
```bash
|
||||
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
|
||||
```
|
||||
|
||||
If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
|
||||
|
||||
```bash
|
||||
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
|
||||
```
|
||||
|
||||
If you want to evaluate on the full SWE-Bench test set:
|
||||
|
||||
```bash
|
||||
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full
|
||||
```
|
||||
|
||||
### Run evaluation
|
||||
|
||||
With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report.
|
||||
|
||||
**This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
|
||||
|
||||
@@ -25,8 +25,8 @@ from openhands.core.config import (
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_llm_config_arg,
|
||||
get_parser,
|
||||
load_from_env,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
@@ -109,6 +109,11 @@ def get_config(
|
||||
if USE_INSTANCE_IMAGE:
|
||||
# We use a different instance image for the each instance of swe-bench eval
|
||||
base_container_image = get_instance_docker_image(instance['instance_id'])
|
||||
logger.info(
|
||||
f'Using instance container image: {base_container_image}. '
|
||||
f'Please make sure this image exists. '
|
||||
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
||||
)
|
||||
else:
|
||||
base_container_image = SWE_BENCH_CONTAINER_IMAGE
|
||||
logger.info(f'Using swe-bench container image: {base_container_image}')
|
||||
@@ -411,12 +416,26 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='princeton-nlp/SWE-bench',
|
||||
help='data set to evaluate on, either full-test or lite-test',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--split',
|
||||
type=str,
|
||||
default='test',
|
||||
help='split to evaluate on',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenHands's repo
|
||||
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
|
||||
swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
|
||||
dataset = load_dataset(args.dataset, split=args.split)
|
||||
logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
|
||||
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
||||
|
||||
llm_config = None
|
||||
if args.llm_config:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,19 +6,33 @@ LEVEL=$1
|
||||
# - base, keyword "sweb.base"
|
||||
# - env, keyword "sweb.env"
|
||||
# - instance, keyword "sweb.eval"
|
||||
SET=$2
|
||||
|
||||
if [ -z "$LEVEL" ]; then
|
||||
echo "Usage: $0 <cache_level>"
|
||||
echo "Usage: $0 <cache_level> <set>"
|
||||
echo "cache_level: base, env, or instance"
|
||||
echo "set: lite, full"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$SET" ]; then
|
||||
echo "Usage: $0 <cache_level> <set>"
|
||||
echo "cache_level: base, env, or instance"
|
||||
echo "set: lite, full, default is lite"
|
||||
SET="lite"
|
||||
fi
|
||||
|
||||
NAMESPACE=$2 # xingyaoww
|
||||
if [ -z "$NAMESPACE" ]; then
|
||||
echo "Default to namespace: xingyaoww"
|
||||
NAMESPACE="xingyaoww"
|
||||
fi
|
||||
IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"
|
||||
|
||||
if [ "$SET" == "lite" ]; then
|
||||
IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"
|
||||
else
|
||||
IMAGE_FILE="$(dirname "$0")/all-swebench-full-instance-images.txt"
|
||||
fi
|
||||
|
||||
# Define a pattern based on the level
|
||||
case $LEVEL in
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
"""You should first perform the following steps:
|
||||
|
||||
1. Build the docker images. Install SWE-Bench first (https://github.com/princeton-nlp/SWE-bench). Then run:
|
||||
```bash
|
||||
export DATASET_NAME=princeton-nlp/SWE-bench_Lite
|
||||
export SPLIT=test
|
||||
export MAX_WORKERS=4
|
||||
export RUN_ID=some-random-ID
|
||||
python -m swebench.harness.run_evaluation \
|
||||
--dataset_name $DATASET_NAME \
|
||||
--split $SPLIT \
|
||||
--predictions_path gold \
|
||||
--max_workers $MAX_WORKERS \
|
||||
--run_id $RUN_ID \
|
||||
--cache_level instance
|
||||
```
|
||||
|
||||
2. Then run this script to push the docker images to the docker hub. Some of the docker images might fail to build in the previous step - start an issue in the SWE-Bench repo for possible fixes.
|
||||
|
||||
To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run:
|
||||
```bash
|
||||
EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import docker
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
logger.setLevel('ERROR')
|
||||
from evaluation.swe_bench.run_infer import get_instance_docker_image # noqa
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
|
||||
parser.add_argument('--split', type=str, default='test')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = load_dataset(args.dataset, split=args.split)
|
||||
client = docker.from_env()
|
||||
|
||||
pbar = tqdm(total=len(dataset))
|
||||
counter = {'success': 0, 'failed': 0}
|
||||
|
||||
failed_instances = []
|
||||
for instance in dataset:
|
||||
instance_id = instance['instance_id']
|
||||
image_name = f'sweb.eval.x86_64.{instance_id}'
|
||||
target_image_name = get_instance_docker_image(instance_id)
|
||||
|
||||
print('-' * 100)
|
||||
# check if image exists
|
||||
try:
|
||||
image: docker.models.images.Image = client.images.get(image_name)
|
||||
image.tag(target_image_name)
|
||||
print(f'Image {image_name} -- tagging to --> {target_image_name}')
|
||||
ret_push = client.images.push(target_image_name)
|
||||
if isinstance(ret_push, str):
|
||||
print(ret_push)
|
||||
else:
|
||||
for line in ret_push:
|
||||
print(line)
|
||||
print(f'Image {image_name} -- pushed to --> {target_image_name}')
|
||||
counter['success'] += 1
|
||||
except docker.errors.ImageNotFound:
|
||||
print(f'ERROR: Image {image_name} does not exist')
|
||||
counter['failed'] += 1
|
||||
failed_instances.append(instance_id)
|
||||
finally:
|
||||
pbar.update(1)
|
||||
pbar.set_postfix(counter)
|
||||
|
||||
print(f'Success: {counter["success"]}, Failed: {counter["failed"]}')
|
||||
print('Failed instances IDs:')
|
||||
for failed_instance in failed_instances:
|
||||
print(failed_instance)
|
||||
@@ -9,6 +9,8 @@ AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
MAX_ITER=$5
|
||||
NUM_WORKERS=$6
|
||||
DATASET=$7
|
||||
SPLIT=$8
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
@@ -31,6 +33,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
|
||||
USE_INSTANCE_IMAGE=true
|
||||
fi
|
||||
|
||||
if [ -z "$SUBSET" ]; then
|
||||
echo "SUBSET not specified, use default lite-test"
|
||||
SUBSET="lite-test"
|
||||
fi
|
||||
|
||||
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
|
||||
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
|
||||
@@ -59,7 +66,9 @@ COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
|
||||
--max-iterations $MAX_ITER \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
--eval-note $EVAL_NOTE \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
Reference in New Issue
Block a user