diff --git a/evaluation/benchmarks/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md index 5e533ec120..50feef85cd 100644 --- a/evaluation/benchmarks/commit0_bench/README.md +++ b/evaluation/benchmarks/commit0_bench/README.md @@ -48,8 +48,6 @@ default, it is set to 1. - `dataset`, a huggingface dataset name. e.g. `wentingzhao/commit0_combined`, specifies which dataset to evaluate on. - `dataset_split`, split for the huggingface dataset. Notice only `test` is supported for Commit0. -Note that the `USE_INSTANCE_IMAGE` environment variable is always set to `true` for Commit0. - Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgent, then your command would be: diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 63d394a029..8bfdbab261 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -39,7 +39,6 @@ from openhands.utils.async_utils import call_async_from_sync from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' -USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { @@ -105,7 +104,6 @@ def get_config( instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: - assert USE_INSTANCE_IMAGE repo_name = instance['repo'].split('/')[1] base_container_image = get_instance_docker_image(repo_name) logger.info( diff --git a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh index 601499b90e..a80d8a07f4 100755 --- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh @@ -30,11 +30,6 @@ if [ -z "$MAX_ITER" ]; then MAX_ITER=100 fi -if [ -z "$USE_INSTANCE_IMAGE" ]; then - echo "USE_INSTANCE_IMAGE not specified, use default true" - USE_INSTANCE_IMAGE=true -fi - if [ -z "$RUN_WITH_BROWSING" ]; then echo "RUN_WITH_BROWSING not specified, use default false" RUN_WITH_BROWSING=false @@ -56,8 +51,6 @@ if [ -z "$SPLIT" ]; then SPLIT="test" fi -export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE -echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" export RUN_WITH_BROWSING=$RUN_WITH_BROWSING echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING" diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index 4746024882..774b1d7b09 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -6,7 +6,7 @@ This folder contains the evaluation harness that we built on top of the original The evaluation consists of three steps: -1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support). +1. Environment setup: [install python environment](../../README.md#development-environment) and [configure LLM config](../../README.md#configure-openhands-and-your-llm). 2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue 3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches) @@ -14,22 +14,21 @@ The evaluation consists of three steps: Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. -## OpenHands SWE-Bench Instance-level Docker Support +## Run Inference (Rollout) on SWE-Bench Instances: Generate Patch from Problem Statement -OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**. -This is now the default behavior. +### Running Locally with Docker -## Run Inference on SWE-Bench Instances +Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the instance-level docker image. -Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support). - -When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on. +When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. +For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. +This image will be used create an OpenHands runtime image where the agent will operate on. ```bash ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] # Example -./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 500 100 1 princeton-nlp/SWE-bench_Verified test ``` where `model_config` is mandatory, and the rest are optional. @@ -47,14 +46,16 @@ in order to use `eval_limit`, you must also set `agent`. default, it is set to 30. - `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By default, it is set to 1. -- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench` or `princeton-nlp/SWE-bench_Lite`, specifies which dataset to evaluate on. +- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, or `princeton-nlp/SWE-bench_Verified`, specifies which dataset to evaluate on. - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`. -There are also two optional environment variables you can set. +> [!CAUTION] +> Setting `num_workers` larger than 1 is not officially tested, YMMV. + +There is also one optional environment variable you can set. ```bash export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure. -export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true ``` Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent, @@ -65,9 +66,11 @@ then your command would be: ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10 ``` -### Run Inference on `RemoteRuntime` +### Running in parallel with RemoteRuntime -This is in beta. Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply if you want to try this out! +OpenHands Remote Runtime is currently in beta (read [here](https://runtime.all-hands.dev/) for more details), it allows you to run rollout in parallel in the cloud, so you don't need a powerful machine to run evaluation. + +Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply if you want to try this out! ```bash ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] @@ -100,41 +103,14 @@ After running the inference, you will obtain a `output.jsonl` (by default it wil ## Evaluate Generated Patches -### Download Docker Images - -**(Recommended for reproducibility)** If you have extra local space (e.g., 200GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running: - -```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance -``` - -If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images: - -```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh env -``` - -If you want to evaluate on the full SWE-Bench test set: - -```bash -evaluation/benchmarks/swe_bench/scripts/docker/pull_all_eval_docker.sh instance full -``` - -### Run evaluation +### Run evaluation with official SWE-Bench harness (Recommend if you have local disk space) With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report. - **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).** -> If you want to evaluate existing results, you should first run this to clone existing outputs -> ->```bash ->git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs ->``` +> [!NOTE] +> This process will automatically download docker images from SWE-Bench official docker hub, please make sure you have enough disk space! -NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#openhands-swe-bench-instance-level-docker-support). - -Then you can run the following: ```bash ./evaluation/benchmarks/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] @@ -165,7 +141,8 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be ### Run evaluation with `RemoteRuntime` -This is in beta. Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply if you want to try this out! +OpenHands Remote Runtime is currently in beta (read [here](https://runtime.all-hands.dev/) for more details), it allows you to run rollout in parallel in the cloud, so you don't need a powerful machine to run evaluation. +Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLSckVz_JFwg2_mOxNZjCtr7aoBFI2Mwdan3f75J_TrdMS1JV2g/viewform) to apply if you want to try this out! ```bash ./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] @@ -180,35 +157,3 @@ To clean-up all existing runtimes that you've already started, run: ```bash ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/utils/scripts/cleanup_remote_runtime.sh ``` - -## Visualize Results - -First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo. - -```bash -git clone https://huggingface.co/spaces/OpenHands/evaluation -``` - -**(optional) setup streamlit environment with conda**: - -```bash -cd evaluation -conda create -n streamlit python=3.10 -conda activate streamlit -pip install -r requirements.txt -``` - -**run the visualizer**: -Then, in a separate Python environment with `streamlit` library, you can run the following: - -```bash -# Make sure you are inside the cloned `evaluation` repo -conda activate streamlit # if you follow the optional conda env setup above -streamlit run app.py --server.port 8501 --server.address 0.0.0.0 -``` - -Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`. - -## Submit your evaluation results - -You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions). diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index df9042969e..d8120b1b6f 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -44,7 +44,6 @@ from openhands.utils.async_utils import call_async_from_sync from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' -USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' @@ -121,23 +120,18 @@ def get_config( instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: - SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1' - if USE_INSTANCE_IMAGE: - # We use a different instance image for the each instance of swe-bench eval - use_official_image = bool( - 'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower() - ) - base_container_image = get_instance_docker_image( - instance['instance_id'], use_official_image - ) - logger.info( - f'Using instance container image: {base_container_image}. ' - f'Please make sure this image exists. ' - f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' - ) - else: - base_container_image = SWE_BENCH_CONTAINER_IMAGE - logger.info(f'Using swe-bench container image: {base_container_image}') + # We use a different instance image for the each instance of swe-bench eval + use_official_image = bool( + 'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower() + ) + base_container_image = get_instance_docker_image( + instance['instance_id'], use_official_image + ) + logger.info( + f'Using instance container image: {base_container_image}. ' + f'Please make sure this image exists. ' + f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' + ) sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = base_container_image @@ -209,75 +203,65 @@ def initialize_runtime( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') - if USE_INSTANCE_IMAGE: - # inject the init script - script_dir = os.path.dirname(__file__) + # inject the init script + script_dir = os.path.dirname(__file__) - # inject the instance info - action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - obs.exit_code == 0, - f'Failed to create /swe_util/eval_data/instances: {str(obs)}', - ) + # inject the instance info + action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + obs.exit_code == 0, + f'Failed to create /swe_util/eval_data/instances: {str(obs)}', + ) - swe_instance_json_name = 'swe-bench-instance.json' - with tempfile.TemporaryDirectory() as temp_dir: - # Construct the full path for the desired file name within the temporary directory - temp_file_path = os.path.join(temp_dir, swe_instance_json_name) - # Write to the file with the desired name within the temporary directory - with open(temp_file_path, 'w') as f: - if not isinstance(instance, dict): - json.dump([instance.to_dict()], f) - else: - json.dump([instance], f) + swe_instance_json_name = 'swe-bench-instance.json' + with tempfile.TemporaryDirectory() as temp_dir: + # Construct the full path for the desired file name within the temporary directory + temp_file_path = os.path.join(temp_dir, swe_instance_json_name) + # Write to the file with the desired name within the temporary directory + with open(temp_file_path, 'w') as f: + if not isinstance(instance, dict): + json.dump([instance.to_dict()], f) + else: + json.dump([instance], f) - # Copy the file to the desired location - runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') + # Copy the file to the desired location + runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') # inject the instance swe entry runtime.copy_to( str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')), '/swe_util/', ) - action = CmdRunAction(command='cat ~/.bashrc') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') - action = CmdRunAction(command='source ~/.bashrc') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - if isinstance(obs, ErrorObservation): - logger.error(f'Failed to source ~/.bashrc: {str(obs)}') - assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') + action = CmdRunAction(command='cat ~/.bashrc') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') - action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - obs.exit_code == 0, - f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}', - ) - else: - action = CmdRunAction(command='source /swe_util/swe_entry.sh') - action.set_hard_timeout(1800) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - obs.exit_code == 0, - f'Failed to source /swe_util/swe_entry.sh: {str(obs)}', - ) + action = CmdRunAction(command='source ~/.bashrc') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + if isinstance(obs, ErrorObservation): + logger.error(f'Failed to source ~/.bashrc: {str(obs)}') + assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') + + action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + obs.exit_code == 0, + f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}', + ) action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') action.set_hard_timeout(600) diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index d0bed01792..64e3cde056 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -29,11 +29,6 @@ if [ -z "$MAX_ITER" ]; then MAX_ITER=100 fi -if [ -z "$USE_INSTANCE_IMAGE" ]; then - echo "USE_INSTANCE_IMAGE not specified, use default true" - USE_INSTANCE_IMAGE=true -fi - if [ -z "$RUN_WITH_BROWSING" ]; then echo "RUN_WITH_BROWSING not specified, use default false" RUN_WITH_BROWSING=false @@ -50,8 +45,6 @@ if [ -z "$SPLIT" ]; then SPLIT="test" fi -export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE -echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" export RUN_WITH_BROWSING=$RUN_WITH_BROWSING echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"