support eval patches with all tests

Support running all tests in SWE-Bench
2026-04-29 03:00:45 -04:00 · 2025-03-20 16:14:23 +00:00 · 2025-03-20 16:12:08 +00:00
3 changed files with 154 additions and 7 deletions
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+
+PROCESS_FILEPATH=$1
+if [ -z "$PROCESS_FILEPATH" ]; then
+    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
+    exit 1
+fi
+
+if [ ! -f $PROCESS_FILEPATH ]; then
+    echo "Error: $PROCESS_FILEPATH is not a file"
+    exit 1
+fi
+
+# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
+# otherwise, we want to eval on the instance_id
+INSTANCE_ID=$2
+DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Verified"}
+SPLIT=${4:-"test"}
+
+echo "INSTANCE_ID: $INSTANCE_ID"
+echo "DATASET_NAME: $DATASET_NAME"
+echo "SPLIT: $SPLIT"
+
+PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
+FILE_DIR=$(dirname $PROCESS_FILEPATH)
+FILE_NAME=$(basename $PROCESS_FILEPATH)
+
+echo "Evaluating $FILE_NAME @ $FILE_DIR"
+
+# ================================================
+# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
+echo "=============================================================="
+echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
+echo "=============================================================="
+# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
+function is_swebench_format() {
+    # Read the first line of the file
+    read -r first_line < "$PROCESS_FILEPATH"
+
+    # Use jq to check if the first line has the required fields
+    echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
+
+    if [ $? -ne 0 ]; then
+        return 1 # Return 1 if the first line does not have the required fields
+    fi
+
+    return 0 # Return 0 if the first line has the required fields
+}
+# Call the function with the file path
+is_swebench_format "$PROCESS_FILEPATH"
+IS_SWEBENCH_FORMAT=$?
+# Use the result in an if-else statement
+if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
+    echo "The file IS in SWE-bench format."
+    SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
+else
+    echo "The file IS NOT in SWE-bench format."
+
+    # ==== Convert OH format to SWE-bench format ====
+    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    # replace .jsonl with .swebench.jsonl in filename
+    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
+    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
+    # assert that the file exists
+    if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
+        echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
+        exit 1
+    fi
+    SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
+fi
+# ================================================
+
+echo "=============================================================="
+echo "Running SWE-bench evaluation"
+echo "=============================================================="
+
+RUN_ID=$(date +"%Y%m%d_%H%M%S")
+N_PROCESS=4
+
+if [ -z "$INSTANCE_ID" ]; then
+    echo "Running SWE-bench evaluation WITH ALL TESTS on the whole input file..."
+    # Default to SWE-Bench-lite
+    # change `--dataset_name` and `--split` to alter dataset
+
+    poetry run python -m swebench.harness.run_evaluation \
+        --dataset_name "$DATASET_NAME" \
+        --split "$SPLIT" \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --timeout 3600 \
+        --cache_level instance \
+        --max_workers $N_PROCESS \
+        --run_id $RUN_ID \
+        --run_all_tests true
+
+    # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
+    MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
+    echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
+
+    RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
+    echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
+
+    # move the eval results to the target directory
+    mkdir -p $RESULT_OUTPUT_DIR
+    # rm eval_outputs directory if it exists
+    if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
+        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
+    fi
+
+    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs_all_tests
+    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id_all_tests.txt
+
+    # move report file
+    REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
+    if [ -f $REPORT_PATH ]; then
+        # check if $RESULT_OUTPUT_DIR/report.json exists
+        if [ -f $RESULT_OUTPUT_DIR/report_all_tests.json ]; then
+            echo "Report file $RESULT_OUTPUT_DIR/report_all_tests.json already exists. Overwriting..."
+            if [ -f $RESULT_OUTPUT_DIR/report_all_tests.json.bak ]; then
+                rm $RESULT_OUTPUT_DIR/report_all_tests.json.bak
+            fi
+            mv $RESULT_OUTPUT_DIR/report_all_tests.json $RESULT_OUTPUT_DIR/report_all_tests.json.bak
+        fi
+
+        mv $REPORT_PATH $RESULT_OUTPUT_DIR/report_all_tests.json
+    fi
+
+else
+    echo "Running SWE-bench evaluation WITH ALL TESTS on the instance_id: $INSTANCE_ID"
+    poetry run python -m swebench.harness.run_evaluation \
+        --dataset_name "$DATASET_NAME" \
+        --split "$SPLIT" \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --timeout 3600 \
+        --instance_ids $INSTANCE_ID \
+        --cache_level instance \
+        --max_workers $N_PROCESS \
+        --run_id $RUN_ID \
+        --run_all_tests true
+fi
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -7789,10 +7789,8 @@ description = "The official SWE-bench package - a benchmark for evaluating LMs o
 optional = false
 python-versions = ">=3.8"
 groups = ["evaluation"]
-files = [
-    {file = "swebench-3.0.15-py3-none-any.whl", hash = "sha256:dd694356f9c155a55d3d2e113fe58446f7385eea0574230af5e2504426f8b85b"},
-    {file = "swebench-3.0.15.tar.gz", hash = "sha256:24e734fbcce34082665a25719075e6899382b7135103dd8c6cc09a6e23789101"},
-]
+files = []
+develop = false

 [package.dependencies]
 beautifulsoup4 = "*"
@@ -7814,6 +7812,12 @@ unidiff = "*"
 inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
 test = ["pytest", "pytest-cov"]

+[package.source]
+type = "git"
+url = "https://github.com/All-Hands-AI/SWE-Bench.git"
+reference = "xw/run-all-test"
+resolved_reference = "7b33e5c07fb9b238c6365f46fb9d7a7f3c11365e"
+
 [[package]]
 name = "swegym"
 version = "2.0.13"
@@ -9316,4 +9320,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "d3ec6b8a6c7e48420d76b7e17d5f1a3f253fa603205f90d4a8e4a614ab5e2c67"
+content-hash = "4e157bc86980cecbccd26b3698a3747aa25bd900b3c3619edb2cb47aa841fb7d"
@@ -99,6 +99,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -127,12 +128,13 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
-swebench = "^3.0.8"
+swebench = { git = "https://github.com/All-Hands-AI/SWE-Bench.git", branch = "xw/run-all-test" }
 swegym = { git = "https://github.com/SWE-Gym/SWE-Bench-Package.git" }
 commit0 = "*"
 func_timeout = "*"
Author	SHA1	Message	Date
Xingyao Wang	52edadeab5	support eval patches with all tests	2025-03-20 16:14:23 +00:00
Xingyao Wang	12b169958e	Support running all tests in SWE-Bench	2025-03-20 16:12:08 +00:00