Compare commits

...

2 Commits

Author SHA1 Message Date
Xingyao Wang 52edadeab5 support eval patches with all tests 2025-03-20 16:14:23 +00:00
Xingyao Wang 12b169958e Support running all tests in SWE-Bench 2025-03-20 16:12:08 +00:00
3 changed files with 154 additions and 7 deletions
@@ -0,0 +1,141 @@
#!/usr/bin/env bash
PROCESS_FILEPATH=$1
if [ -z "$PROCESS_FILEPATH" ]; then
echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
exit 1
fi
if [ ! -f $PROCESS_FILEPATH ]; then
echo "Error: $PROCESS_FILEPATH is not a file"
exit 1
fi
# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
# otherwise, we want to eval on the instance_id
INSTANCE_ID=$2
DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Verified"}
SPLIT=${4:-"test"}
echo "INSTANCE_ID: $INSTANCE_ID"
echo "DATASET_NAME: $DATASET_NAME"
echo "SPLIT: $SPLIT"
PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
FILE_DIR=$(dirname $PROCESS_FILEPATH)
FILE_NAME=$(basename $PROCESS_FILEPATH)
echo "Evaluating $FILE_NAME @ $FILE_DIR"
# ================================================
# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
echo "=============================================================="
echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
echo "=============================================================="
# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
function is_swebench_format() {
# Read the first line of the file
read -r first_line < "$PROCESS_FILEPATH"
# Use jq to check if the first line has the required fields
echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
if [ $? -ne 0 ]; then
return 1 # Return 1 if the first line does not have the required fields
fi
return 0 # Return 0 if the first line has the required fields
}
# Call the function with the file path
is_swebench_format "$PROCESS_FILEPATH"
IS_SWEBENCH_FORMAT=$?
# Use the result in an if-else statement
if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
echo "The file IS in SWE-bench format."
SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
else
echo "The file IS NOT in SWE-bench format."
# ==== Convert OH format to SWE-bench format ====
echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
# replace .jsonl with .swebench.jsonl in filename
SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
# assert that the file exists
if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
exit 1
fi
SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
fi
# ================================================
echo "=============================================================="
echo "Running SWE-bench evaluation"
echo "=============================================================="
RUN_ID=$(date +"%Y%m%d_%H%M%S")
N_PROCESS=4
if [ -z "$INSTANCE_ID" ]; then
echo "Running SWE-bench evaluation WITH ALL TESTS on the whole input file..."
# Default to SWE-Bench-lite
# change `--dataset_name` and `--split` to alter dataset
poetry run python -m swebench.harness.run_evaluation \
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--cache_level instance \
--max_workers $N_PROCESS \
--run_id $RUN_ID \
--run_all_tests true
# get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
# move the eval results to the target directory
mkdir -p $RESULT_OUTPUT_DIR
# rm eval_outputs directory if it exists
if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
rm -rf $RESULT_OUTPUT_DIR/eval_outputs
fi
mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs_all_tests
echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id_all_tests.txt
# move report file
REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
if [ -f $REPORT_PATH ]; then
# check if $RESULT_OUTPUT_DIR/report.json exists
if [ -f $RESULT_OUTPUT_DIR/report_all_tests.json ]; then
echo "Report file $RESULT_OUTPUT_DIR/report_all_tests.json already exists. Overwriting..."
if [ -f $RESULT_OUTPUT_DIR/report_all_tests.json.bak ]; then
rm $RESULT_OUTPUT_DIR/report_all_tests.json.bak
fi
mv $RESULT_OUTPUT_DIR/report_all_tests.json $RESULT_OUTPUT_DIR/report_all_tests.json.bak
fi
mv $REPORT_PATH $RESULT_OUTPUT_DIR/report_all_tests.json
fi
else
echo "Running SWE-bench evaluation WITH ALL TESTS on the instance_id: $INSTANCE_ID"
poetry run python -m swebench.harness.run_evaluation \
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--instance_ids $INSTANCE_ID \
--cache_level instance \
--max_workers $N_PROCESS \
--run_id $RUN_ID \
--run_all_tests true
fi
Generated
+10 -6
View File
@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -7789,10 +7789,8 @@ description = "The official SWE-bench package - a benchmark for evaluating LMs o
optional = false
python-versions = ">=3.8"
groups = ["evaluation"]
files = [
{file = "swebench-3.0.15-py3-none-any.whl", hash = "sha256:dd694356f9c155a55d3d2e113fe58446f7385eea0574230af5e2504426f8b85b"},
{file = "swebench-3.0.15.tar.gz", hash = "sha256:24e734fbcce34082665a25719075e6899382b7135103dd8c6cc09a6e23789101"},
]
files = []
develop = false
[package.dependencies]
beautifulsoup4 = "*"
@@ -7814,6 +7812,12 @@ unidiff = "*"
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
test = ["pytest", "pytest-cov"]
[package.source]
type = "git"
url = "https://github.com/All-Hands-AI/SWE-Bench.git"
reference = "xw/run-all-test"
resolved_reference = "7b33e5c07fb9b238c6365f46fb9d7a7f3c11365e"
[[package]]
name = "swegym"
version = "2.0.13"
@@ -9316,4 +9320,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
content-hash = "d3ec6b8a6c7e48420d76b7e17d5f1a3f253fa603205f90d4a8e4a614ab5e2c67"
content-hash = "4e157bc86980cecbccd26b3698a3747aa25bd900b3c3619edb2cb47aa841fb7d"
+3 -1
View File
@@ -99,6 +99,7 @@ reportlab = "*"
[tool.coverage.run]
concurrency = ["gevent"]
[tool.poetry.group.runtime.dependencies]
jupyterlab = "*"
notebook = "*"
@@ -127,12 +128,13 @@ ignore = ["D1"]
[tool.ruff.lint.pydocstyle]
convention = "google"
[tool.poetry.group.evaluation.dependencies]
streamlit = "*"
whatthepatch = "*"
retry = "*"
evaluate = "*"
swebench = "^3.0.8"
swebench = { git = "https://github.com/All-Hands-AI/SWE-Bench.git", branch = "xw/run-all-test" }
swegym = { git = "https://github.com/SWE-Gym/SWE-Bench-Package.git" }
commit0 = "*"
func_timeout = "*"