feat(evaluation): Add multi-swe-bench dependency and fix rollout script (#11326)

Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Kevin Musgrave
2025-10-16 07:35:19 -07:00
committed by GitHub
parent f42a4f75cb
commit a237b578c0
3 changed files with 37 additions and 16 deletions

View File

@@ -111,15 +111,10 @@ for run_idx in $(seq 1 $N_RUNS); do
echo "### Evaluating on $OUTPUT_FILE ... ###"
OUTPUT_CONFIG_FILE="${OUTPUT_FILE%.jsonl}_config.json"
export EVAL_SKIP_BUILD_ERRORS=true
pip install multi-swe-bench --quiet --disable-pip-version-check > /dev/null 2>&1
COMMAND="poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
poetry run python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
echo "Running command: $COMMAND"
# Run the command
eval $COMMAND