diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/EDA/scripts/run_infer.sh index 06107a7b2c..0e0d58ebdb 100755 --- a/evaluation/EDA/scripts/run_infer.sh +++ b/evaluation/EDA/scripts/run_infer.sh @@ -57,5 +57,3 @@ fi # Run the command echo $COMMAND eval $COMMAND - -checkout_original_branch diff --git a/evaluation/agent_bench/scripts/run_infer.sh b/evaluation/agent_bench/scripts/run_infer.sh index 11c0105d26..64023973af 100755 --- a/evaluation/agent_bench/scripts/run_infer.sh +++ b/evaluation/agent_bench/scripts/run_infer.sh @@ -36,5 +36,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/biocoder/scripts/run_infer.sh b/evaluation/biocoder/scripts/run_infer.sh index 584529d51e..85c57e88b8 100755 --- a/evaluation/biocoder/scripts/run_infer.sh +++ b/evaluation/biocoder/scripts/run_infer.sh @@ -39,5 +39,3 @@ fi # Run the command echo $COMMAND eval $COMMAND - -checkout_original_branch diff --git a/evaluation/bird/scripts/run_infer.sh b/evaluation/bird/scripts/run_infer.sh index b042c715ee..7776d1f782 100755 --- a/evaluation/bird/scripts/run_infer.sh +++ b/evaluation/bird/scripts/run_infer.sh @@ -36,5 +36,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/gaia/scripts/run_infer.sh b/evaluation/gaia/scripts/run_infer.sh index 2dcba3694e..9f50f9f444 100755 --- a/evaluation/gaia/scripts/run_infer.sh +++ b/evaluation/gaia/scripts/run_infer.sh @@ -47,5 +47,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/gorilla/scripts/run_infer.sh b/evaluation/gorilla/scripts/run_infer.sh index 38b37b0b51..468b89c220 100644 --- a/evaluation/gorilla/scripts/run_infer.sh +++ b/evaluation/gorilla/scripts/run_infer.sh @@ -45,5 +45,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/gpqa/scripts/run_infer.sh b/evaluation/gpqa/scripts/run_infer.sh index 2b004ff577..43326a30cd 100755 --- a/evaluation/gpqa/scripts/run_infer.sh +++ b/evaluation/gpqa/scripts/run_infer.sh @@ -44,5 +44,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/humanevalfix/scripts/run_infer.sh b/evaluation/humanevalfix/scripts/run_infer.sh index ca5abaeb05..c61b1be309 100755 --- a/evaluation/humanevalfix/scripts/run_infer.sh +++ b/evaluation/humanevalfix/scripts/run_infer.sh @@ -74,5 +74,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/logic_reasoning/scripts/run_infer.sh b/evaluation/logic_reasoning/scripts/run_infer.sh index c32e4c42c8..5c7748b3c9 100755 --- a/evaluation/logic_reasoning/scripts/run_infer.sh +++ b/evaluation/logic_reasoning/scripts/run_infer.sh @@ -40,5 +40,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/miniwob/scripts/run_infer.sh index 36278b8969..b1dc24a147 100755 --- a/evaluation/miniwob/scripts/run_infer.sh +++ b/evaluation/miniwob/scripts/run_infer.sh @@ -46,5 +46,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/mint/scripts/run_infer.sh b/evaluation/mint/scripts/run_infer.sh index 5cac4beac8..6d6a6510b0 100644 --- a/evaluation/mint/scripts/run_infer.sh +++ b/evaluation/mint/scripts/run_infer.sh @@ -42,5 +42,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/ml_bench/scripts/run_infer.sh b/evaluation/ml_bench/scripts/run_infer.sh index 9909276e7d..98bc528c63 100755 --- a/evaluation/ml_bench/scripts/run_infer.sh +++ b/evaluation/ml_bench/scripts/run_infer.sh @@ -46,5 +46,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/toolqa/scripts/run_infer.sh b/evaluation/toolqa/scripts/run_infer.sh index 6035c71bd9..00d015c338 100644 --- a/evaluation/toolqa/scripts/run_infer.sh +++ b/evaluation/toolqa/scripts/run_infer.sh @@ -61,5 +61,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch diff --git a/evaluation/utils/version_control.sh b/evaluation/utils/version_control.sh index 38c2dd370f..f58f9bdce9 100644 --- a/evaluation/utils/version_control.sh +++ b/evaluation/utils/version_control.sh @@ -1,8 +1,16 @@ checkout_eval_branch() { if [ -z "$COMMIT_HASH" ]; then echo "Commit hash not specified, use current git commit" + build_sandbox return 0 fi + + if git diff --quiet $COMMIT_HASH HEAD; then + echo "The given hash is equivalent to the current HEAD" + build_sandbox + return 0 + fi + echo "Start to checkout opendevin version to $COMMIT_HASH, but keep current evaluation harness" if ! git diff-index --quiet HEAD --; then echo "There are uncommitted changes, please stash or commit them first" @@ -15,8 +23,20 @@ checkout_eval_branch() { echo "Failed to check out to $COMMIT_HASH" exit 1 fi + echo "Revert changes in evaluation folder" git checkout $current_branch -- evaluation + + # Trap the EXIT signal to checkout original branch + trap checkout_original_branch EXIT + + build_sandbox +} + +build_sandbox() { + echo "Build sandbox locally" + docker build -t eval-sandbox -f containers/sandbox/Dockerfile /tmp + export SANDBOX_CONTAINER_IMAGE="eval-sandbox" } checkout_original_branch() { diff --git a/evaluation/webarena/scripts/run_infer.sh b/evaluation/webarena/scripts/run_infer.sh index 1fa9ba13b2..ec6981c9e7 100755 --- a/evaluation/webarena/scripts/run_infer.sh +++ b/evaluation/webarena/scripts/run_infer.sh @@ -44,5 +44,3 @@ fi # Run the command eval $COMMAND - -checkout_original_branch