mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-08 22:38:05 -05:00
[Evaluation]: Log openhands version in eval output folder, instead of agent version (#5394)
This commit is contained in:
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "Dataset not specified, use default 'things'"
|
||||
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
|
||||
--max-iterations 20 \
|
||||
--OPENAI_API_KEY $OPENAI_API_KEY \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
|
||||
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE=$AGENT_VERSION
|
||||
EVAL_NOTE=$OPENHANDS_VERSION
|
||||
|
||||
# Default to NOT use unit tests.
|
||||
if [ -z "$USE_UNIT_TESTS" ]; then
|
||||
|
||||
@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
|
||||
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 5 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION" \
|
||||
--eval-note $OPENHANDS_VERSION" \
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "HF SPLIT: $SPLIT"
|
||||
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
|
||||
@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$LEVELS" ]; then
|
||||
LEVELS="2023_level1"
|
||||
echo "Levels not specified, use default $LEVELS"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "LEVELS: $LEVELS"
|
||||
|
||||
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
|
||||
--level $LEVELS \
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$HUBS" ]; then
|
||||
HUBS="hf,torch,tf"
|
||||
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
|
||||
fi
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "HUBS: $HUBS"
|
||||
|
||||
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
|
||||
--hubs $HUBS \
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
|
||||
DATA_SPLIT="gpqa_diamond"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--data-split $DATA_SPLIT \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
|
||||
DATASET="ProofWriter"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
--dataset $DATASET \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
|
||||
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -18,10 +18,10 @@ checkout_eval_branch
|
||||
# Only 'CodeActAgent' is supported for MINT now
|
||||
AGENT="CodeActAgent"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
|
||||
export PYTHONPATH=$(pwd)
|
||||
|
||||
|
||||
@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
|
||||
USE_KNOWLEDGE=false
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
|
||||
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
|
||||
--use_knowledge $USE_KNOWLEDGE \
|
||||
--max-iterations 30 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION" \
|
||||
--eval-note $OPENHANDS_VERSION" \
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "SPLIT: $SPLIT"
|
||||
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
|
||||
@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
|
||||
echo "WOLFRAM_APPID not specified"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "HARDNESS: $HARDNESS"
|
||||
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
|
||||
--wolfram_alpha_appid $WOLFRAM_APPID\
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE=$AGENT_VERSION
|
||||
EVAL_NOTE=$OPENHANDS_VERSION
|
||||
|
||||
# Default to NOT use unit tests.
|
||||
if [ -z "$USE_UNIT_TESTS" ]; then
|
||||
|
||||
@@ -39,8 +39,8 @@ checkout_original_branch() {
|
||||
git checkout $current_branch
|
||||
}
|
||||
|
||||
get_agent_version() {
|
||||
get_openhands_version() {
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user