mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
* prepare execution and inference * Create README.md * Update README.md * Update evaluation/biocoder/README.md * Update evaluation/swe_bench/swe_env_box.py * switch to biocoder docker container and test-specific code * code for copying and running test files into container * add metrics * add readme * Biocoder evaluation code finished (rewrite testing infrastructure, prompt tuning, and bug fixes) * Update README.md --------- Co-authored-by: lilbillybiscuit <qianbill2014@outlook.com> Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> Co-authored-by: yufansong <yufan@risingwave-labs.com>
38 lines
1.0 KiB
Bash
Executable File
38 lines
1.0 KiB
Bash
Executable File
#!/bin/bash
|
|
MODEL_CONFIG=$1
|
|
AGENT=$2
|
|
EVAL_LIMIT=$3
|
|
DATASET="biocoder"
|
|
|
|
|
|
if [ -z "$AGENT" ]; then
|
|
echo "Agent not specified, use default CodeActAgent"
|
|
AGENT="CodeActAgent"
|
|
fi
|
|
|
|
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
|
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
|
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
|
|
|
echo "AGENT: $AGENT"
|
|
echo "AGENT_VERSION: $AGENT_VERSION"
|
|
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
|
echo "DATASET: $DATASET"
|
|
|
|
COMMAND="poetry run python evaluation/biocoder/run_infer.py \
|
|
--agent-cls $AGENT \
|
|
--llm-config $MODEL_CONFIG \
|
|
--max-iterations 10 \
|
|
--max-chars 10000000 \
|
|
--eval-num-workers 1 \
|
|
--eval-note ${AGENT_VERSION}_${DATASET}"
|
|
|
|
if [ -n "$EVAL_LIMIT" ]; then
|
|
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
|
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
|
fi
|
|
|
|
# Run the command
|
|
echo $COMMAND
|
|
eval $COMMAND
|