mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
2 Commits
uv-migrati
...
uv-migrati
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6766948fb1 | ||
|
|
8a5d8f7006 |
@@ -14,7 +14,7 @@ you can clone the OpenHands project directly.
|
||||
- [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
|
||||
- [Python](https://www.python.org/downloads/) = 3.12
|
||||
- [NodeJS](https://nodejs.org/en/download/package-manager) >= 22.x
|
||||
- [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
|
||||
- [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8 or [UV](https://docs.astral.sh/uv/getting-started/installation/) >= 0.4
|
||||
- OS-specific dependencies:
|
||||
- Ubuntu: build-essential => `sudo apt-get install build-essential python3.12-dev`
|
||||
- WSL: netcat => `sudo apt-get install netcat`
|
||||
@@ -42,10 +42,12 @@ If you want to develop without system admin/sudo access to upgrade/install `Pyth
|
||||
curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
|
||||
bash Miniforge3-$(uname)-$(uname -m).sh
|
||||
|
||||
# Install Python 3.12, nodejs, and poetry
|
||||
# Install Python 3.12, nodejs, and poetry (or uv)
|
||||
mamba install python=3.12
|
||||
mamba install conda-forge::nodejs
|
||||
mamba install conda-forge::poetry
|
||||
# Or install UV instead of Poetry:
|
||||
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
```
|
||||
|
||||
### 2. Build and Setup The Environment
|
||||
@@ -148,14 +150,25 @@ To run tests, refer to the following:
|
||||
#### Unit tests
|
||||
|
||||
```bash
|
||||
# Using Poetry (default)
|
||||
poetry run pytest ./tests/unit/test_*.py
|
||||
|
||||
# Using UV
|
||||
uv run pytest ./tests/unit/test_*.py
|
||||
```
|
||||
|
||||
### 9. Add or update dependency
|
||||
|
||||
#### Using Poetry (default)
|
||||
1. Add your dependency in `pyproject.toml` or use `poetry add xxx`.
|
||||
2. Update the poetry.lock file via `poetry lock --no-update`.
|
||||
|
||||
#### Using UV
|
||||
1. Add your dependency in `pyproject.toml` or use `uv add xxx`.
|
||||
2. Update the uv.lock file via `uv lock`.
|
||||
|
||||
**Note:** The project supports both Poetry and UV for dependency management. To use UV instead of Poetry, set `USE_UV=1` when running make commands (e.g., `USE_UV=1 make build`).
|
||||
|
||||
### 10. Use existing Docker image
|
||||
|
||||
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -40,7 +43,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/EDA/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--dataset $DATASET \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -26,7 +29,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/agent_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
|
||||
@@ -74,13 +74,13 @@ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
|
||||
## Summarize Results
|
||||
|
||||
```bash
|
||||
poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
|
||||
poetry run (or uv run) python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
|
||||
```
|
||||
|
||||
Full example:
|
||||
|
||||
```bash
|
||||
poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
|
||||
poetry run (or uv run) python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
|
||||
```
|
||||
|
||||
This will list the instances that passed and the instances that failed. For each
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -39,7 +42,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then
|
||||
EVAL_NOTE=$EVAL_NOTE-w-test
|
||||
fi
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/aider_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
|
||||
@@ -31,9 +31,9 @@ Each task directory (`tasks/algotune-*`) contains:
|
||||
Use the main evaluation script:
|
||||
|
||||
```bash
|
||||
poetry run python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
|
||||
poetry run (or uv run) python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
|
||||
|
||||
poetry run python evaluation/benchmarks/algotune/run_infer.py \
|
||||
poetry run (or uv run) python evaluation/benchmarks/algotune/run_infer.py \
|
||||
--agent-cls CodeActAgent \
|
||||
--llm-config llm.gpt-5 \
|
||||
--optim_task all \
|
||||
|
||||
@@ -2,10 +2,13 @@
|
||||
set -eo pipefail
|
||||
|
||||
# Generate the tasks
|
||||
poetry run python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
|
||||
$PKG_RUN python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -59,7 +62,7 @@ fi
|
||||
echo "ENABLE_VOLUMES: $ENABLE_VOLUMES"
|
||||
|
||||
# Construct the command
|
||||
COMMAND="poetry run python evaluation/benchmarks/algotune/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/algotune/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--optim_task $OPTIM_TASK \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -28,7 +31,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/biocoder/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -26,7 +29,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/bird/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 5 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -28,7 +31,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 1 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
REPO_SPLIT=$1
|
||||
MODEL_CONFIG=$2
|
||||
COMMIT_HASH=$3
|
||||
@@ -84,7 +87,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/commit0/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/commit0/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -29,7 +32,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -36,7 +39,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "LEVELS: $LEVELS"
|
||||
|
||||
COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
|
||||
COMMAND="$PKG_RUN python ./evaluation/benchmarks/gaia/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 60 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -33,7 +36,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "HUBS: $HUBS"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/gorilla/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
EVAL_LIMIT=$3
|
||||
@@ -33,7 +36,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -64,7 +67,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
PROCESS_FILEPATH=$1
|
||||
if [ -z "$PROCESS_FILEPATH" ]; then
|
||||
echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
|
||||
@@ -21,7 +24,7 @@ if [ -n "$EXP_NAME" ]; then
|
||||
fi
|
||||
|
||||
function run_eval() {
|
||||
COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/eval_infer.py \
|
||||
COMMAND="$PKG_RUN python ./evaluation/benchmarks/lca_ci_build_repair/eval_infer.py \
|
||||
--predictions-path $PROCESS_FILEPATH "
|
||||
|
||||
echo "RUNNING: $COMMAND"
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
|
||||
get_openhands_version
|
||||
@@ -16,7 +19,7 @@ if [ -n "$EXP_NAME" ]; then
|
||||
fi
|
||||
|
||||
function run_eval() {
|
||||
COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/run_infer.py \
|
||||
COMMAND="$PKG_RUN python ./evaluation/benchmarks/lca_ci_build_repair/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG "
|
||||
|
||||
# Run the command
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
DATASET=$2
|
||||
COMMIT_HASH=$3
|
||||
@@ -34,7 +37,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--dataset $DATASET \
|
||||
|
||||
@@ -39,7 +39,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
|
||||
To calculate the average reward, run:
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
poetry run (or uv run) python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
# configure browsing agent
|
||||
export USE_NAV="false"
|
||||
export USE_CONCISE_ANSWER="true"
|
||||
@@ -33,7 +36,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
SUBSET=$3
|
||||
@@ -25,7 +28,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
|
||||
export PYTHONPATH=$(pwd)
|
||||
|
||||
COMMAND="poetry run python ./evaluation/mint/run_infer.py \
|
||||
COMMAND="$PKG_RUN python ./evaluation/mint/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 5 \
|
||||
--max-propose-solution 2 \
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
RESULT_FILE=$1
|
||||
MODEL_CONFIG=$2
|
||||
|
||||
@@ -17,7 +20,7 @@ fi
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "RESULT_FILE: $RESULT_FILE"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/ml_bench/run_analysis.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--json_file_path $RESULT_FILE"
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
SPLIT=$3
|
||||
@@ -32,7 +35,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
@@ -29,7 +29,7 @@ DATASET="${EVAL_DATASET%.jsonl}_with_runtime_.jsonl" # path to converted datase
|
||||
|
||||
# Create the converted dataset file
|
||||
echo "Creating converted dataset at: $DATASET"
|
||||
poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py --input "$EVAL_DATASET" --output "$DATASET"
|
||||
$PKG_RUN python ./evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py --input "$EVAL_DATASET" --output "$DATASET"
|
||||
|
||||
SPLIT="train"
|
||||
export LANGUAGE=java
|
||||
@@ -45,6 +45,9 @@ fi
|
||||
|
||||
# ===== Run inference =====
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
get_openhands_version
|
||||
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
@@ -64,7 +67,7 @@ function run_eval() {
|
||||
export LANGUAGE=java
|
||||
echo "About to run command"
|
||||
COMMAND="EVAL_DOCKER_IMAGE_PREFIX=$EVAL_DOCKER_IMAGE_PREFIX; LANGUAGE=java;
|
||||
poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
|
||||
$PKG_RUN python evaluation/benchmarks/multi_swe_bench/run_infer.py \
|
||||
--agent-cls CodeActAgent \
|
||||
--llm-config $MODEL \
|
||||
--max-iterations $MAX_ITER \
|
||||
@@ -90,7 +93,7 @@ function run_eval() {
|
||||
for run_idx in $(seq 1 $N_RUNS); do
|
||||
if [ -n "$SKIP_IDS_THRESHOLD" ]; then
|
||||
echo "Computing SKIP_IDS for run $run_idx..."
|
||||
SKIP_CMD="poetry run python evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py $SKIP_IDS_THRESHOLD"
|
||||
SKIP_CMD="$PKG_RUN python evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py $SKIP_IDS_THRESHOLD"
|
||||
if [ -n "$SKIP_IDS_PATTERN" ]; then
|
||||
SKIP_CMD="$SKIP_CMD --pattern \"$SKIP_IDS_PATTERN\""
|
||||
fi
|
||||
@@ -150,8 +153,8 @@ for run_idx in $(seq 1 $N_RUNS); do
|
||||
echo "### Evaluating on $OUTPUT_FILE ... ###"
|
||||
OUTPUT_CONFIG_FILE="${OUTPUT_FILE%.jsonl}_config.json"
|
||||
export EVAL_SKIP_BUILD_ERRORS=true
|
||||
COMMAND="poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
|
||||
poetry run python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
|
||||
COMMAND="$PKG_RUN python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
|
||||
$PKG_RUN python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
|
||||
"
|
||||
|
||||
echo "Running command: $COMMAND"
|
||||
@@ -170,10 +173,10 @@ for run_idx in $(seq 1 $N_RUNS); do
|
||||
|
||||
# update the output with evaluation results
|
||||
echo "### Updating the output with evaluation results... ###"
|
||||
poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
|
||||
$PKG_RUN python evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
|
||||
|
||||
echo "### Combining the final completions... ###"
|
||||
poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
|
||||
$PKG_RUN python evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
|
||||
|
||||
echo "### DONE for run $run_idx! ###"
|
||||
echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -115,7 +118,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/multi_swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -103,7 +106,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
USE_KNOWLEDGE=$3
|
||||
@@ -32,7 +35,7 @@ echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/scienceagentbench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--use-knowledge $USE_KNOWLEDGE \
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
FOLDER_PATH=$1
|
||||
NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
|
||||
mkdir -p $NEW_FOLDER_PATH
|
||||
|
||||
# Build all_preds.jsonl
|
||||
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
|
||||
$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
|
||||
mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl
|
||||
|
||||
# Build trajs/
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
PROCESS_FILEPATH=$1
|
||||
if [ -z "$PROCESS_FILEPATH" ]; then
|
||||
echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
|
||||
@@ -66,7 +69,7 @@ else
|
||||
|
||||
# ==== Convert OH format to SWE-bench format ====
|
||||
echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
|
||||
poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
|
||||
$PKG_RUN python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
|
||||
# replace .jsonl with .swebench.jsonl in filename
|
||||
SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
|
||||
echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
|
||||
@@ -97,7 +100,7 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
# Default to SWE-Bench-lite
|
||||
# change `--dataset_name` and `--split` to alter dataset
|
||||
|
||||
poetry run python -m swebench.harness.run_evaluation \
|
||||
$PKG_RUN python -m swebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
@@ -140,11 +143,11 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
|
||||
fi
|
||||
|
||||
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
|
||||
$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
|
||||
|
||||
else
|
||||
echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
|
||||
poetry run python -m swebench.harness.run_evaluation \
|
||||
$PKG_RUN python -m swebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
|
||||
@@ -35,6 +35,9 @@ MAX_ITER=100
|
||||
|
||||
# ===== Run inference =====
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
get_openhands_version
|
||||
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
@@ -51,7 +54,7 @@ EVAL_NOTE="$OPENHANDS_VERSION-no-hint-$EXP_NAME"
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer.py \
|
||||
--agent-cls CodeActAgent \
|
||||
--llm-config $MODEL \
|
||||
--max-iterations $MAX_ITER \
|
||||
@@ -97,7 +100,7 @@ for run_idx in $(seq 1 $N_RUNS); do
|
||||
|
||||
while true; do
|
||||
echo "### Evaluating on $OUTPUT_FILE ... ###"
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/eval_infer.py \
|
||||
--eval-num-workers $((N_WORKERS * 2)) \
|
||||
--input-file $OUTPUT_FILE \
|
||||
--dataset $DATASET \
|
||||
@@ -123,10 +126,10 @@ for run_idx in $(seq 1 $N_RUNS); do
|
||||
|
||||
# update the output with evaluation results
|
||||
echo "### Updating the output with evaluation results... ###"
|
||||
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
|
||||
$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
|
||||
|
||||
echo "### Combining the final completions... ###"
|
||||
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
|
||||
$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
|
||||
|
||||
echo "### DONE for run $run_idx! ###"
|
||||
echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -103,7 +106,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -91,7 +94,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer_interact.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer_interact.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -77,7 +80,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_localize.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_localize.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -103,7 +106,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/swe_perf/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swe_perf/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -107,7 +110,7 @@ export NO_CHANGE_TIMEOUT_SECONDS=900 # 15 minutes
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/swefficiency/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/swefficiency/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
FOLDER_PATH=$1
|
||||
NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
|
||||
mkdir -p $NEW_FOLDER_PATH
|
||||
|
||||
# Build all_preds.jsonl
|
||||
poetry run python evaluation/testgeneval/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
|
||||
$PKG_RUN python evaluation/testgeneval/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
|
||||
mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl
|
||||
|
||||
# Build trajs/
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
|
||||
INPUT_FILE=$1
|
||||
NUM_WORKERS=$2
|
||||
DATASET=$3
|
||||
@@ -29,7 +32,7 @@ fi
|
||||
|
||||
echo "... Evaluating on $INPUT_FILE ..."
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/testgeneval/eval_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/testgeneval/eval_infer.py \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--input-file $INPUT_FILE \
|
||||
--dataset $DATASET \
|
||||
@@ -50,4 +53,4 @@ echo $COMMAND
|
||||
eval $COMMAND
|
||||
|
||||
# update the output with evaluation results
|
||||
# poetry run python evaluation/benchmarks/testgeneval/scripts/eval/update_output_with_eval.py $INPUT_FILE
|
||||
# $PKG_RUN python evaluation/benchmarks/testgeneval/scripts/eval/update_output_with_eval.py $INPUT_FILE
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -85,7 +88,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/testgeneval/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/testgeneval/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
##################################################################################################
|
||||
# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/run_eval.sh
|
||||
##################################################################################################
|
||||
@@ -145,7 +148,7 @@ while IFS= read -r task_image; do
|
||||
docker pull $task_image
|
||||
|
||||
# Build the Python command
|
||||
COMMAND="poetry run python -m evaluation.benchmarks.the_agent_company.run_infer \
|
||||
COMMAND="$PKG_RUN python -m evaluation.benchmarks.the_agent_company.run_infer \
|
||||
--agent-llm-config \"$AGENT_LLM_CONFIG\" \
|
||||
--env-llm-config \"$ENV_LLM_CONFIG\" \
|
||||
--outputs-path \"$OUTPUTS_PATH\" \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -47,7 +50,7 @@ echo "DATASET: $DATASET"
|
||||
echo "HARDNESS: $HARDNESS"
|
||||
echo "WOLFRAM_APPID: $WOLFRAM_APPID"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/toolqa/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
PKG_RUN=${PKG_RUN:-poetry run}
|
||||
PROCESS_FILEPATH=$1
|
||||
if [ -z "$PROCESS_FILEPATH" ]; then
|
||||
echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
|
||||
@@ -58,7 +61,7 @@ else
|
||||
|
||||
# ==== Convert OH format to SWE-bench format ====
|
||||
echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
|
||||
poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
|
||||
$PKG_RUN python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
|
||||
# replace .jsonl with .swebench.jsonl in filename
|
||||
SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
|
||||
echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
|
||||
@@ -83,7 +86,7 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
# Default to SWE-Bench-lite
|
||||
# change `--dataset_name` and `--split` to alter dataset
|
||||
|
||||
poetry run python -m visualswebench.harness.run_evaluation \
|
||||
$PKG_RUN python -m visualswebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
@@ -125,11 +128,11 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
|
||||
fi
|
||||
|
||||
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
|
||||
$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
|
||||
|
||||
else
|
||||
echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
|
||||
poetry run python -m visualswebench.harness.run_evaluation \
|
||||
$PKG_RUN python -m visualswebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
@@ -84,7 +87,7 @@ fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/visual_swe_bench/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/visual_swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
|
||||
@@ -33,7 +33,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/visualwebarena/`
|
||||
To calculate the success rate, run:
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
poetry run (or uv run) python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
# configure browsing agent
|
||||
export USE_NAV="true"
|
||||
export USE_CONCISE_ANSWER="true"
|
||||
@@ -32,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${OPENHANDS_VERSION}"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/visualwebarena/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/visualwebarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 15 \
|
||||
|
||||
@@ -32,7 +32,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
|
||||
To calculate the success rate, run:
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
poetry run (or uv run) python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
@@ -3,6 +3,9 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# Get package runner (poetry run or uv run based on USE_UV env var)
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
|
||||
# configure webarena websites and environment
|
||||
source evaluation/benchmarks/webarena/scripts/webarena_env.sh
|
||||
|
||||
@@ -35,7 +38,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
|
||||
COMMAND="$PKG_RUN python evaluation/benchmarks/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 15 \
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
|
||||
# This allows gradual migration from Poetry to UV
|
||||
get_pkg_run() {
|
||||
if [ "${USE_UV:-0}" = "1" ]; then
|
||||
echo "uv run"
|
||||
else
|
||||
echo "poetry run"
|
||||
fi
|
||||
}
|
||||
|
||||
checkout_eval_branch() {
|
||||
if [ -z "$COMMIT_HASH" ]; then
|
||||
echo "Commit hash not specified, use current git commit"
|
||||
@@ -42,5 +52,6 @@ checkout_original_branch() {
|
||||
get_openhands_version() {
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
|
||||
PKG_RUN=$(get_pkg_run)
|
||||
OPENHANDS_VERSION=v$($PKG_RUN python -c "from openhands import get_version; print(get_version())")
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ Backend:
|
||||
- Located in the `openhands` directory
|
||||
- Testing:
|
||||
- All tests are in `tests/unit/test_*.py`
|
||||
- To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
|
||||
- To test new code, run `poetry run pytest tests/unit/test_xxx.py` (or `uv run pytest tests/unit/test_xxx.py` if using UV) where `xxx` is the appropriate file for the current functionality
|
||||
- Write all tests with pytest
|
||||
|
||||
Frontend:
|
||||
|
||||
Reference in New Issue
Block a user