Update documentation to mention UV as alternative to Poetry

This PR updates documentation to reflect that both Poetry and UV are supported for dependency management. Changes: - Development.md: - Add UV as alternative to Poetry in requirements - Add UV installation instructions - Update testing section with UV examples - Add UV dependency management instructions - Document USE_UV=1 environment variable - skills/add_repo_inst.md: - Update example to mention UV alternative for running tests - Evaluation benchmark READMEs: - Update 'poetry run' references to mention 'uv run' alternative This is a non-breaking change - Poetry remains the default. Closes #12421 (partial) Co-authored-by: openhands <openhands@all-hands.dev>
Update evaluation scripts to support both Poetry and UV
2026-04-29 03:00:45 -04:00 · 2026-01-14 23:52:14 +00:00 · 2026-01-14 23:50:52 +00:00
49 changed files with 216 additions and 69 deletions
--- a/Development.md
+++ b/Development.md
@@ -14,7 +14,7 @@ you can clone the OpenHands project directly.
 - [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
 - [Python](https://www.python.org/downloads/) = 3.12
 - [NodeJS](https://nodejs.org/en/download/package-manager) >= 22.x
- [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
+- [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8 or [UV](https://docs.astral.sh/uv/getting-started/installation/) >= 0.4
 - OS-specific dependencies:
  - Ubuntu: build-essential => `sudo apt-get install build-essential python3.12-dev`
  - WSL: netcat => `sudo apt-get install netcat`
@@ -42,10 +42,12 @@ If you want to develop without system admin/sudo access to upgrade/install `Pyth
 curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 bash Miniforge3-$(uname)-$(uname -m).sh

-# Install Python 3.12, nodejs, and poetry
+# Install Python 3.12, nodejs, and poetry (or uv)
 mamba install python=3.12
 mamba install conda-forge::nodejs
 mamba install conda-forge::poetry
+# Or install UV instead of Poetry:
+# curl -LsSf https://astral.sh/uv/install.sh | sh
 ```

 ### 2. Build and Setup The Environment
@@ -148,14 +150,25 @@ To run tests, refer to the following:
 #### Unit tests

 ```bash
+# Using Poetry (default)
 poetry run pytest ./tests/unit/test_*.py
+
+# Using UV
+uv run pytest ./tests/unit/test_*.py
 ```

 ### 9. Add or update dependency

+#### Using Poetry (default)
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`.
 2. Update the poetry.lock file via `poetry lock --no-update`.

+#### Using UV
+1. Add your dependency in `pyproject.toml` or use `uv add xxx`.
+2. Update the uv.lock file via `uv lock`.
+
+**Note:** The project supports both Poetry and UV for dependency management. To use UV instead of Poetry, set `USE_UV=1` when running make commands (e.g., `USE_UV=1 make build`).
+
 ### 10. Use existing Docker image

 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
--- a/evaluation/benchmarks/EDA/scripts/run_infer.sh
+++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -40,7 +43,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"

-COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/EDA/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --dataset $DATASET \
--- a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -26,7 +29,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/agent_bench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
--- a/evaluation/benchmarks/aider_bench/README.md
+++ b/evaluation/benchmarks/aider_bench/README.md
@@ -74,13 +74,13 @@ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
 ## Summarize Results

 ```bash
-poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+poetry run (or uv run) python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```

 Full example:

 ```bash
-poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
+poetry run (or uv run) python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```

 This will list the instances that passed and the instances that failed. For each
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -39,7 +42,7 @@ if [ "$USE_UNIT_TESTS" = true ]; then
  EVAL_NOTE=$EVAL_NOTE-w-test
 fi

-COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/aider_bench/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/aider_bench:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/aider_bench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
--- a/evaluation/benchmarks/algotune/README.md
+++ b/evaluation/benchmarks/algotune/README.md
@@ -31,9 +31,9 @@ Each task directory (`tasks/algotune-*`) contains:
 Use the main evaluation script:

 ```bash
-poetry run python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
+poetry run (or uv run) python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks

-poetry run python evaluation/benchmarks/algotune/run_infer.py \
+poetry run (or uv run) python evaluation/benchmarks/algotune/run_infer.py \
  --agent-cls CodeActAgent \
  --llm-config llm.gpt-5 \
  --optim_task all \
--- a/evaluation/benchmarks/algotune/scripts/run_infer.sh
+++ b/evaluation/benchmarks/algotune/scripts/run_infer.sh
@@ -2,10 +2,13 @@
 set -eo pipefail

 # Generate the tasks
-poetry run python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks
+$PKG_RUN python evaluation/benchmarks/algotune/adapter/run_adapter.py --output-path evaluation/benchmarks/algotune/tasks

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -59,7 +62,7 @@ fi
 echo "ENABLE_VOLUMES: $ENABLE_VOLUMES"

 # Construct the command
-COMMAND="poetry run python evaluation/benchmarks/algotune/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/algotune/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --optim_task $OPTIM_TASK \
--- a/evaluation/benchmarks/biocoder/scripts/run_infer.sh
+++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -28,7 +31,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"

-COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/biocoder/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/bird/scripts/run_infer.sh
+++ b/evaluation/benchmarks/bird/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -26,7 +29,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/bird/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
--- a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -28,7 +31,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

 EVAL_NOTE="$OPENHANDS_VERSION"

-COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/browsing_delegation/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 1 \
--- a/evaluation/benchmarks/commit0/scripts/run_infer.sh
+++ b/evaluation/benchmarks/commit0/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 REPO_SPLIT=$1
 MODEL_CONFIG=$2
 COMMIT_HASH=$3
@@ -84,7 +87,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/commit0/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/commit0/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -29,7 +32,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/discoverybench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -36,7 +39,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"

-COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
+COMMAND="$PKG_RUN python ./evaluation/benchmarks/gaia/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 60 \
--- a/evaluation/benchmarks/gorilla/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -33,7 +36,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"

-COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/gorilla/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
--- a/evaluation/benchmarks/gpqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 EVAL_LIMIT=$3
@@ -33,7 +36,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/gpqa/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -64,7 +67,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/humanevalfix/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/lca_ci_build_repair/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/lca_ci_build_repair/scripts/eval_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 PROCESS_FILEPATH=$1
 if [ -z "$PROCESS_FILEPATH" ]; then
    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
@@ -21,7 +24,7 @@ if [ -n "$EXP_NAME" ]; then
 fi

 function run_eval() {
-  COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/eval_infer.py \
+  COMMAND="$PKG_RUN python ./evaluation/benchmarks/lca_ci_build_repair/eval_infer.py \
    --predictions-path $PROCESS_FILEPATH "

  echo "RUNNING: $COMMAND"
--- a/evaluation/benchmarks/lca_ci_build_repair/scripts/run_infer.sh
+++ b/evaluation/benchmarks/lca_ci_build_repair/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1

 get_openhands_version
@@ -16,7 +19,7 @@ if [ -n "$EXP_NAME" ]; then
 fi

 function run_eval() {
-  COMMAND="poetry run python ./evaluation/benchmarks/lca_ci_build_repair/run_infer.py \
+  COMMAND="$PKG_RUN python ./evaluation/benchmarks/lca_ci_build_repair/run_infer.py \
    --llm-config $MODEL_CONFIG "

  # Run the command
--- a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 DATASET=$2
 COMMIT_HASH=$3
@@ -34,7 +37,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/logic_reasoning/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --dataset $DATASET \
--- a/evaluation/benchmarks/miniwob/README.md
+++ b/evaluation/benchmarks/miniwob/README.md
@@ -39,7 +39,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
 To calculate the average reward, run:

 ```sh
-poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run (or uv run) python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
 ```

 ## Submit your evaluation results
--- a/evaluation/benchmarks/miniwob/scripts/run_infer.sh
+++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 # configure browsing agent
 export USE_NAV="false"
 export USE_CONCISE_ANSWER="true"
@@ -33,7 +36,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

 EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"

-COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && $PKG_RUN python evaluation/benchmarks/miniwob/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/mint/scripts/run_infer.sh
+++ b/evaluation/benchmarks/mint/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 SUBSET=$3
@@ -25,7 +28,7 @@ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"

 export PYTHONPATH=$(pwd)

-COMMAND="poetry run python ./evaluation/mint/run_infer.py \
+COMMAND="$PKG_RUN python ./evaluation/mint/run_infer.py \
    --llm-config $MODEL_CONFIG \
    --max-iterations 5 \
    --max-propose-solution 2 \
--- a/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_analysis.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 RESULT_FILE=$1
 MODEL_CONFIG=$2

@@ -17,7 +20,7 @@ fi
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "RESULT_FILE: $RESULT_FILE"

-COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_analysis.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/ml_bench/run_analysis.py \
  --llm-config $MODEL_CONFIG \
  --json_file_path $RESULT_FILE"

--- a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 SPLIT=$3
@@ -32,7 +35,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/ml_bench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
--- a/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh
@@ -29,7 +29,7 @@ DATASET="${EVAL_DATASET%.jsonl}_with_runtime_.jsonl"  # path to converted datase

 # Create the converted dataset file
 echo "Creating converted dataset at: $DATASET"
-poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py --input "$EVAL_DATASET" --output "$DATASET"
+$PKG_RUN python ./evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py --input "$EVAL_DATASET" --output "$DATASET"

 SPLIT="train"
 export LANGUAGE=java
@@ -45,6 +45,9 @@ fi

 # ===== Run inference =====
 source "evaluation/utils/version_control.sh"
+
+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
 get_openhands_version

 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
@@ -64,7 +67,7 @@ function run_eval() {
  export LANGUAGE=java
  echo "About to run command"
  COMMAND="EVAL_DOCKER_IMAGE_PREFIX=$EVAL_DOCKER_IMAGE_PREFIX; LANGUAGE=java;
-    poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
+    $PKG_RUN python evaluation/benchmarks/multi_swe_bench/run_infer.py \
    --agent-cls CodeActAgent \
    --llm-config $MODEL \
    --max-iterations $MAX_ITER \
@@ -90,7 +93,7 @@ function run_eval() {
 for run_idx in $(seq 1 $N_RUNS); do
    if [ -n "$SKIP_IDS_THRESHOLD" ]; then
        echo "Computing SKIP_IDS for run $run_idx..."
-        SKIP_CMD="poetry run python evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py $SKIP_IDS_THRESHOLD"
+        SKIP_CMD="$PKG_RUN python evaluation/benchmarks/multi_swe_bench/compute_skip_ids.py $SKIP_IDS_THRESHOLD"
        if [ -n "$SKIP_IDS_PATTERN" ]; then
            SKIP_CMD="$SKIP_CMD --pattern \"$SKIP_IDS_PATTERN\""
        fi
@@ -150,8 +153,8 @@ for run_idx in $(seq 1 $N_RUNS); do
        echo "### Evaluating on $OUTPUT_FILE ... ###"
        OUTPUT_CONFIG_FILE="${OUTPUT_FILE%.jsonl}_config.json"
        export EVAL_SKIP_BUILD_ERRORS=true
-        COMMAND="poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
-        poetry run python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
+        COMMAND="$PKG_RUN python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET;
+        $PKG_RUN python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE
        "

        echo "Running command: $COMMAND"
@@ -170,10 +173,10 @@ for run_idx in $(seq 1 $N_RUNS); do

    # update the output with evaluation results
    echo "### Updating the output with evaluation results... ###"
-    poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
+    $PKG_RUN python evaluation/benchmarks/multi_swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE

    echo "### Combining the final completions... ###"
-    poetry run python evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
+    $PKG_RUN python evaluation/benchmarks/multi_swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE

    echo "### DONE for run $run_idx! ###"
    echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
--- a/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -115,7 +118,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/multi_swe_bench/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/multi_swe_bench/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh
+++ b/evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -103,7 +106,7 @@ fi

 function run_eval() {
  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 USE_KNOWLEDGE=$3
@@ -32,7 +35,7 @@ echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/scienceagentbench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --use-knowledge $USE_KNOWLEDGE \
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_folder_to_swebench_submission.sh
@@ -1,11 +1,14 @@
 #!/usr/bin/env bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 FOLDER_PATH=$1
 NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
 mkdir -p $NEW_FOLDER_PATH

 # Build all_preds.jsonl
-poetry run python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
+$PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
 mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl

 # Build trajs/
--- a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 PROCESS_FILEPATH=$1
 if [ -z "$PROCESS_FILEPATH" ]; then
    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
@@ -66,7 +69,7 @@ else

    # ==== Convert OH format to SWE-bench format ====
    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    $PKG_RUN python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
    # replace .jsonl with .swebench.jsonl in filename
    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -97,7 +100,7 @@ if [ -z "$INSTANCE_ID" ]; then
    # Default to SWE-Bench-lite
    # change `--dataset_name` and `--split` to alter dataset

-    poetry run python -m swebench.harness.run_evaluation \
+    $PKG_RUN python -m swebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
@@ -140,11 +143,11 @@ if [ -z "$INSTANCE_ID" ]; then
        mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
    fi

-    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
+    $PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH

 else
    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
-    poetry run python -m swebench.harness.run_evaluation \
+    $PKG_RUN python -m swebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
--- a/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh
@@ -35,6 +35,9 @@ MAX_ITER=100

 # ===== Run inference =====
 source "evaluation/utils/version_control.sh"
+
+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
 get_openhands_version

 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
@@ -51,7 +54,7 @@ EVAL_NOTE="$OPENHANDS_VERSION-no-hint-$EXP_NAME"

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer.py \
    --agent-cls CodeActAgent \
    --llm-config $MODEL \
    --max-iterations $MAX_ITER \
@@ -97,7 +100,7 @@ for run_idx in $(seq 1 $N_RUNS); do

    while true; do
        echo "### Evaluating on $OUTPUT_FILE ... ###"
-        COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
+        COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/eval_infer.py \
        --eval-num-workers $((N_WORKERS * 2)) \
        --input-file $OUTPUT_FILE \
        --dataset $DATASET \
@@ -123,10 +126,10 @@ for run_idx in $(seq 1 $N_RUNS); do

    # update the output with evaluation results
    echo "### Updating the output with evaluation results... ###"
-    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
+    $PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE

    echo "### Combining the final completions... ###"
-    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
+    $PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE

    echo "### DONE for run $run_idx! ###"
    echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -103,7 +106,7 @@ fi

 function run_eval() {
  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer_interact.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -91,7 +94,7 @@ fi

 function run_eval() {
  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer_interact.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_infer_interact.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/swe_bench/scripts/run_localize.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_localize.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -77,7 +80,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_localize.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swe_bench/run_localize.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/swe_perf/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_perf/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -103,7 +106,7 @@ fi

 function run_eval() {
  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swe_perf/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swe_perf/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/swefficiency/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swefficiency/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -107,7 +110,7 @@ export NO_CHANGE_TIMEOUT_SECONDS=900 # 15 minutes

 function run_eval() {
  local eval_note="${1}"
-  COMMAND="poetry run python evaluation/benchmarks/swefficiency/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/swefficiency/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_folder_to_swebench_submission.sh
+++ b/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_folder_to_swebench_submission.sh
@@ -1,11 +1,14 @@
 #!/bin/bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 FOLDER_PATH=$1
 NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
 mkdir -p $NEW_FOLDER_PATH

 # Build all_preds.jsonl
-poetry run python evaluation/testgeneval/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
+$PKG_RUN python evaluation/testgeneval/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
 mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl

 # Build trajs/
--- a/evaluation/benchmarks/testgeneval/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/testgeneval/scripts/eval_infer.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 set -eo pipefail

+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
+
 INPUT_FILE=$1
 NUM_WORKERS=$2
 DATASET=$3
@@ -29,7 +32,7 @@ fi

 echo "... Evaluating on $INPUT_FILE ..."

-COMMAND="poetry run python evaluation/benchmarks/testgeneval/eval_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/testgeneval/eval_infer.py \
  --eval-num-workers $NUM_WORKERS \
  --input-file $INPUT_FILE \
  --dataset $DATASET \
@@ -50,4 +53,4 @@ echo $COMMAND
 eval $COMMAND

 # update the output with evaluation results
-# poetry run python evaluation/benchmarks/testgeneval/scripts/eval/update_output_with_eval.py $INPUT_FILE
+# $PKG_RUN python evaluation/benchmarks/testgeneval/scripts/eval/update_output_with_eval.py $INPUT_FILE
--- a/evaluation/benchmarks/testgeneval/scripts/run_infer.sh
+++ b/evaluation/benchmarks/testgeneval/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -85,7 +88,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/testgeneval/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/testgeneval/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
+++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 ##################################################################################################
 # Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/run_eval.sh
 ##################################################################################################
@@ -145,7 +148,7 @@ while IFS= read -r task_image; do
    docker pull $task_image

    # Build the Python command
-    COMMAND="poetry run python -m evaluation.benchmarks.the_agent_company.run_infer \
+    COMMAND="$PKG_RUN python -m evaluation.benchmarks.the_agent_company.run_infer \
            --agent-llm-config \"$AGENT_LLM_CONFIG\" \
            --env-llm-config \"$ENV_LLM_CONFIG\" \
            --outputs-path \"$OUTPUTS_PATH\" \
--- a/evaluation/benchmarks/toolqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -47,7 +50,7 @@ echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
 echo "WOLFRAM_APPID: $WOLFRAM_APPID"

-COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/toolqa/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
--- a/evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/eval_infer.sh
@@ -1,5 +1,8 @@
 #!/bin/bash

+
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+PKG_RUN=${PKG_RUN:-poetry run}
 PROCESS_FILEPATH=$1
 if [ -z "$PROCESS_FILEPATH" ]; then
    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
@@ -58,7 +61,7 @@ else

    # ==== Convert OH format to SWE-bench format ====
    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    $PKG_RUN python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
    # replace .jsonl with .swebench.jsonl in filename
    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -83,7 +86,7 @@ if [ -z "$INSTANCE_ID" ]; then
    # Default to SWE-Bench-lite
    # change `--dataset_name` and `--split` to alter dataset

-    poetry run python -m visualswebench.harness.run_evaluation \
+    $PKG_RUN python -m visualswebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
@@ -125,11 +128,11 @@ if [ -z "$INSTANCE_ID" ]; then
        mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
    fi

-    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
+    $PKG_RUN python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH

 else
    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
-    poetry run python -m visualswebench.harness.run_evaluation \
+    $PKG_RUN python -m visualswebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
--- a/evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/visual_swe_bench/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -84,7 +87,7 @@ fi

 function run_eval() {
  local eval_note=$1
-  COMMAND="poetry run python evaluation/benchmarks/visual_swe_bench/run_infer.py \
+  COMMAND="$PKG_RUN python evaluation/benchmarks/visual_swe_bench/run_infer.py \
    --agent-cls $AGENT \
    --llm-config $MODEL_CONFIG \
    --max-iterations $MAX_ITER \
--- a/evaluation/benchmarks/visualwebarena/README.md
+++ b/evaluation/benchmarks/visualwebarena/README.md
@@ -33,7 +33,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/visualwebarena/`
 To calculate the success rate, run:

 ```sh
-poetry run python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run (or uv run) python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
 ```

 ## Submit your evaluation results
--- a/evaluation/benchmarks/visualwebarena/scripts/run_infer.sh
+++ b/evaluation/benchmarks/visualwebarena/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 # configure browsing agent
 export USE_NAV="true"
 export USE_CONCISE_ANSWER="true"
@@ -32,7 +35,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

 EVAL_NOTE="${OPENHANDS_VERSION}"

-COMMAND="poetry run python evaluation/benchmarks/visualwebarena/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/visualwebarena/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 15 \
--- a/evaluation/benchmarks/webarena/README.md
+++ b/evaluation/benchmarks/webarena/README.md
@@ -32,7 +32,7 @@ Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
 To calculate the success rate, run:

 ```sh
-poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+poetry run (or uv run) python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
 ```

 ## Submit your evaluation results
--- a/evaluation/benchmarks/webarena/scripts/run_infer.sh
+++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh
@@ -3,6 +3,9 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

+# Get package runner (poetry run or uv run based on USE_UV env var)
+PKG_RUN=$(get_pkg_run)
+
 # configure webarena websites and environment
 source evaluation/benchmarks/webarena/scripts/webarena_env.sh

@@ -35,7 +38,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

 EVAL_NOTE="$OPENHANDS_VERSION"

-COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
+COMMAND="$PKG_RUN python evaluation/benchmarks/webarena/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 15 \
--- a/evaluation/utils/version_control.sh
+++ b/evaluation/utils/version_control.sh
@@ -1,3 +1,13 @@
+# Package manager runner - uses UV if USE_UV=1, otherwise Poetry
+# This allows gradual migration from Poetry to UV
+get_pkg_run() {
+    if [ "${USE_UV:-0}" = "1" ]; then
+        echo "uv run"
+    else
+        echo "poetry run"
+    fi
+}
+
 checkout_eval_branch() {
    if [ -z "$COMMIT_HASH" ]; then
        echo "Commit hash not specified, use current git commit"
@@ -42,5 +52,6 @@ checkout_original_branch() {
 get_openhands_version() {
    # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
    # We need to track the version of Agent in the evaluation to make sure results are comparable
-    OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
+    PKG_RUN=$(get_pkg_run)
+    OPENHANDS_VERSION=v$($PKG_RUN python -c "from openhands import get_version; print(get_version())")
 }
--- a/skills/add_repo_inst.md
+++ b/skills/add_repo_inst.md
@@ -42,7 +42,7 @@ Backend:
 - Located in the `openhands` directory
 - Testing:
  - All tests are in `tests/unit/test_*.py`
-  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` (or `uv run pytest tests/unit/test_xxx.py` if using UV) where `xxx` is the appropriate file for the current functionality
  - Write all tests with pytest

 Frontend: