Compare commits

..

7 Commits

Author SHA1 Message Date
rohitvinodmalhotra@gmail.com
532a284d5c migrate to use OH version 2025-01-26 15:24:35 -05:00
rohitvinodmalhotra@gmail.com
43f6104967 Merge branch 'main' into eval/visualcodebench 2025-01-26 15:14:28 -05:00
openhands
e249b920ff feat: adapt Design2Code block detection for in-memory evaluation 2024-11-30 19:28:22 +00:00
rohitvinodmalhotra@gmail.com
d920a69f69 adding back server code 2024-11-30 14:00:25 -05:00
openhands
a8ce888981 refactor: adapt Design2Code evaluation metrics 2024-11-30 17:17:05 +00:00
rohitvinodmalhotra@gmail.com
e22ddc0dd6 uncomment agent run 2024-11-26 17:00:07 -05:00
rohitvinodmalhotra@gmail.com
c370912f12 adding eval scripts 2024-11-26 16:57:19 -05:00
464 changed files with 13684 additions and 28738 deletions

View File

@@ -30,7 +30,6 @@ body:
description: How are you running OpenHands?
options:
- Docker command in README
- GitHub resolver
- Development workflow
- app.all-hands.dev
- Other

View File

@@ -19,6 +19,20 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3

View File

@@ -41,8 +41,22 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3.4.0
uses: docker/setup-qemu-action@v3.3.0
with:
image: tonistiigi/binfmt:latest
- name: Login to GHCR
@@ -90,8 +104,22 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3.4.0
uses: docker/setup-qemu-action@v3.3.0
with:
image: tonistiigi/binfmt:latest
- name: Login to GHCR
@@ -191,7 +219,7 @@ jobs:
exit 1
fi
# Run unit tests with the Docker runtime Docker images as root
# Run unit tests with the EventStream runtime Docker images as root
test_runtime_root:
name: RT Unit Tests (Root)
needs: [ghcr_build_runtime]
@@ -202,69 +230,20 @@ jobs:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
# Forked repos can't push to GHCR, so we need to download the image as an artifact
- name: Download runtime image for fork
if: github.event.pull_request.head.repo.fork
uses: actions/download-artifact@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
name: runtime-${{ matrix.base_image }}
path: /tmp
- name: Load runtime image for fork
if: github.event.pull_request.head.repo.fork
run: |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Run docker runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs
poetry run pip install pytest-xdist
# Install to be able to retry on failures for flaky tests
poetry run pip install pytest-rerunfailures
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=docker \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run unit tests with the Docker runtime Docker images as openhands user
test_runtime_oh:
name: RT Unit Tests (openhands)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
matrix:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
@@ -307,7 +286,84 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=docker \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run unit tests with the EventStream runtime Docker images as openhands user
test_runtime_oh:
name: RT Unit Tests (openhands)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
matrix:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
# Forked repos can't push to GHCR, so we need to download the image as an artifact
- name: Download runtime image for fork
if: github.event.pull_request.head.repo.fork
uses: actions/download-artifact@v4
with:
name: runtime-${{ matrix.base_image }}
path: /tmp
- name: Load runtime image for fork
if: github.event.pull_request.head.repo.fork
run: |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Run runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs
poetry run pip install pytest-xdist
# Install to be able to retry on failures for flaky tests
poetry run pip install pytest-rerunfailures
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \

View File

@@ -20,10 +20,6 @@ on:
required: false
type: string
default: "anthropic/claude-3-5-sonnet-20241022"
LLM_API_VERSION:
required: false
type: string
default: ""
base_container_image:
required: false
type: string
@@ -88,10 +84,12 @@ jobs:
run: |
python -m pip index versions openhands-ai > openhands_versions.txt
OPENHANDS_VERSION=$(head -n 1 openhands_versions.txt | awk '{print $2}' | tr -d '()')
# Create a new requirements.txt locally within the workflow, ensuring no reference to the repo's file
echo "openhands-ai==${OPENHANDS_VERSION}" > /tmp/requirements.txt
cat /tmp/requirements.txt
# Ensure requirements.txt ends with newline before appending
if [ -f requirements.txt ] && [ -s requirements.txt ]; then
sed -i -e '$a\' requirements.txt
fi
echo "openhands-ai==${OPENHANDS_VERSION}" >> requirements.txt
cat requirements.txt
- name: Cache pip dependencies
if: |
@@ -109,16 +107,15 @@ jobs:
uses: actions/cache@v4
with:
path: ${{ env.pythonLocation }}/lib/python3.12/site-packages/*
key: ${{ runner.os }}-pip-openhands-resolver-${{ hashFiles('/tmp/requirements.txt') }}
key: ${{ runner.os }}-pip-openhands-resolver-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-openhands-resolver-${{ hashFiles('/tmp/requirements.txt') }}
${{ runner.os }}-pip-openhands-resolver-${{ hashFiles('requirements.txt') }}
- name: Check required environment variables
env:
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
PAT_USERNAME: ${{ secrets.PAT_USERNAME }}
GITHUB_TOKEN: ${{ github.token }}
@@ -175,7 +172,7 @@ jobs:
echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
# Set branch variables
echo "TARGET_BRANCH=${{ inputs.target_branch || 'main' }}" >> $GITHUB_ENV
echo "TARGET_BRANCH=${{ inputs.target_branch }}" >> $GITHUB_ENV
- name: Comment on issue with start message
uses: actions/github-script@v7
@@ -223,18 +220,16 @@ jobs:
} else {
console.log("Installing from requirements.txt...");
await exec.exec("python -m pip install --upgrade pip");
await exec.exec("pip install -r /tmp/requirements.txt");
await exec.exec("pip install -r requirements.txt");
}
- name: Attempt to resolve issue
env:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
GIT_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PYTHONPATH: ""
run: |
cd /tmp && python -m openhands.resolver.resolve_issue \
@@ -267,17 +262,14 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
GIT_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PYTHONPATH: ""
run: |
if [ "${{ steps.check_result.outputs.RESOLUTION_SUCCESS }}" == "true" ]; then
cd /tmp && python -m openhands.resolver.send_pull_request \
--issue-number ${{ env.ISSUE_NUMBER }} \
--target-branch ${{ env.TARGET_BRANCH }} \
--pr-type draft \
--reviewer ${{ github.actor }} | tee pr_result.txt && \
grep "draft created" pr_result.txt | sed 's/.*\///g' > pr_number.txt

98
.github/workflows/py-unit-tests-mac.yml vendored Normal file
View File

@@ -0,0 +1,98 @@
# Workflow that runs python unit tests on mac
name: Run Python Unit Tests Mac
# This job is flaky so only run it nightly
on:
schedule:
- cron: '0 0 * * *'
jobs:
# Run python unit tests on macOS
test-on-macos:
name: Python Unit Tests on macOS
runs-on: macos-14
env:
INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
strategy:
matrix:
python-version: ['3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install tmux
run: brew install tmux
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: poetry install --without evaluation,llama-index
- name: Install & Start Docker
if: env.INSTALL_DOCKER == '1'
run: |
INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
# Uninstall colima to upgrade to the latest version
if brew list colima &>/dev/null; then
brew uninstall colima
# unlinking colima dependency: go
brew uninstall go@1.21
fi
rm -rf ~/.colima ~/.lima
brew install --HEAD colima
brew install docker
start_colima() {
# Find a free port in the range 10000-20000
RANDOM_PORT=$((RANDOM % 10001 + 10000))
# Original line:
if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
echo "Failed to start Colima."
return 1
fi
return 0
}
# Attempt to start Colima for 5 total attempts:
ATTEMPT_LIMIT=5
for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
if start_colima; then
echo "Colima started successfully."
break
else
colima stop -f
sleep 10
colima delete -f
if [ $i -eq $ATTEMPT_LIMIT ]; then
exit 1
fi
sleep 10
fi
done
# For testcontainers to find the Colima socket
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
- name: Build Environment
run: make build
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Run Tests
run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -19,4 +19,3 @@ jobs:
close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
days-before-close: 7
operations-per-run: 150

1
.gitignore vendored
View File

@@ -176,6 +176,7 @@ evaluation/gorilla/data
evaluation/toolqa/data
evaluation/scienceagentbench/benchmark
evaluation/commit0_bench/repos
evaluation/visualcodebench/
# openhands resolver
output/

View File

@@ -1,172 +0,0 @@
# OpenHands Glossary
### Agent
The core AI entity in OpenHands that can perform software development tasks by interacting with tools, browsing the web, and modifying code.
#### Agent Controller
A component that manages the agent's lifecycle, handles its state, and coordinates interactions between the agent and various tools.
#### Agent Delegation
The ability of an agent to hand off specific tasks to other specialized agents for better task completion.
#### Agent Hub
A central registry of different agent types and their capabilities, allowing for easy agent selection and instantiation.
#### Agent Skill
A specific capability or function that an agent can perform, such as file manipulation, web browsing, or code editing.
#### Agent State
The current context and status of an agent, including its memory, active tools, and ongoing tasks.
#### CodeAct Agent
[A generalist agent in OpenHands](https://arxiv.org/abs/2407.16741) designed to perform tasks by editing and executing code.
### Browser
A system for web-based interactions and tasks.
#### Browser Gym
A testing and evaluation environment for browser-based agent interactions and tasks.
#### Web Browser Tool
A tool that enables agents to interact with web pages and perform web-based tasks.
### Commands
Terminal and execution related functionality.
#### Bash Session
A persistent terminal session that maintains state and history for bash command execution.
This uses tmux under the hood.
### Configuration
System-wide settings and options.
#### Agent Configuration
Settings that define an agent's behavior, capabilities, and limitations, including available tools and runtime settings.
#### Configuration Options
Settings that control various aspects of OpenHands behavior, including runtime, security, and agent settings.
#### LLM Config
Configuration settings for language models used by agents, including model selection and parameters.
#### LLM Draft Config
Settings for draft mode operations with language models, typically used for faster, lower-quality responses.
#### Runtime Configuration
Settings that define how the runtime environment should be set up and operated.
#### Security Options
Configuration settings that control security features and restrictions.
### Conversation
A sequence of interactions between a user and an agent, including messages, actions, and their results.
#### Conversation Info
Metadata about a conversation, including its status, participants, and timeline.
#### Conversation Manager
A component that handles the creation, storage, and retrieval of conversations.
#### Conversation Metadata
Additional information about conversations, such as tags, timestamps, and related resources.
#### Conversation Status
The current state of a conversation, including whether it's active, completed, or failed.
#### Conversation Store
A storage system for maintaining conversation history and related data.
### Events
#### Event
Every Conversation comprises a series of Events. Each Event is either an Action or an Observation.
#### Event Stream
A continuous flow of events that represents the ongoing activities and interactions in the system.
#### Action
A specific operation or command that an agent executes through available tools, such as running a command or editing a file.
#### Observation
The response or result returned by a tool after an agent's action, providing feedback about the action's outcome.
### Interface
Different ways to interact with OpenHands.
#### CLI Mode
A command-line interface mode for interacting with OpenHands agents without a graphical interface.
#### GUI Mode
A graphical user interface mode for interacting with OpenHands agents through a web interface.
#### Headless Mode
A mode of operation where OpenHands runs without a user interface, suitable for automation and scripting.
### Agent Memory
The system that decides which parts of the Event Stream (i.e. the conversation history) should be passed into each LLM prompt.
#### Memory Store
A storage system for maintaining agent memory and context across sessions.
#### Condenser
A component that processes and summarizes conversation history to maintain context while staying within token limits.
#### Truncation
A very simple Condenser strategy. Reduces conversation history or content to stay within token limits.
### Microagent
A specialized prompt that enhances OpenHands with domain-specific knowledge, repository-specific context, and task-specific workflows.
#### Microagent Registry
A central repository of available microagents and their configurations.
#### Public Microagent
A general-purpose microagent available to all OpenHands users, triggered by specific keywords.
#### Repository Microagent
A type of microagent that provides repository-specific context and guidelines, stored in the `.openhands/microagents/` directory.
### Prompt
Components for managing and processing prompts.
#### Prompt Caching
A system for caching and reusing common prompts to improve performance.
#### Prompt Manager
A component that handles the loading, processing, and management of prompts used by agents, including microagents.
#### Response Parsing
The process of interpreting and structuring responses from language models and tools.
### Runtime
The execution environment where agents perform their tasks, which can be local, remote, or containerized.
#### Action Execution Server
A REST API that receives agent actions (e.g. bash commands, python code, browsing actions), executes them in the runtime environment, and returns the results.
#### Action Execution Client
A component that handles the execution of actions in the runtime environment, managing the communication between the agent and the runtime.
#### Docker Runtime
A containerized runtime environment that provides isolation and reproducibility for agent operations.
#### E2B Runtime
A specialized runtime environment built on E2B for secure and isolated code execution.
#### Local Runtime
A runtime environment that executes on the local machine, suitable for development and testing.
#### Modal Runtime
A runtime environment built on Modal for scalable and distributed agent operations.
#### Remote Runtime
A sandboxed environment that executes code and commands remotely, providing isolation and security for agent operations.
#### Runtime Builder
A component that builds a Docker image for the Action Execution Server based on a user-specified base image.
### Security
Security-related components and features.
#### Security Analyzer
A component that checks agent actions for potential security risks.

View File

@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.25-nikolaik`
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.21-nikolaik`
## Develop inside Docker container

View File

@@ -12,7 +12,7 @@
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
<br/>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
<br/>
@@ -43,17 +43,17 @@ See the [Running OpenHands](https://docs.all-hands.dev/modules/usage/installatio
system requirements and more information.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.25
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -96,7 +96,7 @@ troubleshooting resources, and advanced configuration options.
OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
through Slack, so this is the best place to start, but we also are happy to have you contact us on Discord or Github:
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg) - Here we talk about research, architecture, and future development.
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) - Here we talk about research, architecture, and future development.
- [Join our Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
- [Read or post Github Issues](https://github.com/All-Hands-AI/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.

View File

@@ -1,4 +1,5 @@
#!/bin/bash
set -e
cp pyproject.toml poetry.lock openhands
poetry build -v

View File

@@ -104,7 +104,7 @@ workspace_base = "./workspace"
#aws_secret_access_key = ""
# API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
api_key = ""
api_key = "your-api-key"
# API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
#base_url = ""
@@ -195,7 +195,7 @@ model = "gpt-4o"
#native_tool_calling = None
[llm.gpt4o-mini]
api_key = ""
api_key = "your-api-key"
model = "gpt-4o"

View File

@@ -11,7 +11,7 @@ services:
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
- SANDBOX_API_HOSTNAME=host.docker.internal
#
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.25-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.21-nikolaik}
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -24,6 +24,3 @@ inline-quotes = "single"
[format]
quote-style = "single"
[lint.flake8-bugbear]
extend-immutable-calls = ["Depends", "fastapi.Depends", "fastapi.params.Depends"]

View File

@@ -7,7 +7,7 @@ services:
image: openhands:latest
container_name: openhands-app-${DATE:-}
environment:
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik}
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -46,11 +46,3 @@ docker run -it \
-e THAT=that
...
```
### Referring to UI Elements
When referencing UI elements, use ``.
Example:
1. Toggle the `Advanced` option
2. Enter your model in the `Custom Model` textbox.

View File

@@ -1,8 +1,8 @@
# 📦 Runtime Docker
# 📦 Runtime EventStream
Le Runtime Docker d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
Le Runtime EventStream d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
Il crée un environnement en bac à sable (sandbox) en utilisant Docker, où du code arbitraire peut être exécuté en toute sécurité sans risquer le système hôte.
## Pourquoi avons-nous besoin d'un runtime en bac à sable ?

View File

@@ -163,7 +163,7 @@ Les options de configuration de base sont définies dans la section `[core]` du
- `runtime`
- Type : `str`
- Valeur par défaut : `"docker"`
- Valeur par défaut : `"eventstream"`
- Description : Environnement d'exécution
- `default_agent`

View File

@@ -95,3 +95,7 @@ sandbox_user_id="1001"
### Erreurs de port d'utilisation
Si vous voyez un message d'erreur indiquant que le port est utilisé ou indisponible, essayez de supprimer toutes les containers docker en cours d'exécution (exécutez `docker ps` et `docker rm` des containers concernés) puis ré-exécutez ```make run```
## Discuter
Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!

View File

@@ -52,7 +52,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -61,7 +61,7 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -114,7 +114,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -46,7 +46,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -56,6 +56,6 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
```

View File

@@ -13,16 +13,16 @@
La façon la plus simple d'exécuter OpenHands est avec Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.25
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
Vous pouvez également exécuter OpenHands en mode [headless scriptable](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), en tant que [CLI interactive](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), ou en utilisant l'[Action GitHub OpenHands](https://docs.all-hands.dev/modules/usage/how-to/github-action).

View File

@@ -42,7 +42,7 @@ Explorez le code source d'OpenHands sur [GitHub](https://github.com/All-Hands-AI
/>
</a>
<br></br>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"

View File

@@ -13,7 +13,7 @@ C'est le Runtime par défaut qui est utilisé lorsque vous démarrez OpenHands.
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

View File

@@ -1,8 +1,8 @@
以下是翻译后的内容:
# 📦 Docker 运行时
# 📦 EventStream 运行时
OpenHands Docker 运行时是实现 AI 代理操作安全灵活执行的核心组件。
OpenHands EventStream 运行时是实现 AI 代理操作安全灵活执行的核心组件。
它使用 Docker 创建一个沙盒环境,可以安全地运行任意代码而不会危及主机系统。
## 为什么我们需要沙盒运行时?

View File

@@ -162,7 +162,7 @@
- `runtime`
- 类型: `str`
- 默认值: `"docker"`
- 默认值: `"eventstream"`
- 描述: 运行时环境
- `default_agent`

View File

@@ -96,3 +96,7 @@ sandbox_user_id="1001"
### 端口使用错误
如果您遇到端口被占用或不可用的错误提示,可以尝试先用`docker ps`命令列出所有运行中的 Docker 容器,然后使用`docker rm`命令删除相关容器,最后再重新执行```make run```命令。
## 讨论
对于其他问题或疑问,请加入 [Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) 或 [Discord](https://discord.gg/ESHStjSjD4) 提问!

View File

@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -112,7 +112,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它的
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -47,7 +47,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -57,6 +57,6 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
```

View File

@@ -11,16 +11,16 @@
在 Docker 中运行 OpenHands 是最简单的方式。
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.25
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
你也可以在可脚本化的[无头模式](https://docs.all-hands.dev/modules/usage/how-to/headless-mode)下运行 OpenHands作为[交互式 CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode),或使用 [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action)。

View File

@@ -42,7 +42,7 @@ OpenHands 是一个**自主 AI 软件工程师**,能够执行复杂的工程
/>
</a>
<br></br>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"

View File

@@ -11,7 +11,7 @@
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

View File

@@ -1,6 +1,6 @@
# 📦 Docker Runtime
# 📦 EventStream Runtime
The OpenHands Docker Runtime is the core component that enables secure and flexible execution of AI agent's action.
The OpenHands EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
## Why do we need a sandboxed runtime?
@@ -54,13 +54,14 @@ graph TD
6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
7. Observation Return: The action execution server sends execution results back to the OpenHands backend as observations
The role of the client:
The role of the client:
- It acts as an intermediary between the OpenHands backend and the sandboxed environment
- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container
- It manages the state of the sandboxed environment, including the current working directory and loaded plugins
- It formats and returns observations to the backend, ensuring a consistent interface for processing results
## How OpenHands builds and maintains OH Runtime images
OpenHands' approach to building and managing runtime images ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
@@ -77,15 +78,16 @@ Tags may be in one of 2 formats:
- **Source Tag**: `oh_v{openhands_version}_{16_digit_lock_hash}_{16_digit_source_hash}`
(e.g.: `oh_v0.9.9_1234567890abcdef_1234567890abcdef`)
#### Source Tag - Most Specific
This is the first 16 digits of the MD5 of the directory hash for the source directory. This gives a hash
for only the openhands source
#### Lock Tag
This hash is built from the first 16 digits of the MD5 of:
- The name of the base image upon which the image was built (e.g.: `nikolaik/python-nodejs:python3.12-nodejs22`)
- The content of the `pyproject.toml` included in the image.
- The content of the `poetry.lock` included in the image.

View File

@@ -1,21 +0,0 @@
# Cloud GitHub Resolver
The GitHub Resolver automates code fixes and provides intelligent assistance for your repositories.
## Setup
The Cloud Github Resolver is available automatically when you
[grant OpenHands Cloud repository access](./openhands-cloud.md#adding-repositories).
## Usage
### Issues
On your repository, label an issue with `openhands`. OpenHands will attempt to fix the issue.
### Pull Requests
In order to get OpenHands to work on pull requests, use `@openhands` in top level or single inline comments to:
- Ask questions
- Request updates
- Get code explanations

View File

@@ -1,39 +0,0 @@
# Openhands Cloud
OpenHands Cloud is the cloud hosted version of OpenHands by All Hands AI.
## Accessing OpenHands Cloud
Currently, users are being admitted to access OpenHands Cloud in waves. To sign up,
[join the waitlist](https://www.all-hands.dev/join-waitlist). Once you are approved, you will get an email with
instructions on how to access it.
## Getting Started
After visiting OpenHands Cloud, you will be asked to connect with your GitHub account:
1. After reading and accepting the terms of service, click `Connect to GitHub`.
2. Review the permissions requested by OpenHands and then click `Authorize OpenHands by All Hands AI`.
- OpenHands will require some permissions from your GitHub account. To read more about these permissions,
you can click the `Learn more` link on the GitHub authorize page.
## Adding Repositories
You can grant OpenHands specific repository access:
1. Under the `Select a GitHub project` dropdown, select `Add more repositories...`.
2. Select the organization, then choose the specific repositories to grant OpenHands access to.
- Openhands requests short-lived tokens (8-hour expiry) with these permissions:
- Actions: Read and write
- Administration: Read-only
- Commit statuses: Read and write
- Contents: Read and write
- Issues: Read and write
- Metadata: Read-only
- Pull requests: Read and write
- Webhooks: Read and write
- Workflows: Read and write
- Repository access for a user is granted based on:
- Granted permission for the repository.
- User's GitHub permissions (owner/collaborator).
You can manage repository access any time by following the above workflow or visiting the Settings page and selecting
`Configure GitHub Repositories` under the `GitHub Settings` section.

View File

@@ -126,7 +126,7 @@ The core configuration options are defined in the `[core]` section of the `confi
- `runtime`
- Type: `str`
- Default: `"docker"`
- Default: `"eventstream"`
- Description: Runtime environment
- `default_agent`

View File

@@ -35,7 +35,7 @@ To run OpenHands in CLI mode with Docker:
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -41,16 +41,8 @@ docker build -t custom-image .
This will produce a new image called `custom-image`, which will be available in Docker.
## Using the Docker Command
When running OpenHands using [the docker command](/modules/usage/installation#start-the-app), replace
`-e SANDBOX_RUNTIME_CONTAINER_IMAGE=...` with `-e SANDBOX_BASE_CONTAINER_IMAGE=<custom image name>`:
```commandline
docker run -it --rm --pull=always \
-e SANDBOX_BASE_CONTAINER_IMAGE=custom-image \
...
```
> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the
> sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
## Using the Development Workflow

View File

@@ -112,7 +112,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -42,10 +42,9 @@ You can provide custom directions for OpenHands by following the [README for the
Github resolver will automatically check for valid [repository secrets](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui#creating-secrets-for-a-repository) or [repository variables](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) to customize its behavior.
The customization options you can set are:
| **Attribute name** | **Type** | **Purpose** | **Example** |
| -------------------------------- | -------- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------- |
| `LLM_MODEL` | Variable | Set the LLM to use with OpenHands | `LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"` |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
| `TARGET_BRANCH` | Variable | Merge to branch other than `main` | `TARGET_BRANCH="dev"` |
| **Attribute name** | **Type** | **Purpose** | **Example** |
|----------------------------------| -------- |-------------------------------------------------------------------------------------------------------------|------------------------------------------------------|
| `LLM_MODEL` | Variable | Set the LLM to use with OpenHands | `LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"` |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |

View File

@@ -1,6 +1,9 @@
# GUI Mode
OpenHands provides a Graphical User Interface (GUI) mode for interacting with the AI assistant.
## Introduction
OpenHands provides a user-friendly Graphical User Interface (GUI) mode for interacting with the AI assistant.
This mode offers an intuitive way to set up the environment, manage settings, and communicate with the AI.
## Installation and Setup
@@ -11,95 +14,104 @@ OpenHands provides a Graphical User Interface (GUI) mode for interacting with th
### Initial Setup
1. Upon first launch, you'll see a settings page.
2. Select an `LLM Provider` and `LLM Model` from the dropdown menus. If the required model does not exist in the list,
toggle `Advanced` options and enter it with the correct prefix in the `Custom Model` text box.
1. Upon first launch, you'll see a settings modal.
2. Select an `LLM Provider` and `LLM Model` from the dropdown menus.
3. Enter the corresponding `API Key` for your chosen provider.
4. Click `Save Changes` to apply the settings.
4. Click "Save" to apply the settings.
### GitHub Token Setup
OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it is available. This can happen in two ways:
- **Local Installation**: The user directly inputs their GitHub token.
<details>
<summary>Setting Up a GitHub Token</summary>
1. **Generate a Personal Access Token (PAT)**:
- On GitHub, go to Settings > Developer Settings > Personal Access Tokens > Tokens (classic).
- Click `Generate new token (classic)`.
- **Locally (OSS)**: The user directly inputs their GitHub token.
- **Online (SaaS)**: The token is obtained through GitHub OAuth authentication.
#### Setting Up a Local GitHub Token
1. **Generate a Personal Access Token (PAT)**:
- Go to GitHub Settings > Developer Settings > Personal Access Tokens > Tokens (classic).
- Click "Generate new token (classic)".
- Required scopes:
- `repo` (Full control of private repositories)
2. **Enter Token in OpenHands**:
- Click the Settings button (gear icon).
- Navigate to the `GitHub Settings` section.
- Paste your token in the `GitHub Token` field.
- Click `Save Changes` to apply the changes.
</details>
- `workflow` (Update GitHub Action workflows)
- `read:org` (Read organization data)
<details>
<summary>Organizational Token Policies</summary>
2. **Enter Token in OpenHands**:
- Click the Settings button (gear icon) in the top right.
- Navigate to the "GitHub" section.
- Paste your token in the "GitHub Token" field.
- Click "Save" to apply the changes.
If you're working with organizational repositories, additional setup may be required:
#### Organizational Token Policies
1. **Check Organization Requirements**:
If you're working with organizational repositories, additional setup may be required:
1. **Check Organization Requirements**:
- Organization admins may enforce specific token policies.
- Some organizations require tokens to be created with SSO enabled.
- Review your organization's [token policy settings](https://docs.github.com/en/organizations/managing-programmatic-access-to-your-organization/setting-a-personal-access-token-policy-for-your-organization).
2. **Verify Organization Access**:
2. **Verify Organization Access**:
- Go to your token settings on GitHub.
- Look for the organization under `Organization access`.
- If required, click `Enable SSO` next to your organization.
- Look for the organization under "Organization access".
- If required, click "Enable SSO" next to your organization.
- Complete the SSO authorization process.
</details>
<details>
<summary>Troubleshooting</summary>
#### OAuth Authentication (Online Mode)
Common issues and solutions:
When using OpenHands in online mode, the GitHub OAuth flow:
- **Token Not Recognized**:
- Ensure the token is properly saved in settings.
- Check that the token hasn't expired.
- Verify the token has the required scopes.
- Try regenerating the token.
- **Organization Access Denied**:
- Check if SSO is required but not enabled.
- Verify organization membership.
- Contact organization admin if token policies are blocking access.
- **Verifying Token Works**:
- The app will show a green checkmark if the token is valid.
- Try accessing a repository to confirm permissions.
- Check the browser console for any error messages.
</details>
- **OpenHands Cloud**: The token is obtained through GitHub OAuth authentication.
<details>
<summary>OAuth Authentication</summary>
When using OpenHands Cloud, the GitHub OAuth flow requests the following permissions:
1. Requests the following permissions:
- Repository access (read/write)
- Workflow management
- Organization read access
To authenticate OpenHands:
- Click `Sign in with GitHub` when prompted.
2. Authentication steps:
- Click "Sign in with GitHub" when prompted.
- Review the requested permissions.
- Authorize OpenHands to access your GitHub account.
- If using an organization, authorize organization access if prompted.
</details>
#### Troubleshooting
Common issues and solutions:
- **Token Not Recognized**:
- Ensure the token is properly saved in settings.
- Check that the token hasn't expired.
- Verify the token has the required scopes.
- Try regenerating the token.
- **Organization Access Denied**:
- Check if SSO is required but not enabled.
- Verify organization membership.
- Contact organization admin if token policies are blocking access.
- **Verifying Token Works**:
- The app will show a green checkmark if the token is valid.
- Try accessing a repository to confirm permissions.
- Check the browser console for any error messages.
- Use the "Test Connection" button in settings if available.
### Advanced Settings
1. Inside the Settings page, toggle `Advanced` options to access additional settings.
1. Toggle `Advanced Options` to access additional settings.
2. Use the `Custom Model` text box to manually enter a model if it's not in the list.
3. Specify a `Base URL` if required by your LLM provider.
### Main Interface
The main interface consists of several key components:
- **Chat Window**: The central area where you can view the conversation history with the AI assistant.
- **Input Box**: Located at the bottom of the screen, use this to type your messages or commands to the AI.
- **Send Button**: Click this to send your message to the AI.
- **Settings Button**: A gear icon that opens the settings modal, allowing you to adjust your configuration at any time.
- **Workspace Panel**: Displays the files and folders in your workspace, allowing you to navigate and view files, or the agent's past commands or web browsing history.
### Interacting with the AI
1. Type your prompt in the input box.
1. Type your question, request, or task description in the input box.
2. Click the send button or press Enter to submit your message.
3. The AI will process your input and provide a response in the chat window.
4. You can continue the conversation by asking follow-up questions or providing additional information.

View File

@@ -32,7 +32,7 @@ To run OpenHands in Headless mode with Docker:
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.25 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi"
```

View File

@@ -6,14 +6,11 @@
- Linux
- Windows with [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) and [Docker Desktop support](https://docs.docker.com/desktop/setup/install/windows-install/#system-requirements)
A system with a modern processor and a minimum of **4GB RAM** is recommended to run OpenHands.
## Prerequisites
<details>
<summary>MacOS</summary>
**Docker Desktop**
### Docker Desktop
1. [Install Docker Desktop on Mac](https://docs.docker.com/desktop/setup/install/mac-install).
2. Open Docker Desktop, go to `Settings > Advanced` and ensure `Allow the default Docker socket to be used` is enabled.
@@ -26,7 +23,7 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
Tested with Ubuntu 22.04.
:::
**Docker Desktop**
### Docker Desktop
1. [Install Docker Desktop on Linux](https://docs.docker.com/desktop/setup/install/linux/).
@@ -34,23 +31,18 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
<details>
<summary>Windows</summary>
**WSL**
### WSL
1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
2. Run `wsl --version` in powershell and confirm `Default Version: 2`.
**Docker Desktop**
### Docker Desktop
1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
2. Open Docker Desktop, go to `Settings` and confirm the following:
- General: `Use the WSL 2 based engine` is enabled.
- Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
:::note
The docker command below to start the app must be run inside the WSL terminal.
:::
</details>
## Start the App
@@ -58,17 +50,17 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
The easiest way to run OpenHands is in Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.25
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
You'll find OpenHands running at http://localhost:3000!
@@ -80,22 +72,24 @@ or run it on tagged issues with [a github action](https://docs.all-hands.dev/mod
## Setup
Upon launching OpenHands, you'll see a Settings page. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
If the required model does not exist in the list, you can toggle `Advanced` options and manually enter it with the correct prefix
If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
in the `Custom Model` text box.
The `Advanced` options also allow you to specify a `Base URL` if required.
The `Advanced Options` also allow you to specify a `Base URL` if required.
Now you're ready to [get started with OpenHands](./getting-started).
<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
<img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
<img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
</div>
## Versions
The [docker command above](./installation#start-the-app) pulls the most recent stable release of OpenHands. You have other options as well:
- For a specific release, replace $VERSION in `openhands:$VERSION` and `runtime:$VERSION`, with the version number.
We use SemVer so `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, replace $VERSION in `openhands:$VERSION` and `runtime:$VERSION`, with `main`.
This version is unstable and is recommended for testing or development purposes only.
The command above pulls the most recent stable release of OpenHands. You have other options as well:
- For a specific release, use `docker.all-hands.dev/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, you can use `docker.all-hands.dev/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
You can choose the tag that best suits your needs based on stability requirements and desired features.

View File

@@ -25,7 +25,7 @@ You will need your ChatGPT deployment name which can be found on the deployments
&lt;deployment-name&gt; below.
:::
1. Enable `Advanced` options
1. Enable `Advanced Options`
2. Set the following:
- `Custom Model` to azure/&lt;deployment-name&gt;
- `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)

View File

@@ -10,7 +10,7 @@ OpenHands uses LiteLLM to make calls to Google's chat models. You can find their
When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
- `LLM Provider` to `Gemini`
- `LLM Model` to the model you will be using.
If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
- `API Key` to your Gemini API key
## VertexAI - Google Cloud Platform Configs
@@ -27,4 +27,4 @@ VERTEXAI_LOCATION="<your-gcp-location>"
Then set the following in the OpenHands UI through the Settings:
- `LLM Provider` to `VertexAI`
- `LLM Model` to the model you will be using.
If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).

View File

@@ -8,7 +8,7 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
- `LLM Provider` to `Groq`
- `LLM Model` to the model you will be using. [Visit here to see the list of
models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
`Advanced` options, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
- `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).
@@ -17,7 +17,7 @@ models that Groq hosts](https://console.groq.com/docs/models). If the model is n
The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
would access any OpenAI-compatible endpoint. In the OpenHands UI through the Settings:
1. Enable `Advanced` options
1. Enable `Advanced Options`
2. Set the following:
- `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
- `Base URL` to `https://api.groq.com/openai/v1`

View File

@@ -8,7 +8,7 @@ To use LiteLLM proxy with OpenHands, you need to:
1. Set up a LiteLLM proxy server (see [LiteLLM documentation](https://docs.litellm.ai/docs/proxy/quick_start))
2. When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
* Enable `Advanced` options
* Enable `Advanced Options`
* `Custom Model` to the prefix `litellm_proxy/` + the model you will be using (e.g. `litellm_proxy/anthropic.claude-3-5-sonnet-20241022-v2:0`)
* `Base URL` to your LiteLLM proxy URL (e.g. `https://your-litellm-proxy.com`)
* `API Key` to your LiteLLM proxy API key

View File

@@ -38,7 +38,7 @@ The following can be set in the OpenHands UI through the Settings:
- `LLM Provider`
- `LLM Model`
- `API Key`
- `Base URL` (through `Advanced` settings)
- `Base URL` (through `Advanced Settings`)
There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
can be set through environment variables passed to the [docker run command](/modules/usage/installation#start-the-app)
@@ -63,22 +63,22 @@ We have a few guides for running OpenHands with specific model providers:
### API retries and rate limits
LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically
retry requests if it receives a Rate Limit Error (429 error code).
retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.
You can customize these options as you need for the provider you're using. Check their documentation, and set the
following environment variables to control the number of retries and the time between retries:
- `LLM_NUM_RETRIES` (Default of 4 times)
- `LLM_RETRY_MIN_WAIT` (Default of 5 seconds)
- `LLM_RETRY_MAX_WAIT` (Default of 30 seconds)
- `LLM_NUM_RETRIES` (Default of 8)
- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
- `LLM_RETRY_MULTIPLIER` (Default of 2)
If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
```toml
[llm]
num_retries = 4
retry_min_wait = 5
retry_max_wait = 30
num_retries = 8
retry_min_wait = 15
retry_max_wait = 120
retry_multiplier = 2
```

View File

@@ -8,7 +8,7 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
* `LLM Provider` to `OpenAI`
* `LLM Model` to the model you will be using.
[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).
## Using OpenAI-Compatible Endpoints
@@ -18,7 +18,7 @@ Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoi
## Using an OpenAI Proxy
If you're using an OpenAI proxy, in the OpenHands UI through the Settings:
1. Enable `Advanced` options
1. Enable `Advanced Options`
2. Set the following:
- `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
- `Base URL` to the URL of your OpenAI proxy

View File

@@ -8,5 +8,5 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
* `LLM Provider` to `OpenRouter`
* `LLM Model` to the model you will be using.
[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
If the model is not in the list, toggle `Advanced` options, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
* `API Key` to your OpenRouter API key.

View File

@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.25-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

2077
docs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,8 +22,8 @@
"@mdx-js/react": "^3.1.0",
"clsx": "^2.0.0",
"prism-react-renderer": "^2.4.1",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-icons": "^5.4.0",
"react-use": "^17.6.0"
},
@@ -31,7 +31,7 @@
"@docusaurus/module-type-aliases": "^3.5.1",
"@docusaurus/tsconfig": "^3.7.0",
"@docusaurus/types": "^3.5.1",
"typescript": "~5.7.3"
"typescript": "~5.7.2"
},
"browserslist": {
"production": [

View File

@@ -42,7 +42,7 @@ const sidebars: SidebarsConfig = {
id: 'usage/prompting/microagents-public',
},
],
},
}
],
},
{
@@ -66,26 +66,9 @@ const sidebars: SidebarsConfig = {
},
{
type: 'doc',
label: 'Github Action',
label: 'Github Actions',
id: 'usage/how-to/github-action',
},
{
type: 'category',
label: 'Cloud',
items: [
{
type: 'doc',
label: 'Openhands Cloud',
id: 'usage/cloud/openhands-cloud',
},
{
type: 'doc',
label: 'Cloud GitHub Resolver',
id: 'usage/cloud/cloud-github-resolver',
},
],
},
],
},
{
@@ -202,7 +185,7 @@ const sidebars: SidebarsConfig = {
type: 'doc',
label: 'About',
id: 'usage/about',
},
}
],
};

View File

@@ -8,7 +8,7 @@ function CustomFooter() {
<footer className="custom-footer">
<div className="footer-content">
<div className="footer-icons">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg" target="_blank" rel="noopener noreferrer">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw" target="_blank" rel="noopener noreferrer">
<FaSlack />
</a>
<a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">

View File

@@ -23,7 +23,7 @@ export function HomepageHeader() {
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
<br/>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
<br/>

View File

@@ -23,17 +23,6 @@ export default function Home(): JSX.Element {
})}
>
<HomepageHeader />
<div style={{ textAlign: 'center', padding: '2rem' }}>
<br />
<h2>Most Popular Links</h2>
<ul style={{ listStyleType: 'none'}}>
<li><a href="/modules/usage/Installation">How to Run OpenHands</a></li>
<li><a href="/modules/usage/prompting/microagents-repo">Customizing OpenHands to a repository</a></li>
<li><a href="/modules/usage/how-to/github-action">Integrating OpenHands with Github</a></li>
<li><a href="/modules/usage/llms#model-recommendations">Recommended models to use</a></li>
<li><a href="/modules/usage/runtimes#connecting-to-your-filesystem">Connecting OpenHands to your filesystem</a></li>
</ul>
</div>
</Layout>
);
}

BIN
docs/static/img/settings-advanced.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

BIN
docs/static/img/settings-screenshot.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

View File

@@ -9,7 +9,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -18,6 +17,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -60,14 +60,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -17,7 +17,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -26,6 +25,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -40,15 +40,20 @@ from openhands.utils.async_utils import call_async_from_sync
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-slim'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-slim',
enable_auto_lint=True,
use_host_network=False,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -16,7 +16,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -25,6 +24,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
load_from_toml,
parse_arguments,
@@ -47,14 +47,21 @@ SKIP_NUM = (
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.11-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.11-bookworm',
enable_auto_lint=True,
use_host_network=False,
timeout=100,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -14,7 +14,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -23,6 +22,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -57,15 +57,17 @@ def get_config(
metadata: EvalMetadata,
) -> AppConfig:
BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -17,7 +17,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -26,6 +25,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -71,15 +71,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -10,7 +10,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -19,6 +18,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -36,14 +36,16 @@ def get_config(
assert (
metadata.max_iterations == 1
), 'max_iterations must be 1 for browsing delegation evaluation.'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
),
workspace_base=None,
workspace_mount_path=None,
)

View File

@@ -15,7 +15,6 @@ from evaluation.utils.shared import (
EvalOutput,
assert_and_raise,
codeact_user_response,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -26,6 +25,7 @@ from openhands.controller.state.state import State
from openhands.core.config import (
AgentConfig,
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -105,7 +105,9 @@ def get_config(
instance: pd.Series,
metadata: EvalMetadata,
) -> AppConfig:
# COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
assert USE_INSTANCE_IMAGE
# We use a different instance image for the each instance of commit0 eval
repo_name = instance['repo'].split('/')[1]
base_container_image = get_instance_docker_image(repo_name)
logger.info(
@@ -113,16 +115,27 @@ def get_config(
f'Please make sure this image exists. '
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
)
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
# else:
# raise
# base_container_image = SWE_BENCH_CONTAINER_IMAGE
# logger.info(f'Using swe-bench container image: {base_container_image}')
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -16,7 +16,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -26,6 +25,7 @@ from openhands.controller.state.state import State
from openhands.core.config import (
AgentConfig,
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -62,14 +62,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -13,7 +13,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -22,10 +21,10 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from openhands.core.config.utils import get_agent_config_arg
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
@@ -48,25 +47,23 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
if metadata.agent_config:
config.set_agent_config(metadata.agent_config, metadata.agent_class)
else:
logger.info('Agent config not provided, using default settings')
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False
return config
@@ -240,10 +237,6 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
agent_config = None
if args.agent_config:
agent_config = get_agent_config_arg(args.agent_config)
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
@@ -262,7 +255,6 @@ if __name__ == '__main__':
eval_output_dir=args.eval_output_dir,
data_split=args.data_split,
details={'gaia-level': args.level},
agent_config=agent_config,
)
dataset = load_dataset('gaia-benchmark/GAIA', args.level)

View File

@@ -9,7 +9,6 @@ AGENT=$3
EVAL_LIMIT=$4
LEVELS=$5
NUM_WORKERS=$6
AGENT_CONFIG=$7
if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
@@ -50,9 +49,5 @@ if [ -n "$EVAL_LIMIT" ]; then
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
if [ -n "$AGENT_CONFIG" ]; then
echo "AGENT_CONFIG: $AGENT_CONFIG"
COMMAND="$COMMAND --agent-config $AGENT_CONFIG"
# Run the command
eval $COMMAND

View File

@@ -11,7 +11,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -20,6 +19,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -40,14 +40,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -29,7 +29,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -38,6 +37,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -61,14 +61,16 @@ ACTION_FORMAT = """
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -22,7 +22,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -31,6 +30,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -82,14 +82,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -9,7 +9,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -18,6 +17,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -45,18 +45,17 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-logic-reasoning:v1.0'
sandbox_config.runtime_extra_deps = (
'$OH_INTERPRETER_PATH -m pip install scitools-pyke'
)
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -12,7 +12,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -22,6 +21,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -55,14 +55,22 @@ def get_config(
metadata: EvalMetadata,
env_id: str,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
keep_runtime_alive=False,
timeout=120,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -14,7 +14,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -23,6 +22,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -103,18 +103,17 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
sandbox_config.runtime_extra_deps = (
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
)
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='xingyaoww/od-eval-mint:v1.0',
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -25,7 +25,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -34,6 +33,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
load_app_config,
@@ -77,14 +77,16 @@ ID2CONDA = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='public.ecr.aws/i5g0m1f6/ml-bench',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -11,7 +11,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -21,6 +20,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -59,17 +59,21 @@ def get_config(
metadata: EvalMetadata,
instance_id: str,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = (
'docker.io/xingyaoww/openhands-eval-scienceagentbench'
)
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_budget_per_task=4,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench',
enable_auto_lint=True,
use_host_network=False,
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -1,6 +1,5 @@
import json
import os
import subprocess
import tempfile
import time
from functools import partial
@@ -22,14 +21,13 @@ from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
get_default_sandbox_config_for_eval,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from openhands.core.config import (
AppConfig,
LLMConfig,
SandboxConfig,
get_parser,
)
from openhands.core.logger import openhands_logger as logger
@@ -81,16 +79,22 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
f'Please make sure this image exists. '
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
)
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = AppConfig(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image=base_container_image,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=600,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=3600,
remote_runtime_resource_factor=get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
),
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
@@ -408,21 +412,6 @@ if __name__ == '__main__':
with open(metadata_filepath, 'r') as metadata_file:
data = metadata_file.read()
metadata = EvalMetadata.model_validate_json(data)
else:
# Initialize with a dummy metadata when file doesn't exist
metadata = EvalMetadata(
agent_class='dummy_agent', # Placeholder agent class
llm_config=LLMConfig(model='dummy_model'), # Minimal LLM config
max_iterations=1, # Minimal iterations
eval_output_dir=os.path.dirname(
args.input_file
), # Use input file dir as output dir
start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(), # Current commit
dataset=args.dataset, # Dataset name from args
)
# The evaluation harness constrains the signature of `process_instance_func` but we need to
# pass extra information. Build a new function object to avoid issues with multiprocessing.

View File

@@ -7,7 +7,6 @@ This file tracks the resource requirements of different instances.
import json
import os
from openhands.core.logger import openhands_logger as logger
CUR_DIR = os.path.dirname(os.path.abspath(__file__))

View File

@@ -0,0 +1 @@
{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}

View File

@@ -18,7 +18,6 @@ from evaluation.utils.shared import (
EvalOutput,
assert_and_raise,
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
is_fatal_evaluation_error,
make_metadata,
@@ -31,6 +30,7 @@ from openhands.controller.state.state import State
from openhands.core.config import (
AgentConfig,
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -67,11 +67,11 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
'<uploaded_files>\n'
f'/workspace/{workspace_dir_name}\n'
'</uploaded_files>\n'
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
f'<issue_description>\n'
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n"
f'<pr_description>\n'
f'{instance.problem_statement}\n'
'</issue_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
'</pr_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?\n'
"I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n'
'Follow these steps to resolve the issue:\n'
@@ -122,23 +122,28 @@ def get_config(
base_container_image = SWE_BENCH_CONTAINER_IMAGE
logger.info(f'Using swe-bench container image: {base_container_image}')
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
sandbox_config.enable_auto_lint = True
sandbox_config.use_host_network = False
# Add platform to the sandbox config to solve issue 4401
sandbox_config.platform = 'linux/amd64'
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_resource_factor=get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
),
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
@@ -324,22 +329,6 @@ def complete_runtime(
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
if obs.exit_code == -1:
# The previous command is still running
# We need to kill previous command
logger.info('The previous command is still running, trying to kill it...')
action = CmdRunAction(command='C-c')
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Then run the command again
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',

View File

@@ -15,25 +15,11 @@ parser.add_argument(
action='store_true',
help='Show visualization paths for failed instances',
)
parser.add_argument(
'--only-x-instances',
action='store_true',
help='Only show instances that are ran by X',
)
args = parser.parse_args()
df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
if args.only_x_instances:
instance_ids_1 = set(df1['instance_id'].tolist())
print(
f'Before removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
)
df2 = df2[df2['instance_id'].isin(instance_ids_1)]
print(
f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
)
# Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner')
@@ -100,7 +86,7 @@ repo_diffs = []
for repo in all_repos:
x_count = len(x_only_by_repo.get(repo, []))
y_count = len(y_only_by_repo.get(repo, []))
diff = y_count - x_count
diff = abs(x_count - y_count)
repo_diffs.append((repo, diff))
# Sort by diff (descending) and then by repo name
@@ -120,13 +106,7 @@ for repo, diff in repo_diffs:
repo_color = 'red' if is_significant else 'yellow'
print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
print(
colored(
f'Difference: {diff} instances! (Larger diff = Y better)',
repo_color,
attrs=['bold'],
)
)
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
if x_instances:
print(' ' + str(x_instances))

View File

@@ -4,7 +4,6 @@
import argparse
import json
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
@@ -21,7 +20,6 @@ output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
output_dir = os.path.dirname(args.oh_output_file)
swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
if os.path.exists(swebench_eval_file):
@@ -59,172 +57,22 @@ def convert_history_to_str(history):
return ret
# Load trajectories for resolved instances
def load_completions(instance_id: str):
global output_dir
glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
files = sorted(glob(glob_path)) # this is ascending order
# pick the last file (last turn)
try:
file_path = files[-1]
except IndexError:
# print(f'No files found for instance {instance_id}: files={files}')
return None
with open(file_path, 'r') as f:
result = json.load(f)
# create messages
messages = result['messages']
messages.append(result['response']['choices'][0]['message'])
tools = result['kwargs']['tools']
return {
'messages': messages,
'tools': tools,
}
def _convert_content(content) -> str:
ret = ''
if isinstance(content, list):
for item in content:
assert item['type'] == 'text', 'Only text is supported for now'
ret += f'{item["text"]}\n'
else:
assert isinstance(content, str), 'Only str is supported for now'
ret = content
return ret
def convert_tool_call_to_string(tool_call: dict) -> str:
"""Convert tool call to content in string format."""
if 'function' not in tool_call:
raise ValueError("Tool call must contain 'function' key.")
if 'id' not in tool_call:
raise ValueError("Tool call must contain 'id' key.")
if 'type' not in tool_call:
raise ValueError("Tool call must contain 'type' key.")
if tool_call['type'] != 'function':
raise ValueError("Tool call type must be 'function'.")
ret = f"<function={tool_call['function']['name']}>\n"
try:
args = json.loads(tool_call['function']['arguments'])
except json.JSONDecodeError as e:
raise ValueError(
f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
) from e
for param_name, param_value in args.items():
is_multiline = isinstance(param_value, str) and '\n' in param_value
ret += f'<parameter={param_name}>'
if is_multiline:
ret += '\n'
ret += f'{param_value}'
if is_multiline:
ret += '\n'
ret += '</parameter>\n'
ret += '</function>'
return ret
def format_traj(traj, first_n_turns=None, last_n_turns=None) -> str:
output = ''
system_message = None
# Handle system message if present
if traj[0]['role'] == 'system':
system_message = traj[0]
traj = traj[1:]
content = _convert_content(system_message['content'])
output += "*** System Message that describes the assistant's behavior ***\n"
output += f'{content}\n'
# Merge consecutive user messages first
merged_traj = []
current_messages = []
n_turns = len(traj)
for i, message in enumerate(traj):
# Skip this message if...
if (
# Case 1: first_n_turns specified and we're past it
(first_n_turns is not None and i >= first_n_turns and last_n_turns is None)
or
# Case 2: last_n_turns specified and we're before it
(
last_n_turns is not None
and i < n_turns - last_n_turns
and first_n_turns is None
)
or
# Case 3: both specified and we're in the middle section
(
first_n_turns is not None
and last_n_turns is not None
and i >= first_n_turns
and i < n_turns - last_n_turns
)
):
continue
if message['role'] == 'user':
current_messages.append(message)
else:
if current_messages:
# Merge all accumulated user messages into one
merged_content = '\n'.join(
_convert_content(msg['content']) for msg in current_messages
)
merged_traj.append({'role': 'user', 'content': merged_content})
current_messages = []
merged_traj.append(message)
# Don't forget to handle any remaining user messages
if current_messages:
merged_content = '\n'.join(
_convert_content(msg['content']) for msg in current_messages
)
merged_traj.append({'role': 'user', 'content': merged_content})
# Now process the merged trajectory
for i, message in enumerate(merged_traj):
role, content = message['role'], message['content']
content = _convert_content(content) if isinstance(content, list) else content
turn_id = i // 2 + 1
output += '-' * 100 + '\n'
output += f'*** Turn {turn_id} - {role.upper() if role != "tool" else "TOOL EXECUTION RESULT"} ***\n'
if role == 'user':
output += f'{content}\n'
elif role == 'tool':
output += f'{content}\n'
elif role == 'assistant':
output += f'{content}\n'
if (
'tool_calls' in message
and message['tool_calls'] is not None
and len(message['tool_calls']) > 0
):
for toolcall_id, tool_call in enumerate(message['tool_calls']):
output += f'### Tool Call {toolcall_id}\n'
output += f'{convert_tool_call_to_string(tool_call)}\n'
else:
raise ValueError(f'Unexpected role: {role}')
output += '-' * 100 + '\n'
return output
def write_row_to_md_file(row, instance_id_to_test_result):
if 'git_patch' in row:
model_patch = row['git_patch']
elif 'test_result' in row and 'git_patch' in row['test_result']:
model_patch = row['test_result']['git_patch']
else:
print(f'Row {row} does not have a git_patch')
return
raise ValueError(f'Row {row} does not have a git_patch')
test_output = None
# Use result from output.jsonl FIRST if available.
if 'report' in row and row['report'] is not None:
if row['instance_id'] in instance_id_to_test_result:
report = instance_id_to_test_result[row['instance_id']].get('report', {})
resolved = report.get('resolved', False)
test_output = instance_id_to_test_result[row['instance_id']].get(
'test_output', None
)
elif 'report' in row and row['report'] is not None:
if not isinstance(row['report'], dict):
resolved = None
print(
@@ -232,12 +80,6 @@ def write_row_to_md_file(row, instance_id_to_test_result):
)
else:
resolved = row['report'].get('resolved', False)
elif row['instance_id'] in instance_id_to_test_result:
report = instance_id_to_test_result[row['instance_id']].get('report', {})
resolved = report.get('resolved', False)
test_output = instance_id_to_test_result[row['instance_id']].get(
'test_output', None
)
else:
resolved = None
@@ -246,8 +88,6 @@ def write_row_to_md_file(row, instance_id_to_test_result):
os.makedirs(output_md_folder, exist_ok=True)
filepath = os.path.join(output_md_folder, filename)
completions = load_completions(instance_id)
with open(filepath, 'w') as f:
f.write(f'# {instance_id} (resolved: {resolved})\n')
@@ -257,12 +97,7 @@ def write_row_to_md_file(row, instance_id_to_test_result):
f.write(json.dumps(row['metadata'], indent=2))
f.write('\n```\n')
# Completion
if completions is not None:
f.write('## Completion\n')
traj = completions['messages']
f.write(format_traj(traj))
# Trajectory
f.write('## History\n')
f.write(convert_history_to_str(row['history']))

View File

@@ -207,13 +207,12 @@ with open(args.input_file, 'r') as infile:
for line in tqdm(infile, desc='Checking for changes'):
data = json.loads(line)
instance_id = data['instance_id']
current_report = data.get('report', {})
new_report = instance_id_to_status[
instance_id
] # if no report, it's not resolved
if current_report != new_report:
needs_update = True
break
if instance_id in instance_id_to_status:
current_report = data.get('report', {})
new_report = instance_id_to_status[instance_id]
if current_report != new_report:
needs_update = True
break
if not needs_update:
print('No updates detected. Skipping file update.')
@@ -235,5 +234,6 @@ with open(args.input_file + '.bak', 'r') as infile, open(
for line in tqdm(infile, desc='Updating output file'):
data = json.loads(line)
instance_id = data['instance_id']
data['report'] = instance_id_to_status[instance_id]
if instance_id in instance_id_to_status:
data['report'] = instance_id_to_status[instance_id]
outfile.write(json.dumps(data) + '\n')

View File

@@ -76,7 +76,7 @@ echo "Running SWE-bench evaluation"
echo "=============================================================="
RUN_ID=$(date +"%Y%m%d_%H%M%S")
N_PROCESS=4
N_PROCESS=16
if [ -z "$INSTANCE_ID" ]; then
echo "Running SWE-bench evaluation on the whole input file..."
@@ -87,7 +87,7 @@ if [ -z "$INSTANCE_ID" ]; then
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--timeout 1800 \
--cache_level instance \
--max_workers $N_PROCESS \
--run_id $RUN_ID
@@ -133,7 +133,7 @@ else
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--timeout 1800 \
--instance_ids $INSTANCE_ID \
--cache_level instance \
--max_workers $N_PROCESS \

View File

@@ -17,14 +17,11 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
```bash
./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
--agent-llm-config <agent-llm-config, default to 'agent'> \
--env-llm-config <env-llm-config, default to 'env'> \
--outputs-path <outputs-path, default to outputs> \
--server-hostname <server-hostname, default to localhost> \
--version <version, default to 1.0.0> \
--start-percentile <integer from 0 to 99, default to 0> \
--end-percentile <integer from 1 to 100, default to 100>
--agent-llm-config <agent-llm-config> \
--env-llm-config <env-llm-config> \
--outputs-path <outputs-path> \
--server-hostname <server-hostname> \
--version <version>
# Example
./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
@@ -32,9 +29,7 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
--env-llm-config claude-3-5-sonnet-20240620 \
--outputs-path outputs \
--server-hostname localhost \
--version 1.0.0 \
--start-percentile 10 \
--end-percentile 20
--version 1.0.0
```
- `agent-llm-config`: the config name for the agent LLM. This should match the config name in config.toml. This is the LLM used by the agent (e.g. CodeActAgent).
@@ -42,11 +37,7 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
- `outputs-path`: the path to save trajectories and evaluation results.
- `server-hostname`: the hostname of the server that hosts all the web services. It could be localhost if you are running the evaluation and services on the same machine. If the services are hosted on a remote machine, you must use the hostname of the remote machine rather than IP address.
- `version`: the version of the task images to use. Currently, the only supported version is 1.0.0.
- `start-percentile`: the start percentile of the task split, must be an integer between 0 to 99.
- `end-percentile`: the end percentile of the task split, must be an integer between 1 to 100 and larger than start-percentile.
The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take 2 days to finish evaluation if you run the whole task set.
To speed up evaluation, you can use `start-percentile` and `end-percentile` to split the tasks for higher parallelism,
provided concurrent runs are **targeting different servers**.
The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take a few days to finish evaluation.
Note: the script will automatically skip a task if it encounters an error. This usually happens when the OpenHands runtime dies due to some unexpected errors. This means even if the script finishes, it might not have evaluated all tasks. You can manually resume the evaluation by running the script again.

View File

@@ -267,9 +267,7 @@ def pre_login(
obs: BrowserOutputObservation = runtime.run_action(browser_action)
logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
if save_screenshots:
image_data = base64.b64decode(
obs.screenshot.replace('data:image/png;base64,', '')
)
image_data = base64.b64decode(obs.screenshot)
with open(os.path.join(directory, f'{image_id}.png'), 'wb') as file:
file.write(image_data)
image_id += 1

View File

@@ -13,16 +13,14 @@ from typing import List
import yaml
from browsing import pre_login
from evaluation.utils.shared import get_default_sandbox_config_for_eval
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
LLMConfig,
get_agent_config_arg,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
from openhands.core.config.agent_config import AgentConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import CmdRunAction, MessageAction
@@ -36,10 +34,7 @@ def get_config(
task_short_name: str,
mount_path_on_host: str,
llm_config: LLMConfig,
agent_config: AgentConfig | None,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
config = AppConfig(
run_as_openhands=False,
max_budget_per_task=4,
@@ -47,21 +42,21 @@ def get_config(
save_trajectory_path=os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json'
),
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
# using host network to access the host machine from the container
use_host_network=True,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
),
# we mount trajectories path so that trajectories, generated by OpenHands
# controller, can be accessible to the evaluator file in the runtime container
workspace_mount_path=mount_path_on_host,
workspace_mount_path_in_sandbox='/outputs',
)
config.set_llm_config(llm_config)
if agent_config:
config.set_agent_config(agent_config)
else:
logger.info('Agent config not provided, using default settings')
agent_config = AgentConfig(
enable_prompt_extensions=False,
)
config.set_agent_config(agent_config)
return config
@@ -152,21 +147,11 @@ def run_solver(
os.makedirs(screenshots_dir, exist_ok=True)
for image_id, obs in enumerate(state.history):
if isinstance(obs, BrowserOutputObservation):
image_data = base64.b64decode(
obs.screenshot.replace('data:image/png;base64,', '')
)
image_data = base64.b64decode(obs.screenshot)
with open(
os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
) as file:
file.write(image_data)
if obs.set_of_marks:
som_image_data = base64.b64decode(
obs.set_of_marks.replace('data:image/png;base64,', '')
)
with open(
os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb'
) as file:
file.write(som_image_data)
if save_final_state:
os.makedirs(state_dir, exist_ok=True)
@@ -229,10 +214,6 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
agent_config: AgentConfig | None = None
if args.agent_config:
agent_config = get_agent_config_arg(args.agent_config)
agent_llm_config: LLMConfig | None = None
if args.agent_llm_config:
agent_llm_config = get_llm_config_arg(args.agent_llm_config)
@@ -273,7 +254,7 @@ if __name__ == '__main__':
else:
temp_dir = tempfile.mkdtemp()
config: AppConfig = get_config(
args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config
args.task_image_name, task_short_name, temp_dir, agent_llm_config
)
runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

View File

@@ -44,10 +44,6 @@ while [[ $# -gt 0 ]]; do
ENV_LLM_CONFIG="$2"
shift 2
;;
--agent-config)
AGENT_CONFIG="$2"
shift 2
;;
--outputs-path)
OUTPUTS_PATH="$2"
shift 2
@@ -60,14 +56,6 @@ while [[ $# -gt 0 ]]; do
VERSION="$2"
shift 2
;;
--start-percentile)
START_PERCENTILE="$2"
shift 2
;;
--end-percentile)
END_PERCENTILE="$2"
shift 2
;;
*)
echo "Unknown argument: $1"
exit 1
@@ -81,54 +69,19 @@ if [[ ! "$OUTPUTS_PATH" = /* ]]; then
OUTPUTS_PATH="$(cd "$(dirname "$OUTPUTS_PATH")" 2>/dev/null && pwd)/$(basename "$OUTPUTS_PATH")"
fi
: "${START_PERCENTILE:=0}" # Default to 0 percentile (first line)
: "${END_PERCENTILE:=100}" # Default to 100 percentile (last line)
# Validate percentile ranges if provided
if ! [[ "$START_PERCENTILE" =~ ^[0-9]+$ ]] || ! [[ "$END_PERCENTILE" =~ ^[0-9]+$ ]]; then
echo "Error: Percentiles must be integers"
exit 1
fi
if [ "$START_PERCENTILE" -ge "$END_PERCENTILE" ]; then
echo "Error: Start percentile must be less than end percentile"
exit 1
fi
if [ "$START_PERCENTILE" -lt 0 ] || [ "$END_PERCENTILE" -gt 100 ]; then
echo "Error: Percentiles must be between 0 and 100"
exit 1
fi
echo "Using agent LLM config: $AGENT_LLM_CONFIG"
echo "Using environment LLM config: $ENV_LLM_CONFIG"
echo "Outputs path: $OUTPUTS_PATH"
echo "Server hostname: $SERVER_HOSTNAME"
echo "Version: $VERSION"
echo "Start Percentile: $START_PERCENTILE"
echo "End Percentile: $END_PERCENTILE"
echo "Downloading tasks.md..."
rm -f tasks.md
wget https://github.com/TheAgentCompany/TheAgentCompany/releases/download/${VERSION}/tasks.md
total_lines=$(cat tasks.md | grep "ghcr.io/theagentcompany" | wc -l)
if [ "$total_lines" -ne 175 ]; then
echo "Error: Expected 175 tasks in tasks.md but found $total_lines lines"
exit 1
fi
# Calculate line numbers based on percentiles
start_line=$(echo "scale=0; ($total_lines * $START_PERCENTILE / 100) + 1" | bc)
end_line=$(echo "scale=0; $total_lines * $END_PERCENTILE / 100" | bc)
echo "Using tasks No. $start_line to $end_line (inclusive) out of 1-175 tasks"
# Create a temporary file with just the desired range
temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"
while IFS= read -r task_image; do
docker pull $task_image
# Remove prefix using ## to remove longest matching pattern from start
task_name=${task_image##ghcr.io/theagentcompany/}
@@ -142,31 +95,21 @@ while IFS= read -r task_image; do
continue
fi
docker pull $task_image
# Build the Python command
COMMAND="poetry run python run_infer.py \
--agent-llm-config \"$AGENT_LLM_CONFIG\" \
--env-llm-config \"$ENV_LLM_CONFIG\" \
--outputs-path \"$OUTPUTS_PATH\" \
--server-hostname \"$SERVER_HOSTNAME\" \
--task-image-name \"$task_image\""
# Add agent-config if it's defined
if [ -n "$AGENT_CONFIG" ]; then
COMMAND="$COMMAND --agent-config $AGENT_CONFIG"
fi
export PYTHONPATH=evaluation/benchmarks/the_agent_company:$PYTHONPATH && \
eval "$COMMAND"
export PYTHONPATH=evaluation/benchmarks/the_agent_company:\$PYTHONPATH && \
poetry run python run_infer.py \
--agent-llm-config "$AGENT_LLM_CONFIG" \
--env-llm-config "$ENV_LLM_CONFIG" \
--outputs-path "$OUTPUTS_PATH" \
--server-hostname "$SERVER_HOSTNAME" \
--task-image-name "$task_image"
# Prune unused images and volumes
docker image rm "$task_image"
docker images "ghcr.io/all-hands-ai/runtime" -q | xargs -r docker rmi -f
docker volume prune -f
docker system prune -f
done < "$temp_file"
done < tasks.md
rm tasks.md "$temp_file"
rm tasks.md
echo "All evaluation completed successfully!"

View File

@@ -10,7 +10,6 @@ from evaluation.utils.shared import (
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -19,6 +18,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
get_parser,
)
@@ -41,14 +41,16 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -0,0 +1,674 @@
from collections import Counter
from copy import deepcopy
from difflib import SequenceMatcher
from io import BytesIO
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
import cv2
import numpy as np
import torch
from colormath.color_conversions import convert_color
from colormath.color_diff import delta_e_cie2000
from colormath.color_objects import LabColor, sRGBColor
from PIL import Image, ImageChops, ImageColor
from scipy.optimize import linear_sum_assignment
from transformers import CLIPModel, CLIPProcessor
from openhands.core.logger import openhands_logger as logger
def calculate_similarity(block1, block2):
"""Calculate text similarity between two blocks using SequenceMatcher."""
text_similarity = SequenceMatcher(None, block1['text'], block2['text']).ratio()
return text_similarity
def adjust_cost_for_context(cost_matrix, consecutive_bonus=1.0, window_size=20):
"""Adjust cost matrix by considering context similarity."""
if window_size <= 0:
return cost_matrix
n, m = cost_matrix.shape
adjusted_cost_matrix = np.copy(cost_matrix)
for i in range(n):
for j in range(m):
if adjusted_cost_matrix[i][j] >= -0.5:
continue
nearby_matrix = cost_matrix[
max(0, i - window_size) : min(n, i + window_size + 1),
max(0, j - window_size) : min(m, j + window_size + 1),
]
flattened_array = nearby_matrix.flatten()
sorted_array = np.sort(flattened_array)[::-1]
sorted_array = np.delete(
sorted_array, np.where(sorted_array == cost_matrix[i, j])[0][0]
)
top_k_elements = sorted_array[-window_size * 2 :]
bonus = consecutive_bonus * np.sum(top_k_elements)
adjusted_cost_matrix[i][j] += bonus
return adjusted_cost_matrix
def create_cost_matrix(A, B):
"""Create cost matrix for block matching."""
n = len(A)
m = len(B)
cost_matrix = np.zeros((n, m))
for i in range(n):
for j in range(m):
cost_matrix[i, j] = -calculate_similarity(A[i], B[j])
return cost_matrix
def calculate_distance_max_1d(x1, y1, x2, y2):
"""Calculate maximum 1D distance between points."""
return max(abs(x2 - x1), abs(y2 - y1))
def calculate_ratio(h1, h2):
"""Calculate ratio between two heights."""
return max(h1, h2) / min(h1, h2)
def rgb_to_lab(rgb):
"""Convert RGB color to Lab color space."""
rgb_color = sRGBColor(rgb[0], rgb[1], rgb[2], is_upscaled=True)
lab_color = convert_color(rgb_color, LabColor)
return lab_color
def color_similarity_ciede2000(rgb1, rgb2):
"""Calculate color similarity using CIEDE2000 formula."""
lab1 = rgb_to_lab(rgb1)
lab2 = rgb_to_lab(rgb2)
delta_e = delta_e_cie2000(lab1, lab2)
similarity = max(0, 1 - (delta_e / 100))
return similarity
def merge_blocks_wo_check(block1, block2):
"""Merge two blocks without additional checks."""
merged_text = block1['text'] + ' ' + block2['text']
x_min = min(block1['bbox'][0], block2['bbox'][0])
y_min = min(block1['bbox'][1], block2['bbox'][1])
x_max = max(
block1['bbox'][0] + block1['bbox'][2], block2['bbox'][0] + block2['bbox'][2]
)
y_max = max(
block1['bbox'][1] + block1['bbox'][3], block2['bbox'][1] + block2['bbox'][3]
)
merged_bbox = (x_min, y_min, x_max - x_min, y_max - y_min)
merged_color = tuple(
(color1 + color2) // 2
for color1, color2 in zip(block1['color'], block2['color'])
)
return {'text': merged_text, 'bbox': merged_bbox, 'color': merged_color}
def find_maximum_matching(A, B, consecutive_bonus, window_size):
"""Find maximum matching between two sets of blocks."""
cost_matrix = create_cost_matrix(A, B)
cost_matrix = adjust_cost_for_context(cost_matrix, consecutive_bonus, window_size)
row_ind, col_ind = linear_sum_assignment(cost_matrix)
current_cost = cost_matrix[row_ind, col_ind].tolist()
return list(zip(row_ind, col_ind)), current_cost, cost_matrix
def remove_indices(lst, indices):
"""Remove indices from list in reverse order."""
for index in sorted(indices, reverse=True):
if index < len(lst):
lst.pop(index)
return lst
def merge_blocks_by_list(blocks, merge_list):
"""Merge blocks according to merge list."""
pop_list = []
while merge_list:
i = merge_list[0][0]
j = merge_list[0][1]
blocks[i] = merge_blocks_wo_check(blocks[i], blocks[j])
pop_list.append(j)
merge_list.pop(0)
if merge_list:
new_merge_list = []
for k in range(len(merge_list)):
if (
merge_list[k][0] != i
and merge_list[k][1] != i
and merge_list[k][0] != j
and merge_list[k][1] != j
):
new_merge_list.append(merge_list[k])
merge_list = new_merge_list
remove_indices(blocks, pop_list)
return blocks
def difference_of_means(list1, list2):
"""Calculate difference of means between two lists."""
counter1 = Counter(list1)
counter2 = Counter(list2)
for element in set(list1) & set(list2):
common_count = min(counter1[element], counter2[element])
counter1[element] -= common_count
counter2[element] -= common_count
unique_list1 = [item for item in counter1.elements()]
unique_list2 = [item for item in counter2.elements()]
mean_list1 = sum(unique_list1) / len(unique_list1) if unique_list1 else 0
mean_list2 = sum(unique_list2) / len(unique_list2) if unique_list2 else 0
if mean_list1 - mean_list2 > 0:
if min(unique_list1) > min(unique_list2):
return mean_list1 - mean_list2
return 0.0
return mean_list1 - mean_list2
def find_possible_merge(A, B, consecutive_bonus, window_size, debug=False):
"""Find possible merges between blocks."""
merge_bonus = 0.0
merge_windows = 1
def sortFn(value):
return value[2]
while True:
A_changed = False
B_changed = False
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if len(A) >= 2:
merge_list = []
for i in range(len(A) - 1):
new_A = deepcopy(A)
new_A[i] = merge_blocks_wo_check(new_A[i], new_A[i + 1])
new_A.pop(i + 1)
updated_matching, updated_cost, _ = find_maximum_matching(
new_A, B, merge_bonus, merge_windows
)
diff = difference_of_means(current_cost, updated_cost)
if diff > 0.05:
merge_list.append([i, i + 1, diff])
merge_list.sort(key=sortFn, reverse=True)
if merge_list:
A_changed = True
A = merge_blocks_by_list(A, merge_list)
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if len(B) >= 2:
merge_list = []
for i in range(len(B) - 1):
new_B = deepcopy(B)
new_B[i] = merge_blocks_wo_check(new_B[i], new_B[i + 1])
new_B.pop(i + 1)
updated_matching, updated_cost, _ = find_maximum_matching(
A, new_B, merge_bonus, merge_windows
)
diff = difference_of_means(current_cost, updated_cost)
if diff > 0.05:
merge_list.append([i, i + 1, diff])
merge_list.sort(key=sortFn, reverse=True)
if merge_list:
B_changed = True
B = merge_blocks_by_list(B, merge_list)
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if not A_changed and not B_changed:
break
matching, _, _ = find_maximum_matching(A, B, consecutive_bonus, window_size)
return A, B, matching
def merge_blocks_by_bbox(blocks):
"""Merge blocks with same bounding box."""
merged_blocks = {}
for block in blocks:
bbox = tuple(block['bbox'])
if bbox in merged_blocks:
existing_block = merged_blocks[bbox]
existing_block['text'] += ' ' + block['text']
existing_block['color'] = [
(ec + c) / 2 for ec, c in zip(existing_block['color'], block['color'])
]
else:
merged_blocks[bbox] = block
return list(merged_blocks.values())
def mask_bounding_boxes_with_inpainting(image, bounding_boxes):
"""Mask bounding boxes in image using inpainting."""
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
mask = np.zeros(image_cv.shape[:2], dtype=np.uint8)
height, width = image_cv.shape[:2]
for bbox in bounding_boxes:
x_ratio, y_ratio, w_ratio, h_ratio = bbox
x = int(x_ratio * width)
y = int(y_ratio * height)
w = int(w_ratio * width)
h = int(h_ratio * height)
mask[y : y + h, x : x + w] = 255
inpainted_image = cv2.inpaint(image_cv, mask, 3, cv2.INPAINT_TELEA)
return Image.fromarray(cv2.cvtColor(inpainted_image, cv2.COLOR_BGR2RGB))
def rescale_and_mask(image, blocks):
"""Rescale image and mask blocks."""
if blocks:
image = mask_bounding_boxes_with_inpainting(image, blocks)
width, height = image.size
if width < height:
new_size = (width, width)
else:
new_size = (height, height)
return image.resize(new_size, Image.LANCZOS)
def calculate_clip_similarity(image1, image2, blocks1, blocks2):
"""Calculate CLIP similarity between two images."""
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# Mask and preprocess images
image1_masked = rescale_and_mask(image1, [block['bbox'] for block in blocks1])
image2_masked = rescale_and_mask(image2, [block['bbox'] for block in blocks2])
inputs = processor(
images=[image1_masked, image2_masked], return_tensors='pt', padding=True
)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Calculate features and similarity
with torch.no_grad():
image_features = model.get_image_features(**inputs)
image_features1 = image_features[0].unsqueeze(0)
image_features2 = image_features[1].unsqueeze(0)
image_features1 /= image_features1.norm(dim=-1, keepdim=True)
image_features2 /= image_features2.norm(dim=-1, keepdim=True)
similarity = (image_features1 @ image_features2.T).item()
return similarity
def rgb_to_hex(rgb):
"""Convert an RGB tuple to hexadecimal format."""
return '{:02X}{:02X}{:02X}'.format(*rgb)
class ColorPool:
def __init__(self, offset=0):
color_values = list(range(10, 251, 16))
color_list = [((r + offset) % 256, (g + offset) % 256, (b + offset) % 256)
for r in color_values for g in color_values for b in color_values]
self.color_pool = [rgb_to_hex(color) for color in color_list]
def pop_color(self):
if self.color_pool:
return self.color_pool.pop()
else:
raise NotImplementedError
def process_html_str(html_str, offset=0):
"""Process HTML string to assign unique colors to text elements."""
soup = BeautifulSoup(html_str, 'html.parser')
def update_style(element, property_name, value):
important_value = f"{value} !important"
styles = element.attrs.get('style', '').split(';')
updated_styles = [s for s in styles if not s.strip().startswith(property_name) and len(s.strip()) > 0]
updated_styles.append(f"{property_name}: {important_value}")
element['style'] = '; '.join(updated_styles).strip()
# Set background color of all elements to transparent white
for element in soup.find_all(True):
update_style(element, 'background-color', 'rgba(255, 255, 255, 0.0)')
color_pool = ColorPool(offset)
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'a', 'b', 'li',
'table', 'td', 'th', 'button', 'footer', 'header', 'figcaption']
for tag in soup.find_all(text_tags):
color = f"#{color_pool.pop_color()}"
update_style(tag, 'color', color)
update_style(tag, 'opacity', '1.0')
return str(soup)
def similar(n1, n2):
"""Check if two numbers are similar within a threshold."""
return abs(n1 - n2) <= 8
def find_different_pixels(image1, image2):
"""Find pixels that differ between two images."""
if image1.size != image2.size:
logger.warning("Images are not the same size")
return None
image1 = image1.convert('RGB')
image2 = image2.convert('RGB')
pixels1 = image1.load()
pixels2 = image2.load()
different_pixels = []
for x in range(image1.size[0]):
for y in range(image1.size[1]):
r1, g1, b1 = pixels1[x, y]
r2, g2, b2 = pixels2[x, y]
if similar((r1 + 50) % 256, r2) and similar((g1 + 50) % 256, g2) and similar((b1 + 50) % 256, b2):
different_pixels.append((y, x))
return np.stack(different_pixels) if different_pixels else None
def extract_text_with_color(html_str):
"""Extract text and color information from HTML string."""
def get_color(tag):
if 'style' in tag.attrs:
styles = tag['style'].split(';')
color_style = [s for s in styles if 'color' in s and 'background-color' not in s]
if color_style:
color = color_style[-1].split(':')[1].strip().replace(" !important", "")
if color[0] == "#":
return color
else:
try:
if color.startswith('rgb'):
color = tuple(map(int, color[4:-1].split(',')))
else:
color = ImageColor.getrgb(color)
return '#{:02x}{:02x}{:02x}'.format(*color)
except ValueError:
logger.warning(f"Unable to identify or convert color: {color}")
return None
return None
def extract_text_recursive(element, parent_color='#000000'):
if isinstance(element, Comment):
return None
elif isinstance(element, NavigableString):
text = element.strip()
return (text, parent_color) if text else None
elif isinstance(element, Tag):
current_color = get_color(element) or parent_color
children_texts = filter(None, [extract_text_recursive(child, current_color)
for child in element.children])
return list(children_texts)
soup = BeautifulSoup(html_str, 'html.parser')
body = soup.body
return extract_text_recursive(body) if body else []
def flatten_tree(tree):
"""Flatten a nested tree structure into a list."""
flat_list = []
def flatten(node):
if isinstance(node, list):
for item in node:
flatten(item)
else:
flat_list.append(node)
flatten(tree)
return flat_list
def get_blocks_from_image_diff_pixels(image, html_text_color_tree, different_pixels):
"""Extract text blocks from image using color differences."""
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
x_w = image_cv.shape[0]
y_w = image_cv.shape[1]
def hex_to_bgr(hex_color):
hex_color = hex_color.lstrip('#')
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
return rgb[::-1]
def get_intersect(arr1, arr2):
arr1_reshaped = arr1.view([('', arr1.dtype)] * arr1.shape[1])
arr2_reshaped = arr2.view([('', arr2.dtype)] * arr2.shape[1])
common_rows = np.intersect1d(arr1_reshaped, arr2_reshaped)
return common_rows.view(arr1.dtype).reshape(-1, arr1.shape[1])
blocks = []
for item in html_text_color_tree:
try:
color = np.array(hex_to_bgr(item[1]), dtype="uint8")
except:
continue
lower = color - 4
upper = color + 4
mask = cv2.inRange(image_cv, lower, upper)
coords = np.column_stack(np.where(mask > 0))
coords = get_intersect(coords, different_pixels)
if coords.size == 0:
continue
x_min, y_min = np.min(coords, axis=0)
x_max, y_max = np.max(coords, axis=0)
# Get average color from original image
color_coords = coords.copy()
color_coords = color_coords[color_coords[:, 0] <= x_max]
color_coords = color_coords[color_coords[:, 1] <= y_max]
colors = [image_cv[x, y] for x, y in color_coords]
avg_color = tuple(map(int, np.mean(colors, axis=0)))[::-1] # Convert BGR to RGB
blocks.append({
'text': item[0].lower(),
'bbox': (y_min / y_w, x_min / x_w, (y_max - y_min + 1) / y_w, (x_max - x_min + 1) / x_w),
'color': avg_color
})
return blocks
def get_blocks_from_html(html_str, image1):
"""Extract text blocks from HTML and image."""
# Process HTML with two different color offsets
html_str_1 = process_html_str(html_str, offset=0)
html_str_2 = process_html_str(html_str, offset=50)
# Render both HTML versions to images
# TODO: Screenshot html_str_2
filter_color = (255, 0, 0)
image2 = Image.new("RGB", image1.size, filter_color)
# Find pixels that differ between the two rendered images
different_pixels = find_different_pixels(image1, image2)
if different_pixels is None:
logger.warning("Unable to get pixels with different colors")
return []
# Extract text and color information from HTML
html_text_color_tree = flatten_tree(extract_text_with_color(html_str_1))
try:
blocks = get_blocks_from_image_diff_pixels(image1, html_text_color_tree, different_pixels)
except Exception as e:
logger.warning(f"Unable to get blocks: {e}")
return []
return blocks
def evaluate(task, generated_img):
"""Evaluate generated image against reference image using multiple metrics."""
# Load reference image
post_image = task['post_image']
# Extract blocks from HTML and images
post_blocks = get_blocks_from_html(task['post_html'], post_image)
gen_blocks = get_blocks_from_html(task['gen_html'], generated_img)
print("block details", post_blocks, gen_blocks)
if not post_blocks or not gen_blocks:
# Fallback to basic CLIP and pixel comparison if no blocks available
clip_score = calculate_clip_similarity(post_image, generated_img, [], [])
logger.info(f'CLIP similarity score: {clip_score}')
# Pixel comparison
diff = ImageChops.difference(generated_img, post_image)
pixel_match = not diff.getbbox()
logger.info(
f"Pixel difference analysis: {'No difference' if pixel_match else 'Differences found'}"
)
return clip_score > 0.95 or pixel_match
# Merge blocks with same bounding boxes
post_blocks = merge_blocks_by_bbox(post_blocks)
gen_blocks = merge_blocks_by_bbox(gen_blocks)
# Find optimal block matching
consecutive_bonus, window_size = 0.1, 1
gen_blocks_m, post_blocks_m, matching = find_possible_merge(
gen_blocks, deepcopy(post_blocks), consecutive_bonus, window_size
)
# Filter matches with low similarity
filtered_matching = []
for i, j in matching:
text_similarity = calculate_similarity(gen_blocks_m[i], post_blocks_m[j])
if text_similarity >= 0.5:
filtered_matching.append([i, j, text_similarity])
matching = filtered_matching
if not matching:
logger.warning('No matching blocks found')
clip_score = calculate_clip_similarity(
post_image, generated_img, gen_blocks, post_blocks
)
return clip_score > 0.95
# Calculate metrics for matched blocks
indices1 = [item[0] for item in matching]
indices2 = [item[1] for item in matching]
# Calculate unmatched areas
unmatched_area_1 = sum(
block['bbox'][2] * block['bbox'][3]
for i, block in enumerate(gen_blocks_m)
if i not in indices1
)
unmatched_area_2 = sum(
block['bbox'][2] * block['bbox'][3]
for j, block in enumerate(post_blocks_m)
if j not in indices2
)
total_unmatched_area = unmatched_area_1 + unmatched_area_2
# Calculate metrics for matched blocks
matched_areas = []
text_scores = []
position_scores = []
color_scores = []
for i, j, text_similarity in matching:
# Area
block_area = (
gen_blocks_m[i]['bbox'][2] * gen_blocks_m[i]['bbox'][3]
+ post_blocks_m[j]['bbox'][2] * post_blocks_m[j]['bbox'][3]
)
matched_areas.append(block_area)
# Position similarity
position_similarity = 1 - calculate_distance_max_1d(
gen_blocks_m[i]['bbox'][0] + gen_blocks_m[i]['bbox'][2] / 2,
gen_blocks_m[i]['bbox'][1] + gen_blocks_m[i]['bbox'][3] / 2,
post_blocks_m[j]['bbox'][0] + post_blocks_m[j]['bbox'][2] / 2,
post_blocks_m[j]['bbox'][1] + post_blocks_m[j]['bbox'][3] / 2,
)
# Color similarity
color_similarity = color_similarity_ciede2000(
gen_blocks_m[i]['color'], post_blocks_m[j]['color']
)
text_scores.append(text_similarity)
position_scores.append(position_similarity)
color_scores.append(color_similarity)
# Calculate final scores
total_area = sum(matched_areas) + total_unmatched_area
size_score = sum(matched_areas) / total_area if total_area > 0 else 0
text_score = np.mean(text_scores) if text_scores else 0
position_score = np.mean(position_scores) if position_scores else 0
color_score = np.mean(color_scores) if color_scores else 0
clip_score = calculate_clip_similarity(
post_image, generated_img, gen_blocks, post_blocks
)
# Combine scores with equal weights
final_score = 0.2 * (
size_score + text_score + position_score + color_score + clip_score
)
logger.info('Evaluation scores:')
logger.info(f'- Size score: {size_score:.3f}')
logger.info(f'- Text score: {text_score:.3f}')
logger.info(f'- Position score: {position_score:.3f}')
logger.info(f'- Color score: {color_score:.3f}')
logger.info(f'- CLIP score: {clip_score:.3f}')
logger.info(f'- Final score: {final_score:.3f}')
return final_score > 0.8 # Consider it a match if final score > 80%
def png_to_bytes(png):
buffer = BytesIO()
png.save(buffer, format='PNG')
image_bytes = buffer.getvalue()
return image_bytes
def bytes_to_image(image_bytes):
"""Convert bytes to a Pillow Image object."""
return Image.open(BytesIO(image_bytes))
if __name__ == '__main__':
first_image = Image.open('./evaluation/visualcodebench/data/1/post.png')
image = Image.open('./evaluation/visualcodebench/data/1/prev.png')
html_file = open('./evaluation/visualcodebench/data/1/post/index.html', 'r')
first_html = html_file.read()
html_file.close()
html_file = open('./evaluation/visualcodebench/data/1/prev/index.html', 'r')
gen_html = html_file.read()
html_file.close()
sample = {'post_image': first_image, "post_html": first_html, "gen_html": gen_html}
evaluate(sample, image)

View File

@@ -0,0 +1,97 @@
import base64
import os
from io import BytesIO
import pandas as pd
from huggingface_hub import snapshot_download
from PIL import PngImagePlugin
from tqdm import tqdm
from openhands.core.logger import openhands_logger as logger
REPO_DOWNLOAD_DIR = (
'./evaluation/visualcodebench/' # Directory to store the downloaded repository
)
def download_repository():
"""
Download the entire repository from Hugging Face Hub.
This function clones the repository into REPO_DOWNLOAD_DIR.
"""
repo_id = 'rvmalhot/VisualCodeBench'
try:
logger.info(f"Downloading repository '{repo_id}'...")
snapshot_download(
repo_id=repo_id,
local_dir=REPO_DOWNLOAD_DIR,
repo_type='dataset',
ignore_patterns=None, # Download all files
)
logger.info(f"Repository downloaded to '{REPO_DOWNLOAD_DIR}'.")
except Exception as e:
logger.error(f"Error downloading repository '{repo_id}': {e}")
raise e
def format_task_dict(example):
instance_id = example['id']
prev_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/prev')
post_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/post')
# Check if 'prev' and 'post' directories exist
prev_exists = os.path.exists(prev_remote_path)
post_exists = os.path.exists(post_remote_path)
if prev_exists and post_exists:
skip = False
else:
skip = True
task = {
'instance_id': instance_id,
'prev_image': example['prev_image'],
'post_image': example['post_image'],
'changes': example['changes'],
'prev_code_files': example['prev_code_files'],
'post_code_files': example['post_code_files'],
'skip': skip,
}
return task
def prepare_visualcodebench(dataset):
logger.info('Processing dataset')
dataset_processed = []
for example in tqdm(dataset['train']):
formatted_example = format_task_dict(example)
if formatted_example['skip']:
continue
del formatted_example['skip']
dataset_processed.append(formatted_example)
return pd.DataFrame(dataset_processed)
def pil_image_to_base64(image: PngImagePlugin.PngImageFile) -> str:
"""
Converts a PIL image to a Base64-encoded string.
Parameters:
- image (PngImagePlugin.PngImageFile): The PIL image to convert.
Returns:
- str: The Base64-encoded string of the image.
"""
if not isinstance(image, PngImagePlugin.PngImageFile):
raise ValueError(
'The provided image is not a PIL.PngImagePlugin.PngImageFile instance.'
)
buffered = BytesIO()
image.save(buffered, format='PNG')
img_bytes = buffered.getvalue()
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
base64_with_prefix = f'data:image/png;base64,{img_base64}'
return [base64_with_prefix]

View File

@@ -0,0 +1,247 @@
# FILE: run_infer.py
import asyncio
import os
import shutil
import tempfile
from functools import partial
import pandas as pd
from datasets import load_dataset
# from evaluation.benchmarks.visualcodebench.eval import capture_screenshot
from evaluation.benchmarks.visualcodebench.prepare import (
REPO_DOWNLOAD_DIR,
download_repository,
pil_image_to_base64,
prepare_visualcodebench,
)
from evaluation.utils.shared import (
EvalMetadata,
assert_and_raise,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
)
from openhands.core.config.utils import parse_arguments
from openhands.core.logger import openhands_logger as logger # Import OpenHands logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action.commands import CmdRunAction
from openhands.events.action.message import MessageAction
from openhands.events.observation.commands import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync
# Define workspace and output directories
WORKSPACE_DIR = './workspace'
FAKE_RESPONSES = {
'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
}
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info('-' * 30)
logger.info('BEGIN Runtime Initialization Fn')
logger.info('-' * 30)
workspace_dir_name = instance['instance_id']
obs: CmdOutputObservation
action = CmdRunAction(command='mkdir -p /workspace/{workspace_dir_name}')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to create /workspace/{workspace_dir_name}: {str(obs)}',
)
file_path = REPO_DOWNLOAD_DIR + f'data/{workspace_dir_name}/prev/index.html'
runtime.copy_to(file_path, f'/workspace/{workspace_dir_name}')
logger.info(f'Copied code file for instance {workspace_dir_name}')
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
)
logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
logger.info('-' * 30)
def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> str:
# TODO: extract edited HTML file from agent workspace
# temp_zip = runtime.copy_from(f'/workspace/{instance.instance_id}')
# file_name = f'/workspace/{instance.instance_id}/index.html'
# with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
# if file_name in zip_ref.namelist():
# with zip_ref.open(file_name) as file:
# file_content = file.read().decode('utf-8') # Decode bytes to string
# else:
# raise FileNotFoundError(f"'{file_name}' not found in the ZIP archive.")
with tempfile.TemporaryDirectory() as tmpdir:
src_folder = REPO_DOWNLOAD_DIR + f'data/{instance.instance_id}/post/'
shutil.copytree(src_folder, tmpdir, dirs_exist_ok=True)
# image = capture_screenshot(tmpdir)
# if image is not None:
# shutil.copy(os.path.join(tmpdir, 'final_screenshot.png'), REPO_DOWNLOAD_DIR)
def process_instance(
instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True
):
config = get_config(metadata)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
# =============================================
# build instruction
# =============================================
# Prepare instruction
instruction = (
f"Modify the HTML/CSS according to the following instruction:\n\n"
f"{instance['changes']}\n\n"
)
instruction += (
'IMPORTANT: You should ONLY interact with the environment provided '
'to you AND NEVER ASK FOR HUMAN HELP.\n'
)
# =============================================
# create sandbox and run the agent
# =============================================
runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
try:
initialize_runtime(runtime, instance=instance)
image_urls = pil_image_to_base64(instance['prev_image'])
action = MessageAction(content=instruction, image_urls=image_urls)
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=action,
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
)
)
if state is None:
raise ValueError('State should not be None.')
# =============================================
# result evaluation
# =============================================
return_val = complete_runtime(runtime, instance)
logger.info(f'Return value {return_val}')
finally:
runtime.close()
# TODO: return EVAL output
def main():
"""Main function to run the evaluation."""
# args = parse_args()
args = parse_arguments()
logger.info(f"\n{'='*80}\nStarting VisualCodeBench Evaluation\n{'='*80}")
logger.info(f'Agent: {args.agent_cls}')
logger.info(f'Model: {args.llm_config}')
logger.info(f'Max iterations: {args.max_iterations}')
logger.info(f'Eval limit: {args.eval_n_limit}')
logger.info(f'Num workers: {args.eval_num_workers}\n')
logger.info(f'Eval output: {args.eval_output_dir}\n')
# Step 1: Download the entire repository once
logger.info('Downloading repository...')
download_repository()
# Step 2: Load Dataset
logger.info('Loading dataset...')
dataset = load_dataset(REPO_DOWNLOAD_DIR)
# Step 3: Prepare dataset
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
logger.error(f'Could not find LLM config: {args.llm_config}')
raise ValueError(f'Could not find LLM config: {args.llm_config}')
metadata = make_metadata(
llm_config,
'VisualCodeBench',
args.agent_cls,
args.max_iterations,
args.eval_note,
'evaluation/output/',
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
dataset = prepare_visualcodebench(dataset)
instances = prepare_dataset(dataset, output_file, eval_n_limit=args.eval_n_limit)
# Step 4: Run eval
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,46 @@
#!/bin/bash
set -eo pipefail
source "evaluation/utils/version_control.sh"
# Check if required arguments are provided
if [ "$#" -lt 4 ]; then
echo "Usage: $0 [model_config] [commit_hash] [agent_cls] [eval_limit] [num_workers]"
echo "Example: $0 llm.eval_gpt_4o_mini HEAD CodeActAgent 5 1"
exit 1
fi
MODEL_CONFIG=$1
COMMIT_HASH=$2
AGENT_CLS=$3
EVAL_LIMIT=$4
NUM_WORKERS=${5:-1} # Default to 1 worker if not specified
# Checkout the specified commit
checkout_eval_branch
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default CodeActAgent"
AGENT="CodeActAgent"
fi
get_openhands_version
echo "AGENT: $AGENT"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="export PYTHONPATH=evaluation/benchmarks/visualcodebench:\$PYTHONPATH && poetry run python evaluation/benchmarks/visualcodebench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--eval-num-workers $NUM_WORKERS \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

View File

@@ -0,0 +1,167 @@
import http
import os
import socket
import socketserver
import threading
import time
from io import BytesIO
import requests
from PIL import Image, ImageChops
from playwright.sync_api import sync_playwright
from openhands.core.logger import openhands_logger as logger
def get_free_port():
"""Find a free port to run the HTTP server."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def start_http_server(tmpdir):
port = get_free_port()
class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def translate_path(self, path):
# Serve files from the specified directory instead of the current working directory
path = super().translate_path(path)
relative_path = os.path.relpath(path, os.getcwd())
return os.path.join(tmpdir, relative_path)
handler = CustomHTTPRequestHandler
server = socketserver.TCPServer(('', port), handler)
return server, port
def capture_screenshot(tmpdir):
server, port = start_http_server(tmpdir)
server_thread = threading.Thread(target=server.serve_forever)
server_thread.daemon = True
server_thread.start()
time.sleep(10)
image = None
try:
server_url = f'http://localhost:{port}/'
if not is_server_reachable(server_url):
raise RuntimeError(f'Server not reachable at {server_url}')
screenshot_path = os.path.join(tmpdir, 'final_screenshot.png')
capture_screenshot_playwright(server_url, screenshot_path)
image = Image.open(screenshot_path)
image.load()
finally:
# Shut down the server and clean up
server.shutdown()
server.server_close()
return image
def is_server_reachable(url):
"""
Check if the local server is reachable.
"""
try:
response = requests.get(url, timeout=5) # Set a 5-second timeout
if response.status_code == 200:
logger.info(f'Server is reachable at {url}')
return True
else:
logger.warning(
f'Server responded with status code {response.status_code} at {url}'
)
return False
except requests.ConnectionError as e:
logger.error(f'Failed to connect to server at {url}: {e}')
return False
def capture_screenshot_playwright(url, screenshot_path):
"""Capture a screenshot of the given URL using Playwright."""
try:
with sync_playwright() as p:
logger.info('Launching browser...')
browser = p.chromium.launch(timeout=10000) # 10 seconds for browser launch
logger.info('Creating a new page...')
page = browser.new_page()
logger.info(f'Navigating to URL: {url}')
try:
page.goto(url, timeout=60 * 1000) # Set timeout to 5 seconds
logger.info('Page navigation completed.')
except Exception as e:
logger.warning(f'Page navigation timed out. {e}. Continuing...')
logger.info('Waiting for network to be idle...')
try:
page.wait_for_load_state(
'networkidle', timeout=60 * 1000
) # Set timeout to 5 seconds
logger.info('Page load state reached.')
except Exception as e:
logger.warning(f'Page load state timed out. {e}. Continuing...')
logger.info('Capturing screenshot...')
page.screenshot(
path=screenshot_path, full_page=True
) # Capture full page screenshot
logger.info(f'Screenshot saved to {screenshot_path}')
browser.close()
return True
except Exception as e:
logger.error(f'Error capturing screenshot with Playwright: {e}')
return False
def evaluate(task, screenshot_path):
"""Compare generated screenshot with post_image using CLIP score."""
try:
import torch
from transformers import CLIPModel, CLIPProcessor
# Load CLIP model and processor
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
# Load images
post_image = Image.open(BytesIO(task['post_image']))
generated_img = Image.open(screenshot_path)
# Process images
inputs = processor(
images=[post_image, generated_img], return_tensors='pt', padding=True
)
# Get image features
image_features = model.get_image_features(**inputs)
# Calculate cosine similarity
similarity = torch.nn.functional.cosine_similarity(
image_features[0].unsqueeze(0), image_features[1].unsqueeze(0)
).item()
logger.info(f'CLIP similarity score: {similarity}')
return similarity > 0.95 # Consider it a match if similarity > 95%
except Exception as e:
logger.error(f'Error in CLIP evaluation: {e}')
# Fallback to pixel comparison if CLIP fails
try:
post_image = Image.open(BytesIO(task['post_image']))
generated_img = Image.open(screenshot_path)
# Compare images directly without converting to bytes
diff = ImageChops.difference(generated_img, post_image)
logger.info(
f"Pixel difference analysis: {'No difference' if not diff.getbbox() else 'Differences found'}"
)
return not diff.getbbox()
except Exception as ex:
logger.error(f'Error in fallback evaluation: {ex}')
return False

View File

@@ -11,7 +11,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -21,6 +20,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -55,29 +55,31 @@ def get_config(
assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
sandbox_config.browsergym_eval_env = env_id
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'OPENAI_BASE_URL': openai_base_url,
'VWA_CLASSIFIEDS': f'{base_url}:9980',
'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
'VWA_SHOPPING': f'{base_url}:7770',
'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
'VWA_REDDIT': f'{base_url}:9999',
'VWA_GITLAB': f'{base_url}:8023',
'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399',
}
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
runtime_startup_env_vars={
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'OPENAI_BASE_URL': openai_base_url,
'VWA_CLASSIFIEDS': f'{base_url}:9980',
'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
'VWA_SHOPPING': f'{base_url}:7770',
'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
'VWA_REDDIT': f'{base_url}:9999',
'VWA_GITLAB': f'{base_url}:8023',
'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399',
},
timeout=300,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -11,7 +11,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -20,6 +19,7 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -50,26 +50,28 @@ def get_config(
assert base_url is not None, 'WEBARENA_BASE_URL must be set'
assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
sandbox_config.browsergym_eval_env = env_id
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'SHOPPING': f'{base_url}:7770/',
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
'REDDIT': f'{base_url}:9999',
'GITLAB': f'{base_url}:8023',
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
}
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
runtime_startup_env_vars={
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'SHOPPING': f'{base_url}:7770/',
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
'REDDIT': f'{base_url}:9999',
'GITLAB': f'{base_url}:8023',
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
},
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

View File

@@ -8,7 +8,6 @@ from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestRes
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
get_default_sandbox_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -22,6 +21,7 @@ from openhands.controller.state.state import State
from openhands.core.config import (
AgentConfig,
AppConfig,
SandboxConfig,
get_llm_config_arg,
parse_arguments,
)
@@ -43,14 +43,23 @@ def get_config(
metadata: EvalMetadata,
instance_id: str,
) -> AppConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.platform = 'linux/amd64'
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
sandbox=SandboxConfig(
# use default base_container_image
enable_auto_lint=True,
use_host_network=False,
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,

Some files were not shown because too many files have changed in this diff Show More