Compare commits

..

7 Commits

Author SHA1 Message Date
rohitvinodmalhotra@gmail.com
532a284d5c migrate to use OH version 2025-01-26 15:24:35 -05:00
rohitvinodmalhotra@gmail.com
43f6104967 Merge branch 'main' into eval/visualcodebench 2025-01-26 15:14:28 -05:00
openhands
e249b920ff feat: adapt Design2Code block detection for in-memory evaluation 2024-11-30 19:28:22 +00:00
rohitvinodmalhotra@gmail.com
d920a69f69 adding back server code 2024-11-30 14:00:25 -05:00
openhands
a8ce888981 refactor: adapt Design2Code evaluation metrics 2024-11-30 17:17:05 +00:00
rohitvinodmalhotra@gmail.com
e22ddc0dd6 uncomment agent run 2024-11-26 17:00:07 -05:00
rohitvinodmalhotra@gmail.com
c370912f12 adding eval scripts 2024-11-26 16:57:19 -05:00
311 changed files with 8303 additions and 14490 deletions

View File

@@ -30,7 +30,6 @@ body:
description: How are you running OpenHands?
options:
- Docker command in README
- GitHub resolver
- Development workflow
- app.all-hands.dev
- Other

View File

@@ -19,6 +19,20 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3

View File

@@ -41,8 +41,22 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3.4.0
uses: docker/setup-qemu-action@v3.3.0
with:
image: tonistiigi/binfmt:latest
- name: Login to GHCR
@@ -90,8 +104,22 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v3.4.0
uses: docker/setup-qemu-action@v3.3.0
with:
image: tonistiigi/binfmt:latest
- name: Login to GHCR
@@ -191,7 +219,7 @@ jobs:
exit 1
fi
# Run unit tests with the Docker runtime Docker images as root
# Run unit tests with the EventStream runtime Docker images as root
test_runtime_root:
name: RT Unit Tests (Root)
needs: [ghcr_build_runtime]
@@ -202,69 +230,20 @@ jobs:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
# Forked repos can't push to GHCR, so we need to download the image as an artifact
- name: Download runtime image for fork
if: github.event.pull_request.head.repo.fork
uses: actions/download-artifact@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
name: runtime-${{ matrix.base_image }}
path: /tmp
- name: Load runtime image for fork
if: github.event.pull_request.head.repo.fork
run: |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Run docker runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs
poetry run pip install pytest-xdist
# Install to be able to retry on failures for flaky tests
poetry run pip install pytest-rerunfailures
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=docker \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run unit tests with the Docker runtime Docker images as openhands user
test_runtime_oh:
name: RT Unit Tests (openhands)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
matrix:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
@@ -307,7 +286,84 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=docker \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# Run unit tests with the EventStream runtime Docker images as openhands user
test_runtime_oh:
name: RT Unit Tests (openhands)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
matrix:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: true
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
# Forked repos can't push to GHCR, so we need to download the image as an artifact
- name: Download runtime image for fork
if: github.event.pull_request.head.repo.fork
uses: actions/download-artifact@v4
with:
name: runtime-${{ matrix.base_image }}
path: /tmp
- name: Load runtime image for fork
if: github.event.pull_request.head.repo.fork
run: |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Run runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs
poetry run pip install pytest-xdist
# Install to be able to retry on failures for flaky tests
poetry run pip install pytest-rerunfailures
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \

View File

@@ -20,10 +20,6 @@ on:
required: false
type: string
default: "anthropic/claude-3-5-sonnet-20241022"
LLM_API_VERSION:
required: false
type: string
default: ""
base_container_image:
required: false
type: string
@@ -120,7 +116,6 @@ jobs:
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
PAT_USERNAME: ${{ secrets.PAT_USERNAME }}
GITHUB_TOKEN: ${{ github.token }}
@@ -177,7 +172,7 @@ jobs:
echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
# Set branch variables
echo "TARGET_BRANCH=${{ inputs.target_branch || 'main' }}" >> $GITHUB_ENV
echo "TARGET_BRANCH=${{ inputs.target_branch }}" >> $GITHUB_ENV
- name: Comment on issue with start message
uses: actions/github-script@v7
@@ -235,7 +230,6 @@ jobs:
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PYTHONPATH: ""
run: |
cd /tmp && python -m openhands.resolver.resolve_issue \
@@ -271,13 +265,11 @@ jobs:
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
LLM_API_VERSION: ${{ inputs.LLM_API_VERSION }}
PYTHONPATH: ""
run: |
if [ "${{ steps.check_result.outputs.RESOLUTION_SUCCESS }}" == "true" ]; then
cd /tmp && python -m openhands.resolver.send_pull_request \
--issue-number ${{ env.ISSUE_NUMBER }} \
--target-branch ${{ env.TARGET_BRANCH }} \
--pr-type draft \
--reviewer ${{ github.actor }} | tee pr_result.txt && \
grep "draft created" pr_result.txt | sed 's/.*\///g' > pr_number.txt

98
.github/workflows/py-unit-tests-mac.yml vendored Normal file
View File

@@ -0,0 +1,98 @@
# Workflow that runs python unit tests on mac
name: Run Python Unit Tests Mac
# This job is flaky so only run it nightly
on:
schedule:
- cron: '0 0 * * *'
jobs:
# Run python unit tests on macOS
test-on-macos:
name: Python Unit Tests on macOS
runs-on: macos-14
env:
INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
strategy:
matrix:
python-version: ['3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install tmux
run: brew install tmux
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: poetry install --without evaluation,llama-index
- name: Install & Start Docker
if: env.INSTALL_DOCKER == '1'
run: |
INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
# Uninstall colima to upgrade to the latest version
if brew list colima &>/dev/null; then
brew uninstall colima
# unlinking colima dependency: go
brew uninstall go@1.21
fi
rm -rf ~/.colima ~/.lima
brew install --HEAD colima
brew install docker
start_colima() {
# Find a free port in the range 10000-20000
RANDOM_PORT=$((RANDOM % 10001 + 10000))
# Original line:
if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
echo "Failed to start Colima."
return 1
fi
return 0
}
# Attempt to start Colima for 5 total attempts:
ATTEMPT_LIMIT=5
for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
if start_colima; then
echo "Colima started successfully."
break
else
colima stop -f
sleep 10
colima delete -f
if [ $i -eq $ATTEMPT_LIMIT ]; then
exit 1
fi
sleep 10
fi
done
# For testcontainers to find the Colima socket
# https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
- name: Build Environment
run: make build
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
- name: Run Tests
run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -19,4 +19,3 @@ jobs:
close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
days-before-close: 7
operations-per-run: 150

1
.gitignore vendored
View File

@@ -176,6 +176,7 @@ evaluation/gorilla/data
evaluation/toolqa/data
evaluation/scienceagentbench/benchmark
evaluation/commit0_bench/repos
evaluation/visualcodebench/
# openhands resolver
output/

View File

@@ -1,172 +0,0 @@
# OpenHands Glossary
### Agent
The core AI entity in OpenHands that can perform software development tasks by interacting with tools, browsing the web, and modifying code.
#### Agent Controller
A component that manages the agent's lifecycle, handles its state, and coordinates interactions between the agent and various tools.
#### Agent Delegation
The ability of an agent to hand off specific tasks to other specialized agents for better task completion.
#### Agent Hub
A central registry of different agent types and their capabilities, allowing for easy agent selection and instantiation.
#### Agent Skill
A specific capability or function that an agent can perform, such as file manipulation, web browsing, or code editing.
#### Agent State
The current context and status of an agent, including its memory, active tools, and ongoing tasks.
#### CodeAct Agent
[A generalist agent in OpenHands](https://arxiv.org/abs/2407.16741) designed to perform tasks by editing and executing code.
### Browser
A system for web-based interactions and tasks.
#### Browser Gym
A testing and evaluation environment for browser-based agent interactions and tasks.
#### Web Browser Tool
A tool that enables agents to interact with web pages and perform web-based tasks.
### Commands
Terminal and execution related functionality.
#### Bash Session
A persistent terminal session that maintains state and history for bash command execution.
This uses tmux under the hood.
### Configuration
System-wide settings and options.
#### Agent Configuration
Settings that define an agent's behavior, capabilities, and limitations, including available tools and runtime settings.
#### Configuration Options
Settings that control various aspects of OpenHands behavior, including runtime, security, and agent settings.
#### LLM Config
Configuration settings for language models used by agents, including model selection and parameters.
#### LLM Draft Config
Settings for draft mode operations with language models, typically used for faster, lower-quality responses.
#### Runtime Configuration
Settings that define how the runtime environment should be set up and operated.
#### Security Options
Configuration settings that control security features and restrictions.
### Conversation
A sequence of interactions between a user and an agent, including messages, actions, and their results.
#### Conversation Info
Metadata about a conversation, including its status, participants, and timeline.
#### Conversation Manager
A component that handles the creation, storage, and retrieval of conversations.
#### Conversation Metadata
Additional information about conversations, such as tags, timestamps, and related resources.
#### Conversation Status
The current state of a conversation, including whether it's active, completed, or failed.
#### Conversation Store
A storage system for maintaining conversation history and related data.
### Events
#### Event
Every Conversation comprises a series of Events. Each Event is either an Action or an Observation.
#### Event Stream
A continuous flow of events that represents the ongoing activities and interactions in the system.
#### Action
A specific operation or command that an agent executes through available tools, such as running a command or editing a file.
#### Observation
The response or result returned by a tool after an agent's action, providing feedback about the action's outcome.
### Interface
Different ways to interact with OpenHands.
#### CLI Mode
A command-line interface mode for interacting with OpenHands agents without a graphical interface.
#### GUI Mode
A graphical user interface mode for interacting with OpenHands agents through a web interface.
#### Headless Mode
A mode of operation where OpenHands runs without a user interface, suitable for automation and scripting.
### Agent Memory
The system that decides which parts of the Event Stream (i.e. the conversation history) should be passed into each LLM prompt.
#### Memory Store
A storage system for maintaining agent memory and context across sessions.
#### Condenser
A component that processes and summarizes conversation history to maintain context while staying within token limits.
#### Truncation
A very simple Condenser strategy. Reduces conversation history or content to stay within token limits.
### Microagent
A specialized prompt that enhances OpenHands with domain-specific knowledge, repository-specific context, and task-specific workflows.
#### Microagent Registry
A central repository of available microagents and their configurations.
#### Public Microagent
A general-purpose microagent available to all OpenHands users, triggered by specific keywords.
#### Repository Microagent
A type of microagent that provides repository-specific context and guidelines, stored in the `.openhands/microagents/` directory.
### Prompt
Components for managing and processing prompts.
#### Prompt Caching
A system for caching and reusing common prompts to improve performance.
#### Prompt Manager
A component that handles the loading, processing, and management of prompts used by agents, including microagents.
#### Response Parsing
The process of interpreting and structuring responses from language models and tools.
### Runtime
The execution environment where agents perform their tasks, which can be local, remote, or containerized.
#### Action Execution Server
A REST API that receives agent actions (e.g. bash commands, python code, browsing actions), executes them in the runtime environment, and returns the results.
#### Action Execution Client
A component that handles the execution of actions in the runtime environment, managing the communication between the agent and the runtime.
#### Docker Runtime
A containerized runtime environment that provides isolation and reproducibility for agent operations.
#### E2B Runtime
A specialized runtime environment built on E2B for secure and isolated code execution.
#### Local Runtime
A runtime environment that executes on the local machine, suitable for development and testing.
#### Modal Runtime
A runtime environment built on Modal for scalable and distributed agent operations.
#### Remote Runtime
A sandboxed environment that executes code and commands remotely, providing isolation and security for agent operations.
#### Runtime Builder
A component that builds a Docker image for the Action Execution Server based on a user-specified base image.
### Security
Security-related components and features.
#### Security Analyzer
A component that checks agent actions for potential security risks.

View File

@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.24-nikolaik`
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.21-nikolaik`
## Develop inside Docker container

View File

@@ -12,7 +12,7 @@
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
<br/>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
<br/>
@@ -43,17 +43,17 @@ See the [Running OpenHands](https://docs.all-hands.dev/modules/usage/installatio
system requirements and more information.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.24
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -96,7 +96,7 @@ troubleshooting resources, and advanced configuration options.
OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
through Slack, so this is the best place to start, but we also are happy to have you contact us on Discord or Github:
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg) - Here we talk about research, architecture, and future development.
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) - Here we talk about research, architecture, and future development.
- [Join our Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
- [Read or post Github Issues](https://github.com/All-Hands-AI/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.

View File

@@ -1,4 +1,5 @@
#!/bin/bash
set -e
cp pyproject.toml poetry.lock openhands
poetry build -v

View File

@@ -104,7 +104,7 @@ workspace_base = "./workspace"
#aws_secret_access_key = ""
# API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
api_key = ""
api_key = "your-api-key"
# API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
#base_url = ""
@@ -195,7 +195,7 @@ model = "gpt-4o"
#native_tool_calling = None
[llm.gpt4o-mini]
api_key = ""
api_key = "your-api-key"
model = "gpt-4o"

View File

@@ -11,7 +11,7 @@ services:
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
- SANDBOX_API_HOSTNAME=host.docker.internal
#
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.24-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.21-nikolaik}
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -24,6 +24,3 @@ inline-quotes = "single"
[format]
quote-style = "single"
[lint.flake8-bugbear]
extend-immutable-calls = ["Depends", "fastapi.Depends", "fastapi.params.Depends"]

View File

@@ -7,7 +7,7 @@ services:
image: openhands:latest
container_name: openhands-app-${DATE:-}
environment:
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik}
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -1,8 +1,8 @@
# 📦 Runtime Docker
# 📦 Runtime EventStream
Le Runtime Docker d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
Le Runtime EventStream d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
Il crée un environnement en bac à sable (sandbox) en utilisant Docker, où du code arbitraire peut être exécuté en toute sécurité sans risquer le système hôte.
## Pourquoi avons-nous besoin d'un runtime en bac à sable ?

View File

@@ -163,7 +163,7 @@ Les options de configuration de base sont définies dans la section `[core]` du
- `runtime`
- Type : `str`
- Valeur par défaut : `"docker"`
- Valeur par défaut : `"eventstream"`
- Description : Environnement d'exécution
- `default_agent`

View File

@@ -95,3 +95,7 @@ sandbox_user_id="1001"
### Erreurs de port d'utilisation
Si vous voyez un message d'erreur indiquant que le port est utilisé ou indisponible, essayez de supprimer toutes les containers docker en cours d'exécution (exécutez `docker ps` et `docker rm` des containers concernés) puis ré-exécutez ```make run```
## Discuter
Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!

View File

@@ -52,7 +52,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -61,7 +61,7 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -114,7 +114,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -46,7 +46,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -56,6 +56,6 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
```

View File

@@ -13,16 +13,16 @@
La façon la plus simple d'exécuter OpenHands est avec Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.24
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
Vous pouvez également exécuter OpenHands en mode [headless scriptable](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), en tant que [CLI interactive](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), ou en utilisant l'[Action GitHub OpenHands](https://docs.all-hands.dev/modules/usage/how-to/github-action).

View File

@@ -42,7 +42,7 @@ Explorez le code source d'OpenHands sur [GitHub](https://github.com/All-Hands-AI
/>
</a>
<br></br>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"

View File

@@ -13,7 +13,7 @@ C'est le Runtime par défaut qui est utilisé lorsque vous démarrez OpenHands.
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

View File

@@ -1,8 +1,8 @@
以下是翻译后的内容:
# 📦 Docker 运行时
# 📦 EventStream 运行时
OpenHands Docker 运行时是实现 AI 代理操作安全灵活执行的核心组件。
OpenHands EventStream 运行时是实现 AI 代理操作安全灵活执行的核心组件。
它使用 Docker 创建一个沙盒环境,可以安全地运行任意代码而不会危及主机系统。
## 为什么我们需要沙盒运行时?

View File

@@ -162,7 +162,7 @@
- `runtime`
- 类型: `str`
- 默认值: `"docker"`
- 默认值: `"eventstream"`
- 描述: 运行时环境
- `default_agent`

View File

@@ -96,3 +96,7 @@ sandbox_user_id="1001"
### 端口使用错误
如果您遇到端口被占用或不可用的错误提示,可以尝试先用`docker ps`命令列出所有运行中的 Docker 容器,然后使用`docker rm`命令删除相关容器,最后再重新执行```make run```命令。
## 讨论
对于其他问题或疑问,请加入 [Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw) 或 [Discord](https://discord.gg/ESHStjSjD4) 提问!

View File

@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -112,7 +112,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它的
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -47,7 +47,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -57,6 +57,6 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
```

View File

@@ -11,16 +11,16 @@
在 Docker 中运行 OpenHands 是最简单的方式。
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.24
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
你也可以在可脚本化的[无头模式](https://docs.all-hands.dev/modules/usage/how-to/headless-mode)下运行 OpenHands作为[交互式 CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode),或使用 [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action)。

View File

@@ -42,7 +42,7 @@ OpenHands 是一个**自主 AI 软件工程师**,能够执行复杂的工程
/>
</a>
<br></br>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw">
<img
src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
alt="Join our Slack community"

View File

@@ -11,7 +11,7 @@
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

View File

@@ -1,6 +1,6 @@
# 📦 Docker Runtime
# 📦 EventStream Runtime
The OpenHands Docker Runtime is the core component that enables secure and flexible execution of AI agent's action.
The OpenHands EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
## Why do we need a sandboxed runtime?

View File

@@ -126,7 +126,7 @@ The core configuration options are defined in the `[core]` section of the `confi
- `runtime`
- Type: `str`
- Default: `"docker"`
- Default: `"eventstream"`
- Description: Runtime environment
- `default_agent`

View File

@@ -35,7 +35,7 @@ To run OpenHands in CLI mode with Docker:
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.cli
```

View File

@@ -41,16 +41,8 @@ docker build -t custom-image .
This will produce a new image called `custom-image`, which will be available in Docker.
## Using the Docker Command
When running OpenHands using [the docker command](/modules/usage/installation#start-the-app), replace
`-e SANDBOX_RUNTIME_CONTAINER_IMAGE=...` with `-e SANDBOX_BASE_CONTAINER_IMAGE=<custom image name>`:
```commandline
docker run -it --rm --pull=always \
-e SANDBOX_BASE_CONTAINER_IMAGE=custom-image \
...
```
> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the
> sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
## Using the Development Workflow

View File

@@ -112,7 +112,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
runtime='docker',
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='your_container_image',

View File

@@ -42,10 +42,9 @@ You can provide custom directions for OpenHands by following the [README for the
Github resolver will automatically check for valid [repository secrets](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui#creating-secrets-for-a-repository) or [repository variables](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) to customize its behavior.
The customization options you can set are:
| **Attribute name** | **Type** | **Purpose** | **Example** |
| -------------------------------- | -------- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------- |
| `LLM_MODEL` | Variable | Set the LLM to use with OpenHands | `LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"` |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
| `TARGET_BRANCH` | Variable | Merge to branch other than `main` | `TARGET_BRANCH="dev"` |
| **Attribute name** | **Type** | **Purpose** | **Example** |
|----------------------------------| -------- |-------------------------------------------------------------------------------------------------------------|------------------------------------------------------|
| `LLM_MODEL` | Variable | Set the LLM to use with OpenHands | `LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"` |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |

View File

@@ -32,7 +32,7 @@ To run OpenHands in Headless mode with Docker:
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.24 \
docker.all-hands.dev/all-hands-ai/openhands:0.21 \
python -m openhands.core.main -t "write a bash script that prints hi"
```

View File

@@ -43,10 +43,6 @@
- General: `Use the WSL 2 based engine` is enabled.
- Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
:::note
The docker command below to start the app must be run inside the WSL terminal.
:::
</details>
## Start the App
@@ -54,17 +50,17 @@
The easiest way to run OpenHands is in Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.24
docker.all-hands.dev/all-hands-ai/openhands:0.21
```
You'll find OpenHands running at http://localhost:3000!

View File

@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.24-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

2077
docs/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,8 +22,8 @@
"@mdx-js/react": "^3.1.0",
"clsx": "^2.0.0",
"prism-react-renderer": "^2.4.1",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-icons": "^5.4.0",
"react-use": "^17.6.0"
},
@@ -31,7 +31,7 @@
"@docusaurus/module-type-aliases": "^3.5.1",
"@docusaurus/tsconfig": "^3.7.0",
"@docusaurus/types": "^3.5.1",
"typescript": "~5.7.3"
"typescript": "~5.7.2"
},
"browserslist": {
"production": [

View File

@@ -8,7 +8,7 @@ function CustomFooter() {
<footer className="custom-footer">
<div className="footer-content">
<div className="footer-icons">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg" target="_blank" rel="noopener noreferrer">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw" target="_blank" rel="noopener noreferrer">
<FaSlack />
</a>
<a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">

View File

@@ -23,7 +23,7 @@ export function HomepageHeader() {
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
<br/>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2ypg5jweb-d~6hObZDbXi_HEL8PDrbHg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2wkh4pklz-w~h_DVDtEe9H5kyQlcNxVw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
<br/>

View File

@@ -69,7 +69,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -53,7 +53,6 @@ def get_config(
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -61,7 +61,6 @@ def get_config(
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=1800,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -67,7 +67,6 @@ def get_config(
base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -80,7 +80,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -45,7 +45,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=False,
use_host_network=False,
remote_runtime_enable_retries=True,
),
workspace_base=None,
workspace_mount_path=None,

View File

@@ -135,7 +135,6 @@ def get_config(
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -71,7 +71,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -56,7 +56,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -49,7 +49,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -70,7 +70,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -91,7 +91,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -55,7 +55,6 @@ def get_config(
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -70,7 +70,6 @@ def get_config(
remote_runtime_init_timeout=1800,
keep_runtime_alive=False,
timeout=120,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -113,7 +113,6 @@ def get_config(
enable_auto_lint=True,
use_host_network=False,
runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -73,7 +73,6 @@ def get_config(
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -412,17 +412,6 @@ if __name__ == '__main__':
with open(metadata_filepath, 'r') as metadata_file:
data = metadata_file.read()
metadata = EvalMetadata.model_validate_json(data)
else:
# Initialize with a dummy metadata when file doesn't exist
metadata = EvalMetadata(
agent_class="dummy_agent", # Placeholder agent class
llm_config=LLMConfig(model="dummy_model"), # Minimal LLM config
max_iterations=1, # Minimal iterations
eval_output_dir=os.path.dirname(args.input_file), # Use input file dir as output dir
start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(), # Current commit
dataset=args.dataset # Dataset name from args
)
# The evaluation harness constrains the signature of `process_instance_func` but we need to
# pass extra information. Build a new function object to avoid issues with multiprocessing.

View File

@@ -7,7 +7,6 @@ This file tracks the resource requirements of different instances.
import json
import os
from openhands.core.logger import openhands_logger as logger
CUR_DIR = os.path.dirname(os.path.abspath(__file__))

View File

@@ -67,11 +67,11 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
'<uploaded_files>\n'
f'/workspace/{workspace_dir_name}\n'
'</uploaded_files>\n'
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
f'<issue_description>\n'
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n"
f'<pr_description>\n'
f'{instance.problem_statement}\n'
'</issue_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
'</pr_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?\n'
"I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n'
'Follow these steps to resolve the issue:\n'
@@ -139,12 +139,10 @@ def get_config(
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
remote_runtime_api_timeout=120,
remote_runtime_resource_factor=get_instance_resource_factor(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
),
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -15,25 +15,11 @@ parser.add_argument(
action='store_true',
help='Show visualization paths for failed instances',
)
parser.add_argument(
'--only-x-instances',
action='store_true',
help='Only show instances that are ran by X',
)
args = parser.parse_args()
df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
if args.only_x_instances:
instance_ids_1 = set(df1['instance_id'].tolist())
print(
f'Before removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
)
df2 = df2[df2['instance_id'].isin(instance_ids_1)]
print(
f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
)
# Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner')
@@ -100,7 +86,7 @@ repo_diffs = []
for repo in all_repos:
x_count = len(x_only_by_repo.get(repo, []))
y_count = len(y_only_by_repo.get(repo, []))
diff = y_count - x_count
diff = abs(x_count - y_count)
repo_diffs.append((repo, diff))
# Sort by diff (descending) and then by repo name
@@ -120,13 +106,7 @@ for repo, diff in repo_diffs:
repo_color = 'red' if is_significant else 'yellow'
print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
print(
colored(
f'Difference: {diff} instances! (Larger diff = Y better)',
repo_color,
attrs=['bold'],
)
)
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
if x_instances:
print(' ' + str(x_instances))

View File

@@ -4,7 +4,6 @@
import argparse
import json
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
@@ -21,7 +20,6 @@ output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
output_dir = os.path.dirname(args.oh_output_file)
swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
if os.path.exists(swebench_eval_file):
@@ -59,172 +57,22 @@ def convert_history_to_str(history):
return ret
# Load trajectories for resolved instances
def load_completions(instance_id: str):
global output_dir
glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
files = sorted(glob(glob_path)) # this is ascending order
# pick the last file (last turn)
try:
file_path = files[-1]
except IndexError:
# print(f'No files found for instance {instance_id}: files={files}')
return None
with open(file_path, 'r') as f:
result = json.load(f)
# create messages
messages = result['messages']
messages.append(result['response']['choices'][0]['message'])
tools = result['kwargs']['tools']
return {
'messages': messages,
'tools': tools,
}
def _convert_content(content) -> str:
ret = ''
if isinstance(content, list):
for item in content:
assert item['type'] == 'text', 'Only text is supported for now'
ret += f'{item["text"]}\n'
else:
assert isinstance(content, str), 'Only str is supported for now'
ret = content
return ret
def convert_tool_call_to_string(tool_call: dict) -> str:
"""Convert tool call to content in string format."""
if 'function' not in tool_call:
raise ValueError("Tool call must contain 'function' key.")
if 'id' not in tool_call:
raise ValueError("Tool call must contain 'id' key.")
if 'type' not in tool_call:
raise ValueError("Tool call must contain 'type' key.")
if tool_call['type'] != 'function':
raise ValueError("Tool call type must be 'function'.")
ret = f"<function={tool_call['function']['name']}>\n"
try:
args = json.loads(tool_call['function']['arguments'])
except json.JSONDecodeError as e:
raise ValueError(
f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
) from e
for param_name, param_value in args.items():
is_multiline = isinstance(param_value, str) and '\n' in param_value
ret += f'<parameter={param_name}>'
if is_multiline:
ret += '\n'
ret += f'{param_value}'
if is_multiline:
ret += '\n'
ret += '</parameter>\n'
ret += '</function>'
return ret
def format_traj(traj, first_n_turns=None, last_n_turns=None) -> str:
output = ''
system_message = None
# Handle system message if present
if traj[0]['role'] == 'system':
system_message = traj[0]
traj = traj[1:]
content = _convert_content(system_message['content'])
output += "*** System Message that describes the assistant's behavior ***\n"
output += f'{content}\n'
# Merge consecutive user messages first
merged_traj = []
current_messages = []
n_turns = len(traj)
for i, message in enumerate(traj):
# Skip this message if...
if (
# Case 1: first_n_turns specified and we're past it
(first_n_turns is not None and i >= first_n_turns and last_n_turns is None)
or
# Case 2: last_n_turns specified and we're before it
(
last_n_turns is not None
and i < n_turns - last_n_turns
and first_n_turns is None
)
or
# Case 3: both specified and we're in the middle section
(
first_n_turns is not None
and last_n_turns is not None
and i >= first_n_turns
and i < n_turns - last_n_turns
)
):
continue
if message['role'] == 'user':
current_messages.append(message)
else:
if current_messages:
# Merge all accumulated user messages into one
merged_content = '\n'.join(
_convert_content(msg['content']) for msg in current_messages
)
merged_traj.append({'role': 'user', 'content': merged_content})
current_messages = []
merged_traj.append(message)
# Don't forget to handle any remaining user messages
if current_messages:
merged_content = '\n'.join(
_convert_content(msg['content']) for msg in current_messages
)
merged_traj.append({'role': 'user', 'content': merged_content})
# Now process the merged trajectory
for i, message in enumerate(merged_traj):
role, content = message['role'], message['content']
content = _convert_content(content) if isinstance(content, list) else content
turn_id = i // 2 + 1
output += '-' * 100 + '\n'
output += f'*** Turn {turn_id} - {role.upper() if role != "tool" else "TOOL EXECUTION RESULT"} ***\n'
if role == 'user':
output += f'{content}\n'
elif role == 'tool':
output += f'{content}\n'
elif role == 'assistant':
output += f'{content}\n'
if (
'tool_calls' in message
and message['tool_calls'] is not None
and len(message['tool_calls']) > 0
):
for toolcall_id, tool_call in enumerate(message['tool_calls']):
output += f'### Tool Call {toolcall_id}\n'
output += f'{convert_tool_call_to_string(tool_call)}\n'
else:
raise ValueError(f'Unexpected role: {role}')
output += '-' * 100 + '\n'
return output
def write_row_to_md_file(row, instance_id_to_test_result):
if 'git_patch' in row:
model_patch = row['git_patch']
elif 'test_result' in row and 'git_patch' in row['test_result']:
model_patch = row['test_result']['git_patch']
else:
print(f'Row {row} does not have a git_patch')
return
raise ValueError(f'Row {row} does not have a git_patch')
test_output = None
# Use result from output.jsonl FIRST if available.
if 'report' in row and row['report'] is not None:
if row['instance_id'] in instance_id_to_test_result:
report = instance_id_to_test_result[row['instance_id']].get('report', {})
resolved = report.get('resolved', False)
test_output = instance_id_to_test_result[row['instance_id']].get(
'test_output', None
)
elif 'report' in row and row['report'] is not None:
if not isinstance(row['report'], dict):
resolved = None
print(
@@ -232,12 +80,6 @@ def write_row_to_md_file(row, instance_id_to_test_result):
)
else:
resolved = row['report'].get('resolved', False)
elif row['instance_id'] in instance_id_to_test_result:
report = instance_id_to_test_result[row['instance_id']].get('report', {})
resolved = report.get('resolved', False)
test_output = instance_id_to_test_result[row['instance_id']].get(
'test_output', None
)
else:
resolved = None
@@ -246,8 +88,6 @@ def write_row_to_md_file(row, instance_id_to_test_result):
os.makedirs(output_md_folder, exist_ok=True)
filepath = os.path.join(output_md_folder, filename)
completions = load_completions(instance_id)
with open(filepath, 'w') as f:
f.write(f'# {instance_id} (resolved: {resolved})\n')
@@ -257,12 +97,7 @@ def write_row_to_md_file(row, instance_id_to_test_result):
f.write(json.dumps(row['metadata'], indent=2))
f.write('\n```\n')
# Completion
if completions is not None:
f.write('## Completion\n')
traj = completions['messages']
f.write(format_traj(traj))
# Trajectory
f.write('## History\n')
f.write(convert_history_to_str(row['history']))

View File

@@ -207,13 +207,12 @@ with open(args.input_file, 'r') as infile:
for line in tqdm(infile, desc='Checking for changes'):
data = json.loads(line)
instance_id = data['instance_id']
current_report = data.get('report', {})
new_report = instance_id_to_status[
instance_id
] # if no report, it's not resolved
if current_report != new_report:
needs_update = True
break
if instance_id in instance_id_to_status:
current_report = data.get('report', {})
new_report = instance_id_to_status[instance_id]
if current_report != new_report:
needs_update = True
break
if not needs_update:
print('No updates detected. Skipping file update.')
@@ -235,5 +234,6 @@ with open(args.input_file + '.bak', 'r') as infile, open(
for line in tqdm(infile, desc='Updating output file'):
data = json.loads(line)
instance_id = data['instance_id']
data['report'] = instance_id_to_status[instance_id]
if instance_id in instance_id_to_status:
data['report'] = instance_id_to_status[instance_id]
outfile.write(json.dumps(data) + '\n')

View File

@@ -76,7 +76,7 @@ echo "Running SWE-bench evaluation"
echo "=============================================================="
RUN_ID=$(date +"%Y%m%d_%H%M%S")
N_PROCESS=4
N_PROCESS=16
if [ -z "$INSTANCE_ID" ]; then
echo "Running SWE-bench evaluation on the whole input file..."
@@ -87,7 +87,7 @@ if [ -z "$INSTANCE_ID" ]; then
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--timeout 1800 \
--cache_level instance \
--max_workers $N_PROCESS \
--run_id $RUN_ID
@@ -133,7 +133,7 @@ else
--dataset_name "$DATASET_NAME" \
--split "$SPLIT" \
--predictions_path $SWEBENCH_FORMAT_JSONL \
--timeout 3600 \
--timeout 1800 \
--instance_ids $INSTANCE_ID \
--cache_level instance \
--max_workers $N_PROCESS \

View File

@@ -17,14 +17,11 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
```bash
./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
--agent-llm-config <agent-llm-config, default to 'agent'> \
--env-llm-config <env-llm-config, default to 'env'> \
--outputs-path <outputs-path, default to outputs> \
--server-hostname <server-hostname, default to localhost> \
--version <version, default to 1.0.0> \
--start-percentile <integer from 0 to 99, default to 0> \
--end-percentile <integer from 1 to 100, default to 100>
--agent-llm-config <agent-llm-config> \
--env-llm-config <env-llm-config> \
--outputs-path <outputs-path> \
--server-hostname <server-hostname> \
--version <version>
# Example
./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
@@ -32,9 +29,7 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
--env-llm-config claude-3-5-sonnet-20240620 \
--outputs-path outputs \
--server-hostname localhost \
--version 1.0.0 \
--start-percentile 10 \
--end-percentile 20
--version 1.0.0
```
- `agent-llm-config`: the config name for the agent LLM. This should match the config name in config.toml. This is the LLM used by the agent (e.g. CodeActAgent).
@@ -42,11 +37,7 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
- `outputs-path`: the path to save trajectories and evaluation results.
- `server-hostname`: the hostname of the server that hosts all the web services. It could be localhost if you are running the evaluation and services on the same machine. If the services are hosted on a remote machine, you must use the hostname of the remote machine rather than IP address.
- `version`: the version of the task images to use. Currently, the only supported version is 1.0.0.
- `start-percentile`: the start percentile of the task split, must be an integer between 0 to 99.
- `end-percentile`: the end percentile of the task split, must be an integer between 1 to 100 and larger than start-percentile.
The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take 2 days to finish evaluation if you run the whole task set.
To speed up evaluation, you can use `start-percentile` and `end-percentile` to split the tasks for higher parallelism,
provided concurrent runs are **targeting different servers**.
The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take a few days to finish evaluation.
Note: the script will automatically skip a task if it encounters an error. This usually happens when the OpenHands runtime dies due to some unexpected errors. This means even if the script finishes, it might not have evaluated all tasks. You can manually resume the evaluation by running the script again.

View File

@@ -50,7 +50,6 @@ def get_config(
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_enable_retries=True,
),
# we mount trajectories path so that trajectories, generated by OpenHands
# controller, can be accessible to the evaluator file in the runtime container

View File

@@ -56,14 +56,6 @@ while [[ $# -gt 0 ]]; do
VERSION="$2"
shift 2
;;
--start-percentile)
START_PERCENTILE="$2"
shift 2
;;
--end-percentile)
END_PERCENTILE="$2"
shift 2
;;
*)
echo "Unknown argument: $1"
exit 1
@@ -77,53 +69,16 @@ if [[ ! "$OUTPUTS_PATH" = /* ]]; then
OUTPUTS_PATH="$(cd "$(dirname "$OUTPUTS_PATH")" 2>/dev/null && pwd)/$(basename "$OUTPUTS_PATH")"
fi
: "${START_PERCENTILE:=0}" # Default to 0 percentile (first line)
: "${END_PERCENTILE:=100}" # Default to 100 percentile (last line)
# Validate percentile ranges if provided
if ! [[ "$START_PERCENTILE" =~ ^[0-9]+$ ]] || ! [[ "$END_PERCENTILE" =~ ^[0-9]+$ ]]; then
echo "Error: Percentiles must be integers"
exit 1
fi
if [ "$START_PERCENTILE" -ge "$END_PERCENTILE" ]; then
echo "Error: Start percentile must be less than end percentile"
exit 1
fi
if [ "$START_PERCENTILE" -lt 0 ] || [ "$END_PERCENTILE" -gt 100 ]; then
echo "Error: Percentiles must be between 0 and 100"
exit 1
fi
echo "Using agent LLM config: $AGENT_LLM_CONFIG"
echo "Using environment LLM config: $ENV_LLM_CONFIG"
echo "Outputs path: $OUTPUTS_PATH"
echo "Server hostname: $SERVER_HOSTNAME"
echo "Version: $VERSION"
echo "Start Percentile: $START_PERCENTILE"
echo "End Percentile: $END_PERCENTILE"
echo "Downloading tasks.md..."
rm -f tasks.md
wget https://github.com/TheAgentCompany/TheAgentCompany/releases/download/${VERSION}/tasks.md
total_lines=$(cat tasks.md | grep "ghcr.io/theagentcompany" | wc -l)
if [ "$total_lines" -ne 175 ]; then
echo "Error: Expected 175 tasks in tasks.md but found $total_lines lines"
exit 1
fi
# Calculate line numbers based on percentiles
start_line=$(echo "scale=0; ($total_lines * $START_PERCENTILE / 100) + 1" | bc)
end_line=$(echo "scale=0; $total_lines * $END_PERCENTILE / 100" | bc)
echo "Using tasks No. $start_line to $end_line (inclusive) out of 1-175 tasks"
# Create a temporary file with just the desired range
temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"
while IFS= read -r task_image; do
docker pull $task_image
@@ -153,8 +108,8 @@ while IFS= read -r task_image; do
docker images "ghcr.io/all-hands-ai/runtime" -q | xargs -r docker rmi -f
docker volume prune -f
docker system prune -f
done < "$temp_file"
done < tasks.md
rm tasks.md "$temp_file"
rm tasks.md
echo "All evaluation completed successfully!"

View File

@@ -50,7 +50,6 @@ def get_config(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -0,0 +1,674 @@
from collections import Counter
from copy import deepcopy
from difflib import SequenceMatcher
from io import BytesIO
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
import cv2
import numpy as np
import torch
from colormath.color_conversions import convert_color
from colormath.color_diff import delta_e_cie2000
from colormath.color_objects import LabColor, sRGBColor
from PIL import Image, ImageChops, ImageColor
from scipy.optimize import linear_sum_assignment
from transformers import CLIPModel, CLIPProcessor
from openhands.core.logger import openhands_logger as logger
def calculate_similarity(block1, block2):
"""Calculate text similarity between two blocks using SequenceMatcher."""
text_similarity = SequenceMatcher(None, block1['text'], block2['text']).ratio()
return text_similarity
def adjust_cost_for_context(cost_matrix, consecutive_bonus=1.0, window_size=20):
"""Adjust cost matrix by considering context similarity."""
if window_size <= 0:
return cost_matrix
n, m = cost_matrix.shape
adjusted_cost_matrix = np.copy(cost_matrix)
for i in range(n):
for j in range(m):
if adjusted_cost_matrix[i][j] >= -0.5:
continue
nearby_matrix = cost_matrix[
max(0, i - window_size) : min(n, i + window_size + 1),
max(0, j - window_size) : min(m, j + window_size + 1),
]
flattened_array = nearby_matrix.flatten()
sorted_array = np.sort(flattened_array)[::-1]
sorted_array = np.delete(
sorted_array, np.where(sorted_array == cost_matrix[i, j])[0][0]
)
top_k_elements = sorted_array[-window_size * 2 :]
bonus = consecutive_bonus * np.sum(top_k_elements)
adjusted_cost_matrix[i][j] += bonus
return adjusted_cost_matrix
def create_cost_matrix(A, B):
"""Create cost matrix for block matching."""
n = len(A)
m = len(B)
cost_matrix = np.zeros((n, m))
for i in range(n):
for j in range(m):
cost_matrix[i, j] = -calculate_similarity(A[i], B[j])
return cost_matrix
def calculate_distance_max_1d(x1, y1, x2, y2):
"""Calculate maximum 1D distance between points."""
return max(abs(x2 - x1), abs(y2 - y1))
def calculate_ratio(h1, h2):
"""Calculate ratio between two heights."""
return max(h1, h2) / min(h1, h2)
def rgb_to_lab(rgb):
"""Convert RGB color to Lab color space."""
rgb_color = sRGBColor(rgb[0], rgb[1], rgb[2], is_upscaled=True)
lab_color = convert_color(rgb_color, LabColor)
return lab_color
def color_similarity_ciede2000(rgb1, rgb2):
"""Calculate color similarity using CIEDE2000 formula."""
lab1 = rgb_to_lab(rgb1)
lab2 = rgb_to_lab(rgb2)
delta_e = delta_e_cie2000(lab1, lab2)
similarity = max(0, 1 - (delta_e / 100))
return similarity
def merge_blocks_wo_check(block1, block2):
"""Merge two blocks without additional checks."""
merged_text = block1['text'] + ' ' + block2['text']
x_min = min(block1['bbox'][0], block2['bbox'][0])
y_min = min(block1['bbox'][1], block2['bbox'][1])
x_max = max(
block1['bbox'][0] + block1['bbox'][2], block2['bbox'][0] + block2['bbox'][2]
)
y_max = max(
block1['bbox'][1] + block1['bbox'][3], block2['bbox'][1] + block2['bbox'][3]
)
merged_bbox = (x_min, y_min, x_max - x_min, y_max - y_min)
merged_color = tuple(
(color1 + color2) // 2
for color1, color2 in zip(block1['color'], block2['color'])
)
return {'text': merged_text, 'bbox': merged_bbox, 'color': merged_color}
def find_maximum_matching(A, B, consecutive_bonus, window_size):
"""Find maximum matching between two sets of blocks."""
cost_matrix = create_cost_matrix(A, B)
cost_matrix = adjust_cost_for_context(cost_matrix, consecutive_bonus, window_size)
row_ind, col_ind = linear_sum_assignment(cost_matrix)
current_cost = cost_matrix[row_ind, col_ind].tolist()
return list(zip(row_ind, col_ind)), current_cost, cost_matrix
def remove_indices(lst, indices):
"""Remove indices from list in reverse order."""
for index in sorted(indices, reverse=True):
if index < len(lst):
lst.pop(index)
return lst
def merge_blocks_by_list(blocks, merge_list):
"""Merge blocks according to merge list."""
pop_list = []
while merge_list:
i = merge_list[0][0]
j = merge_list[0][1]
blocks[i] = merge_blocks_wo_check(blocks[i], blocks[j])
pop_list.append(j)
merge_list.pop(0)
if merge_list:
new_merge_list = []
for k in range(len(merge_list)):
if (
merge_list[k][0] != i
and merge_list[k][1] != i
and merge_list[k][0] != j
and merge_list[k][1] != j
):
new_merge_list.append(merge_list[k])
merge_list = new_merge_list
remove_indices(blocks, pop_list)
return blocks
def difference_of_means(list1, list2):
"""Calculate difference of means between two lists."""
counter1 = Counter(list1)
counter2 = Counter(list2)
for element in set(list1) & set(list2):
common_count = min(counter1[element], counter2[element])
counter1[element] -= common_count
counter2[element] -= common_count
unique_list1 = [item for item in counter1.elements()]
unique_list2 = [item for item in counter2.elements()]
mean_list1 = sum(unique_list1) / len(unique_list1) if unique_list1 else 0
mean_list2 = sum(unique_list2) / len(unique_list2) if unique_list2 else 0
if mean_list1 - mean_list2 > 0:
if min(unique_list1) > min(unique_list2):
return mean_list1 - mean_list2
return 0.0
return mean_list1 - mean_list2
def find_possible_merge(A, B, consecutive_bonus, window_size, debug=False):
"""Find possible merges between blocks."""
merge_bonus = 0.0
merge_windows = 1
def sortFn(value):
return value[2]
while True:
A_changed = False
B_changed = False
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if len(A) >= 2:
merge_list = []
for i in range(len(A) - 1):
new_A = deepcopy(A)
new_A[i] = merge_blocks_wo_check(new_A[i], new_A[i + 1])
new_A.pop(i + 1)
updated_matching, updated_cost, _ = find_maximum_matching(
new_A, B, merge_bonus, merge_windows
)
diff = difference_of_means(current_cost, updated_cost)
if diff > 0.05:
merge_list.append([i, i + 1, diff])
merge_list.sort(key=sortFn, reverse=True)
if merge_list:
A_changed = True
A = merge_blocks_by_list(A, merge_list)
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if len(B) >= 2:
merge_list = []
for i in range(len(B) - 1):
new_B = deepcopy(B)
new_B[i] = merge_blocks_wo_check(new_B[i], new_B[i + 1])
new_B.pop(i + 1)
updated_matching, updated_cost, _ = find_maximum_matching(
A, new_B, merge_bonus, merge_windows
)
diff = difference_of_means(current_cost, updated_cost)
if diff > 0.05:
merge_list.append([i, i + 1, diff])
merge_list.sort(key=sortFn, reverse=True)
if merge_list:
B_changed = True
B = merge_blocks_by_list(B, merge_list)
matching, current_cost, cost_matrix = find_maximum_matching(
A, B, merge_bonus, merge_windows
)
if not A_changed and not B_changed:
break
matching, _, _ = find_maximum_matching(A, B, consecutive_bonus, window_size)
return A, B, matching
def merge_blocks_by_bbox(blocks):
"""Merge blocks with same bounding box."""
merged_blocks = {}
for block in blocks:
bbox = tuple(block['bbox'])
if bbox in merged_blocks:
existing_block = merged_blocks[bbox]
existing_block['text'] += ' ' + block['text']
existing_block['color'] = [
(ec + c) / 2 for ec, c in zip(existing_block['color'], block['color'])
]
else:
merged_blocks[bbox] = block
return list(merged_blocks.values())
def mask_bounding_boxes_with_inpainting(image, bounding_boxes):
"""Mask bounding boxes in image using inpainting."""
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
mask = np.zeros(image_cv.shape[:2], dtype=np.uint8)
height, width = image_cv.shape[:2]
for bbox in bounding_boxes:
x_ratio, y_ratio, w_ratio, h_ratio = bbox
x = int(x_ratio * width)
y = int(y_ratio * height)
w = int(w_ratio * width)
h = int(h_ratio * height)
mask[y : y + h, x : x + w] = 255
inpainted_image = cv2.inpaint(image_cv, mask, 3, cv2.INPAINT_TELEA)
return Image.fromarray(cv2.cvtColor(inpainted_image, cv2.COLOR_BGR2RGB))
def rescale_and_mask(image, blocks):
"""Rescale image and mask blocks."""
if blocks:
image = mask_bounding_boxes_with_inpainting(image, blocks)
width, height = image.size
if width < height:
new_size = (width, width)
else:
new_size = (height, height)
return image.resize(new_size, Image.LANCZOS)
def calculate_clip_similarity(image1, image2, blocks1, blocks2):
"""Calculate CLIP similarity between two images."""
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# Mask and preprocess images
image1_masked = rescale_and_mask(image1, [block['bbox'] for block in blocks1])
image2_masked = rescale_and_mask(image2, [block['bbox'] for block in blocks2])
inputs = processor(
images=[image1_masked, image2_masked], return_tensors='pt', padding=True
)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Calculate features and similarity
with torch.no_grad():
image_features = model.get_image_features(**inputs)
image_features1 = image_features[0].unsqueeze(0)
image_features2 = image_features[1].unsqueeze(0)
image_features1 /= image_features1.norm(dim=-1, keepdim=True)
image_features2 /= image_features2.norm(dim=-1, keepdim=True)
similarity = (image_features1 @ image_features2.T).item()
return similarity
def rgb_to_hex(rgb):
"""Convert an RGB tuple to hexadecimal format."""
return '{:02X}{:02X}{:02X}'.format(*rgb)
class ColorPool:
def __init__(self, offset=0):
color_values = list(range(10, 251, 16))
color_list = [((r + offset) % 256, (g + offset) % 256, (b + offset) % 256)
for r in color_values for g in color_values for b in color_values]
self.color_pool = [rgb_to_hex(color) for color in color_list]
def pop_color(self):
if self.color_pool:
return self.color_pool.pop()
else:
raise NotImplementedError
def process_html_str(html_str, offset=0):
"""Process HTML string to assign unique colors to text elements."""
soup = BeautifulSoup(html_str, 'html.parser')
def update_style(element, property_name, value):
important_value = f"{value} !important"
styles = element.attrs.get('style', '').split(';')
updated_styles = [s for s in styles if not s.strip().startswith(property_name) and len(s.strip()) > 0]
updated_styles.append(f"{property_name}: {important_value}")
element['style'] = '; '.join(updated_styles).strip()
# Set background color of all elements to transparent white
for element in soup.find_all(True):
update_style(element, 'background-color', 'rgba(255, 255, 255, 0.0)')
color_pool = ColorPool(offset)
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'a', 'b', 'li',
'table', 'td', 'th', 'button', 'footer', 'header', 'figcaption']
for tag in soup.find_all(text_tags):
color = f"#{color_pool.pop_color()}"
update_style(tag, 'color', color)
update_style(tag, 'opacity', '1.0')
return str(soup)
def similar(n1, n2):
"""Check if two numbers are similar within a threshold."""
return abs(n1 - n2) <= 8
def find_different_pixels(image1, image2):
"""Find pixels that differ between two images."""
if image1.size != image2.size:
logger.warning("Images are not the same size")
return None
image1 = image1.convert('RGB')
image2 = image2.convert('RGB')
pixels1 = image1.load()
pixels2 = image2.load()
different_pixels = []
for x in range(image1.size[0]):
for y in range(image1.size[1]):
r1, g1, b1 = pixels1[x, y]
r2, g2, b2 = pixels2[x, y]
if similar((r1 + 50) % 256, r2) and similar((g1 + 50) % 256, g2) and similar((b1 + 50) % 256, b2):
different_pixels.append((y, x))
return np.stack(different_pixels) if different_pixels else None
def extract_text_with_color(html_str):
"""Extract text and color information from HTML string."""
def get_color(tag):
if 'style' in tag.attrs:
styles = tag['style'].split(';')
color_style = [s for s in styles if 'color' in s and 'background-color' not in s]
if color_style:
color = color_style[-1].split(':')[1].strip().replace(" !important", "")
if color[0] == "#":
return color
else:
try:
if color.startswith('rgb'):
color = tuple(map(int, color[4:-1].split(',')))
else:
color = ImageColor.getrgb(color)
return '#{:02x}{:02x}{:02x}'.format(*color)
except ValueError:
logger.warning(f"Unable to identify or convert color: {color}")
return None
return None
def extract_text_recursive(element, parent_color='#000000'):
if isinstance(element, Comment):
return None
elif isinstance(element, NavigableString):
text = element.strip()
return (text, parent_color) if text else None
elif isinstance(element, Tag):
current_color = get_color(element) or parent_color
children_texts = filter(None, [extract_text_recursive(child, current_color)
for child in element.children])
return list(children_texts)
soup = BeautifulSoup(html_str, 'html.parser')
body = soup.body
return extract_text_recursive(body) if body else []
def flatten_tree(tree):
"""Flatten a nested tree structure into a list."""
flat_list = []
def flatten(node):
if isinstance(node, list):
for item in node:
flatten(item)
else:
flat_list.append(node)
flatten(tree)
return flat_list
def get_blocks_from_image_diff_pixels(image, html_text_color_tree, different_pixels):
"""Extract text blocks from image using color differences."""
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
x_w = image_cv.shape[0]
y_w = image_cv.shape[1]
def hex_to_bgr(hex_color):
hex_color = hex_color.lstrip('#')
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
return rgb[::-1]
def get_intersect(arr1, arr2):
arr1_reshaped = arr1.view([('', arr1.dtype)] * arr1.shape[1])
arr2_reshaped = arr2.view([('', arr2.dtype)] * arr2.shape[1])
common_rows = np.intersect1d(arr1_reshaped, arr2_reshaped)
return common_rows.view(arr1.dtype).reshape(-1, arr1.shape[1])
blocks = []
for item in html_text_color_tree:
try:
color = np.array(hex_to_bgr(item[1]), dtype="uint8")
except:
continue
lower = color - 4
upper = color + 4
mask = cv2.inRange(image_cv, lower, upper)
coords = np.column_stack(np.where(mask > 0))
coords = get_intersect(coords, different_pixels)
if coords.size == 0:
continue
x_min, y_min = np.min(coords, axis=0)
x_max, y_max = np.max(coords, axis=0)
# Get average color from original image
color_coords = coords.copy()
color_coords = color_coords[color_coords[:, 0] <= x_max]
color_coords = color_coords[color_coords[:, 1] <= y_max]
colors = [image_cv[x, y] for x, y in color_coords]
avg_color = tuple(map(int, np.mean(colors, axis=0)))[::-1] # Convert BGR to RGB
blocks.append({
'text': item[0].lower(),
'bbox': (y_min / y_w, x_min / x_w, (y_max - y_min + 1) / y_w, (x_max - x_min + 1) / x_w),
'color': avg_color
})
return blocks
def get_blocks_from_html(html_str, image1):
"""Extract text blocks from HTML and image."""
# Process HTML with two different color offsets
html_str_1 = process_html_str(html_str, offset=0)
html_str_2 = process_html_str(html_str, offset=50)
# Render both HTML versions to images
# TODO: Screenshot html_str_2
filter_color = (255, 0, 0)
image2 = Image.new("RGB", image1.size, filter_color)
# Find pixels that differ between the two rendered images
different_pixels = find_different_pixels(image1, image2)
if different_pixels is None:
logger.warning("Unable to get pixels with different colors")
return []
# Extract text and color information from HTML
html_text_color_tree = flatten_tree(extract_text_with_color(html_str_1))
try:
blocks = get_blocks_from_image_diff_pixels(image1, html_text_color_tree, different_pixels)
except Exception as e:
logger.warning(f"Unable to get blocks: {e}")
return []
return blocks
def evaluate(task, generated_img):
"""Evaluate generated image against reference image using multiple metrics."""
# Load reference image
post_image = task['post_image']
# Extract blocks from HTML and images
post_blocks = get_blocks_from_html(task['post_html'], post_image)
gen_blocks = get_blocks_from_html(task['gen_html'], generated_img)
print("block details", post_blocks, gen_blocks)
if not post_blocks or not gen_blocks:
# Fallback to basic CLIP and pixel comparison if no blocks available
clip_score = calculate_clip_similarity(post_image, generated_img, [], [])
logger.info(f'CLIP similarity score: {clip_score}')
# Pixel comparison
diff = ImageChops.difference(generated_img, post_image)
pixel_match = not diff.getbbox()
logger.info(
f"Pixel difference analysis: {'No difference' if pixel_match else 'Differences found'}"
)
return clip_score > 0.95 or pixel_match
# Merge blocks with same bounding boxes
post_blocks = merge_blocks_by_bbox(post_blocks)
gen_blocks = merge_blocks_by_bbox(gen_blocks)
# Find optimal block matching
consecutive_bonus, window_size = 0.1, 1
gen_blocks_m, post_blocks_m, matching = find_possible_merge(
gen_blocks, deepcopy(post_blocks), consecutive_bonus, window_size
)
# Filter matches with low similarity
filtered_matching = []
for i, j in matching:
text_similarity = calculate_similarity(gen_blocks_m[i], post_blocks_m[j])
if text_similarity >= 0.5:
filtered_matching.append([i, j, text_similarity])
matching = filtered_matching
if not matching:
logger.warning('No matching blocks found')
clip_score = calculate_clip_similarity(
post_image, generated_img, gen_blocks, post_blocks
)
return clip_score > 0.95
# Calculate metrics for matched blocks
indices1 = [item[0] for item in matching]
indices2 = [item[1] for item in matching]
# Calculate unmatched areas
unmatched_area_1 = sum(
block['bbox'][2] * block['bbox'][3]
for i, block in enumerate(gen_blocks_m)
if i not in indices1
)
unmatched_area_2 = sum(
block['bbox'][2] * block['bbox'][3]
for j, block in enumerate(post_blocks_m)
if j not in indices2
)
total_unmatched_area = unmatched_area_1 + unmatched_area_2
# Calculate metrics for matched blocks
matched_areas = []
text_scores = []
position_scores = []
color_scores = []
for i, j, text_similarity in matching:
# Area
block_area = (
gen_blocks_m[i]['bbox'][2] * gen_blocks_m[i]['bbox'][3]
+ post_blocks_m[j]['bbox'][2] * post_blocks_m[j]['bbox'][3]
)
matched_areas.append(block_area)
# Position similarity
position_similarity = 1 - calculate_distance_max_1d(
gen_blocks_m[i]['bbox'][0] + gen_blocks_m[i]['bbox'][2] / 2,
gen_blocks_m[i]['bbox'][1] + gen_blocks_m[i]['bbox'][3] / 2,
post_blocks_m[j]['bbox'][0] + post_blocks_m[j]['bbox'][2] / 2,
post_blocks_m[j]['bbox'][1] + post_blocks_m[j]['bbox'][3] / 2,
)
# Color similarity
color_similarity = color_similarity_ciede2000(
gen_blocks_m[i]['color'], post_blocks_m[j]['color']
)
text_scores.append(text_similarity)
position_scores.append(position_similarity)
color_scores.append(color_similarity)
# Calculate final scores
total_area = sum(matched_areas) + total_unmatched_area
size_score = sum(matched_areas) / total_area if total_area > 0 else 0
text_score = np.mean(text_scores) if text_scores else 0
position_score = np.mean(position_scores) if position_scores else 0
color_score = np.mean(color_scores) if color_scores else 0
clip_score = calculate_clip_similarity(
post_image, generated_img, gen_blocks, post_blocks
)
# Combine scores with equal weights
final_score = 0.2 * (
size_score + text_score + position_score + color_score + clip_score
)
logger.info('Evaluation scores:')
logger.info(f'- Size score: {size_score:.3f}')
logger.info(f'- Text score: {text_score:.3f}')
logger.info(f'- Position score: {position_score:.3f}')
logger.info(f'- Color score: {color_score:.3f}')
logger.info(f'- CLIP score: {clip_score:.3f}')
logger.info(f'- Final score: {final_score:.3f}')
return final_score > 0.8 # Consider it a match if final score > 80%
def png_to_bytes(png):
buffer = BytesIO()
png.save(buffer, format='PNG')
image_bytes = buffer.getvalue()
return image_bytes
def bytes_to_image(image_bytes):
"""Convert bytes to a Pillow Image object."""
return Image.open(BytesIO(image_bytes))
if __name__ == '__main__':
first_image = Image.open('./evaluation/visualcodebench/data/1/post.png')
image = Image.open('./evaluation/visualcodebench/data/1/prev.png')
html_file = open('./evaluation/visualcodebench/data/1/post/index.html', 'r')
first_html = html_file.read()
html_file.close()
html_file = open('./evaluation/visualcodebench/data/1/prev/index.html', 'r')
gen_html = html_file.read()
html_file.close()
sample = {'post_image': first_image, "post_html": first_html, "gen_html": gen_html}
evaluate(sample, image)

View File

@@ -0,0 +1,97 @@
import base64
import os
from io import BytesIO
import pandas as pd
from huggingface_hub import snapshot_download
from PIL import PngImagePlugin
from tqdm import tqdm
from openhands.core.logger import openhands_logger as logger
REPO_DOWNLOAD_DIR = (
'./evaluation/visualcodebench/' # Directory to store the downloaded repository
)
def download_repository():
"""
Download the entire repository from Hugging Face Hub.
This function clones the repository into REPO_DOWNLOAD_DIR.
"""
repo_id = 'rvmalhot/VisualCodeBench'
try:
logger.info(f"Downloading repository '{repo_id}'...")
snapshot_download(
repo_id=repo_id,
local_dir=REPO_DOWNLOAD_DIR,
repo_type='dataset',
ignore_patterns=None, # Download all files
)
logger.info(f"Repository downloaded to '{REPO_DOWNLOAD_DIR}'.")
except Exception as e:
logger.error(f"Error downloading repository '{repo_id}': {e}")
raise e
def format_task_dict(example):
instance_id = example['id']
prev_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/prev')
post_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/post')
# Check if 'prev' and 'post' directories exist
prev_exists = os.path.exists(prev_remote_path)
post_exists = os.path.exists(post_remote_path)
if prev_exists and post_exists:
skip = False
else:
skip = True
task = {
'instance_id': instance_id,
'prev_image': example['prev_image'],
'post_image': example['post_image'],
'changes': example['changes'],
'prev_code_files': example['prev_code_files'],
'post_code_files': example['post_code_files'],
'skip': skip,
}
return task
def prepare_visualcodebench(dataset):
logger.info('Processing dataset')
dataset_processed = []
for example in tqdm(dataset['train']):
formatted_example = format_task_dict(example)
if formatted_example['skip']:
continue
del formatted_example['skip']
dataset_processed.append(formatted_example)
return pd.DataFrame(dataset_processed)
def pil_image_to_base64(image: PngImagePlugin.PngImageFile) -> str:
"""
Converts a PIL image to a Base64-encoded string.
Parameters:
- image (PngImagePlugin.PngImageFile): The PIL image to convert.
Returns:
- str: The Base64-encoded string of the image.
"""
if not isinstance(image, PngImagePlugin.PngImageFile):
raise ValueError(
'The provided image is not a PIL.PngImagePlugin.PngImageFile instance.'
)
buffered = BytesIO()
image.save(buffered, format='PNG')
img_bytes = buffered.getvalue()
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
base64_with_prefix = f'data:image/png;base64,{img_base64}'
return [base64_with_prefix]

View File

@@ -0,0 +1,247 @@
# FILE: run_infer.py
import asyncio
import os
import shutil
import tempfile
from functools import partial
import pandas as pd
from datasets import load_dataset
# from evaluation.benchmarks.visualcodebench.eval import capture_screenshot
from evaluation.benchmarks.visualcodebench.prepare import (
REPO_DOWNLOAD_DIR,
download_repository,
pil_image_to_base64,
prepare_visualcodebench,
)
from evaluation.utils.shared import (
EvalMetadata,
assert_and_raise,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
)
from openhands.controller.state.state import State
from openhands.core.config import (
AppConfig,
SandboxConfig,
get_llm_config_arg,
)
from openhands.core.config.utils import parse_arguments
from openhands.core.logger import openhands_logger as logger # Import OpenHands logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action.commands import CmdRunAction
from openhands.events.action.message import MessageAction
from openhands.events.observation.commands import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync
# Define workspace and output directories
WORKSPACE_DIR = './workspace'
FAKE_RESPONSES = {
'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
}
def get_config(
metadata: EvalMetadata,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='eventstream',
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
enable_auto_lint=True,
use_host_network=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
return config
def initialize_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required
):
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
"""
logger.info('-' * 30)
logger.info('BEGIN Runtime Initialization Fn')
logger.info('-' * 30)
workspace_dir_name = instance['instance_id']
obs: CmdOutputObservation
action = CmdRunAction(command='mkdir -p /workspace/{workspace_dir_name}')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to create /workspace/{workspace_dir_name}: {str(obs)}',
)
file_path = REPO_DOWNLOAD_DIR + f'data/{workspace_dir_name}/prev/index.html'
runtime.copy_to(file_path, f'/workspace/{workspace_dir_name}')
logger.info(f'Copied code file for instance {workspace_dir_name}')
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
)
logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
logger.info('-' * 30)
def complete_runtime(
runtime: Runtime,
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
) -> str:
# TODO: extract edited HTML file from agent workspace
# temp_zip = runtime.copy_from(f'/workspace/{instance.instance_id}')
# file_name = f'/workspace/{instance.instance_id}/index.html'
# with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
# if file_name in zip_ref.namelist():
# with zip_ref.open(file_name) as file:
# file_content = file.read().decode('utf-8') # Decode bytes to string
# else:
# raise FileNotFoundError(f"'{file_name}' not found in the ZIP archive.")
with tempfile.TemporaryDirectory() as tmpdir:
src_folder = REPO_DOWNLOAD_DIR + f'data/{instance.instance_id}/post/'
shutil.copytree(src_folder, tmpdir, dirs_exist_ok=True)
# image = capture_screenshot(tmpdir)
# if image is not None:
# shutil.copy(os.path.join(tmpdir, 'final_screenshot.png'), REPO_DOWNLOAD_DIR)
def process_instance(
instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True
):
config = get_config(metadata)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
# =============================================
# build instruction
# =============================================
# Prepare instruction
instruction = (
f"Modify the HTML/CSS according to the following instruction:\n\n"
f"{instance['changes']}\n\n"
)
instruction += (
'IMPORTANT: You should ONLY interact with the environment provided '
'to you AND NEVER ASK FOR HUMAN HELP.\n'
)
# =============================================
# create sandbox and run the agent
# =============================================
runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
try:
initialize_runtime(runtime, instance=instance)
image_urls = pil_image_to_base64(instance['prev_image'])
action = MessageAction(content=instruction, image_urls=image_urls)
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=action,
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
)
)
if state is None:
raise ValueError('State should not be None.')
# =============================================
# result evaluation
# =============================================
return_val = complete_runtime(runtime, instance)
logger.info(f'Return value {return_val}')
finally:
runtime.close()
# TODO: return EVAL output
def main():
"""Main function to run the evaluation."""
# args = parse_args()
args = parse_arguments()
logger.info(f"\n{'='*80}\nStarting VisualCodeBench Evaluation\n{'='*80}")
logger.info(f'Agent: {args.agent_cls}')
logger.info(f'Model: {args.llm_config}')
logger.info(f'Max iterations: {args.max_iterations}')
logger.info(f'Eval limit: {args.eval_n_limit}')
logger.info(f'Num workers: {args.eval_num_workers}\n')
logger.info(f'Eval output: {args.eval_output_dir}\n')
# Step 1: Download the entire repository once
logger.info('Downloading repository...')
download_repository()
# Step 2: Load Dataset
logger.info('Loading dataset...')
dataset = load_dataset(REPO_DOWNLOAD_DIR)
# Step 3: Prepare dataset
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
logger.error(f'Could not find LLM config: {args.llm_config}')
raise ValueError(f'Could not find LLM config: {args.llm_config}')
metadata = make_metadata(
llm_config,
'VisualCodeBench',
args.agent_cls,
args.max_iterations,
args.eval_note,
'evaluation/output/',
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
dataset = prepare_visualcodebench(dataset)
instances = prepare_dataset(dataset, output_file, eval_n_limit=args.eval_n_limit)
# Step 4: Run eval
run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,46 @@
#!/bin/bash
set -eo pipefail
source "evaluation/utils/version_control.sh"
# Check if required arguments are provided
if [ "$#" -lt 4 ]; then
echo "Usage: $0 [model_config] [commit_hash] [agent_cls] [eval_limit] [num_workers]"
echo "Example: $0 llm.eval_gpt_4o_mini HEAD CodeActAgent 5 1"
exit 1
fi
MODEL_CONFIG=$1
COMMIT_HASH=$2
AGENT_CLS=$3
EVAL_LIMIT=$4
NUM_WORKERS=${5:-1} # Default to 1 worker if not specified
# Checkout the specified commit
checkout_eval_branch
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default CodeActAgent"
AGENT="CodeActAgent"
fi
get_openhands_version
echo "AGENT: $AGENT"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="export PYTHONPATH=evaluation/benchmarks/visualcodebench:\$PYTHONPATH && poetry run python evaluation/benchmarks/visualcodebench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--eval-num-workers $NUM_WORKERS \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

View File

@@ -0,0 +1,167 @@
import http
import os
import socket
import socketserver
import threading
import time
from io import BytesIO
import requests
from PIL import Image, ImageChops
from playwright.sync_api import sync_playwright
from openhands.core.logger import openhands_logger as logger
def get_free_port():
"""Find a free port to run the HTTP server."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]
def start_http_server(tmpdir):
port = get_free_port()
class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def translate_path(self, path):
# Serve files from the specified directory instead of the current working directory
path = super().translate_path(path)
relative_path = os.path.relpath(path, os.getcwd())
return os.path.join(tmpdir, relative_path)
handler = CustomHTTPRequestHandler
server = socketserver.TCPServer(('', port), handler)
return server, port
def capture_screenshot(tmpdir):
server, port = start_http_server(tmpdir)
server_thread = threading.Thread(target=server.serve_forever)
server_thread.daemon = True
server_thread.start()
time.sleep(10)
image = None
try:
server_url = f'http://localhost:{port}/'
if not is_server_reachable(server_url):
raise RuntimeError(f'Server not reachable at {server_url}')
screenshot_path = os.path.join(tmpdir, 'final_screenshot.png')
capture_screenshot_playwright(server_url, screenshot_path)
image = Image.open(screenshot_path)
image.load()
finally:
# Shut down the server and clean up
server.shutdown()
server.server_close()
return image
def is_server_reachable(url):
"""
Check if the local server is reachable.
"""
try:
response = requests.get(url, timeout=5) # Set a 5-second timeout
if response.status_code == 200:
logger.info(f'Server is reachable at {url}')
return True
else:
logger.warning(
f'Server responded with status code {response.status_code} at {url}'
)
return False
except requests.ConnectionError as e:
logger.error(f'Failed to connect to server at {url}: {e}')
return False
def capture_screenshot_playwright(url, screenshot_path):
"""Capture a screenshot of the given URL using Playwright."""
try:
with sync_playwright() as p:
logger.info('Launching browser...')
browser = p.chromium.launch(timeout=10000) # 10 seconds for browser launch
logger.info('Creating a new page...')
page = browser.new_page()
logger.info(f'Navigating to URL: {url}')
try:
page.goto(url, timeout=60 * 1000) # Set timeout to 5 seconds
logger.info('Page navigation completed.')
except Exception as e:
logger.warning(f'Page navigation timed out. {e}. Continuing...')
logger.info('Waiting for network to be idle...')
try:
page.wait_for_load_state(
'networkidle', timeout=60 * 1000
) # Set timeout to 5 seconds
logger.info('Page load state reached.')
except Exception as e:
logger.warning(f'Page load state timed out. {e}. Continuing...')
logger.info('Capturing screenshot...')
page.screenshot(
path=screenshot_path, full_page=True
) # Capture full page screenshot
logger.info(f'Screenshot saved to {screenshot_path}')
browser.close()
return True
except Exception as e:
logger.error(f'Error capturing screenshot with Playwright: {e}')
return False
def evaluate(task, screenshot_path):
"""Compare generated screenshot with post_image using CLIP score."""
try:
import torch
from transformers import CLIPModel, CLIPProcessor
# Load CLIP model and processor
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
# Load images
post_image = Image.open(BytesIO(task['post_image']))
generated_img = Image.open(screenshot_path)
# Process images
inputs = processor(
images=[post_image, generated_img], return_tensors='pt', padding=True
)
# Get image features
image_features = model.get_image_features(**inputs)
# Calculate cosine similarity
similarity = torch.nn.functional.cosine_similarity(
image_features[0].unsqueeze(0), image_features[1].unsqueeze(0)
).item()
logger.info(f'CLIP similarity score: {similarity}')
return similarity > 0.95 # Consider it a match if similarity > 95%
except Exception as e:
logger.error(f'Error in CLIP evaluation: {e}')
# Fallback to pixel comparison if CLIP fails
try:
post_image = Image.open(BytesIO(task['post_image']))
generated_img = Image.open(screenshot_path)
# Compare images directly without converting to bytes
diff = ImageChops.difference(generated_img, post_image)
logger.info(
f"Pixel difference analysis: {'No difference' if not diff.getbbox() else 'Differences found'}"
)
return not diff.getbbox()
except Exception as ex:
logger.error(f'Error in fallback evaluation: {ex}')
return False

View File

@@ -79,7 +79,6 @@ def get_config(
'VWA_HOMEPAGE': f'{base_url}:4399',
},
timeout=300,
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -71,7 +71,6 @@ def get_config(
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
},
remote_runtime_enable_retries=True,
),
# do not mount workspace
workspace_base=None,

View File

@@ -1,6 +1,6 @@
import { render, screen, waitFor } from "@testing-library/react";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, it, expect } from "vitest";
import { describe, it, expect, test } from "vitest";
import { ChatMessage } from "#/components/features/chat/chat-message";
describe("ChatMessage", () => {
@@ -45,9 +45,7 @@ describe("ChatMessage", () => {
await user.click(copyToClipboardButton);
await waitFor(() =>
expect(navigator.clipboard.readText()).resolves.toBe("Hello, World!"),
);
expect(navigator.clipboard.readText()).resolves.toBe("Hello, World!");
});
it("should display an error toast if copying content to clipboard fails", async () => {});

View File

@@ -1,5 +1,4 @@
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
import type { Message } from "#/message";
import { act, screen, waitFor, within } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";

View File

@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
import { screen } from "@testing-library/react";
import { renderWithProviders } from "test-utils";
import { ExpandableMessage } from "#/components/features/chat/expandable-message";
import { vi } from "vitest"
import { vi } from "vitest";
vi.mock("react-i18next", async () => {
const actual = await vi.importActual("react-i18next");

View File

@@ -1,45 +0,0 @@
import userEvent from "@testing-library/user-event";
import { describe, expect, it, vi } from "vitest";
import { render, screen } from "@testing-library/react";
import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
import { AnalyticsConsentFormModal } from "#/components/features/analytics/analytics-consent-form-modal";
import OpenHands from "#/api/open-hands";
import { SettingsProvider } from "#/context/settings-context";
import { AuthProvider } from "#/context/auth-context";
describe("AnalyticsConsentFormModal", () => {
it("should call saveUserSettings with default settings on confirm reset settings", async () => {
const user = userEvent.setup();
const onCloseMock = vi.fn();
const saveUserSettingsSpy = vi.spyOn(OpenHands, "saveSettings");
render(<AnalyticsConsentFormModal onClose={onCloseMock} />, {
wrapper: ({ children }) => (
<AuthProvider>
<QueryClientProvider client={new QueryClient()}>
<SettingsProvider>{children}</SettingsProvider>
</QueryClientProvider>
</AuthProvider>
),
});
const confirmButton = screen.getByTestId("confirm-preferences");
await user.click(confirmButton);
expect(saveUserSettingsSpy).toHaveBeenCalledWith({
user_consents_to_analytics: true,
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
github_token: undefined,
language: "en",
llm_api_key: undefined,
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
unset_github_token: undefined,
});
expect(onCloseMock).toHaveBeenCalled();
});
});

View File

@@ -1,14 +1,5 @@
import { render, screen, within } from "@testing-library/react";
import {
afterAll,
afterEach,
beforeAll,
describe,
expect,
it,
test,
vi,
} from "vitest";
import { afterEach, describe, expect, it, test, vi } from "vitest";
import userEvent from "@testing-library/user-event";
import { formatTimeDelta } from "#/utils/format-time-delta";
import { ConversationCard } from "#/components/features/conversation-panel/conversation-card";
@@ -20,18 +11,10 @@ describe("ConversationCard", () => {
const onChangeTitle = vi.fn();
const onDownloadWorkspace = vi.fn();
beforeAll(() => {
vi.stubGlobal("window", { open: vi.fn() });
});
afterEach(() => {
vi.clearAllMocks();
});
afterAll(() => {
vi.unstubAllGlobals();
});
it("should render the conversation card", () => {
render(
<ConversationCard
@@ -46,8 +29,9 @@ describe("ConversationCard", () => {
const expectedDate = `${formatTimeDelta(new Date("2021-10-01T12:00:00Z"))} ago`;
const card = screen.getByTestId("conversation-card");
const title = within(card).getByTestId("conversation-card-title");
within(card).getByText("Conversation 1");
expect(title).toHaveValue("Conversation 1");
within(card).getByText(expectedDate);
});
@@ -164,8 +148,10 @@ describe("ConversationCard", () => {
/>,
);
await clickOnEditButton(user);
const title = screen.getByTestId("conversation-card-title");
expect(title).toBeDisabled();
await clickOnEditButton(user);
expect(title).toBeEnabled();
expect(screen.queryByTestId("context-menu")).not.toBeInTheDocument();
@@ -178,6 +164,7 @@ describe("ConversationCard", () => {
expect(onChangeTitle).toHaveBeenCalledWith("New Conversation Name");
expect(title).toHaveValue("New Conversation Name");
expect(title).toBeDisabled();
});
it("should reset title and not call onChangeTitle when the title is empty", async () => {
@@ -204,27 +191,7 @@ describe("ConversationCard", () => {
expect(title).toHaveValue("Conversation 1");
});
test("clicking the title should trigger the onClick handler", async () => {
const user = userEvent.setup();
render(
<ConversationCard
onClick={onClick}
onDelete={onDelete}
isActive
onChangeTitle={onChangeTitle}
title="Conversation 1"
selectedRepository={null}
lastUpdatedAt="2021-10-01T12:00:00Z"
/>,
);
const title = screen.getByTestId("conversation-card-title");
await user.click(title);
expect(onClick).toHaveBeenCalled();
});
test("clicking the title should not trigger the onClick handler if edit mode", async () => {
test("clicking the title should not trigger the onClick handler", async () => {
const user = userEvent.setup();
render(
<ConversationCard
@@ -237,8 +204,6 @@ describe("ConversationCard", () => {
/>,
);
await clickOnEditButton(user);
const title = screen.getByTestId("conversation-card-title");
await user.click(title);

View File

@@ -179,10 +179,9 @@ describe("ConversationPanel", () => {
const user = userEvent.setup();
renderConversationPanel();
const cards = await screen.findAllByTestId("conversation-card");
const title = within(cards[0]).getByTestId("conversation-card-title");
const card = cards[0];
await clickOnEditButton(user, card);
const title = within(card).getByTestId("conversation-card-title");
await clickOnEditButton(user);
await user.clear(title);
await user.type(title, "Conversation 1 Renamed");
@@ -203,10 +202,7 @@ describe("ConversationPanel", () => {
const user = userEvent.setup();
renderConversationPanel();
const cards = await screen.findAllByTestId("conversation-card");
const card = cards[0];
await clickOnEditButton(user, card);
const title = within(card).getByTestId("conversation-card-title");
const title = within(cards[0]).getByTestId("conversation-card-title");
await user.click(title);
await user.tab();
@@ -231,7 +227,7 @@ describe("ConversationPanel", () => {
it("should call onClose after clicking a card", async () => {
renderConversationPanel();
const cards = await screen.findAllByTestId("conversation-card");
const firstCard = cards[1];
const firstCard = cards[0];
await userEvent.click(firstCard);

View File

@@ -1,16 +1,11 @@
import { screen, within } from "@testing-library/react";
import { UserEvent } from "@testing-library/user-event";
export const clickOnEditButton = async (
user: UserEvent,
container?: HTMLElement,
) => {
const wrapper = container ? within(container) : screen;
const ellipsisButton = wrapper.getByTestId("ellipsis-button");
export const clickOnEditButton = async (user: UserEvent) => {
const ellipsisButton = screen.getByTestId("ellipsis-button");
await user.click(ellipsisButton);
const menu = wrapper.getByTestId("context-menu");
const menu = screen.getByTestId("context-menu");
const editButton = within(menu).getByTestId("edit-button");
await user.click(editButton);

View File

@@ -3,24 +3,52 @@ import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { renderWithProviders } from "test-utils";
import { createRoutesStub } from "react-router";
import { AxiosError } from "axios";
import { Sidebar } from "#/components/features/sidebar/sidebar";
import { MULTI_CONVERSATION_UI } from "#/utils/feature-flags";
import OpenHands from "#/api/open-hands";
import { MOCK_USER_PREFERENCES } from "#/mocks/handlers";
// These tests will now fail because the conversation panel is rendered through a portal
// and technically not a child of the Sidebar component.
const RouterStub = createRoutesStub([
{
path: "/conversation/:conversationId",
Component: () => <Sidebar />,
},
]);
const renderSidebar = () => {
const RouterStub = createRoutesStub([
{
path: "/conversation/:conversationId",
Component: Sidebar,
},
]);
const renderSidebar = () =>
renderWithProviders(<RouterStub initialEntries={["/conversation/123"]} />);
};
describe("Sidebar", () => {
it.skipIf(!MULTI_CONVERSATION_UI)(
"should have the conversation panel open by default",
() => {
renderSidebar();
expect(screen.getByTestId("conversation-panel")).toBeInTheDocument();
},
);
it.skipIf(!MULTI_CONVERSATION_UI)(
"should toggle the conversation panel",
async () => {
const user = userEvent.setup();
renderSidebar();
const projectPanelButton = screen.getByTestId(
"toggle-conversation-panel",
);
await user.click(projectPanelButton);
expect(
screen.queryByTestId("conversation-panel"),
).not.toBeInTheDocument();
},
);
describe("Settings", () => {
const getSettingsSpy = vi.spyOn(OpenHands, "getSettings");
const saveSettingsSpy = vi.spyOn(OpenHands, "saveSettings");
@@ -48,12 +76,35 @@ describe("Sidebar", () => {
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
language: "en",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
...MOCK_USER_PREFERENCES.settings,
// the actual values are falsey (null or "") but we're checking for undefined
llm_api_key: undefined,
llm_base_url: undefined,
security_analyzer: undefined,
});
});
it("should send all settings data when saving account settings", async () => {
const user = userEvent.setup();
renderSidebar();
const userAvatar = screen.getByTestId("user-avatar");
await user.click(userAvatar);
const menu = screen.getByTestId("account-settings-context-menu");
const accountSettingsButton = within(menu).getByTestId(
"account-settings-button",
);
await user.click(accountSettingsButton);
const accountSettingsModal = screen.getByTestId("account-settings-form");
const saveButton =
within(accountSettingsModal).getByTestId("save-settings");
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
...MOCK_USER_PREFERENCES.settings,
llm_api_key: undefined, // null or undefined
});
});
@@ -83,25 +134,14 @@ describe("Sidebar", () => {
within(accountSettingsModal).getByLabelText(/GITHUB\$TOKEN_LABEL/i);
await user.type(tokenInput, "new-token");
const analyticsConsentInput =
within(accountSettingsModal).getByTestId("analytics-consent");
await user.click(analyticsConsentInput);
const saveButton =
within(accountSettingsModal).getByTestId("save-settings");
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
github_token: "new-token",
...MOCK_USER_PREFERENCES.settings,
language: "no",
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
user_consents_to_analytics: true,
llm_api_key: undefined, // null or undefined
});
});
@@ -129,53 +169,11 @@ describe("Sidebar", () => {
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
language: "en",
...MOCK_USER_PREFERENCES.settings,
llm_api_key: undefined,
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: undefined,
});
});
});
describe("Settings Modal", () => {
it("should open the settings modal if the user clicks the settings button", async () => {
const user = userEvent.setup();
renderSidebar();
expect(screen.queryByTestId("ai-config-modal")).not.toBeInTheDocument();
const settingsButton = screen.getByTestId("settings-button");
await user.click(settingsButton);
const settingsModal = screen.getByTestId("ai-config-modal");
expect(settingsModal).toBeInTheDocument();
});
it("should open the settings modal if GET /settings fails with a 404", async () => {
const error = new AxiosError(
"Request failed with status code 404",
"ERR_BAD_REQUEST",
undefined,
undefined,
{
status: 404,
statusText: "Not Found",
data: { message: "Settings not found" },
headers: {},
// @ts-expect-error - we only need the response object for this test
config: {},
},
);
vi.spyOn(OpenHands, "getSettings").mockRejectedValue(error);
renderSidebar();
const settingsModal = await screen.findByTestId("ai-config-modal");
expect(settingsModal).toBeInTheDocument();
});
});
});

View File

@@ -14,7 +14,7 @@ describe("WaitlistModal", () => {
});
it("should render a tos checkbox that is unchecked by default", () => {
render(<WaitlistModal ghTokenIsSet={false} githubAuthUrl={null} />);
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
const checkbox = screen.getByRole("checkbox");
expect(checkbox).not.toBeChecked();
@@ -22,7 +22,7 @@ describe("WaitlistModal", () => {
it("should only enable the GitHub button if the tos checkbox is checked", async () => {
const user = userEvent.setup();
render(<WaitlistModal ghTokenIsSet={false} githubAuthUrl={null} />);
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
const checkbox = screen.getByRole("checkbox");
const button = screen.getByRole("button", { name: "Connect to GitHub" });
@@ -40,7 +40,7 @@ describe("WaitlistModal", () => {
);
const user = userEvent.setup();
render(<WaitlistModal ghTokenIsSet={false} githubAuthUrl="mock-url" />);
render(<WaitlistModal ghToken={null} githubAuthUrl="mock-url" />);
const checkbox = screen.getByRole("checkbox");
await user.click(checkbox);

View File

@@ -32,7 +32,7 @@ describe("FeedbackForm", () => {
screen.getByLabelText(I18nKey.FEEDBACK$PRIVATE_LABEL);
screen.getByLabelText(I18nKey.FEEDBACK$PUBLIC_LABEL);
screen.getByRole("button", { name: I18nKey.FEEDBACK$SHARE_LABEL });
screen.getByRole("button", { name: I18nKey.FEEDBACK$CONTRIBUTE_LABEL });
screen.getByRole("button", { name: I18nKey.FEEDBACK$CANCEL_LABEL });
});

View File

@@ -1,82 +0,0 @@
import { render, screen } from "@testing-library/react";
import { describe, it, expect } from "vitest";
import { Messages } from "#/components/features/chat/messages";
import type { Message } from "#/message";
describe("File Operations Messages", () => {
it("should show success indicator for successful file read operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "read_file_contents",
content: "Successfully read file contents",
success: true,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-success");
});
it("should show failure indicator for failed file read operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "read_file_contents",
content: "Failed to read file contents",
success: false,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
});
it("should show success indicator for successful file edit operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "edit_file_contents",
content: "Successfully edited file contents",
success: true,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-success");
});
it("should show failure indicator for failed file edit operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "edit_file_contents",
content: "Failed to edit file contents",
success: false,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);
const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
});
});

View File

@@ -4,7 +4,7 @@ import { useTranslation } from "react-i18next";
import translations from "../../src/i18n/translation.json";
import { UserAvatar } from "../../src/components/features/sidebar/user-avatar";
vi.mock("@heroui/react", () => ({
vi.mock("@nextui-org/react", () => ({
Tooltip: ({ content, children }: { content: string; children: React.ReactNode }) => (
<div>
{children}

View File

@@ -1,174 +0,0 @@
import { screen, waitFor } from "@testing-library/react";
import { afterEach, describe, expect, it, vi } from "vitest";
import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";
import { AccountSettingsModal } from "#/components/shared/modals/account-settings/account-settings-modal";
import { MOCK_DEFAULT_USER_SETTINGS } from "#/mocks/handlers";
import OpenHands from "#/api/open-hands";
import * as ConsentHandlers from "#/utils/handle-capture-consent";
describe("AccountSettingsModal", () => {
const getSettingsSpy = vi.spyOn(OpenHands, "getSettings");
const saveSettingsSpy = vi.spyOn(OpenHands, "saveSettings");
afterEach(() => {
vi.clearAllMocks();
});
it.skip("should set the appropriate user analytics consent default", async () => {
getSettingsSpy.mockResolvedValue({
...MOCK_DEFAULT_USER_SETTINGS,
user_consents_to_analytics: true,
});
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const analyticsConsentInput = screen.getByTestId("analytics-consent");
await waitFor(() => expect(analyticsConsentInput).toBeChecked());
});
it("should save the users consent to analytics when saving account settings", async () => {
const user = userEvent.setup();
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const analyticsConsentInput = screen.getByTestId("analytics-consent");
await user.click(analyticsConsentInput);
const saveButton = screen.getByTestId("save-settings");
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
language: "en",
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
user_consents_to_analytics: true,
});
});
it("should call handleCaptureConsent with the analytics consent value if the save is successful", async () => {
const user = userEvent.setup();
const handleCaptureConsentSpy = vi.spyOn(
ConsentHandlers,
"handleCaptureConsent",
);
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const analyticsConsentInput = screen.getByTestId("analytics-consent");
await user.click(analyticsConsentInput);
const saveButton = screen.getByTestId("save-settings");
await user.click(saveButton);
expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
await user.click(analyticsConsentInput);
await user.click(saveButton);
expect(handleCaptureConsentSpy).toHaveBeenCalledWith(false);
});
it("should send all settings data when saving account settings", async () => {
const user = userEvent.setup();
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const languageInput = screen.getByLabelText(/language/i);
await user.click(languageInput);
const norskOption = screen.getByText(/norsk/i);
await user.click(norskOption);
const tokenInput = screen.getByTestId("github-token-input");
await user.type(tokenInput, "new-token");
const saveButton = screen.getByTestId("save-settings");
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
language: "no",
github_token: "new-token",
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
user_consents_to_analytics: false,
});
});
it("should render a checkmark and not the input if the github token is set", async () => {
getSettingsSpy.mockResolvedValue({
...MOCK_DEFAULT_USER_SETTINGS,
github_token_is_set: true,
});
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
await waitFor(() => {
const checkmark = screen.queryByTestId("github-token-set-checkmark");
const input = screen.queryByTestId("github-token-input");
expect(checkmark).toBeInTheDocument();
expect(input).not.toBeInTheDocument();
});
});
it("should send an unset github token property when pressing disconnect", async () => {
const user = userEvent.setup();
getSettingsSpy.mockResolvedValue({
...MOCK_DEFAULT_USER_SETTINGS,
github_token_is_set: true,
});
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const disconnectButton = await screen.findByTestId("disconnect-github");
await user.click(disconnectButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
github_token: undefined,
language: "en",
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
unset_github_token: true,
});
});
it("should not unset the github token when changing the language", async () => {
const user = userEvent.setup();
getSettingsSpy.mockResolvedValue({
...MOCK_DEFAULT_USER_SETTINGS,
github_token_is_set: true,
});
renderWithProviders(<AccountSettingsModal onClose={() => {}} />);
const languageInput = screen.getByLabelText(/language/i);
await user.click(languageInput);
const norskOption = screen.getByText(/norsk/i);
await user.click(norskOption);
const saveButton = screen.getByTestId("save-settings");
await user.click(saveButton);
expect(saveSettingsSpy).toHaveBeenCalledWith({
agent: "CodeActAgent",
confirmation_mode: false,
enable_default_condenser: false,
language: "no",
llm_base_url: "",
llm_model: "anthropic/claude-3-5-sonnet-20241022",
remote_runtime_resource_factor: 1,
security_analyzer: "",
user_consents_to_analytics: false,
});
});
});

View File

@@ -8,10 +8,9 @@ vi.mock("react-i18next", () => ({
useTranslation: () => ({
t: (key: string) => {
const translations: Record<string, string> = {
SUGGESTIONS$TODO_APP: "ToDoリストアプリを開発する",
LANDING$BUILD_APP_BUTTON: "プルリクエストを表示するアプリを開発する",
SUGGESTIONS$HACKER_NEWS:
"Hacker Newsのトップ記事を表示するbashスクリプトを作成する",
"SUGGESTIONS$TODO_APP": "ToDoリストアプリを開発する",
"LANDING$BUILD_APP_BUTTON": "プルリクエストを表示するアプリを開発する",
"SUGGESTIONS$HACKER_NEWS": "Hacker Newsのトップ記事を表示するbashスクリプトを作成する",
};
return translations[key] || key;
},
@@ -39,9 +38,9 @@ describe("SuggestionItem", () => {
value: "todo app value",
};
render(
<SuggestionItem suggestion={translatedSuggestion} onClick={onClick} />,
);
const { container } = render(<SuggestionItem suggestion={translatedSuggestion} onClick={onClick} />);
console.log('Rendered HTML:', container.innerHTML);
expect(screen.getByText("ToDoリストアプリを開発する")).toBeInTheDocument();
});

View File

@@ -58,7 +58,6 @@ describe("frontend/routes/_oh", () => {
it("should render and capture the user's consent if oss mode", async () => {
const user = userEvent.setup();
const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
const getSettingsSpy = vi.spyOn(OpenHands, "getSettings");
const handleCaptureConsentSpy = vi.spyOn(
CaptureConsent,
"handleCaptureConsent",
@@ -70,16 +69,12 @@ describe("frontend/routes/_oh", () => {
POSTHOG_CLIENT_KEY: "test-key",
});
// @ts-expect-error - We only care about the user_consents_to_analytics field
getSettingsSpy.mockResolvedValue({
user_consents_to_analytics: null,
});
renderWithProviders(<RouteStub />);
// The user has not consented to tracking
const consentForm = await screen.findByTestId("user-capture-consent-form");
expect(handleCaptureConsentSpy).not.toHaveBeenCalled();
expect(localStorage.getItem("analytics-consent")).toBeNull();
const submitButton = within(consentForm).getByRole("button", {
name: /confirm preferences/i,
@@ -88,6 +83,7 @@ describe("frontend/routes/_oh", () => {
// The user has now consented to tracking
expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
expect(localStorage.getItem("analytics-consent")).toBe("true");
expect(
screen.queryByTestId("user-capture-consent-form"),
).not.toBeInTheDocument();
@@ -110,6 +106,17 @@ describe("frontend/routes/_oh", () => {
});
});
it("should not render the user consent form if the user has already made a decision", async () => {
localStorage.setItem("analytics-consent", "true");
renderWithProviders(<RouteStub />);
await waitFor(() => {
expect(
screen.queryByTestId("user-capture-consent-form"),
).not.toBeInTheDocument();
});
});
// TODO: Likely failing due to how tokens are now handled in context. Move to e2e tests
it.skip("should render a new project button if a token is set", async () => {
localStorage.setItem("token", "test-token");

View File

@@ -1,59 +0,0 @@
import { describe, it, expect, vi, beforeEach } from "vitest";
import { handleStatusMessage } from "#/services/actions";
import store from "#/store";
import { trackError } from "#/utils/error-handler";
// Mock dependencies
vi.mock("#/utils/error-handler", () => ({
trackError: vi.fn(),
}));
vi.mock("#/store", () => ({
default: {
dispatch: vi.fn(),
},
}));
describe("Actions Service", () => {
beforeEach(() => {
vi.clearAllMocks();
});
describe("handleStatusMessage", () => {
it("should dispatch info messages to status state", () => {
const message = {
type: "info",
message: "Runtime is not available",
id: "runtime.unavailable",
status_update: true as const,
};
handleStatusMessage(message);
expect(store.dispatch).toHaveBeenCalledWith(expect.objectContaining({
payload: message,
}));
});
it("should log error messages and display them in chat", () => {
const message = {
type: "error",
message: "Runtime connection failed",
id: "runtime.connection.failed",
status_update: true as const,
};
handleStatusMessage(message);
expect(trackError).toHaveBeenCalledWith({
message: "Runtime connection failed",
source: "chat",
metadata: { msgId: "runtime.connection.failed" },
});
expect(store.dispatch).toHaveBeenCalledWith(expect.objectContaining({
payload: message,
}));
});
});
});

View File

@@ -1,165 +0,0 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { trackError, showErrorToast, showChatError } from "#/utils/error-handler";
import posthog from "posthog-js";
import toast from "react-hot-toast";
import * as Actions from "#/services/actions";
vi.mock("posthog-js", () => ({
default: {
captureException: vi.fn(),
},
}));
vi.mock("react-hot-toast", () => ({
default: {
error: vi.fn(),
},
}));
vi.mock("#/services/actions", () => ({
handleStatusMessage: vi.fn(),
}));
describe("Error Handler", () => {
beforeEach(() => {
vi.clearAllMocks();
});
afterEach(() => {
vi.clearAllMocks();
});
describe("trackError", () => {
it("should send error to PostHog with basic info", () => {
const error = {
message: "Test error",
source: "test",
};
trackError(error);
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Test error"), {
error_source: "test",
});
});
it("should include additional metadata in PostHog event", () => {
const error = {
message: "Test error",
source: "test",
metadata: {
extra: "info",
details: { foo: "bar" },
},
};
trackError(error);
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Test error"), {
error_source: "test",
extra: "info",
details: { foo: "bar" },
});
});
});
describe("showErrorToast", () => {
it("should log error and show toast", () => {
const error = {
message: "Toast error",
source: "toast-test",
};
showErrorToast(error);
// Verify PostHog logging
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Toast error"), {
error_source: "toast-test",
});
// Verify toast was shown
expect(toast.error).toHaveBeenCalled();
});
it("should include metadata in PostHog event when showing toast", () => {
const error = {
message: "Toast error",
source: "toast-test",
metadata: { context: "testing" },
};
showErrorToast(error);
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Toast error"), {
error_source: "toast-test",
context: "testing",
});
});
it("should log errors from different sources with appropriate metadata", () => {
// Test agent status error
showErrorToast({
message: "Agent error",
source: "agent-status",
metadata: { id: "error.agent" },
});
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Agent error"), {
error_source: "agent-status",
id: "error.agent",
});
showErrorToast({
message: "Server error",
source: "server",
metadata: { error_code: 500, details: "Internal error" },
});
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Server error"), {
error_source: "server",
error_code: 500,
details: "Internal error",
});
});
it("should log feedback submission errors with conversation context", () => {
const error = new Error("Feedback submission failed");
showErrorToast({
message: error.message,
source: "feedback",
metadata: { conversationId: "123", error },
});
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Feedback submission failed"), {
error_source: "feedback",
conversationId: "123",
error,
});
});
});
describe("showChatError", () => {
it("should log error and show chat error message", () => {
const error = {
message: "Chat error",
source: "chat-test",
msgId: "123",
};
showChatError(error);
// Verify PostHog logging
expect(posthog.captureException).toHaveBeenCalledWith(new Error("Chat error"), {
error_source: "chat-test",
});
// Verify error message was shown in chat
expect(Actions.handleStatusMessage).toHaveBeenCalledWith({
type: "error",
message: "Chat error",
id: "123",
status_update: true,
});
});
});
});

View File

@@ -0,0 +1,28 @@
import { beforeEach, describe, expect, it, vi, type Mock } from "vitest";
import { getCachedConfig } from "../../src/utils/storage";
describe("getCachedConfig", () => {
beforeEach(() => {
// Clear all instances and calls to constructor and all methods
Storage.prototype.getItem = vi.fn();
});
it("should return an empty object when local storage is null or undefined", () => {
(Storage.prototype.getItem as Mock).mockReturnValue(null);
expect(getCachedConfig()).toEqual({});
(Storage.prototype.getItem as Mock).mockReturnValue(undefined);
expect(getCachedConfig()).toEqual({});
});
it("should return an empty object when local storage has invalid JSON", () => {
(Storage.prototype.getItem as Mock).mockReturnValue("invalid JSON");
expect(getCachedConfig()).toEqual({});
});
it("should return parsed object when local storage has valid JSON", () => {
const validJSON = '{"key":"value"}';
(Storage.prototype.getItem as Mock).mockReturnValue(validJSON);
expect(getCachedConfig()).toEqual({ key: "value" });
});
});

File diff suppressed because it is too large Load Diff

View File

@@ -1,33 +1,33 @@
{
"name": "openhands-frontend",
"version": "0.24.0",
"version": "0.21.0",
"private": true,
"type": "module",
"engines": {
"node": ">=20.0.0"
},
"dependencies": {
"@heroui/react": "2.6.14",
"@monaco-editor/react": "^4.7.0-rc.0",
"@react-router/node": "^7.1.5",
"@react-router/serve": "^7.1.5",
"@nextui-org/react": "^2.6.11",
"@react-router/node": "^7.1.2",
"@react-router/serve": "^7.1.2",
"@react-types/shared": "^3.27.0",
"@reduxjs/toolkit": "^2.5.1",
"@tanstack/react-query": "^5.66.0",
"@reduxjs/toolkit": "^2.5.0",
"@tanstack/react-query": "^5.64.1",
"@vitejs/plugin-react": "^4.3.2",
"@xterm/addon-fit": "^0.10.0",
"@xterm/xterm": "^5.4.0",
"axios": "^1.7.9",
"clsx": "^2.1.1",
"eslint-config-airbnb-typescript": "^18.0.0",
"framer-motion": "^12.4.2",
"i18next": "^24.2.2",
"framer-motion": "^12.0.1",
"i18next": "^24.2.1",
"i18next-browser-languagedetector": "^8.0.2",
"i18next-http-backend": "^3.0.2",
"isbot": "^5.1.22",
"i18next-http-backend": "^3.0.1",
"isbot": "^5.1.21",
"jose": "^5.9.4",
"monaco-editor": "^0.52.2",
"posthog-js": "^1.216.0",
"posthog-js": "^1.207.0",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"react-highlight": "^0.15.0",
@@ -36,21 +36,21 @@
"react-icons": "^5.4.0",
"react-markdown": "^9.0.3",
"react-redux": "^9.2.0",
"react-router": "^7.1.5",
"react-router": "^7.1.2",
"react-syntax-highlighter": "^15.6.1",
"react-textarea-autosize": "^8.5.7",
"remark-gfm": "^4.0.1",
"remark-gfm": "^4.0.0",
"sirv-cli": "^3.0.0",
"socket.io-client": "^4.8.1",
"tailwind-merge": "^3.0.1",
"vite": "^6.1.0",
"tailwind-merge": "^2.6.0",
"vite": "^5.4.11",
"web-vitals": "^3.5.2",
"ws": "^8.18.0"
},
"scripts": {
"dev": "npm run make-i18n && cross-env VITE_MOCK_API=false react-router dev",
"dev:mock": "npm run make-i18n && cross-env VITE_MOCK_API=true react-router dev",
"build": "npm run make-i18n && npm run typecheck && react-router build",
"build": "npm run make-i18n && tsc && react-router build",
"start": "npx sirv-cli build/ --single",
"test": "vitest run",
"test:e2e": "playwright test",
@@ -77,23 +77,23 @@
},
"devDependencies": {
"@mswjs/socket.io-binding": "^0.1.1",
"@playwright/test": "^1.50.1",
"@react-router/dev": "^7.1.5",
"@playwright/test": "^1.49.1",
"@react-router/dev": "^7.1.2",
"@tailwindcss/typography": "^0.5.16",
"@tanstack/eslint-plugin-query": "^5.66.1",
"@tanstack/eslint-plugin-query": "^5.64.2",
"@testing-library/dom": "^10.4.0",
"@testing-library/jest-dom": "^6.6.1",
"@testing-library/react": "^16.2.0",
"@testing-library/user-event": "^14.6.1",
"@types/node": "^22.13.1",
"@types/react": "^19.0.8",
"@testing-library/user-event": "^14.6.0",
"@types/node": "^22.10.7",
"@types/react": "^19.0.7",
"@types/react-dom": "^19.0.3",
"@types/react-highlight": "^0.12.8",
"@types/react-syntax-highlighter": "^15.5.13",
"@types/ws": "^8.5.14",
"@types/ws": "^8.5.12",
"@typescript-eslint/eslint-plugin": "^7.18.0",
"@typescript-eslint/parser": "^7.18.0",
"@vitest/coverage-v8": "^3.0.5",
"@vitest/coverage-v8": "^3.0.2",
"autoprefixer": "^10.4.20",
"cross-env": "^7.0.3",
"eslint": "^8.57.0",
@@ -107,7 +107,7 @@
"eslint-plugin-react-hooks": "^4.6.2",
"husky": "^9.1.6",
"jsdom": "^26.0.0",
"lint-staged": "^15.4.3",
"lint-staged": "^15.4.1",
"msw": "^2.6.6",
"postcss": "^8.5.1",
"prettier": "^3.4.2",

Some files were not shown because too many files have changed in this diff Show More