Compare commits

..

2 Commits

Author SHA1 Message Date
Xingyao Wang
312ace79fa Merge branch 'main' into openhands-fix-issue-8682 2025-06-03 11:41:53 -04:00
openhands
f52a653eb4 Fix issue #8682: [Bug]: Terminal tab not showing content until page refresh 2025-05-24 14:50:01 +00:00
430 changed files with 7808 additions and 18970 deletions

View File

@@ -12,8 +12,4 @@
"ghcr.io/devcontainers/features/node:1": {},
},
"postCreateCommand": ".devcontainer/setup.sh",
"runArgs": ["--add-host=host.docker.internal:host-gateway"],
"containerEnv": {
"DOCKER_HOST_ADDR": "host.docker.internal"
},
}

4
.devcontainer/setup.sh Executable file → Normal file
View File

@@ -1,9 +1,5 @@
#!/bin/bash
# Mark the current repository as safe for Git to prevent "dubious ownership" errors,
# which can occur in containerized environments when directory ownership doesn't match the current user.
git config --global --add safe.directory "$(realpath .)"
# Install `nc`
sudo apt update && sudo apt install netcat -y

View File

@@ -1,23 +1,5 @@
# NodeJS
frontend/node_modules
# Configuration (except pyproject.toml)
*.ini
*.toml
!pyproject.toml
*.yml
# Documentation (except README.md)
*.md
!README.md
# Hidden files and directories
.*
__pycache__
# Unneded files and directories
/dev_config/
/docs/
/evaluation/
/tests/
CITATION.cff
config.toml
.envrc
.env
.git

2
.github/CODEOWNERS vendored
View File

@@ -5,7 +5,7 @@
/frontend/ @rbren @amanape
# Evaluation code owners
/evaluation/ @xingyaoww @neubig
/evaluation/ @xingyaoww @neubig
# Documentation code owners
/docs/ @mamoodi

View File

@@ -33,7 +33,6 @@ body:
- Docker command in README
- GitHub resolver
- Development workflow
- CLI
- app.all-hands.dev
- Other
default: 0

View File

@@ -16,6 +16,7 @@ updates:
mcp-packages:
patterns:
- "mcp"
- "mcpm"
security-all:
applies-to: "security-updates"
patterns:
@@ -72,9 +73,3 @@ updates:
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: "docker"
directories:
- "containers/*"
schedule:
interval: "weekly"

View File

@@ -293,7 +293,7 @@ jobs:
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies INSTALL_PLAYWRIGHT=0
run: make install-python-dependencies POETRY_GROUP=main,test,runtime INSTALL_PLAYWRIGHT=0
- name: Run docker runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs
@@ -313,8 +313,6 @@ jobs:
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
env:
DEBUG: "1"
# Run unit tests with the Docker runtime Docker images as openhands user
test_runtime_oh:
@@ -380,8 +378,6 @@ jobs:
TEST_IN_CI=true \
RUN_AS_OPENHANDS=true \
poetry run pytest -n 7 -raRs --reruns 2 --reruns-delay 5 -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py --durations=10
env:
DEBUG: "1"
# The two following jobs (named identically) are to check whether all the runtime tests have passed as the
# "All Runtime Tests Passed" is a required job for PRs to merge

View File

@@ -54,7 +54,7 @@ jobs:
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
- name: Install Python dependencies using Poetry
run: poetry install --with dev,test,runtime
run: poetry install --without evaluation
- name: Configure config.toml for testing with Haiku
env:

View File

@@ -74,7 +74,7 @@ jobs:
- name: Fix python lint issues
run: |
# Run all pre-commit hooks and continue even if they modify files (exit code 1)
pre-commit run --config ./dev_config/python/.pre-commit-config.yaml --all-files || true
pre-commit run --config ./dev_config/python/.pre-commit-config.yaml --files openhands/**/* evaluation/**/* tests/**/* || true
# Commit and push changes if any
- name: Check for changes

View File

@@ -53,7 +53,7 @@ jobs:
- name: Install pre-commit
run: pip install pre-commit==3.7.0
- name: Run pre-commit hooks
run: pre-commit run --all-files --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
# Check version consistency across documentation
check-version-consistency:

View File

@@ -44,7 +44,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: poetry install --with dev,test,runtime
run: poetry install --without evaluation
- name: Build Environment
run: make build
- name: Run Unit Tests
@@ -71,13 +71,8 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'poetry'
- name: Install Python dependencies using Poetry
run: poetry install --with dev,test,runtime
run: poetry install --without evaluation
- name: Run Windows unit tests
run: poetry run pytest -svv tests/unit/test_windows_bash.py
env:
DEBUG: "1"
- name: Run Windows runtime tests with LocalRuntime
run: $env:TEST_RUNTIME="local"; poetry run pytest -svv tests/runtime/test_bash.py
env:
TEST_RUNTIME: local
DEBUG: "1"

1
.gitignore vendored
View File

@@ -166,6 +166,7 @@ cython_debug/
# https://stackoverflow.com/questions/32964920/should-i-commit-the-vscode-folder-to-source-control
.vscode/**/*
!.vscode/extensions.json
!.vscode/launch.json
!.vscode/settings.json
!.vscode/tasks.json

View File

@@ -5,14 +5,6 @@ This repository contains the code for OpenHands, an automated AI software engine
To set up the entire repo, including frontend and backend, run `make build`.
You don't need to do this unless the user asks you to, or if you're trying to run the entire application.
## Running OpenHands with OpenHands:
To run the full application to debug issues:
```bash
export INSTALL_DOCKER=0
export RUNTIME=local
make build && make run FRONTEND_PORT=12000 FRONTEND_HOST=0.0.0.0 BACKEND_HOST=0.0.0.0 &> /tmp/openhands-log.txt &
```
IMPORTANT: Before making any changes to the codebase, ALWAYS run `make install-pre-commit-hooks` to ensure pre-commit hooks are properly installed.
Before pushing any changes, you MUST ensure that any lint errors or simple test errors have been fixed.
@@ -52,13 +44,7 @@ Frontend:
- Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
- Internationalization:
- Generate i18n declaration file: `npm run make-i18n`
- Data Fetching & Cache Management:
- We use TanStack Query (fka React Query) for data fetching and cache management
- Data Access Layer: API client methods are located in `frontend/src/api` and should never be called directly from UI components - they must always be wrapped with TanStack Query
- Custom hooks are located in `frontend/src/hooks/query/` and `frontend/src/hooks/mutation/`
- Query hooks should follow the pattern use[Resource] (e.g., `useConversationMicroagents`)
- Mutation hooks should follow the pattern use[Action] (e.g., `useDeleteConversation`)
- Architecture rule: UI components → TanStack Query hooks → Data Access Layer (`frontend/src/api`) → API endpoints
## Template for Github Pull Request

View File

@@ -103,29 +103,6 @@ components or interface enhancements.
make start-frontend
```
### 5. Running OpenHands with OpenHands
You can use OpenHands to develop and improve OpenHands itself! This is a powerful way to leverage AI assistance for contributing to the project.
#### Quick Start
1. **Build and run OpenHands:**
```bash
export INSTALL_DOCKER=0
export RUNTIME=local
make build && make run
```
2. **Access the interface:**
- Local development: http://localhost:3001
- Remote/cloud environments: Use the appropriate external URL
3. **Configure for external access (if needed):**
```bash
# For external access (e.g., cloud environments)
make run FRONTEND_PORT=12000 FRONTEND_HOST=0.0.0.0 BACKEND_HOST=0.0.0.0
```
### 6. LLM Debugging
If you encounter any issues with the Language Model (LM) or you're simply curious, export DEBUG=1 in the environment and restart the backend.
@@ -159,7 +136,7 @@ poetry run pytest ./tests/unit/test_*.py
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.44-nikolaik`
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.40-nikolaik`
## Develop inside Docker container

View File

@@ -1,91 +0,0 @@
# MCP CLI Runtime Implementation Summary
## What Was Implemented
**Phase 1: HTTP/SSE Support** - Successfully implemented MCP action support in CLI Runtime with maximum code reuse from existing infrastructure.
### Key Features Implemented
1. **MCP Action Execution**: `call_tool_mcp()` method that handles MCP actions
2. **Configuration Management**: `get_mcp_config()` method that loads MCP config from multiple sources
3. **Error Handling**: Proper Windows platform checks and error reporting
4. **Code Reuse**: ~80% code reuse from `action_execution_client.py` patterns
### Configuration Sources (in order of precedence)
1. **OpenHands Config**: If your OpenHands config already has MCP settings
2. **Environment Variables**: For programmatic configuration
3. **User Config File**: `~/.openhands/config.toml` (completely optional)
4. **Default Empty Config**: If no configuration is found
### Technical Implementation
- **Reused Infrastructure**: Uses existing `MCPClient`, `create_mcp_clients`, `call_tool_mcp` from utils
- **Consistent Patterns**: Same error handling, logging, and platform checks as other runtimes
- **TOML Loading**: Uses OpenHands standard `toml` library and `MCPConfig.from_toml_section()`
- **No Dependencies**: No new dependencies added
## Configuration Examples
### User Config File (`~/.openhands/config.toml`)
```toml
[mcp]
# SSE Servers - External servers that communicate via Server-Sent Events
sse_servers = [
# Basic SSE server with just a URL
"http://localhost:3000/mcp",
# SSE server with API key authentication
{url="https://secure-example.com/mcp", api_key="your-api-key"}
]
# Note: stdio_servers are not yet supported in CLI Runtime (Phase 2)
```
### Environment Variables
```bash
export OPENHANDS_MCP_SSE_SERVERS='[{"url":"http://localhost:3000/mcp"}]'
```
## Usage
```python
from openhands.runtime.impl.cli import CLIRuntime
from openhands.events.action import MCPAction
# Create runtime
runtime = CLIRuntime(config=your_config)
# Execute MCP action
action = MCPAction(server_name="your-server", tool_name="your-tool", arguments={})
result = await runtime.call_tool_mcp(action)
```
## What's Next (Phase 2)
- **Stdio MCP Client Implementation**: Support for local process-based MCP servers
- **Process Management**: Handle stdio server lifecycle
- **Enhanced Configuration**: Auto-discovery of localhost MCP servers
## Compatibility
-**Backward Compatible**: Existing CLI runtime functionality unchanged
-**Cross-Platform**: Works on Windows, macOS, Linux (Windows has MCP disabled)
-**Optional Config**: Works without any configuration files
-**Docker Alternative**: Provides MCP support without Docker requirements
## Code Quality
-**High Code Reuse**: ~80% reuse from existing action_execution_client.py
-**Consistent Error Handling**: Same patterns as other runtimes
-**Proper Validation**: Uses existing MCPConfig validation
-**Clean Implementation**: Minimal changes, focused functionality
## Testing
The implementation has been validated for:
- ✅ Proper import structure
- ✅ Code reuse patterns
- ✅ Error handling
- ✅ Configuration loading
- ✅ Phase 1 requirements compliance

View File

@@ -151,7 +151,7 @@ install-python-dependencies:
echo "Installing only POETRY_GROUP=${POETRY_GROUP}"; \
poetry install --only $${POETRY_GROUP}; \
else \
poetry install --with dev,test,runtime; \
poetry install; \
fi
@if [ "${INSTALL_PLAYWRIGHT}" != "false" ] && [ "${INSTALL_PLAYWRIGHT}" != "0" ]; then \
if [ -f "/etc/manjaro-release" ]; then \
@@ -189,7 +189,7 @@ install-pre-commit-hooks:
lint-backend:
@echo "$(YELLOW)Running linters...$(RESET)"
@poetry run pre-commit run --all-files --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)
@poetry run pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)
lint-frontend:
@echo "$(YELLOW)Running linters for frontend...$(RESET)"

View File

@@ -18,17 +18,6 @@
<a href="https://docs.all-hands.dev/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
<a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
<a href="https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0#gid=0"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>
<!-- Keep these links. Translations will automatically update with the README. -->
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=de">Deutsch</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=es">Español</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=fr">français</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=ja">日本語</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=ko">한국어</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=pt">Português</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=ru">Русский</a> |
<a href="https://www.readme-i18n.com/All-Hands-AI/OpenHands?lang=zh">中文</a>
<hr>
</div>
@@ -62,21 +51,19 @@ system requirements and more information.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.40-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.40-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.44
docker.all-hands.dev/all-hands-ai/openhands:0.40
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
When you open the application, you'll be asked to choose an LLM provider and add an API key.

View File

@@ -51,21 +51,19 @@ OpenHands也可以使用Docker在本地系统上运行。
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.40-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.40-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.44
docker.all-hands.dev/all-hands-ai/openhands:0.40
```
> **注意**: 如果您在0.44版本之前使用过OpenHands您可能需要运行 `mv ~/.openhands-state ~/.openhands` 来将对话历史迁移到新位置。
您将在[http://localhost:3000](http://localhost:3000)找到运行中的OpenHands
打开应用程序时您将被要求选择一个LLM提供商并添加API密钥。

View File

@@ -1,16 +1,16 @@
ARG OPENHANDS_BUILD_VERSION=dev
FROM node:22.16.0-bookworm-slim AS frontend-builder
FROM node:21.7.2-bookworm-slim AS frontend-builder
WORKDIR /app
COPY frontend/package.json frontend/package-lock.json ./
COPY ./frontend/package.json frontend/package-lock.json ./
RUN npm install -g npm@10.5.1
RUN npm ci
COPY frontend ./
COPY ./frontend ./
RUN npm run build
FROM python:3.12.10-slim AS base
FROM base AS backend-builder
FROM python:3.12.3-slim AS backend-builder
WORKDIR /app
ENV PYTHONPATH='/app'
@@ -22,18 +22,17 @@ ENV POETRY_NO_INTERACTION=1 \
RUN apt-get update -y \
&& apt-get install -y curl make git build-essential \
&& python3 -m pip install poetry --break-system-packages
&& python3 -m pip install poetry==1.8.2 --break-system-packages
COPY pyproject.toml poetry.lock ./
COPY ./pyproject.toml ./poetry.lock ./
RUN touch README.md
RUN export POETRY_CACHE_DIR && poetry install --no-root && rm -rf $POETRY_CACHE_DIR
RUN export POETRY_CACHE_DIR && poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR
FROM base AS openhands-app
FROM python:3.12.3-slim AS openhands-app
WORKDIR /app
# re-declare for this section
ARG OPENHANDS_BUILD_VERSION
ARG OPENHANDS_BUILD_VERSION #re-declare for this section
ENV RUN_AS_OPENHANDS=true
# A random number--we need this to be different from the user's UID on the host machine
@@ -44,7 +43,7 @@ ENV WORKSPACE_BASE=/opt/workspace_base
ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
ENV SANDBOX_USER_ID=0
ENV FILE_STORE=local
ENV FILE_STORE_PATH=/.openhands
ENV FILE_STORE_PATH=/.openhands-state
RUN mkdir -p $FILE_STORE_PATH
RUN mkdir -p $WORKSPACE_BASE
@@ -75,7 +74,12 @@ COPY --chown=openhands:app --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${V
COPY --chown=openhands:app --chmod=770 ./microagents ./microagents
COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
COPY --chown=openhands:app pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./
COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
COPY --chown=openhands:app ./poetry.lock ./poetry.lock
COPY --chown=openhands:app ./README.md ./README.md
COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
COPY --chown=openhands:app ./LICENSE ./LICENSE
# This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
RUN python openhands/core/download.py # No-op to download assets

View File

@@ -10,9 +10,8 @@ services:
environment:
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
- SANDBOX_API_HOSTNAME=host.docker.internal
- DOCKER_HOST_ADDR=host.docker.internal
#
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.44-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.40-nikolaik}
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -37,8 +37,7 @@ repos:
hooks:
- id: mypy
additional_dependencies:
[types-requests, types-setuptools, types-pyyaml, types-toml, types-docker, lxml]
# To see gaps add `--html-report mypy-report/`
[types-requests, types-setuptools, types-pyyaml, types-toml]
entry: mypy --config-file dev_config/python/mypy.ini openhands/
always_run: true
pass_filenames: false

View File

@@ -7,8 +7,8 @@ services:
image: openhands:latest
container_name: openhands-app-${DATE:-}
environment:
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik}
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of ~/.openhands for this user
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.40-nikolaik}
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:
- "3000:3000"
@@ -16,7 +16,7 @@ services:
- "host.docker.internal:host-gateway"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ~/.openhands:/.openhands
- ~/.openhands-state:/.openhands-state
- ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
pull_policy: build
stdin_open: true

View File

@@ -1,17 +0,0 @@
# Setup
```
npm install -g mint
```
or
```
yarn global add mint
```
# Preview
```
mint dev
```

View File

@@ -20,7 +20,7 @@
"navigation": {
"tabs": [
{
"tab": "Docs",
"tab": "Getting started",
"pages": [
"index",
"usage/installation",
@@ -31,124 +31,118 @@
"pages": [
"usage/cloud/openhands-cloud",
{
"group": "Integrations",
"group": "Installation",
"pages": [
"usage/cloud/github-installation",
"usage/cloud/gitlab-installation",
"usage/cloud/slack-installation"
"usage/cloud/gitlab-installation"
]
},
"usage/cloud/cloud-ui",
"usage/cloud/cloud-issue-resolver",
"usage/cloud/cloud-api"
]
},
{
"group": "Running OpenHands on Your Own",
"group": "Usage Methods",
"pages": [
"usage/local-setup",
"usage/how-to/gui-mode",
"usage/how-to/cli-mode",
"usage/how-to/headless-mode",
"usage/how-to/github-action",
{
"group": "Advanced Configuration",
"pages": [
{
"group": "LLM Configuration",
"pages": [
"usage/llms/llms",
{
"group": "Providers",
"pages": [
"usage/llms/azure-llms",
"usage/llms/google-llms",
"usage/llms/groq",
"usage/llms/local-llms",
"usage/llms/litellm-proxy",
"usage/llms/openai-llms",
"usage/llms/openrouter"
]
}
]
},
{
"group": "Runtime Configuration",
"pages": [
"usage/runtimes/overview",
{
"group": "Providers",
"pages": [
"usage/runtimes/docker",
"usage/runtimes/remote",
"usage/runtimes/local",
{
"group": "Third-Party Providers",
"pages": [
"usage/runtimes/modal",
"usage/runtimes/daytona",
"usage/runtimes/runloop",
"usage/runtimes/e2b"
]
}
]
}
]
},
"usage/configuration-options",
"usage/how-to/custom-sandbox-guide",
"usage/search-engine-setup",
"usage/mcp"
]
}
]
},
{
"group": "Customization",
"pages": [
"usage/prompting/repository",
{
"group": "Microagents",
"pages": [
"usage/prompting/microagents-overview",
"usage/prompting/microagents-repo",
"usage/prompting/microagents-keyword",
"usage/prompting/microagents-org",
"usage/prompting/microagents-public"
]
}
]
},
{
"group": "Tips and Tricks",
"pages": [
"usage/prompting/prompting-best-practices"
]
},
{
"group": "Troubleshooting & Feedback",
"pages": [
"usage/troubleshooting/troubleshooting",
"usage/feedback"
]
},
{
"group": "OpenHands Developers",
"pages": [
"usage/how-to/development-overview",
{
"group": "Architecture",
"pages": [
"usage/architecture/backend",
"usage/architecture/runtime"
]
},
"usage/how-to/debugging",
"usage/how-to/evaluation-harness",
"usage/how-to/websocket-connection"
"usage/how-to/github-action"
]
}
]
},
{
"tab": "Prompting and Customization",
"pages": [
"usage/prompting/prompting-best-practices",
"usage/prompting/repository",
{
"group": "Microagents",
"pages": [
"usage/prompting/microagents-overview",
"usage/prompting/microagents-repo",
"usage/prompting/microagents-keyword",
"usage/prompting/microagents-org",
"usage/prompting/microagents-public"
]
}
]
},
{
"tab": "Advanced Configuration",
"pages": [
{
"group": "LLM Configuration",
"pages": [
"usage/llms/llms",
{
"group": "Providers",
"pages": [
"usage/llms/azure-llms",
"usage/llms/google-llms",
"usage/llms/groq",
"usage/llms/local-llms",
"usage/llms/litellm-proxy",
"usage/llms/openai-llms",
"usage/llms/openrouter"
]
}
]
},
{
"group": "Runtime Configuration",
"pages": [
"usage/runtimes/overview",
{
"group": "Providers",
"pages": [
"usage/runtimes/docker",
"usage/runtimes/remote",
"usage/runtimes/local",
{
"group": "Third-Party Providers",
"pages": [
"usage/runtimes/modal",
"usage/runtimes/daytona",
"usage/runtimes/runloop",
"usage/runtimes/e2b"
]
}
]
}
]
},
"usage/configuration-options",
"usage/how-to/custom-sandbox-guide",
"usage/search-engine-setup",
"usage/mcp"
]
},
{
"tab": "Troubleshooting & Feedback",
"pages": [
"usage/troubleshooting/troubleshooting",
"usage/feedback"
]
},
{
"tab": "For OpenHands Developers",
"pages": [
"usage/how-to/development-overview",
{
"group": "Architecture",
"pages": [
"usage/architecture/backend",
"usage/architecture/runtime"
]
},
"usage/how-to/debugging",
"usage/how-to/evaluation-harness",
"usage/how-to/websocket-connection"
]
},
{
"tab": "API Reference",
"openapi": "/openapi.json"
@@ -200,11 +194,5 @@
"chatgpt",
"claude"
]
},
"redirects": [
{
"source": "/modules/:slug*",
"destination": "/:slug*"
}
]
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 542 KiB

View File

@@ -1,11 +1,9 @@
---
title: Cloud API
description: OpenHands Cloud provides a REST API that allows you to programmatically interact with OpenHands.
This guide explains how to obtain an API key and use the API to start conversations and retrieve their status.
description: OpenHands Cloud provides a REST API that allows you to programmatically interact with the service. This guide explains how to obtain an API key and use the API to start conversations.
---
For the available API endpoints, refer to the
[OpenHands API Reference](https://docs.all-hands.dev/api-reference).
For more detailed information about the API, refer to the [OpenHands API Reference](https://docs.all-hands.dev/swagger-ui/).
## Obtaining an API Key
@@ -18,7 +16,7 @@ To use the OpenHands Cloud API, you'll need to generate an API key:
5. Give your key a descriptive name (Example: "Development" or "Production") and select `Create`.
6. Copy the generated API key and store it securely. It will only be shown once.
![API Key Generation](/static/img/api-key-generation.png)
![API Key Generation](/static/img/docs/api-key-generation.png)
## API Usage
@@ -35,81 +33,87 @@ To start a new conversation with OpenHands to perform a task, you'll need to mak
#### Examples
<details>
<summary>cURL</summary>
<Accordion title="cURL">
```bash
curl -X POST "https://app.all-hands.dev/api/conversations" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
"repository": "yourusername/your-repo"
}'
```
</Accordion>
```bash
curl -X POST "https://app.all-hands.dev/api/conversations" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
"repository": "yourusername/your-repo"
}'
```
</details>
<Accordion title="Python (with requests)">
```python
import requests
<details>
<summary>Python (with requests)</summary>
api_key = "YOUR_API_KEY"
url = "https://app.all-hands.dev/api/conversations"
```python
import requests
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
api_key = "YOUR_API_KEY"
url = "https://app.all-hands.dev/api/conversations"
data = {
"initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
"repository": "yourusername/your-repo"
}
response = requests.post(url, headers=headers, json=data)
conversation = response.json()
print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
print(f"Status: {conversation['status']}")
```
</Accordion>
<Accordion title="TypeScript/JavaScript (with fetch)">
```typescript
const apiKey = "YOUR_API_KEY";
const url = "https://app.all-hands.dev/api/conversations";
const headers = {
"Authorization": `Bearer ${apiKey}`,
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
};
}
const data = {
initial_user_msg: "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
repository: "yourusername/your-repo"
};
data = {
"initial_user_msg": "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
"repository": "yourusername/your-repo"
}
async function startConversation() {
try {
const response = await fetch(url, {
method: "POST",
headers: headers,
body: JSON.stringify(data)
});
response = requests.post(url, headers=headers, json=data)
conversation = response.json()
const conversation = await response.json();
print(f"Conversation Link: https://app.all-hands.dev/conversations/{conversation['conversation_id']}")
print(f"Status: {conversation['status']}")
```
</details>
console.log(`Conversation Link: https://app.all-hands.dev/conversations/${conversation.id}`);
console.log(`Status: ${conversation.status}`);
<details>
<summary>TypeScript/JavaScript (with fetch)</summary>
return conversation;
} catch (error) {
console.error("Error starting conversation:", error);
}
```typescript
const apiKey = "YOUR_API_KEY";
const url = "https://app.all-hands.dev/api/conversations";
const headers = {
"Authorization": `Bearer ${apiKey}`,
"Content-Type": "application/json"
};
const data = {
initial_user_msg: "Check whether there is any incorrect information in the README.md file and send a PR to fix it if so.",
repository: "yourusername/your-repo"
};
async function startConversation() {
try {
const response = await fetch(url, {
method: "POST",
headers: headers,
body: JSON.stringify(data)
});
const conversation = await response.json();
console.log(`Conversation Link: https://app.all-hands.dev/conversations/${conversation.id}`);
console.log(`Status: ${conversation.status}`);
return conversation;
} catch (error) {
console.error("Error starting conversation:", error);
}
}
startConversation();
```
</Accordion>
startConversation();
```
</details>
#### Response
@@ -141,12 +145,14 @@ GET https://app.all-hands.dev/api/conversations/{conversation_id}
#### Example
<Accordion title="cURL">
```bash
curl -X GET "https://app.all-hands.dev/api/conversations/{conversation_id}" \
-H "Authorization: Bearer YOUR_API_KEY"
```
</Accordion>
<details>
<summary>cURL</summary>
```bash
curl -X GET "https://app.all-hands.dev/api/conversations/{conversation_id}" \
-H "Authorization: Bearer YOUR_API_KEY"
```
</details>
#### Response

View File

@@ -0,0 +1,33 @@
---
title: Cloud Issue Resolver
description: The Cloud Issue Resolver automates code fixes and provides intelligent assistance for your repositories on GitHub.
---
## Setup
The Cloud Issue Resolver is available automatically when you grant OpenHands Cloud repository access:
- [GitHub repository access](./github-installation#adding-repository-access)
## Usage
After granting OpenHands Cloud repository access, you can use the Cloud Issue Resolver on issues and pull requests in your repositories.
### Working with Issues
On your repository, label an issue with `openhands` or add a message starting with
`@openhands`. OpenHands will:
1. Comment on the issue to let you know it is working on it
- You can click on the link to track the progress on OpenHands Cloud
2. Open a pull request if it determines that the issue has been successfully resolved
3. Comment on the issue with a summary of the performed tasks and a link to the PR
### Working with Pull Requests
To get OpenHands to work on pull requests, mention `@openhands` in comments to:
- Ask questions
- Request updates
- Get code explanations
OpenHands will:
1. Comment to let you know it is working on it
2. Perform the requested task

View File

@@ -1,36 +1,28 @@
---
title: Cloud UI
description: The Cloud UI provides a web interface for interacting with OpenHands. This page explains how to use the
OpenHands Cloud UI.
description: The Cloud UI provides a web interface for interacting with OpenHands AI. This page explains how to access and use the OpenHands Cloud UI.
---
## Landing Page
The landing page is where you can:
## Accessing the UI
- [Add GitHub repository access](/usage/cloud/github-installation#adding-github-repository-access) to OpenHands.
- [Select a GitHub repo](/usage/cloud/github-installation#working-with-github-repos-in-openhands-cloud) or
[a GitLab repo](/usage/cloud/gitlab-installation#working-with-gitlab-repos-in-openhands-cloud) to start working on.
- See `Suggested Tasks` for repositories that OpenHands has access to.
- Launch an empty conversation using `Launch from Scratch`.
The OpenHands Cloud UI can be accessed at [app.all-hands.dev](https://app.all-hands.dev). You'll need to sign in with your GitHub or GitLab account to access the interface.
## Settings
The Settings page allows you to:
- [Configure GitHub repository access](/usage/cloud/github-installation#modifying-repository-access) for OpenHands.
- Set application settings like your preferred language, notifications and other preferences.
- Add credits to your account.
- Generate custom secrets.
- Create API keys to work with OpenHands programmatically.
## Key Features
For an overview of the key features available inside a conversation, please refer to the [Key Features](/usage/key-features)
section of the documentation.
For detailed information about the features available in the OpenHands Cloud UI, please refer to the [Key Features](../key-features.md) section of the documentation.
## Settings
The settings page allows you to:
- Configure your account preferences.
- Manage repository access.
- Generate API keys for programmatic access.
- Generate custom secrets for the agent.
## Next Steps
- [Install GitHub Integration](/usage/cloud/github-installation) to use OpenHands with your GitHub repositories.
- [Install GitLab Integration](/usage/cloud/gitlab-installation) to use OpenHands with your GitLab repositories.
- [Use the Cloud API](/usage/cloud/cloud-api) to programmatically interact with OpenHands.
- [Use the Cloud Issue Resolver](./cloud-issue-resolver.md) to automate code fixes and get assistance.
- [Learn about the Cloud API](./cloud-api.md) for programmatic access.

View File

@@ -1,22 +1,30 @@
---
title: GitHub Integration
description: This guide walks you through the process of installing OpenHands Cloud for your GitHub repositories. Once
set up, it will allow OpenHands to work with your GitHub repository through the Cloud UI or straight from GitHub!
title: GitHub Installation
description: This guide walks you through the process of installing and configuring OpenHands Cloud for your GitHub repositories.
---
## Prerequisites
- Signed in to [OpenHands Cloud](https://app.all-hands.dev) with [a GitHub account](/usage/cloud/openhands-cloud).
- A GitHub account
- Access to OpenHands Cloud
## Adding GitHub Repository Access
## Installation Steps
You can grant OpenHands access to specific GitHub repositories:
1. Log in to [OpenHands Cloud](https://app.all-hands.dev)
2. If you haven't connected your GitHub account yet:
- Click on `Connect to GitHub`
- Review and accept the terms of service
- Authorize the OpenHands AI application
1. Click on `Add GitHub repos` on the landing page.
## Adding Repository Access
You can grant OpenHands access to specific repositories:
1. Click on `Add GitHub repos`
2. Select your organization and choose the specific repositories to grant OpenHands access to.
<Accordion title="OpenHands permissions">
- OpenHands requests short-lived tokens (8-hour expiration) with these permissions:
- OpenHands requests short-lived tokens (8-hour expiration) with these permissions:
- Actions: Read and write
- Administration: Read-only
- Commit statuses: Read and write
- Contents: Read and write
- Issues: Read and write
@@ -27,45 +35,20 @@ You can grant OpenHands access to specific GitHub repositories:
- Repository access for a user is granted based on:
- Permission granted for the repository
- User's GitHub permissions (owner/collaborator)
</Accordion>
3. Click `Install & Authorize`.
3. Click `Install & Authorize`
## Modifying Repository Access
You can modify GitHub repository access at any time by:
- Selecting `Add GitHub repos` on the landing page or
- Visiting the Settings page and selecting `Configure GitHub Repositories` under the `Integrations` tab
You can modify repository access at any time by visiting the Settings page and selecting `Configure GitHub Repositories` under the `Git` tab.
## Working With GitHub Repos in Openhands Cloud
## Using OpenHands with GitHub
Once you've granted GitHub repository access, you can start working with your GitHub repository. Use the `select a repo`
and `select a branch` dropdowns to select the appropriate repository and branch you'd like OpenHands to work on. Then
click on `Launch` to start the conversation!
Once you've granted repository access, you can use OpenHands with your GitHub repositories.
![Connect Repo](/static/img/connect-repo.png)
## Working on Github Issues and Pull Requests Using Openhands
Giving GitHub repository access to OpenHands also allows you to work on GitHub issues and pull requests directly.
### Working with Issues
On your repository, label an issue with `openhands` or add a message starting with
`@openhands`. OpenHands will:
1. Comment on the issue to let you know it is working on it.
- You can click on the link to track the progress on OpenHands Cloud.
2. Open a pull request if it determines that the issue has been successfully resolved.
3. Comment on the issue with a summary of the performed tasks and a link to the PR.
### Working with Pull Requests
To get OpenHands to work on pull requests, mention `@openhands` in the comments to:
- Ask questions
- Request updates
- Get code explanations
For details on how to use OpenHands with GitHub issues and pull requests, see the [Cloud Issue Resolver](./cloud-issue-resolver.md) documentation.
## Next Steps
- [Learn about the Cloud UI](/usage/cloud/cloud-ui).
- [Use the Cloud API](/usage/cloud/cloud-api) to programmatically interact with OpenHands.
- [Access the Cloud UI](./cloud-ui.md) to interact with the web interface
- [Use the Cloud Issue Resolver](./cloud-issue-resolver.md) to automate code fixes and get assistance
- [Use the Cloud API](./cloud-api.md) to programmatically interact with OpenHands

View File

@@ -1,31 +1,23 @@
---
title: GitLab Integration
description: This guide walks you through the process of installing OpenHands Cloud for your GitLab repositories. Once
set up, it will allow OpenHands to work with your GitLab repository.
title: GitLab Installation
description: This guide walks you through the process of installing and configuring OpenHands Cloud for your GitLab repositories.
---
## Prerequisites
- Signed in to [OpenHands Cloud](https://app.all-hands.dev) with [a GitLab account](/usage/cloud/openhands-cloud).
- A GitLab account
- Access to OpenHands Cloud
## Adding GitLab Repository Access
## Installation Steps
Upon signing into OpenHands Cloud with a GitLab account, OpenHands will have access to your repositories.
1. Log in to [OpenHands Cloud](https://app.all-hands.dev)
2. If you haven't connected your GitLab account yet:
- Click on `Log in with GitLab`
- Authorize the OpenHands application
## Working With GitLab Repos in Openhands Cloud
After signing in with a Gitlab account, use the `select a repo` and `select a branch` dropdowns to select the
appropriate repository and branch you'd like OpenHands to work on. Then click on `Launch` to start the conversation!
![Connect Repo](/static/img/connect-repo.png)
## Using Tokens with Reduced Scopes
OpenHands requests an API-scoped token during OAuth authentication. By default, this token is provided to the agent.
To restrict the agent's permissions, you can define a custom secret `GITLAB_TOKEN`, which will override the default token assigned to the agent.
While the high-permission API token is still requested and used for other components of the application (e.g. opening merge requests), the agent will not have access to it.
## Next Steps
- [Learn about the Cloud UI](/usage/cloud/cloud-ui).
- [Use the Cloud API](/usage/cloud/cloud-api) to programmatically interact with OpenHands.
- [Access the Cloud UI](./cloud-ui.md) to interact with the web interface
- [Use the Cloud API](./cloud-api.md) to programmatically interact with OpenHands

View File

@@ -1,12 +1,13 @@
---
title: Getting Started
description: Getting started with OpenHands Cloud.
description: Getting started with OpenHands Cloud
---
OpenHands Cloud is the hosted cloud version of All Hands AI's OpenHands.
## Accessing OpenHands Cloud
OpenHands Cloud is the hosted cloud version of All Hands AI's OpenHands. To get started with OpenHands Cloud,
visit [app.all-hands.dev](https://app.all-hands.dev).
To get started with OpenHands Cloud, visit [app.all-hands.dev](https://app.all-hands.dev).
You'll be prompted to connect with your GitHub or GitLab account:
@@ -14,13 +15,13 @@ You'll be prompted to connect with your GitHub or GitLab account:
2. Review the permissions requested by OpenHands and authorize the application.
- OpenHands will require certain permissions from your account. To read more about these permissions,
you can click the `Learn more` link on the authorization page.
3. Review and accept the `terms of service` and select `Continue`.
## Next Steps
Once you've connected your account, you can:
- [Install GitHub Integration](/usage/cloud/github-installation) to use OpenHands with your GitHub repositories.
- [Install GitLab Integration](/usage/cloud/gitlab-installation) to use OpenHands with your GitLab repositories.
- [Learn about the Cloud UI](/usage/cloud/cloud-ui).
- [Use the Cloud API](/usage/cloud/cloud-api) to programmatically interact with OpenHands.
- [Install GitHub Integration](./github-installation.md) to use OpenHands with your GitHub repositories
- [Install GitLab Integration](./gitlab-installation.md) to use OpenHands with your GitLab repositories
- [Access the Cloud UI](./cloud-ui.md) to interact with the web interface
- [Use the Cloud API](./cloud-api.md) to programmatically interact with OpenHands
- [Set up the Cloud Issue Resolver](./cloud-issue-resolver.md) to automate code fixes and provide intelligent assistance

View File

@@ -1,73 +0,0 @@
---
title: Slack Integration (Beta)
description: This guide walks you through installing the OpenHands Slack app.
---
## Prerequisites
- Access to OpenHands Cloud
## Installation Steps
<AccordionGroup>
<Accordion title="Install Slack App (only for Slack admins/owners)">
**This step is for Slack admins/owners**
1. Make sure you have permissions to install Apps to your workspace.
2. Click the button below to install OpenHands Slack App <a target="_blank" href="https://slack.com/oauth/v2/authorize?client_id=7477886716822.8729519890534&scope=app_mentions:read,chat:write,users:read,channels:history,groups:history,mpim:history,im:history&user_scope=channels:history,groups:history,im:history,mpim:history"><img alt="Add to Slack" height="40" width="139" src="https://platform.slack-edge.com/img/add_to_slack.png" srcSet="https://platform.slack-edge.com/img/add_to_slack.png 1x, https://platform.slack-edge.com/img/add_to_slack@2x.png 2x" /></a>
3. In the top right corner, select the workspace to install the OpenHands Slack app.
4. Review permissions and click allow.
</Accordion>
<Accordion title="Authorize Slack App (for all Slack workspace members)">
**Make sure your Slack workspace admin/owner has installed OpenHands Slack App first**
Every user in the Slack workspace (including admins/owners) must link their Cloud OpenHands account to the OpenHands Slack App. To do this:
1. Visit [integrations settings](https://app.all-hands.dev/settings/integrations) in OpenHands Cloud.
2. Click the button "Install Slack App".
3. In the top right corner, select the workspace to install the OpenHands Slack app.
4. Review permissions and click allow.
Depending on the workspace settings, you may need approval from your Slack admin to authorize the Slack App.
</Accordion>
</AccordionGroup>
## Working With the Slack App
To start a new conversation, you can mention `@openhands` in a new message or a thread inside any Slack channel.
Once a conversation is started, all thread messages underneath it will be follow-up messages to OpenHands.
To send follow-up messages for the same conversation, mention `@openhands` in a thread reply to the original message. You must be the user who started the conversation.
## Example conversation
### Start a new conversation, and select repo
Conversation is started by mentioning `@openhands`.
![slack-create-convo.png](/static/img/slack-create-convo.png)
### See agent response and send follow up messages
Initial request is followed up by mentioning `@openhands` in a thread reply.
![slack-results-and-follow-up.png](/static/img/slack-results-and-follow-up.png)
## Pro tip
You can mention a repo name when starting a new conversation in the following formats
1. "My-Repo" repo (e.g `@openhands in the openhands repo ...`)
2. "All-Hands-AI/OpenHands" (e.g `@openhands in All-Hands-AI/OpenHands ...`)
The repo match is case insensitive. If a repo name match is made, it will kick off the conversation.
If the repo name partially matches against multiple repos, you'll be asked to select a repo from the filtered list.
![slack-pro-tip.png](/static/img/slack-pro-tip.png)

View File

@@ -1,48 +1,24 @@
---
title: CLI
description: The Command-Line Interface (CLI) provides a powerful interface that lets you engage with OpenHands
directly from your terminal.
title: CLI Mode
description: CLI mode provides a powerful interactive Command-Line Interface (CLI) that lets you engage with OpenHands directly from your terminal.
---
This mode is different from the [headless mode](/usage/how-to/headless-mode), which is non-interactive and better
for scripting.
This mode is different from the [headless mode](./headless-mode), which is non-interactive and better for scripting.
## Getting Started
### Running with Python
**Note** - OpenHands requires Python version 3.12 or higher (Python 3.14 is not currently supported)
1. Install OpenHands using pip:
```bash
pip install openhands-ai
```
Or if you prefer not to manage your own Python environment, you can use `uvx`:
```bash
uvx --python 3.12 --from openhands-ai openhands
```
2. Launch an interactive OpenHands conversation from the command line:
```bash
openhands
```
3. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
This command opens an interactive prompt where you can type tasks or commands and get responses from OpenHands.
#### For Developers
If you have cloned the repository, you can run the CLI directly using Poetry:
1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
2. Set your model, API key, and other preferences using environment variables or with the [`config.toml`](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml) file.
3. Launch an interactive OpenHands conversation from the command line:
```bash
poetry run python -m openhands.cli.main
```
This command opens an interactive prompt where you can type tasks or commands and get responses from OpenHands.
### Running with Docker
1. Set the following environment variables in your terminal:
@@ -55,21 +31,19 @@ poetry run python -m openhands.cli.main
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
-e LLM_API_KEY=$LLM_API_KEY \
-e LLM_MODEL=$LLM_MODEL \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.44 \
docker.all-hands.dev/all-hands-ai/openhands:0.39 \
python -m openhands.cli.main --override-cli-mode true
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
This launches the CLI in Docker, allowing you to interact with OpenHands as described above.
The `-e SANDBOX_USER_ID=$(id -u)` ensures files created by the agent in your workspace have the correct permissions.

View File

@@ -46,7 +46,7 @@ This will produce a new image called `custom-image`, which will be available in
## Using the Docker Command
When running OpenHands using [the docker command](/usage/local-setup#start-the-app), replace
When running OpenHands using [the docker command](/usage/installation#start-the-app), replace
`-e SANDBOX_RUNTIME_CONTAINER_IMAGE=...` with `-e SANDBOX_BASE_CONTAINER_IMAGE=<custom image name>`:
```commandline

View File

@@ -48,6 +48,6 @@ The customization options you can set are:
| `LLM_MODEL` | Variable | Set the LLM to use with OpenHands | `LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"` |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
| `TARGET_BRANCH` | Variable | Merge to branch other than `main` | `TARGET_BRANCH="dev"` |
| `TARGET_RUNNER` | Variable | Target runner to execute the agent workflow (default ubuntu-latest) | `TARGET_RUNNER="custom-runner"` |

View File

@@ -1,13 +1,14 @@
---
title: GUI
description: High level overview of the Graphical User Interface (GUI) in OpenHands.
title: GUI Mode
description: OpenHands provides a Graphical User Interface (GUI) mode for interacting with the AI assistant.
---
## Prerequisites
## Installation and Setup
- [OpenHands is running](/usage/local-setup)
1. Follow the installation instructions to install OpenHands.
2. After running the command, access OpenHands at [http://localhost:3000](http://localhost:3000).
## Overview
## Interacting with the GUI
### Initial Setup
@@ -18,23 +19,16 @@ description: High level overview of the Graphical User Interface (GUI) in OpenHa
3. Enter the corresponding `API Key` for your chosen provider.
4. Click `Save Changes` to apply the settings.
### Settings
### Version Control Tokens
You can use the Settings page at any time to:
OpenHands supports multiple version control providers. You can configure tokens for multiple providers simultaneously.
- Setup the LLM provider and model for OpenHands.
- [Setup the search engine](/usage/search-engine-setup).
- [Configure MCP servers](/usage/mcp).
- [Connect to GitHub](/usage/how-to/gui-mode#github-setup) and [connect to GitLab](/usage/how-to/gui-mode#gitlab-setup)
- Set application settings like your preferred language, notifications and other preferences.
- [Manage custom secrets](/usage/how-to/gui-mode#secrets-management).
#### GitHub Setup
#### GitHub Token Setup
OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if provided:
<AccordionGroup>
<Accordion title="Setting Up a GitHub Token">
<details>
<summary>Setting Up a GitHub Token</summary>
1. **Generate a Personal Access Token (PAT)**:
- On GitHub, go to Settings > Developer Settings > Personal Access Tokens > Tokens (classic).
@@ -43,11 +37,16 @@ OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if pro
- `repo` (Full control of private repositories)
- **Fine-Grained Tokens**
- All Repositories (You can select specific repositories, but this will impact what returns in repo search)
- Minimal Permissions (Select `Meta Data = Read-only` read for search, `Pull Requests = Read and Write` and `Content = Read and Write` for branch creation)
- Minimal Permissions ( Select `Meta Data = Read-only` read for search, `Pull Requests = Read and Write` and `Content = Read and Write` for branch creation)
2. **Enter Token in OpenHands**:
- In the Settings page, navigate to the `Integrations` tab.
- Click the Settings button (gear icon).
- Navigate to the `Git` tab.
- Paste your token in the `GitHub Token` field.
- Click `Save Changes` to apply the changes.
</details>
<details>
<summary>Organizational Token Policies</summary>
If you're working with organizational repositories, additional setup may be required:
@@ -60,12 +59,15 @@ OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if pro
- Look for the organization under `Organization access`.
- If required, click `Enable SSO` next to your organization.
- Complete the SSO authorization process.
</Accordion>
</details>
<details>
<summary>Troubleshooting</summary>
<Accordion title="Troubleshooting">
Common issues and solutions:
- **Token Not Recognized**:
- Ensure the token is properly saved in settings.
- Check that the token hasn't expired.
- Verify the token has the required scopes.
- Try regenerating the token.
@@ -79,15 +81,15 @@ OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if pro
- The app will show a green checkmark if the token is valid.
- Try accessing a repository to confirm permissions.
- Check the browser console for any error messages.
</Accordion>
</AccordionGroup>
</details>
#### GitLab Setup
#### GitLab Token Setup
OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment if provided:
<AccordionGroup>
<Accordion title="Setting Up a GitLab Token">
<details>
<summary>Setting Up a GitLab Token</summary>
1. **Generate a Personal Access Token (PAT)**:
- On GitLab, go to User Settings > Access Tokens.
- Create a new token with the following scopes:
@@ -97,17 +99,15 @@ OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment if pro
- `write_repository` (Write repository)
- Set an expiration date or leave it blank for a non-expiring token.
2. **Enter Token in OpenHands**:
- In the Settings page, navigate to the `Integrations` tab.
- Click the Settings button (gear icon).
- Navigate to the `Git` tab.
- Paste your token in the `GitLab Token` field.
- Click `Save Changes` to apply the changes.
</details>
3. **(Optional): Restrict agent permissions**
- Create another PAT using Step 1 and exclude `api` scope .
- In the Settings page, in the `Secrets` tab, create a new secret `GITLAB_TOKEN` and paste your lower scope token.
- OpenHands will use the higher scope token, and the agent will use the lower scope token
</Accordion>
<details>
<summary>Troubleshooting</summary>
<Accordion title="Troubleshooting">
Common issues and solutions:
- **Token Not Recognized**:
@@ -119,110 +119,25 @@ OpenHands automatically exports a `GITLAB_TOKEN` to the shell environment if pro
- Verify project access permissions.
- Check if the token has the necessary scopes.
- For group/organization repositories, ensure you have proper access.
</Accordion>
</AccordionGroup>
</details>
#### BitBucket Setup (Coming soon ...)
<AccordionGroup>
<Accordion title="Setting Up a BitBucket Password">
1. **Generate an App Password**:
- On BitBucket, go to Personal Settings > App Password.
- Create a new password with the following scopes:
- `repository: read`
- `repository: write`
- `pull requests: read`
- `pull requests: write`
- `issues: read`
- `issues: write`
- App passwords are non-expiring token. OpenHands will migrate to using API tokens in the future.
2. **Enter Token in OpenHands**:
- In the Settings page, navigate to the `Integrations` tab.
- Paste your token in the `BitBucket Token` field.
- Click `Save Changes` to apply the changes.
</Accordion>
### Advanced Settings
<Accordion title="Troubleshooting">
Common issues and solutions:
1. Inside the Settings page, under the `LLM` tab, toggle `Advanced` options to access additional settings.
2. Use the `Custom Model` text box to manually enter a model if it's not in the list.
3. Specify a `Base URL` if required by your LLM provider.
- **Token Not Recognized**:
- Ensure the token is properly saved in settings.
- Check that the token hasn't expired.
- Verify the token has the required scopes.
### Interacting with the AI
- **Verifying Token Works**:
- The app will show a green checkmark if the token is valid.
- Try accessing a repository to confirm permissions.
- Check the browser console for any error messages.
</Accordion>
</AccordionGroup>
#### Secrets Management
OpenHands provides a secrets manager that allows you to securely store and manage sensitive information that can be accessed by the agent during runtime, such as API keys. These secrets are automatically exported as environment variables in the agent's runtime environment.
1. **Accessing the Secrets Manager**:
- In the Settings page, navigate to the `Secrets` tab.
- You'll see a list of all your existing custom secrets (if any).
2. **Adding a New Secret**:
- Click the `Add New Secret` button.
- Fill in the following fields:
- **Name**: A unique identifier for your secret (e.g., `AWS_ACCESS_KEY`). This will be the environment variable name.
- **Value**: The sensitive information you want to store.
- **Description** (optional): A brief description of what the secret is used for, which is also provided to the agent.
- Click `Add Secret` to save.
3. **Editing a Secret**:
- Click the `Edit` button next to the secret you want to modify.
- You can update the name and description of the secret.
- Note: For security reasons, you cannot view or edit the value of an existing secret. If you need to change the value, delete the secret and create a new one.
4. **Deleting a Secret**:
- Click the `Delete` button next to the secret you want to remove.
- Confirm the deletion when prompted.
5. **Using Secrets in the Agent**:
- All custom secrets are automatically exported as environment variables in the agent's runtime environment.
- You can access them in your code using standard environment variable access methods (e.g., `os.environ['SECRET_NAME']` in Python).
- Example: If you create a secret named `OPENAI_API_KEY`, you can access it in your code as `process.env.OPENAI_API_KEY` in JavaScript or `os.environ['OPENAI_API_KEY']` in Python.
#### Advanced Settings
The `Advanced` settings allows configuration of additional LLM settings. Inside the Settings page, under the `LLM` tab,
toggle `Advanced` options to access additional settings.
- Custom Model: Use the `Custom Model` text box to manually enter a model. Make sure to use the correct prefix based on litellm docs.
- Base URL: Specify a `Base URL` if required by your LLM provider.
- Memory Condensation: The memory condenser manages the LLM's context by ensuring only the most important and relevant information is presented.
- Confirmation Mode: Enabling this mode will cause OpenHands to confirm an action with the user before performing it.
### Key Features
For an overview of the key features available inside a conversation, please refer to the [Key Features](/usage/key-features)
section of the documentation.
### Status Indicator
The status indicator located in the bottom left of the screen will cycle through a number of states as a new conversation
is loaded. Typically these include:
* `Disconnected` : The frontend is not connected to any conversation
* `Connecting` : The frontend is connecting a websocket to a conversation.
* `Building Runtime...` : The server is building a runtime. This is typically in development mode only while building a docker image.
* `Starting Runtime...` : The server is starting a new runtime instance - probably a new docker container or remote runtime.
* `Initializing Agent...` : The server is starting the agent loop. (This step does not appear at present with Nested runtimes)
* `Setting up workspace...` : Usually this means a `git clone ...` operation.
* `Setting up git hooks` : Setting up the git pre commit hooks for the workspace.
* `Agent is awaiting user input...` : Ready to go!
1. Type your prompt in the input box.
2. Click the send button or press Enter to submit your message.
3. The AI will process your input and provide a response in the chat window.
4. You can continue the conversation by asking follow-up questions or providing additional information.
## Tips for Effective Use
- Be specific in your requests to get the most accurate and helpful responses, as described in the [prompting best practices](../prompting/prompting-best-practices).
- Use one of the recommended models, as described in the [LLMs section](/usage/llms/llms).
- Use one of the recommended models, as described in the [LLMs section](usage/llms/llms.md).
## Other Ways to Run Openhands
- [Run OpenHands in a scriptable headless mode.](/usage/how-to/headless-mode)
- [Run OpenHands with a friendly CLI.](/usage/how-to/cli-mode)
- [Run OpenHands on GitHub issues with a GitHub action.](/usage/how-to/github-action)
Remember, the GUI mode of OpenHands is designed to make your interaction with the AI assistant as smooth and intuitive
as possible. Don't hesitate to explore its features to maximize your productivity.

View File

@@ -1,10 +1,9 @@
---
title: Headless
description: You can run OpenHands with a single command, without starting the web application. This makes it easy to
write scripts and automate tasks with OpenHands.
title: Headless Mode
description: You can run OpenHands with a single command, without starting the web application. This makes it easy to write scripts and automate tasks with OpenHands.
---
This is different from [the CLI](./cli-mode), which is interactive, and better for active development.
This is different from [CLI Mode](./cli-mode), which is interactive, and better for active development.
## With Python
@@ -32,20 +31,19 @@ To run OpenHands in Headless mode with Docker:
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
-e LLM_API_KEY=$LLM_API_KEY \
-e LLM_MODEL=$LLM_MODEL \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.44 \
docker.all-hands.dev/all-hands-ai/openhands:0.39 \
python -m openhands.core.main -t "write a bash script that prints hi"
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
The `-e SANDBOX_USER_ID=$(id -u)` is passed to the Docker command to ensure the sandbox user matches the host users
permissions. This prevents the agent from creating root-owned files in the mounted workspace.

View File

@@ -1,6 +1,6 @@
---
title: Quick Start
description: Running OpenHands Cloud or running on your local system.
description: Running OpenHands on the cloud or your local desktop
icon: rocket
---
@@ -10,10 +10,164 @@ The easiest way to get started with OpenHands is on OpenHands Cloud, which comes
To get started with OpenHands Cloud, visit [app.all-hands.dev](https://app.all-hands.dev).
For more information see [getting started with OpenHands Cloud.](/usage/cloud/openhands-cloud)
You'll be prompted to connect with your GitHub or GitLab account:
## Running OpenHands on Your Own
1. Click `Log in with GitHub` or `Log in with GitLab`.
2. Review the permissions requested by OpenHands and authorize the application.
- OpenHands will require certain permissions from your account. To read more about these permissions,
you can click the `Learn more` link on the authorization page.
Run OpenHands on your local system and bring your own LLM and API key.
For more information see [running OpenHands on your own.](/usage/local-setup)
Once you've connected your account, you can:
- [Install GitHub Integration](/usage/cloud/github-installation) to use OpenHands with your GitHub repositories
- [Install GitLab Integration](/usage/cloud/gitlab-installation) to use OpenHands with your GitLab repositories
- [Access the Cloud UI](/usage/cloud/cloud-ui) to interact with the web interface
- [Use the Cloud API](/usage/cloud/cloud-api) to programmatically interact with OpenHands
- [Set up the Cloud Issue Resolver](/usage/cloud/cloud-issue-resolver) to automate code fixes and provide intelligent assistance
## Running OpenHands on your local desktop
### System Requirements
- MacOS with [Docker Desktop support](https://docs.docker.com/desktop/setup/install/mac-install/#system-requirements)
- Linux
- Windows with [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) and [Docker Desktop support](https://docs.docker.com/desktop/setup/install/windows-install/#system-requirements)
A system with a modern processor and a minimum of **4GB RAM** is recommended to run OpenHands.
### Prerequisites
<AccordionGroup>
<Accordion title="MacOS">
**Docker Desktop**
1. [Install Docker Desktop on Mac](https://docs.docker.com/desktop/setup/install/mac-install).
2. Open Docker Desktop, go to `Settings > Advanced` and ensure `Allow the default Docker socket to be used` is enabled.
</Accordion>
<Accordion title="Linux">
<Note>
Tested with Ubuntu 22.04.
</Note>
**Docker Desktop**
1. [Install Docker Desktop on Linux](https://docs.docker.com/desktop/setup/install/linux/).
</Accordion>
<Accordion title="Windows">
**WSL**
1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
2. Run `wsl --version` in powershell and confirm `Default Version: 2`.
**Docker Desktop**
1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
2. Open Docker Desktop, go to `Settings` and confirm the following:
- General: `Use the WSL 2 based engine` is enabled.
- Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
<Note>
The docker command below to start the app must be run inside the WSL terminal.
</Note>
</Accordion>
</AccordionGroup>
### Start the App
The easiest way to run OpenHands is in Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.39
```
You'll find OpenHands running at http://localhost:3000!
You can also [connect OpenHands to your local filesystem](https://docs.all-hands.dev/modules/usage/runtimes/docker#connecting-to-your-filesystem),
run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
interact with it via a [friendly CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode),
or run it on tagged issues with [a GitHub action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
### Setup
After launching OpenHands, you **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
This can be done during the initial settings popup or by selecting the `Settings`
button (gear icon) in the UI.
If the required model does not exist in the list, in `Settings` under the `LLM` tab, you can toggle `Advanced` options
and manually enter it with the correct prefix in the `Custom Model` text box.
The `Advanced` options also allow you to specify a `Base URL` if required.
#### Getting an API Key
OpenHands requires an API key to access most language models. Here's how to get an API key from the recommended providers:
<AccordionGroup>
<Accordion title="Anthropic (Claude)">
1. [Create an Anthropic account](https://console.anthropic.com/).
2. [Generate an API key](https://console.anthropic.com/settings/keys).
3. [Set up billing](https://console.anthropic.com/settings/billing).
Consider setting usage limits to control costs.
</Accordion>
<Accordion title="OpenAI">
1. [Create an OpenAI account](https://platform.openai.com/).
2. [Generate an API key](https://platform.openai.com/api-keys).
3. [Set up billing](https://platform.openai.com/account/billing/overview).
</Accordion>
</AccordionGroup>
#### Setting Up Search Engine
OpenHands can be configured to use a search engine to allow the agent to search the web for information when needed.
Search functionality is enabled by default in OpenHands Cloud. No additional setup is required.
To enable search functionality in self-hosted OpenHands:
1. Get a Tavily API key from [tavily.com](https://tavily.com/)
2. Enter the API key in the Settings page under `LLM` tab, `Search API Key (Tavily)`
For more details, see the [Search Engine Setup](/usage/search-engine-setup) guide.
Now you're ready to [get started with OpenHands](./getting-started).
#### Versions
The [docker command above](./installation#start-the-app) pulls the most recent stable release of OpenHands. You have other options as well:
- For a specific release, replace `$VERSION` in `openhands:$VERSION` and `runtime:$VERSION`, with the version number.
We use SemVer so `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, replace `$VERSION` in `openhands:$VERSION` and `runtime:$VERSION`, with `main`.
This version is unstable and is recommended for testing or development purposes only.
For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).

View File

@@ -8,7 +8,7 @@ description: OpenHands uses LiteLLM to make calls to Google's chat models. You c
When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
- `LLM Provider` to `Gemini`
- `LLM Model` to the model you will be using.
If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model`
If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model`
(e.g. gemini/&lt;model-name&gt; like `gemini/gemini-2.0-flash`).
- `API Key` to your Gemini API key
@@ -26,5 +26,5 @@ VERTEXAI_LOCATION="<your-gcp-location>"
Then set the following in the OpenHands UI through the Settings under the `LLM` tab:
- `LLM Provider` to `VertexAI`
- `LLM Model` to the model you will be using.
If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model`
If the model is not in the list, enable `Advanced` options, and enter it in `Custom Model`
(e.g. vertex_ai/&lt;model-name&gt;).

View File

@@ -8,7 +8,7 @@ description: OpenHands uses LiteLLM to make calls to chat models on Groq. You ca
When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
- `LLM Provider` to `Groq`
- `LLM Model` to the model you will be using. [Visit here to see the list of
models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list,
models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list,
enable `Advanced` options, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
- `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).

View File

@@ -16,7 +16,7 @@ To use LiteLLM proxy with OpenHands, you need to:
## Supported Models
The supported models depend on your LiteLLM proxy configuration. OpenHands supports any model that your LiteLLM proxy
The supported models depend on your LiteLLM proxy configuration. OpenHands supports any model that your LiteLLM proxy
is configured to handle.
Refer to your LiteLLM proxy configuration for the list of available models and their names.

View File

@@ -14,28 +14,23 @@ recommendations for model selection. Our latest benchmarking results can be foun
Based on these findings and community feedback, these are the latest models that have been verified to work reasonably well with OpenHands:
### Cloud / API-Based Models
- [anthropic/claude-sonnet-4-20250514](https://www.anthropic.com/api) (recommended)
- [openai/o4-mini](https://openai.com/index/introducing-o3-and-o4-mini/)
- [gemini/gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/)
- [deepseek/deepseek-chat](https://api-docs.deepseek.com/)
- [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model) -- available through [OpenRouter](https://openrouter.ai/all-hands/openhands-lm-32b-v0.1)
If you have successfully run OpenHands with specific providers, we encourage you to open a PR to share your setup process
to help others using the same provider!
For a full list of the providers and models available, please consult the
[litellm documentation](https://docs.litellm.ai/docs/providers).
<Warning>
OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
limits and monitor usage.
</Warning>
### Local / Self-Hosted Models
If you have successfully run OpenHands with specific providers, we encourage you to open a PR to share your setup process
to help others using the same provider!
- [mistralai/devstral-small](https://www.all-hands.dev/blog/devstral-a-new-state-of-the-art-open-model-for-coding-agents) (20 May 2025) -- also available through [OpenRouter](https://openrouter.ai/mistralai/devstral-small:free)
- [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model) (31 March 2025) -- also available through [OpenRouter](https://openrouter.ai/all-hands/openhands-lm-32b-v0.1)
For a full list of the providers and models available, please consult the
[litellm documentation](https://docs.litellm.ai/docs/providers).
<Note>
Most current local and open source models are not as powerful. When using such models, you may see long

View File

@@ -23,7 +23,7 @@ We recommend using [LMStudio](https://lmstudio.ai/) for serving these models loc
- Option 2: Download a LLM in GGUF format. For example, to download [Devstral Small 2505 GGUF](https://huggingface.co/mistralai/Devstral-Small-2505_gguf), using `huggingface-cli download mistralai/Devstral-Small-2505_gguf --local-dir mistralai/Devstral-Small-2505_gguf`. Then in bash terminal, run `lms import {model_name}` in the directory where you've downloaded the model checkpoint (e.g. run `lms import devstralQ4_K_M.gguf` in `mistralai/Devstral-Small-2505_gguf`)
3. Open LM Studio application, you should first switch to `power user` mode, and then open the developer tab:
![image](./screenshots/1_select_power_user.png)
4. Then click `Select a model to load` on top of the application:
@@ -48,33 +48,31 @@ We recommend using [LMStudio](https://lmstudio.ai/) for serving these models loc
### Start OpenHands with locally served model
Check [the installation guide](/usage/local-setup) to make sure you have all the prerequisites for running OpenHands.
Check [the installation guide](https://docs.all-hands.dev/modules/usage/installation) to make sure you have all the prerequisites for running OpenHands.
```bash
export LMSTUDIO_MODEL_NAME="imported-models/uncategorized/devstralq4_k_m.gguf" # <- Replace this with the model name you copied from LMStudio
export LMSTUDIO_URL="http://host.docker.internal:1234" # <- Replace this with the port from LMStudio
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik
mkdir -p ~/.openhands && echo '{"language":"en","agent":"CodeActAgent","max_iterations":null,"security_analyzer":null,"confirmation_mode":false,"llm_model":"lm_studio/'$LMSTUDIO_MODEL_NAME'","llm_api_key":"dummy","llm_base_url":"'$LMSTUDIO_URL/v1'","remote_runtime_resource_factor":null,"github_token":null,"enable_default_condenser":true,"user_consents_to_analytics":true}' > ~/.openhands/settings.json
mkdir -p ~/.openhands-state && echo '{"language":"en","agent":"CodeActAgent","max_iterations":null,"security_analyzer":null,"confirmation_mode":false,"llm_model":"lm_studio/'$LMSTUDIO_MODEL_NAME'","llm_api_key":"dummy","llm_base_url":"'$LMSTUDIO_URL/v1'","remote_runtime_resource_factor":null,"github_token":null,"enable_default_condenser":true,"user_consents_to_analytics":true}' > ~/.openhands-state/settings.json
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.39-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.44
docker.all-hands.dev/all-hands-ai/openhands:0.39
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
Once your server is running -- you can visit `http://localhost:3000` in your browser to use OpenHands with local Devstral model:
```
Digest: sha256:e72f9baecb458aedb9afc2cd5bc935118d1868719e55d50da73190d3a85c674f
Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.44
Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.39
Starting OpenHands...
Running OpenHands as root
14:22:13 - openhands:INFO: server_config.py:50 - Using config class None
@@ -128,18 +126,6 @@ vllm serve all-hands/openhands-lm-32b-v0.1 \
--enable-prefix-caching
```
### Create an OpenAI-Compatible Endpoint with Ollama
- Install Ollama following [the official documentation](https://ollama.com/download).
- For Ollama configuration, use `ollama/<modelname>` as custom model in web. Api key also can be set to `ollama`.
- Example launch command for Devstral LM 24B:
```bash
OLLAMA_CONTEXT_LENGTH=32768 OLLAMA_HOST=0.0.0.0:11434 OLLAMA_KEEP_ALIVE=-1 nohup ollama serve&
#The minimum context size is ~8196, even the system prompt won't fit smaller
ollama pull devstral:latest
```
## Advanced: Run and Configure OpenHands
### Run OpenHands
@@ -166,7 +152,7 @@ Start OpenHands using `make run`.
### Configure OpenHands
Once OpenHands is running, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
Once OpenHands is running, you'll need to set the following in the OpenHands UI through the Settings under the `LLM` tab:
1. Enable `Advanced` options.
2. Set the following:
- `Custom Model` to `openai/<served-model-name>` (e.g. `openai/openhands-lm-32b-v0.1`)

View File

@@ -9,6 +9,6 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
* `LLM Provider` to `OpenRouter`
* `LLM Model` to the model you will be using.
[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
If the model is not in the list, enable `Advanced` options, and enter it in
If the model is not in the list, enable `Advanced` options, and enter it in
`Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
* `API Key` to your OpenRouter API key.

View File

@@ -1,171 +0,0 @@
---
title: Getting Started
description: Getting started with running OpenHands on your own.
---
## Recommended Methods for Running Openhands on Your Local System
### System Requirements
- MacOS with [Docker Desktop support](https://docs.docker.com/desktop/setup/install/mac-install/#system-requirements)
- Linux
- Windows with [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) and [Docker Desktop support](https://docs.docker.com/desktop/setup/install/windows-install/#system-requirements)
- Windows without WSL (see [Windows Without WSL Guide](/usage/windows-without-wsl))
A system with a modern processor and a minimum of **4GB RAM** is recommended to run OpenHands.
### Prerequisites
<AccordionGroup>
<Accordion title="MacOS">
**Docker Desktop**
1. [Install Docker Desktop on Mac](https://docs.docker.com/desktop/setup/install/mac-install).
2. Open Docker Desktop, go to `Settings > Advanced` and ensure `Allow the default Docker socket to be used` is enabled.
</Accordion>
<Accordion title="Linux">
<Note>
Tested with Ubuntu 22.04.
</Note>
**Docker Desktop**
1. [Install Docker Desktop on Linux](https://docs.docker.com/desktop/setup/install/linux/).
</Accordion>
<Accordion title="Windows">
**WSL**
1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
2. Run `wsl --version` in powershell and confirm `Default Version: 2`.
**Docker Desktop**
1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
2. Open Docker Desktop, go to `Settings` and confirm the following:
- General: `Use the WSL 2 based engine` is enabled.
- Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
<Note>
The docker command below to start the app must be run inside the WSL terminal.
</Note>
**Alternative: Windows without WSL**
If you prefer to run OpenHands on Windows without WSL or Docker, see our [Windows Without WSL Guide](/usage/windows-without-wsl).
</Accordion>
</AccordionGroup>
### Start the App
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.44-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.44
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
You'll find OpenHands running at http://localhost:3000!
### Setup
After launching OpenHands, you **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
This can be done during the initial settings popup or by selecting the `Settings`
button (gear icon) in the UI.
If the required model does not exist in the list, in `Settings` under the `LLM` tab, you can toggle `Advanced` options
and manually enter it with the correct prefix in the `Custom Model` text box.
The `Advanced` options also allow you to specify a `Base URL` if required.
#### Getting an API Key
OpenHands requires an API key to access most language models. Here's how to get an API key from the recommended providers:
<AccordionGroup>
<Accordion title="Anthropic (Claude)">
1. [Create an Anthropic account](https://console.anthropic.com/).
2. [Generate an API key](https://console.anthropic.com/settings/keys).
3. [Set up billing](https://console.anthropic.com/settings/billing).
</Accordion>
<Accordion title="OpenAI">
1. [Create an OpenAI account](https://platform.openai.com/).
2. [Generate an API key](https://platform.openai.com/api-keys).
3. [Set up billing](https://platform.openai.com/account/billing/overview).
</Accordion>
<Accordion title="Google (Gemini)">
1. Create a Google account if you don't already have one.
2. [Generate an API key](https://aistudio.google.com/apikey).
3. [Set up billing](https://aistudio.google.com/usage?tab=billing).
</Accordion>
<Accordion title="Local LLM (e.g. LM Studio, llama.cpp, Ollama)">
If your local LLM server isnt behind an authentication proxy, you can enter any value as the API key (e.g. `local-key`, `test123`) — it wont be used.
</Accordion>
</AccordionGroup>
Consider setting usage limits to control costs.
#### Using a Local LLM
<Note>
Effective use of local models for agent tasks requires capable hardware, along with models specifically tuned for instruction-following and agent-style behavior.
</Note>
To run OpenHands with a locally hosted language model instead of a cloud provider, see the [Local LLMs guide](/usage/llms/local-llms) for setup instructions.
#### Setting Up Search Engine
OpenHands can be configured to use a search engine to allow the agent to search the web for information when needed.
To enable search functionality in OpenHands:
1. Get a Tavily API key from [tavily.com](https://tavily.com/).
2. Enter the Tavily API key in the Settings page under `LLM` tab > `Search API Key (Tavily)`
For more details, see the [Search Engine Setup](/usage/search-engine-setup) guide.
Now you're ready to [get started with OpenHands](/usage/getting-started).
### Versions
The [docker command above](/usage/local-setup#start-the-app) pulls the most recent stable release of OpenHands. You have other options as well:
- For a specific release, replace `$VERSION` in `openhands:$VERSION` and `runtime:$VERSION`, with the version number.
For example, `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, replace `$VERSION` in `openhands:$VERSION` and `runtime:$VERSION`, with `main`.
This version is unstable and is recommended for testing or development purposes only.
## Next Steps
- [Connect OpenHands to your local filesystem.](/usage/runtimes/docker#connecting-to-your-filesystem) to use OpenHands with your GitHub repositories
- [Run OpenHands in a scriptable headless mode.](/usage/how-to/headless-mode)
- [Run OpenHands with a friendly CLI.](/usage/how-to/cli-mode)
- [Run OpenHands on tagged issues with a GitHub action.](/usage/how-to/github-action)

View File

@@ -5,7 +5,7 @@ description: Organizations and users can define microagents that apply to all re
## Usage
These microagents can be [any type of microagent](./microagents-overview#microagent-types) and will be loaded
These microagents can be [any type of microagent](./microagents-overview#microagent-types) and will be loaded
accordingly. However, they are applied to all repositories belonging to the organization or user.
Add a `.openhands` repository under the organization or user and create a `microagents` directory and place the

View File

@@ -11,7 +11,7 @@ Currently OpenHands supports the following types of microagents:
- [Keyword-Triggered Microagents](./microagents-keyword): Guidelines activated by specific keywords in prompts.
To customize OpenHands' behavior, create a .openhands/microagents/ directory in the root of your repository and
add `<microagent_name>.md` files inside. For repository-specific guidelines, you can ask OpenHands to analyze your repository and create a comprehensive `repo.md` file (see [General Microagents](./microagents-repo) for details).
add `<microagent_name>.md` files inside.
<Note>
Loaded microagents take up space in the context window.

View File

@@ -17,45 +17,13 @@ Frontmatter should be enclosed in triple dashes (---) and may include the follow
|-----------|-----------------------------------------|----------|----------------|
| `agent` | The agent this microagent applies to | No | 'CodeActAgent' |
## Creating a Comprehensive Repository Agent
To create an effective repository agent, you can ask OpenHands to analyze your repository with a prompt like:
## Example
General microagent file example located at `.openhands/microagents/repo.md`:
```
Please browse the repository, look at the documentation and relevant code, and understand the purpose of this repository.
Specifically, I want you to create a `.openhands/microagents/repo.md` file. This file should contain succinct information that summarizes:
1. The purpose of this repository
2. The general setup of this repo
3. A brief description of the structure of this repo
Read all the GitHub workflows under .github/ of the repository (if this folder exists) to understand the CI checks (e.g., linter, pre-commit), and include those in the repo.md file.
```
This approach helps OpenHands capture repository context efficiently, reducing the need for repeated searches during conversations and ensuring more accurate solutions.
## Example Content
A comprehensive repository agent file (`.openhands/microagents/repo.md`) should include:
```
# Repository Purpose
This project is a TODO application that allows users to track TODO items.
# Setup Instructions
To set it up, you can run `npm run build`.
# Repository Structure
- `/src`: Core application code
- `/tests`: Test suite
- `/docs`: Documentation
- `/.github`: CI/CD workflows
# CI/CD Workflows
- `lint.yml`: Runs ESLint on all JavaScript files
- `test.yml`: Runs the test suite on pull requests
# Development Guidelines
Always make sure the tests are passing before committing changes. You can run the tests by running `npm run test`.
```

View File

@@ -15,7 +15,7 @@ Before using the Local Runtime, ensure that:
1. You can run OpenHands using the [Development workflow](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
2. For Linux and Mac, tmux is available on your system.
3. For Windows, PowerShell is available on your system.
- Only [CLI mode](../how-to/cli-mode) and [headless mode](../how-to/headless-mode) are supported in Windows with Local Runtime.
- Only [CLI mode](../how-to/cli-mode) and [headless mode](../how-to/headless-mode) are supported in Windows with Local Runtime.
## Configuration

View File

@@ -31,9 +31,9 @@ On initial prompt, an error is seen with `Permission Denied` or `PermissionError
**Resolution**
* Check if the `~/.openhands` is owned by `root`. If so, you can:
* Change the directory's ownership: `sudo chown <user>:<user> ~/.openhands`.
* or update permissions on the directory: `sudo chmod 777 ~/.openhands`
* Check if the `~/.openhands-state` is owned by `root`. If so, you can:
* Change the directory's ownership: `sudo chown <user>:<user> ~/.openhands-state`.
* or update permissions on the directory: `sudo chmod 777 ~/.openhands-state`
* or delete it if you dont need previous data. OpenHands will recreate it. You'll need to re-enter LLM settings.
* If mounting a local directory, ensure your `WORKSPACE_BASE` has the necessary permissions for the user running
OpenHands.
@@ -56,16 +56,13 @@ To fix this:
-e SANDBOX_VSCODE_PORT=41234 \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:latest \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ~/.openhands:/.openhands \
-v ~/.openhands-state:/.openhands-state \
-p 3000:3000 \
-p 41234:41234 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:latest
```
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
2. Make sure to expose the same port with `-p 41234:41234` in your Docker command.
3. If running with the development workflow, you can set this in your `config.toml` file:
```toml

View File

@@ -1,200 +0,0 @@
---
title: Windows Without WSL
description: Running OpenHands GUI on Windows without using WSL or Docker
---
# Running OpenHands GUI on Windows Without WSL
This guide provides step-by-step instructions for running OpenHands on a Windows machine without using WSL or Docker.
## Prerequisites
1. **Windows 10/11** - A modern Windows operating system
2. **PowerShell 7+** - While Windows PowerShell comes pre-installed on Windows 10/11, PowerShell 7+ is strongly recommended to avoid compatibility issues (see Troubleshooting section for "System.Management.Automation" errors)
3. **.NET Core Runtime** - Required for the PowerShell integration via pythonnet
4. **Python 3.12 or 3.13** - Python 3.12 or 3.13 is required (Python 3.14 is not supported due to pythonnet compatibility)
5. **Git** - For cloning the repository and version control
6. **Node.js and npm** - For running the frontend
## Step 1: Install Required Software
1. **Install Python 3.12 or 3.13**
- Download Python 3.12.x or 3.13.x from [python.org](https://www.python.org/downloads/)
- During installation, check "Add Python to PATH"
- Verify installation by opening PowerShell and running:
```powershell
python --version
```
2. **Install PowerShell 7**
- Download and install PowerShell 7 from the [official PowerShell GitHub repository](https://github.com/PowerShell/PowerShell/releases)
- Choose the MSI installer appropriate for your system (x64 for most modern computers)
- Run the installer with default options
- Verify installation by opening a new terminal and running:
```powershell
pwsh --version
```
- Using PowerShell 7 (pwsh) instead of Windows PowerShell will help avoid "System.Management.Automation" errors
3. **Install .NET Core Runtime**
- Download and install the .NET Core Runtime from [Microsoft's .NET download page](https://dotnet.microsoft.com/download)
- Choose the latest .NET Core Runtime (not SDK)
- Verify installation by opening PowerShell and running:
```powershell
dotnet --info
```
- This step is required for the PowerShell integration via pythonnet. Without it, OpenHands will fall back to a more limited PowerShell implementation.
4. **Install Git**
- Download Git from [git-scm.com](https://git-scm.com/download/win)
- Use default installation options
- Verify installation:
```powershell
git --version
```
5. **Install Node.js and npm**
- Download Node.js from [nodejs.org](https://nodejs.org/) (LTS version recommended)
- During installation, accept the default options which will install npm as well
- Verify installation:
```powershell
node --version
npm --version
```
6. **Install Poetry**
- Open PowerShell as Administrator and run:
```powershell
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
```
- Add Poetry to your PATH:
```powershell
$env:Path += ";$env:APPDATA\Python\Scripts"
```
- Verify installation:
```powershell
poetry --version
```
## Step 2: Clone and Set Up OpenHands
1. **Clone the Repository**
```powershell
git clone https://github.com/All-Hands-AI/OpenHands.git
cd OpenHands
```
2. **Install Dependencies**
```powershell
poetry install
```
This will install all required dependencies, including:
- pythonnet - Required for Windows PowerShell integration
- All other OpenHands dependencies
## Step 3: Run OpenHands
1. **Build the Frontend**
```powershell
cd frontend
npm install
npm run build
cd ..
```
This will build the frontend files that the backend will serve.
2. **Start the Backend**
```powershell
# Make sure to use PowerShell 7 (pwsh) instead of Windows PowerShell
pwsh
$env:RUNTIME="local"; poetry run uvicorn openhands.server.listen:app --host 0.0.0.0 --port 3000 --reload --reload-exclude "./workspace"
```
This will start the OpenHands app using the local runtime with PowerShell integration, available at `localhost:3000`.
> **Note**: If you encounter a `RuntimeError: Directory './frontend/build' does not exist` error, make sure you've built the frontend first using the command above.
> **Important**: Using PowerShell 7 (pwsh) instead of Windows PowerShell is recommended to avoid "System.Management.Automation" errors. If you encounter this error, see the Troubleshooting section below.
3. **Alternatively, Run the Frontend in Development Mode (in a separate PowerShell window)**
```powershell
cd frontend
npm run dev
```
4. **Access the OpenHands GUI**
Open your browser and navigate to:
```
http://localhost:3000
```
> **Note**: If you're running the frontend in development mode (using `npm run dev`), use port 3001 instead: `http://localhost:3001`
## Limitations on Windows
When running OpenHands on Windows without WSL or Docker, be aware of the following limitations:
1. **Browser Tool Not Supported**: The browser tool is not currently supported on Windows.
2. **.NET Core Requirement**: The PowerShell integration requires .NET Core Runtime to be installed. If .NET Core is not available, OpenHands will automatically fall back to a more limited PowerShell implementation with reduced functionality.
3. **Interactive Shell Commands**: Some interactive shell commands may not work as expected. The PowerShell session implementation has limitations compared to the bash session used on Linux/macOS.
4. **Path Handling**: Windows uses backslashes (`\`) in paths, which may require adjustments when working with code examples designed for Unix-like systems.
## Troubleshooting
### "System.Management.Automation" Not Found Error
If you encounter an error message stating that "System.Management.Automation" was not found, this typically indicates that you have a minimal version of PowerShell installed or that the .NET components required for PowerShell integration are missing.
> **IMPORTANT**: This error is most commonly caused by using the built-in Windows PowerShell (powershell.exe) instead of PowerShell 7 (pwsh.exe). Even if you installed PowerShell 7 during the prerequisites, you may still be using the older Windows PowerShell by default.
To resolve this issue:
1. **Install the latest version of PowerShell 7** from the official Microsoft repository:
- Visit [https://github.com/PowerShell/PowerShell/releases](https://github.com/PowerShell/PowerShell/releases)
- Download and install the latest MSI package for your system architecture (x64 for most systems)
- During installation, ensure you select the following options:
- "Add PowerShell to PATH environment variable"
- "Register Windows PowerShell 7 as the default shell"
- "Enable PowerShell remoting"
- The installer will place PowerShell 7 in `C:\Program Files\PowerShell\7` by default
2. **Restart your terminal or command prompt** to ensure the new PowerShell is available
3. **Verify the installation** by running:
```powershell
pwsh --version
```
You should see output indicating PowerShell 7.x.x
4. **Run OpenHands using PowerShell 7** instead of Windows PowerShell:
```powershell
pwsh
cd path\to\openhands
$env:RUNTIME="local"; poetry run uvicorn openhands.server.listen:app --host 0.0.0.0 --port 3000 --reload --reload-exclude "./workspace"
```
> **Note**: Make sure you're explicitly using `pwsh` (PowerShell 7) and not `powershell` (Windows PowerShell). The command prompt or terminal title should say "PowerShell 7" rather than just "Windows PowerShell".
5. **If the issue persists**, ensure that you have the .NET Runtime installed:
- Download and install the latest .NET Runtime from [Microsoft's .NET download page](https://dotnet.microsoft.com/download)
- Choose ".NET Runtime" (not SDK) version 6.0 or later
- After installation, verify it's properly installed by running:
```powershell
dotnet --info
```
- Restart your computer after installation
- Try running OpenHands again
6. **Ensure that the .NET Framework is properly installed** on your system:
- Go to Control Panel > Programs > Programs and Features > Turn Windows features on or off
- Make sure ".NET Framework 4.8 Advanced Services" is enabled
- Click OK and restart if prompted
This error occurs because OpenHands uses the pythonnet package to interact with PowerShell, which requires the System.Management.Automation assembly from the .NET framework. A minimal PowerShell installation or older Windows PowerShell (rather than PowerShell 7+) might not include all the necessary components for this integration.

View File

@@ -71,27 +71,10 @@ EVAL_CONDENSER=summarizer_for_eval \
The name is up to you, but should match a name defined in your `config.toml` file. The last argument in the command specifies the condenser configuration to use. In this case, `summarizer_for_eval` is used, which refers to the LLM-based summarizing condenser as defined above.
If no condenser configuration is specified, the 'noop' condenser will be used by default, which keeps the full conversation history.
```
For other configurations specific to evaluation, such as `save_trajectory_path`, these are typically set in the `get_config` function of the respective `run_infer.py` file for each benchmark.
### Enabling LLM-Based Editor Tools
The LLM-Based Editor tool (currently supported only for SWE-Bench) can be enabled by setting:
```bash
export ENABLE_LLM_EDITOR=true
```
You can set the config for the Editor LLM as:
```toml
[llm.draft_editor]
base_url = "http://localhost:9002/v1"
model = "hosted_vllm/lite_coder_qwen_editor_3B"
api_key = ""
temperature = 0.7
max_input_tokens = 10500
max_output_tokens = 10500
```
## Supported Benchmarks
The OpenHands evaluation harness supports a wide variety of benchmarks across [software engineering](#software-engineering), [web browsing](#web-browsing), [miscellaneous assistance](#misc-assistance), and [real-world](#real-world) tasks.

View File

@@ -144,7 +144,7 @@ if __name__ == '__main__':
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
llm_config.modify_params = False
if llm_config is None:

View File

@@ -1 +0,0 @@
data/

View File

@@ -6,13 +6,6 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
To enable the Tavily MCP Server, you can add the Tavily API key under the `core` section of your `config.toml` file, like below:
```toml
[core]
search_api_key = "tvly-******"
```
## Run the evaluation
We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).

View File

@@ -1,5 +1,4 @@
import asyncio
import copy
import functools
import os
import re
@@ -7,7 +6,6 @@ import re
import huggingface_hub
import pandas as pd
from datasets import load_dataset
from pydantic import SecretStr
from evaluation.benchmarks.gaia.scorer import question_scorer
from evaluation.utils.shared import (
@@ -26,7 +24,6 @@ from openhands.core.config import (
OpenHandsConfig,
get_llm_config_arg,
get_parser,
load_from_toml,
)
from openhands.core.config.utils import get_agent_config_arg
from openhands.core.logger import openhands_logger as logger
@@ -44,7 +41,7 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
}
AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': 'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool. Your final answer MUST be encapsulated within <solution> and </solution>.\n'
'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
}
@@ -52,7 +49,7 @@ def get_config(
metadata: EvalMetadata,
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
@@ -70,11 +67,6 @@ def get_config(
logger.info('Agent config not provided, using default settings')
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False
config_copy = copy.deepcopy(config)
load_from_toml(config_copy)
if config_copy.search_api_key:
config.search_api_key = SecretStr(config_copy.search_api_key)
return config
@@ -142,26 +134,16 @@ def process_instance(
dest_file = None
# Prepare instruction
instruction = """You have one question to answer. It is paramount that you provide a correct answer.
Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
You must make sure you find the correct answer! You MUST strictly follow the task-specific formatting instructions for your final answer.
Here is the task:
{task_question}
""".format(
task_question=instance['Question'],
)
instruction = f'{instance["Question"]}\n'
logger.info(f'Instruction: {instruction}')
if dest_file:
instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
instruction += 'IMPORTANT: Please encapsulate your final answer (answer ONLY) within <solution> and </solution>. Your answer will be evaluated using string matching approaches so it important that you STRICTLY adhere to the output formatting instructions specified in the task (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)\n'
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
instruction += (
'For example: The answer to the question is <solution> 42 </solution>.\n'
)
instruction += "IMPORTANT: Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, express it numerically (i.e., with digits rather than words), do not use commas, and do not include units such as $ or percent signs unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities). If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.\n"
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
@@ -193,7 +175,7 @@ Here is the task:
for event in reversed(state.history):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
model_answer_raw = event.final_thought
model_answer_raw = event.thought
break
elif isinstance(event, CmdRunAction):
model_answer_raw = event.thought
@@ -240,7 +222,6 @@ Here is the task:
error=state.last_error if state and state.last_error else None,
test_result=test_result,
)
runtime.close()
return output
@@ -272,8 +253,6 @@ if __name__ == '__main__':
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
toml_config = OpenHandsConfig()
load_from_toml(toml_config)
metadata = make_metadata(
llm_config=llm_config,
dataset_name='gaia',
@@ -282,10 +261,7 @@ if __name__ == '__main__':
eval_note=args.eval_note,
eval_output_dir=args.eval_output_dir,
data_split=args.data_split,
details={
'gaia-level': args.level,
'mcp-servers': ['tavily'] if toml_config.search_api_key else [],
},
details={'gaia-level': args.level},
agent_config=agent_config,
)

View File

@@ -39,7 +39,7 @@ echo "LEVELS: $LEVELS"
COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 60 \
--max-iterations 30 \
--level $LEVELS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \

View File

@@ -223,7 +223,7 @@ if __name__ == '__main__':
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
llm_config.modify_params = False
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

View File

@@ -2,8 +2,6 @@
This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
**UPDATE (6/15/2025): We now support running SWE-bench-Live evaluation (see the paper [here](https://arxiv.org/abs/2505.23419))! For how to run it, checkout [this README](./SWE-bench-Live.md).**
**UPDATE (5/26/2025): We now support running interactive SWE-Bench evaluation (see the paper [here](https://arxiv.org/abs/2502.13069))! For how to run it, checkout [this README](./SWE-Interact.md).**
**UPDATE (4/8/2025): We now support running SWT-Bench evaluation! For more details, checkout [the corresponding section](#SWT-Bench-Evaluation).**

View File

@@ -1,65 +0,0 @@
# SWE-bench-Live
<p align="center">
<a href="https://arxiv.org/abs/2505.23419">📃 Paper</a>
<a href="https://huggingface.co/SWE-bench-Live" >🤗 HuggingFace</a>
<a href="https://SWE-bench-Live.github.io" >📊 Leaderboard</a>
</p>
SWE-bench-Live is a live benchmark for issue resolving, providing a dataset that contains the latest issue tasks. This document explains how to run the evaluation of OpenHands on SWE-bench-Live.
Since SWE-bench-Live has an almost identical setting to SWE-bench, you only need to simply change the dataset name to `SWE-bench-Live/SWE-bench-Live`, the other parts are basically the same as running on SWE-bench.
## Setting Up
Set up the development environment and configure your LLM provider by following the [README](README.md).
## Running Inference
Use the same script, but change the dataset name to `SWE-bench-Live` and select the split (either `lite` or `full`). The lite split contains 300 instances from the past six months, while the full split includes 1,319 instances created after 2024.
```shell
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
```
In the original SWE-bench-Live paper, max_iterations is set to 100.
```shell
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.your_llm HEAD CodeActAgent 300 100 3 SWE-bench-Live/SWE-bench-Live lite
```
## Evaluating Results
After OpenHands generates patch results for each issue, we evaluate the results using the [SWE-bench-Live evaluation harness](https://github.com/microsoft/SWE-bench-Live).
Convert to the format of predictions for SWE benchmarks:
```shell
# You can find output.jsonl in evaluation/evaluation_outputs
python evaluation/benchmarks/swe_bench/scripts/live/convert.py --output_jsonl [path/to/evaluation/output.jsonl] > preds.jsonl
```
Please refer to the original [SWE-bench-Live repository](https://github.com/microsoft/SWE-bench-Live) to set up the evaluation harness and use the provided scripts to generate the evaluation report:
```shell
python -m swebench.harness.run_evaluation \
--dataset_name SWE-bench-Live/SWE-bench-Live \
--split lite \
--namespace starryzhang \
--predictions_path preds.jsonl \
--max_workers 10 \
--run_id openhands
```
## Citation
```bibtex
@article{zhang2025swebenchgoeslive,
title={SWE-bench Goes Live!},
author={Linghao Zhang and Shilin He and Chaoyun Zhang and Yu Kang and Bowen Li and Chengxing Xie and Junhao Wang and Maoquan Wang and Yufan Huang and Shengyu Fu and Elsie Nallipogu and Qingwei Lin and Yingnong Dang and Saravan Rajmohan and Dongmei Zhang},
journal={arXiv preprint arXiv:2505.23419},
year={2025}
}
```

View File

@@ -1,80 +0,0 @@
from typing import Any
import pandas as pd
from evaluation.utils.shared import assert_and_raise
from openhands.core.logger import openhands_logger as logger
from openhands.events.action import CmdRunAction
from openhands.events.observation import (
CmdOutputObservation,
ErrorObservation,
)
from openhands.runtime.base import Runtime
from openhands.utils.shutdown_listener import sleep_if_should_continue
def complete_runtime(
runtime: Runtime,
instance: pd.Series,
) -> dict[str, Any]:
"""Complete the runtime and export the git patch for SWE-bench-Live."""
logger.info('-' * 30)
logger.info('BEGIN Runtime Completion Fn')
logger.info('-' * 30)
obs: CmdOutputObservation
workspace_dir_name = instance.instance_id
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.set_hard_timeout(600)
logger.info(action)
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
)
action = CmdRunAction(command='git config --global core.pager ""')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
f'Failed to git config --global core.pager "": {str(obs)}',
)
action = CmdRunAction(command='git add -A')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
f'Failed to git add -A: {str(obs)}',
)
n_retries = 0
git_patch = None
while n_retries < 5:
action = CmdRunAction(
command=f'git diff --no-color --cached {instance["base_commit"]}',
)
action.set_hard_timeout(100 + 10 * n_retries)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
n_retries += 1
if isinstance(obs, CmdOutputObservation):
if obs.exit_code == 0:
git_patch = obs.content.strip()
break
else:
logger.info('Failed to get git diff, retrying...')
sleep_if_should_continue(10)
elif isinstance(obs, ErrorObservation):
logger.error(f'Error occurred: {obs.content}. Retrying...')
sleep_if_should_continue(10)
else:
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
logger.info('-' * 30)
logger.info('END Runtime Completion Fn')
logger.info('-' * 30)
return {'git_patch': git_patch}

View File

@@ -1,4 +1,4 @@
TASK_INSTRUECTION = """
TASK_INSTRUECTION="""
Given the following GitHub problem description, your objective is to localize the specific files, classes or functions, and lines of code that need modification or contain key information to resolve the issue.
Follow these steps to localize the issue:
@@ -66,4 +66,4 @@ FAKE_USER_MSG_FOR_LOC = (
'Verify that you have carefully analyzed the impact of the found locations on the repository, especially their dependencies. '
'If you think you have solved the task, please send your final answer (including the former answer and reranking) to user through message and then call `finish` to finish.\n'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
)
)

View File

@@ -1,65 +0,0 @@
<uploaded_files>
/workspace/{{ workspace_dir_name }}
</uploaded_files>
I've uploaded a python code repository in the directory {{ workspace_dir_name }}. Consider the following issue description:
<issue_description>
{{ instance.problem_statement }}
</issue_description>
Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
Your task is to make the minimal changes to non-test files in the /workspace/{{ workspace_dir_name }} directory to ensure the <issue_description> is satisfied.
Follow these phases to resolve the issue:
Phase 1. READING: read the problem and reword it in clearer terms
1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
1.3 Explain the problem in clear terms.
1.4 Enumerate the steps to reproduce the problem.
1.5 Hightlight any best practices to take into account when testing and fixing the issue
Phase 2. RUNNING: install and run the tests on the repository
2.1 Follow the readme
2.2 Install the environment and anything needed
2.2 Iterate and figure out how to run the tests
Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
3.2 Identify all files related to the problem statement.
3.3 Propose the methods and files to fix the issue and explain why.
3.4 From the possible file locations, select the most likely location to fix the issue.
Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
4.1 Look at existing test files in the repository to understand the test format/structure.
4.2 Create a minimal reproduction script that reproduces the located issue.
4.3 Run the reproduction script to confirm you are reproducing the issue.
4.4 Adjust the reproduction script as necessary.
Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
5.1 State clearly what the problem is.
5.2 State clearly where the problem is located.
5.3 State clearly how the test reproduces the issue.
5.4 State clearly the best practices to take into account in the fix.
5.5 State clearly how to fix the problem.
Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
6.1 Make minimal, focused changes to fix the issue.
Phase 7. VERIFICATION: Test your implementation thoroughly.
7.1 Run your reproduction script to verify the fix works.
7.2 Add edge cases to your test script to ensure comprehensive coverage.
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {{ instance.base_commit }}.
8.1 Ensure you've fully addressed all requirements.
8.2 Run any tests in the repository related to:
8.2.1 The issue you are fixing
8.2.2 The files you modified
8.2.3 The functions you changed
8.3 If any tests fail, revise your implementation until all tests pass
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.

View File

@@ -1,65 +0,0 @@
<uploaded_files>
/workspace/{{ workspace_dir_name }}
</uploaded_files>
I've uploaded a python code repository in the directory {{ workspace_dir_name }}. Consider the following issue description:
<issue_description>
{{ instance.problem_statement }}
</issue_description>
Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
Your task is to make the minimal changes to non-test files in the /workspace/{{ workspace_dir_name }} directory to ensure the <issue_description> is satisfied.
Follow these phases to resolve the issue:
Phase 1. READING: read the problem and reword it in clearer terms
1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
1.3 Explain the problem in clear terms.
1.4 Enumerate the steps to reproduce the problem.
1.5 Hightlight any best practices to take into account when testing and fixing the issue
Phase 2. RUNNING: install and run the tests on the repository
2.1 Follow the readme
2.2 Install the environment and anything needed
2.2 Iterate and figure out how to run the tests
Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
3.2 Identify all files related to the problem statement.
3.3 Propose the methods and files to fix the issue and explain why.
3.4 From the possible file locations, select the most likely location to fix the issue.
Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
4.1 Look at existing test files in the repository to understand the test format/structure.
4.2 Create a minimal reproduction script that reproduces the located issue.
4.3 Run the reproduction script to confirm you are reproducing the issue.
4.4 Adjust the reproduction script as necessary.
Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
5.1 State clearly what the problem is.
5.2 State clearly where the problem is located.
5.3 State clearly how the test reproduces the issue.
5.4 State clearly the best practices to take into account in the fix.
5.5 State clearly how to fix the problem.
Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
6.1 Make minimal, focused changes to fix the issue.
Phase 7. VERIFICATION: Test your implementation thoroughly.
7.1 Run your reproduction script to verify the fix works.
7.2 Add edge cases to your test script to ensure comprehensive coverage.
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {{ instance.base_commit }}.
8.1 Ensure you've fully addressed all requirements.
8.2 Run any tests in the repository related to:
8.2.1 The issue you are fixing
8.2.2 The files you modified
8.2.3 The functions you changed
8.3 If any tests fail, revise your implementation until all tests pass
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.

View File

@@ -1,45 +0,0 @@
# Task: Fix Issue in Python Repository
## Repository Context
You are provided with a Python code repository that contains an issue requiring your attention. The repository is located in a sandboxed environment, and you have access to the codebase to implement the necessary changes.
The code repository is located at: `/workspace/{{ workspace_dir_name }}`
(This path is provided for context; use file system tools to confirm paths before access).
## Goal
Your goal is to fix the issue described in the **Issue Description** section below. Implement the necessary changes to **non-test files only** within the repository, ensuring that **all relevant tests pass** after your changes.
## Key Requirements & Constraints
1. **Understand the problem** very well: it is a bug report, and you know humans don't always write good descriptions. Explore the codebase to understand the related code and the problem in depth. It is possible that the solution needs to be a bit more extensive than just the stated text. Don't exagerate though: don't do unrelated refactoring, but also don't interpret the description too strictly.
2. **Focus on the issues:** Implement the fix focusing on non-test files related to the issue.
2. **Environment Ready:** The Python environment is pre-configured with all dependencies. Do not install packages.
3. **Mandatory Testing Procedure:**
* **Create Test to Reproduce the Issue:** *Before* implementing any fix, you MUST create a *new test* (separate from existing tests) that specifically reproduces the issue.
* Take existing tests as example to understand the testing format/structure.
* Enhance this test with edge cases.
* Run this test to confirm reproduction.
* **Verify Fix:** After implementing the fix, run your test again to verify the issue is resolved.
* **Identify ALL Relevant Tests:** You MUST perform a **dedicated search and analysis** to identify **all** existing unit tests potentially affected by your changes. This includes:
* Tests in the same module/directory as the changed files (e.g., `tests/` subdirectories).
* Tests explicitly importing or using the modified code/classes/functions.
* Tests mentioned in the issue description or related documentation.
* Tests covering functionalities that *depend on* the modified code (analyze callers/dependencies if necessary).
**If you cannot confidently identify a specific subset, you MUST identify and plan to run the entire test suite for the modified application or module(s). State your identified test scope clearly.**
* **Run Identified Relevant Tests:** You MUST execute the **complete set** of relevant existing unit tests you identified in the previous step. Ensure you are running the *correct and comprehensive set* of tests. You MUST NOT modify these existing tests.
* **Final Check & Verification:** Before finishing, ensure **all** identified relevant existing tests pass. **Explicitly confirm that you have considered potential omissions in your test selection and believe the executed tests comprehensively cover the impact of your changes.** Failing to identify and run the *complete* relevant set constitutes a failure. If any identified tests fail, revise your fix. Passing all relevant tests is the primary measure of success.
4. **Defensive Programming:** Actively practice defensive programming: anticipate and handle potential edge cases, unexpected inputs, and different ways the affected code might be called **to ensure the fix works reliably and allows relevant tests to pass.** Analyze the potential impact on other parts of the codebase.
5. **Final Review:** Compare your solution against the original issue and the base commit ({{ instance.base_commit }}) to ensure completeness and test passage.
## General Workflow Guidance
* Prioritize understanding the problem, exploring the code, planning your fix, implementing it carefully using the required diff format, and **thoroughly testing** according to the **Mandatory Testing Procedure**.
* Consider trade-offs between different solutions. The goal is a **robust change that makes the relevant tests pass.** Quality, correctness, and reliability are key.
* Actively practice defensive programming: anticipate and handle potential edge cases, unexpected inputs, and different ways the affected code might be called **to ensure the fix works reliably and allows relevant tests to pass.** Analyze the potential impact on other parts of the codebase.
* IMPORTANT: Your solution will be tested by additional hidden tests, so do not assume the task is complete just because visible tests pass! Refine the solution until you are confident that it is robust and comprehensive according to the **Defensive Programming** requirement.
## Final Note
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
## Issue Description
{{ instance.problem_statement }}

View File

@@ -1,80 +0,0 @@
You will be tasked to fix an issue from an open-source repository.
Your thinking should be thorough and so it's fine if it's very long. You can think step by step before and after each action you decide to take.
You MUST iterate and keep going until the problem is solved.
You already have everything you need to solve this problem in the /workspace/{{ workspace_dir_name }} folder, even without internet connection. I want you to fully solve this autonomously before coming back to me.
Only terminate your turn when you are sure that the problem is solved. Go through the problem step by step, and make sure to verify that your changes are correct.
NEVER end your turn without having solved the problem, and when you say you are going to make a tool call, make sure you ACTUALLY make the tool call, instead of ending your turn.
THE PROBLEM CAN DEFINITELY BE SOLVED WITHOUT THE INTERNET.
Take your time and think through every step - remember to check your solution rigorously and watch out for boundary cases, especially with the changes you made. Your solution must be perfect. If not, continue working on it.
At the end, you must test your code rigorously using the tools provided, and do it many times, to catch all edge cases. If it is not robust, iterate more and make it perfect. Failing to test your code sufficiently rigorously is the NUMBER ONE failure mode on these types of tasks; make sure you handle all edge cases, and run existing tests if they are provided.
You MUST plan extensively before each function call, and reflect extensively on the outcomes of the previous function calls. DO NOT do this entire process by making function calls only, as this can impair your ability to solve the problem and think insightfully.
# Workflow
## High-Level Problem Solving Strategy
1. Understand the problem deeply. Carefully read the issue and think critically about what is required.
2. Investigate the codebase. Explore relevant files, search for key functions, and gather context.
3. Develop a clear, step-by-step plan. Break down the fix into manageable, incremental steps.
4. Implement the fix incrementally. Make small, testable code changes.
5. Debug as needed. Use debugging techniques to isolate and resolve issues.
6. Test frequently. Run tests after each change to verify correctness.
7. Iterate until the root cause is fixed and all tests pass.
8. Reflect and validate comprehensively. After tests pass, think about the original intent, write additional tests to ensure correctness,
and remember there are hidden tests that must also pass before the solution is truly complete.
Refer to the detailed sections below for more information on each step.
## 1. Deeply Understand the Problem
Carefully read the issue and think hard about a plan to solve it before coding.
## 2. Codebase Investigation
- Explore relevant files and directories.
- Search for key functions, classes, or variables related to the issue.
- Read and understand relevant code snippets.
- Identify the root cause of the problem.
- Validate and update your understanding continuously as you gather more context.
## 3. Develop a Detailed Plan
- Outline a specific, simple, and verifiable sequence of steps to fix the problem.
- Break down the fix into small, incremental changes.
## 4. Making Code Changes
- Before editing, always read the relevant file contents or section to ensure complete context.
- If a patch is not applied correctly, attempt to reapply it.
- Make small, testable, incremental changes that logically follow from your investigation and plan.
## 5. Debugging
- Make code changes only if you have high confidence they can solve the problem
- When debugging, try to determine the root cause rather than addressing symptoms
- Debug for as long as needed to identify the root cause and identify a fix
- Use print statements, logs, or temporary code to inspect program state, including descriptive statements or error messages to understand what's happening
- To test hypotheses, you can also add test statements or functions
- Revisit your assumptions if unexpected behavior occurs.
## 6. Testing
- Run tests frequently using `python3 run_tests.py` (or equivalent).
- After each change, verify correctness by running relevant tests.
- If tests fail, analyze failures and revise your patch.
- Write additional tests if needed to capture important behaviors or edge cases.
- Ensure all tests pass before finalizing.
## 7. Final Verification
- Confirm the root cause is fixed.
- Review your solution for logic correctness and robustness.
- Iterate until you are extremely confident the fix is complete and all tests pass.
## 8. Final Reflection and Additional Testing
- Reflect carefully on the original intent of the user and the problem statement.
- Think about potential edge cases or scenarios that may not be covered by existing tests.
- Write additional tests that would need to pass to fully validate the correctness of your solution.
- Run these new tests and ensure they all pass.
- Be aware that there are additional hidden tests that must also pass for the solution to be successful.
- Do not assume the task is complete just because the visible tests pass; continue refining until you are confident the fix is robust and comprehensive.

View File

@@ -1,19 +0,0 @@
<uploaded_files>
/workspace/{{ workspace_dir_name }}
</uploaded_files>
I've uploaded a python code repository in the directory {{ workspace_dir_name }}. Consider the following issue description:
<issue_description>
{{ instance.problem_statement }}
</issue_description>
Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved?
I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests!
Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved.
Follow these steps to reproduce the issue:
1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error
3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework
4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests.
{{ test_instructions }}Your thinking should be thorough and so it's fine if it's very long.

View File

@@ -8,7 +8,6 @@ from typing import Any, Literal
import pandas as pd
import toml
from datasets import load_dataset
from jinja2 import Environment, FileSystemLoader
import openhands.agenthub
from evaluation.benchmarks.swe_bench.binary_patch_utils import (
@@ -63,29 +62,8 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
BenchMode = Literal['swe', 'swt', 'swt-ci']
# Global variable to track dataset type
DATASET_TYPE = 'SWE-bench'
def set_dataset_type(dataset_name: str) -> str:
"""Set dataset type based on dataset name."""
global DATASET_TYPE
name_lower = dataset_name.lower()
if 'swe-gym' in name_lower:
DATASET_TYPE = 'SWE-Gym'
elif 'swe-bench-live' in name_lower:
DATASET_TYPE = 'SWE-bench-Live'
elif 'multimodal' in name_lower:
DATASET_TYPE = 'Multimodal'
else:
DATASET_TYPE = 'SWE-bench'
logger.info(f'Dataset type set to: {DATASET_TYPE}')
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
@@ -93,59 +71,107 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
if DATASET_TYPE == 'SWE-bench-Live':
return instance.instance_id
else:
return f'{instance.repo}__{instance.version}'.replace('/', '__')
return f'{instance.repo}__{instance.version}'.replace('/', '__')
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
mode = metadata.details['mode']
llm_model = metadata.llm_config.model
# Determine the template file based on mode and LLM
if mode.startswith('swt'):
template_name = 'swt.j2'
elif mode == 'swe':
if 'claude' in llm_model:
template_name = 'swe_claude.j2'
elif 'gemini' in llm_model:
template_name = 'swe_gemini.j2'
elif 'gpt-4.1' in llm_model:
template_name = 'swe_gpt4.j2'
else:
template_name = (
'swe_default.j2' # Default for 'swe' mode (regular swe-bench)
)
else:
# Fallback or error handling if mode is unexpected
logger.error(f'Unexpected evaluation mode: {mode}. Falling back to default.')
template_name = 'swe_default.j2'
# Set up Jinja2 environment
# Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script
prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')
env = Environment(loader=FileSystemLoader(prompts_dir))
template = env.get_template(template_name)
# Prepare context for rendering
context = {
'instance': instance,
'workspace_dir_name': workspace_dir_name,
'metadata': metadata, # Pass metadata if needed in templates
}
# Add specific context for swt-ci mode if needed
if mode == 'swt-ci':
context['test_instructions'] = (
test_instructions = (
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
if mode.endswith('ci')
else ''
)
else:
context['test_instructions'] = '' # Ensure it's defined for other modes
instruction = f"""\
<uploaded_files>
/workspace/{workspace_dir_name}
</uploaded_files>
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
# Render the instruction
instruction = template.render(context)
<issue_description>
{instance.problem_statement}
</issue_description>
Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved?
I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests!
Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved.
Follow these steps to reproduce the issue:
1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error
3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework
4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests.
{test_instructions}Your thinking should be thorough and so it's fine if it's very long.
"""
else:
instruction = f"""
<uploaded_files>
/workspace/{workspace_dir_name}
</uploaded_files>
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
<issue_description>
{instance.problem_statement}
</issue_description>
Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied.
Follow these phases to resolve the issue:
Phase 1. READING: read the problem and reword it in clearer terms
1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
1.3 Explain the problem in clear terms.
1.4 Enumerate the steps to reproduce the problem.
1.5 Hightlight any best practices to take into account when testing and fixing the issue
Phase 2. RUNNING: install and run the tests on the repository
2.1 Follow the readme
2.2 Install the environment and anything needed
2.2 Iterate and figure out how to run the tests
Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
3.2 Identify all files related to the problem statement.
3.3 Propose the methods and files to fix the issue and explain why.
3.4 From the possible file locations, select the most likely location to fix the issue.
Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
4.1 Look at existing test files in the repository to understand the test format/structure.
4.2 Create a minimal reproduction script that reproduces the located issue.
4.3 Run the reproduction script to confirm you are reproducing the issue.
4.4 Adjust the reproduction script as necessary.
Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
5.1 State clearly what the problem is.
5.2 State clearly where the problem is located.
5.3 State clearly how the test reproduces the issue.
5.4 State clearly the best practices to take into account in the fix.
5.5 State clearly how to fix the problem.
Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
6.1 Make minimal, focused changes to fix the issue.
Phase 7. VERIFICATION: Test your implementation thoroughly.
7.1 Run your reproduction script to verify the fix works.
7.2 Add edge cases to your test script to ensure comprehensive coverage.
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}.
8.1 Ensure you've fully addressed all requirements.
8.2 Run any tests in the repository related to:
8.2.1 The issue you are fixing
8.2.2 The files you modified
8.2.3 The functions you changed
8.3 If any tests fail, revise your implementation until all tests pass
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
"""
if RUN_WITH_BROWSING:
instruction += (
@@ -176,13 +202,9 @@ def get_instance_docker_image(
if swebench_official_image:
# Official SWE-Bench image
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
# SWE-bench-Live uses the same naming convention as SWE-Bench
if DATASET_TYPE == 'SWE-bench-Live':
docker_image_prefix = 'docker.io/starryzhang/'
elif DATASET_TYPE == 'SWE-bench':
docker_image_prefix = 'docker.io/swebench/'
docker_image_prefix = 'docker.io/swebench/'
repo, name = instance_id.split('__')
image_name = f'{docker_image_prefix.rstrip("/")}/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
logger.debug(f'Using official SWE-Bench image: {image_name}')
return image_name
else:
@@ -200,8 +222,7 @@ def get_config(
metadata: EvalMetadata,
) -> OpenHandsConfig:
# We use a different instance image for the each instance of swe-bench eval
use_swebench_official_image = DATASET_TYPE != 'SWE-Gym'
use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
base_container_image = get_instance_docker_image(
instance['instance_id'],
swebench_official_image=use_swebench_official_image,
@@ -233,19 +254,15 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
)
)
# get 'draft_editor' config if exists
config.set_llm_config(get_llm_config_arg('draft_editor'), 'draft_editor')
agent_config = AgentConfig(
enable_jupyter=False,
enable_browsing=RUN_WITH_BROWSING,
enable_llm_editor=ENABLE_LLM_EDITOR,
enable_llm_editor=False,
enable_mcp=False,
condenser=metadata.condenser_config,
enable_prompt_extensions=False,
@@ -318,12 +335,8 @@ def initialize_runtime(
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
# inject the instance swe entry
if DATASET_TYPE == 'SWE-bench-Live':
entry_script_path = 'instance_swe_entry_live.sh'
else:
entry_script_path = 'instance_swe_entry.sh'
runtime.copy_to(
str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
'/swe_util/',
)
@@ -343,14 +356,14 @@ def initialize_runtime(
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0,
f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
)
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
@@ -403,9 +416,9 @@ def initialize_runtime(
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
if DATASET_TYPE != 'Multimodal' and DATASET_TYPE != 'SWE-bench-Live':
if 'multimodal' not in metadata.dataset.lower():
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
# SWE-Bench multimodal datasets and SWE-bench-Live are not using the testbed environment
# SWE-Bench multimodal datasets are not using the testbed environment
action = CmdRunAction(command='which python')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
@@ -647,13 +660,7 @@ def process_instance(
# ======= THIS IS SWE-Bench specific =======
# Get git patch
if DATASET_TYPE == 'SWE-bench-Live':
from evaluation.benchmarks.swe_bench.live_utils import (
complete_runtime as complete_runtime_fn,
)
else:
complete_runtime_fn = complete_runtime
return_val = complete_runtime_fn(runtime, instance)
return_val = complete_runtime(runtime, instance)
git_patch = return_val['git_patch']
logger.info(
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
@@ -758,15 +765,11 @@ if __name__ == '__main__':
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenHands's repo
dataset = load_dataset(args.dataset, split=args.split)
# Set the global dataset type based on dataset name
set_dataset_type(args.dataset)
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
logger.info(
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
)
if DATASET_TYPE == 'SWE-Gym':
if 'SWE-Gym' in args.dataset:
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),

View File

@@ -192,8 +192,6 @@ def get_config(
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
oh_aci_li_cmd = '/openhands/micromamba/bin/micromamba run -n openhands poetry run pip install openhands-aci[llama]'
sandbox_config.runtime_extra_deps = oh_aci_li_cmd
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
sandbox_config.runtime_startup_env_vars = {
'REPO_PATH': f'/workspace/{workspace_dir_name}/',
@@ -218,7 +216,6 @@ def get_config(
enable_jupyter=False,
enable_browsing=RUN_WITH_BROWSING,
enable_llm_editor=False,
enable_mcp=os.environ.get('ENABLE_MCP', False),
condenser=metadata.condenser_config,
enable_prompt_extensions=False,
)

View File

@@ -1,33 +0,0 @@
import argparse
import json
def main(output_jsonl: str):
with open(output_jsonl, 'r') as f:
for line in f:
try:
output = json.loads(line)
pred = {
'instance_id': output['instance_id'],
'model_name_or_path': output['metadata']['llm_config']['model'],
'model_patch': output['test_result']['git_patch'],
}
except Exception as e:
print(
f'Error while reading output of instance {output["instance_id"]}: {e}'
)
print(json.dumps(pred))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--output_jsonl',
type=str,
required=True,
help='Path to the prediction file (.../outputs.jsonl)',
)
args = parser.parse_args()
main(args.output_jsonl)

View File

@@ -1,41 +0,0 @@
#!/usr/bin/env bash
source ~/.bashrc
SWEUTIL_DIR=/swe_util
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
# SWE_INSTANCE_ID=django__django-11099
if [ -z "$SWE_INSTANCE_ID" ]; then
echo "Error: SWE_INSTANCE_ID is not set." >&2
exit 1
fi
# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
if [[ -z "$item" ]]; then
echo "No item found for the provided instance ID."
exit 1
fi
echo "WORKSPACE_NAME: $SWE_INSTANCE_ID"
# Clear the workspace
if [ -d /workspace ]; then
rm -rf /workspace/*
else
mkdir /workspace
fi
# Copy repo to workspace
if [ -d /workspace/$SWE_INSTANCE_ID ]; then
rm -rf /workspace/$SWE_INSTANCE_ID
fi
mkdir -p /workspace
cp -r /testbed /workspace/$SWE_INSTANCE_ID
# SWE-bench-Live does not use conda to manage Python
# if [ -d /opt/miniconda3 ]; then
# . /opt/miniconda3/etc/profile.d/conda.sh
# conda activate testbed
# fi

View File

@@ -921,7 +921,7 @@ SPECS_PYDICOM.update(
SPECS_HUMANEVAL = {k: {'python': '3.9', 'test_cmd': 'python'} for k in ['1.0']}
# Constants - Task Instance Installation Environment
# Constants - Task Instance Instllation Environment
MAP_REPO_VERSION_TO_SPECS: dict[str, dict[str, Any]] = {
'astropy/astropy': SPECS_ASTROPY,
'dbt-labs/dbt-core': SPECS_DBT_CORE,

View File

@@ -539,7 +539,7 @@ if __name__ == '__main__':
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
llm_config.log_completions = True
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
llm_config.modify_params = False
if llm_config is None:

View File

@@ -1,102 +0,0 @@
# VersiCode benchmark
This project is used to evaluate the performance of the model on VersiCode. It includes:
- data: the test data needed and the model outputs
- inference_utils: inference scripts for ours tasks and models
- metric: scripts for calculating various metric
- output_processing: process the model output to facilitate the calculation of model metrics
# Details
1. **Prepare the environment**
```shell
#create conda environment
conda create -n VersiCode python==3.12
#install requirements
pip install -r requirements.txt
```
2. **Experiment Data**
To obtain the experimental data, please visit the Hugging Face link: https://huggingface.co/datasets/AstoneNg/VersiCode.
Locate the files `VersiCode_block_completion.json` and `VersiCode_migration.json` under the `experiment_data` directory, and place them in the `/data/test_data directory` of this project.
3. **Model inference**
```shell
#cd inference_utils directory
cd inference_utils
#The script file starting with 'test' is used to test the local model
#The script file at the beginning of the API is used to test the API call model
#block level code completipn
#Modify the 10th and 12th lines of code to specify the base URL and model name
python api_test_block_completion.py
#Modify the 30th line of code to specify the local model path
python test_block.py
# code migration (migration order is 'old_to_new')
#Modify the 10th and 12th lines of code to specify the base URL and model name
python api_code_migration.py
#Modify the 30th line of code to specify the local model path
python test_migration.py
```
4. **Process output**
Process the output content of the model, remove redundant content, extract specified content for easy calculation of indicators.
```shell
#cd output_processing
cd output_processing
#Extract content from<start> and <end>
#Modify the 8th and 9th lines of code to specify the model and task granularity
python clear_ans.py
#In the block completion task and migration task, cdc@k The calculation of indicators needs to be targeted at key rows,
#Modify lines 76 and 79 to specify the data path
python choose_core_line_from_block_versicode.py
python choose_core_line_from_migration_versicode.py
```
5. **Metric**
We have three metrics pass@kem@k and cdc@k Due to our inability to automatically build a dynamic evaluation environment, we have not provided pass@k .
```shell
#cd metric
cd metric
#Modify lines 137-140 in migration task (compute_migration_cdc_score.py) or 143-145 in block and line completion task (compute_versicode_cdc_score.py and compute_versicode_em_score.py) of the code to specify the data path and calculate the k-value of the metric
python compute_migration_cdc_score.py
python compute_versicode_cdc_score.py
python compute_versicode_em_score.py
#Notes
#We found limitations in the ISM@k and PM@k metrics for evaluating code generation, so they are used only as reference in our experiments.
#Modify lines 261-265 in block and line completion task of the code to specify the data path and calculate the k-value of the metric
python compute_ism_pm_score.py
```
# Citation
```
@article{versicode,
author={Tongtong Wu and Weigang Wu and Xingyu Wang and Kang Xu and Suyu Ma and Bo Jiang and Ping Yang and Zhenchang Xing and Yuan-Fang Li and Gholamreza Haffari},
title = {VersiCode: Towards Version-controllable Code Generation},
journal = {CoRR},
volume = {abs/2406.07411},
year = {2024},
url = {https://arxiv.org/abs/2406.07411},
}
```
**Github url**: https://github.com/wutong8023/VersiCode
# Contributor
[Tongtong Wu](https://scholar.google.com/citations?hl=zh-CN&user=u1Qp8lUAAAAJ&view_op=list_works&sortby=pubdate), [Weigang Wu](https://scholar.google.com/citations?hl=zh-CN&user=UneIZo8AAAAJ), [Xingyu Wang](https://scholar.google.com/citations?hl=zh-CN&user=wqPJcxcAAAAJ), [Kang Xu](https://scholar.google.com/citations?hl=zh-CN&user=N1UUDi0AAAAJ), [Suyu Ma](https://scholar.google.com/citations?hl=zh-CN&user=NJHR1ukAAAAJ), [Bo Jiang](https://wutong8023.site/VersiCode/), [Ping Yang](https://scholar.google.com/citations?view_op=list_works&hl=en&hl=en&user=hrogvxoAAAAJ), [Zhenchang Xing](https://scholar.google.com/citations?hl=zh-CN&user=0vCxuH4AAAAJ), [Yuan-Fang Li](https://scholar.google.com/citations?hl=zh-CN&user=wufXO1kAAAAJ), [Gholamreza Haffari](https://scholar.google.com/citations?hl=zh-CN&user=Perjx5EAAAAJ)

View File

@@ -1,134 +0,0 @@
"""
GPT performs line level generation prediction and truncates overly long tokens
"""
import json
import os
import tiktoken
from openai import OpenAI
max_tokens = 127000 # gpt3.5 is 16ktoken gpt4o is 128k
model_name = ''
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI()
def truncate_text(text, max_tokens):
encoding = tiktoken.get_encoding('cl100k_base')
disallowed_special = ()
tokens = encoding.encode(text, disallowed_special=disallowed_special)
print(len(tokens))
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
truncated_text = encoding.decode(tokens)
return truncated_text
def predict(content, model_name):
response = client.chat.completions.create(
model=model_name,
messages=[{'role': 'user', 'content': content}],
frequency_penalty=0.1,
max_tokens=128,
logit_bias=None,
logprobs=None,
n=6,
presence_penalty=0.0,
seed=None,
stop=None,
stream=False,
temperature=0.8,
top_p=0.95,
)
ans_list = []
choices_list = response.choices
for c in choices_list:
content = c.message.content
ans_list.append(content)
final_ans = str(ans_list)
return final_ans
def bulid_prompt(description, old_version, old_code, new_version) -> str:
"""
build prompt
:param version:
:param description:
:param masked_code:
:param options:
:return:
"""
prompt = f"""
You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality,
including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version.
Your task is to refactor the code using the methods provided by the specified new version and return the refactored code.
Please note that you only need to return the refactored code and enclose it with <start> and <end>:
###Functionality description of the code
{description}
###Dependency and old version
{old_version}
###Old version code
{old_code}
###Dependency and new version
{new_version}
###Refactored new code
"""
return prompt
json_path = '../data/test_data/VersiCode_migration.json'
with open(json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_dict = lodict
data_list = data_dict
for data in data_list:
if 'model_output' in data:
print(
f'the {data_list.index(data) + 1} has already been predicted, skipping this data!'
)
continue
try:
print(f'Predicting {data_list.index(data) + 1} ')
old_version = data['dependency'] + data['old_version'] # package == x.x.x
new_version = data['dependency'] + data['new_version'] # package == x.x.x
description = data['description'] # 功能描述
old_code = data['old_code'] # mask后的代码
instruction = bulid_prompt(description, old_version, old_code, new_version)
truncated_text = truncate_text(instruction, max_tokens)
prediction = predict(truncated_text, model_name)
data['model_output'] = prediction
except Exception as e:
print(f'error{e}')
print('save current data')
save_folder_path = os.path.join(
'../data/result_data/code_migration', model_name
)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(data_dict, fw, indent=4, ensure_ascii=False)
break
save_folder_path = os.path.join('../data/result_data/code_migration', model_name)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(data_dict, fw, indent=4, ensure_ascii=False)

View File

@@ -1,141 +0,0 @@
"""
GPT performs line level generation prediction and truncates overly long tokens
"""
import json
import os
import tiktoken
from openai import OpenAI
max_tokens = 127000 # gpt3.5 is 16ktoken gpt4o is 128k
model_name = ''
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI()
def truncate_text(text, max_tokens):
encoding = tiktoken.get_encoding('cl100k_base')
disallowed_special = ()
tokens = encoding.encode(text, disallowed_special=disallowed_special)
print(len(tokens))
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
truncated_text = encoding.decode(tokens)
return truncated_text
def predict(content, model_name):
response = client.chat.completions.create(
model=model_name,
messages=[{'role': 'user', 'content': content}],
frequency_penalty=0.1,
max_tokens=128,
logit_bias=None,
logprobs=None,
n=6,
presence_penalty=0.0,
seed=None,
stop=None,
stream=False,
temperature=0.8,
top_p=0.95,
)
ans_list = []
choices_list = response.choices
for c in choices_list:
content = c.message.content
ans_list.append(content)
final_ans = str(ans_list)
return final_ans
def bulid_prompt(version, description) -> str:
"""
build prompt
:param version:
:param description:
:param masked_code:
:param options:
:return:
"""
prompt = f"""
You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages.
You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified.
Please note that you only need to return the code that implements the function, and do not return any other content.
Please use <start> and <end> to enclose the generated code. Here is an example:
###Function Description
The function of this code is to print the results predicted by calling the model using vllm.
###dependeny and version
vllm==0.3.3
###response:
<start>
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print("Prompt,Generated text")
<end>
###Function Description
{description}
###dependeny and version
{version}
###response:
"""
return prompt
json_path = '../data/test_data/VersiCode_block_completion.json'
with open(json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_dict = lodict
data_list = data_dict
for data in data_list:
if 'model_output' in data:
print(
f'the {data_list.index(data) + 1} has already been predicted, skipping this data!'
)
continue
try:
print(f'Predicting {data_list.index(data) + 1} ')
version = data['dependency'] + data['version'] # package == x.x.x
description = data['description'] # func description
instruction = bulid_prompt(version, description)
truncated_text = truncate_text(instruction, max_tokens)
prediction = predict(truncated_text, model_name)
data['model_output'] = prediction
except Exception as e:
print(f'error{e}')
print('save current data')
save_folder_path = os.path.join(
'../data/result_data/block_completion', model_name
)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(data_dict, fw, indent=4, ensure_ascii=False)
break
save_folder_path = os.path.join('../data/result_data/block_completion', model_name)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(data_dict, fw, indent=4, ensure_ascii=False)

View File

@@ -1,129 +0,0 @@
"""
block completion
"""
import copy
import gc
import json
import os
import time
from multiprocessing import Process
import tiktoken
import torch
from vllm import LLM, SamplingParams
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
def truncate_text(text, max_tokens):
encoding = tiktoken.get_encoding('cl100k_base')
disallowed_special = ()
tokens = encoding.encode(text, disallowed_special=disallowed_special)
print(len(tokens))
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
truncated_text = encoding.decode(tokens)
return truncated_text
model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B']
def run_inference(model_name, origin_data_list):
temp_data_list = copy.deepcopy(origin_data_list)
test_list = []
for data in temp_data_list:
version = data['dependency'] + data['version'] # package == x.x.x
description = data['description'] # func description
instruction = bulid_prompt(version, description)
test_list.append(instruction)
sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=64)
llm = LLM(
model=model_name,
tensor_parallel_size=4,
gpu_memory_utilization=0.9,
swap_space=20,
)
outputs = llm.generate(test_list, sampling_params)
for output in outputs:
requests_id = int(output.request_id)
temp_ans_list = []
output_list = output.outputs
for o in output_list:
text = o.text
temp_ans_list.append(text)
temp_data_list[requests_id]['model_output'] = str(temp_ans_list)
save_folder_path = os.path.join(
'../data/result_data/block_completion', model_name.split('/')[-1]
)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(temp_data_list, fw, indent=4, ensure_ascii=False)
gc.collect()
torch.cuda.empty_cache()
def bulid_prompt(version, description) -> str:
"""
build prompt
:param version:
:param description:
:param masked_code:
:param options:
:return:
"""
prompt = f"""
You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages.
You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified.
Please note that you only need to return the code that implements the function, and do not return any other content.
Please use <start> and <end> to enclose the generated code. Here is an example:
###Function Description
The function of this code is to print the results predicted by calling the model using vllm.
###dependeny and version
vllm==0.3.3
###response:
<start>
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print("Prompt,Generated text")
<end>
###Function Description
{description}
###dependeny and version
{version}
###response:
"""
return prompt
json_path = '../data/test_data/VersiCode_block_completion.json'
with open(json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
origin_data_list = lodict
for model_name in model_list:
process = Process(target=run_inference, args=(model_name, origin_data_list))
process.start()
process.join()
time.sleep(120)

View File

@@ -1,122 +0,0 @@
"""
code migration
"""
import copy
import gc
import json
import os
import time
from multiprocessing import Process
import tiktoken
import torch
from vllm import LLM, SamplingParams
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
def truncate_text(text, max_tokens):
encoding = tiktoken.get_encoding('cl100k_base')
disallowed_special = ()
tokens = encoding.encode(text, disallowed_special=disallowed_special)
print(len(tokens))
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
truncated_text = encoding.decode(tokens)
return truncated_text
model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B']
def run_inference(model_name, origin_data_list):
temp_data_list = copy.deepcopy(origin_data_list)
test_list = []
for data in temp_data_list:
old_version = data['dependency'] + data['old_version'] # package == x.x.x
new_version = data['dependency'] + data['new_version'] # package == x.x.x
description = data['description'] # 功能描述
old_code = data['old_code'] # mask后的代码
instruction = bulid_prompt(description, old_version, old_code, new_version)
test_list.append(instruction)
sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=512)
llm = LLM(
model=model_name,
tensor_parallel_size=4,
gpu_memory_utilization=0.6,
swap_space=40,
)
outputs = llm.generate(test_list, sampling_params)
for output in outputs:
requests_id = int(output.request_id)
temp_ans_list = []
output_list = output.outputs
for o in output_list:
text = o.text
temp_ans_list.append(text)
temp_data_list[requests_id]['model_output'] = str(temp_ans_list)
save_folder_path = os.path.join(
'../data/result_data/code_migration', model_name.split('/')[-1]
)
if not os.path.exists(save_folder_path):
os.makedirs(save_folder_path)
save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
with open(save_json_path, 'w', encoding='utf-8') as fw:
json.dump(temp_data_list, fw, indent=4, ensure_ascii=False)
gc.collect()
torch.cuda.empty_cache()
def bulid_prompt(description, old_version, old_code, new_version) -> str:
"""
build prompt
:param version:
:param description:
:param masked_code:
:param options:
:return:
"""
prompt = f"""
You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality,
including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version.
Your task is to refactor the code using the methods provided by the specified new version and return the refactored code.
Please note that you only need to return the refactored code and enclose it with <start> and <end>:
###Functionality description of the code
{description}
###Dependency and old version
{old_version}
###Old version code
{old_code}
###Dependency and new version
{new_version}
###Refactored new code
"""
return prompt
json_path = '../data/test_data/VersiCode_migration.json'
with open(json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
origin_data_list = lodict
for model_name in model_list:
process = Process(target=run_inference, args=(model_name, origin_data_list))
process.start()
process.join()
time.sleep(120)

View File

@@ -1,356 +0,0 @@
"""
评测block的预测能力
1、判断是否包含正确的函数名
2、判断是否合法
3、计算ISM和PM
"""
import io
import json
import math
import os
import re
import tokenize
def is_code_valid(code):
try:
compile(code, '<string>', 'exec')
return True
except Exception:
return False
def longest_common_prefix_between_lists_with_elements(list1, list2):
"""
计算两个字符串列表中元素的最长前缀匹配长度
:param list1:
:param list2:
:return:
"""
max_prefix_length = 0
max_prefix_elements = ()
for str1 in list1:
for str2 in list2:
prefix_length = 0
min_len = min(len(str1), len(str2))
for i in range(min_len):
if str1[i] == str2[i]:
prefix_length += 1
else:
break
if prefix_length > max_prefix_length:
max_prefix_length = prefix_length
max_prefix_elements = (str1, str2)
return max_prefix_length, max_prefix_elements
def get_token(ans_code: str, output_code: str):
"""
对代码进行词法分析,分解成标识符,返回两个标识符列表
:param ans_code:
:param output_code:
:return:
"""
output_flag = True
ans_flag = True
try:
tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline)
except Exception:
tokens_ans = ans_code.splitlines()
ans_flag = False
try:
tokens_output = tokenize.tokenize(
io.BytesIO(output_code.encode('utf-8')).readline
)
except Exception:
tokens_output = output_code.splitlines()
output_flag = False
identifiers_ans = []
identifiers_output = []
if ans_flag:
try:
for token in tokens_ans:
if token.type == tokenize.NAME:
identifiers_ans.append(token.string)
except Exception:
identifiers_ans = tokens_ans
else:
identifiers_ans = tokens_ans
if output_flag:
try:
for to in tokens_output:
if to.type == tokenize.NAME:
identifiers_output.append(to.string)
except Exception:
identifiers_output = tokens_output
else:
identifiers_output = tokens_output
return identifiers_ans, identifiers_output
def get_token_per_line(code: str):
"""
对每一行代码进行词法分析,记录每一行的标识符
:param code: 代码字符串
:return: 每一行的标识符列表组成的列表
"""
lines = code.split('\n') # 将代码按行分割成列表
identifiers_per_line = [] # 用于存储每一行的标识符列表的列表
for line in lines:
tokens = tokenize.tokenize(io.BytesIO(line.encode('utf-8')).readline)
identifiers = []
try:
for token in tokens:
if token.type == tokenize.NAME:
identifiers.append(token.string)
except Exception:
identifiers = line.split(' ')
identifiers_per_line.append(identifiers)
return identifiers_per_line
def get_ISM(answer_code: str, model_output_list: list, answer_name: str) -> list:
"""
计算ISM返回一个有序的得分列表
:return:
"""
score_list = []
for code in model_output_list:
if '```python' in code:
code = code.replace('```python', '')
code = code.replace('```', '')
if not re.search(rf'\b{re.escape(answer_name)}\b', code) or not is_code_valid(
code
):
score_list.append(0)
continue
# if answer_name not in code:
# score_list.append(0)
# continue
identifiers_ans, identifiers_output = get_token(answer_code, code)
max_len, elements = longest_common_prefix_between_lists_with_elements(
identifiers_ans, identifiers_output
)
if max_len != 0:
base_element_len = max(len(elements[0]), len(elements[1]))
temp_score = max_len / base_element_len
score_list.append(temp_score)
else:
score_list.append(0)
# base_element_len = max(len(elements[0]), len(elements[1]))
# temp_score = max_len/base_element_len
# score_list.append(temp_score)
score_list = sorted(score_list, reverse=True)
return score_list
def get_ISM_without_verification(
answer_code: str, model_output_list: list, answer_name: str
) -> list:
"""
计算ISM返回一个有序的得分列表
:return:
"""
score_list = []
for code in model_output_list:
if answer_name not in code:
score_list.append(0)
continue
# if answer_name not in code:
# score_list.append(0)
# continue
identifiers_ans, identifiers_output = get_token(answer_code, code)
max_len, elements = longest_common_prefix_between_lists_with_elements(
identifiers_ans, identifiers_output
)
if max_len != 0:
base_element_len = max(len(elements[0]), len(elements[1]))
temp_score = max_len / base_element_len
score_list.append(temp_score)
else:
score_list.append(0)
# base_element_len = max(len(elements[0]), len(elements[1]))
# temp_score = max_len/base_element_len
# score_list.append(temp_score)
score_list = sorted(score_list, reverse=True)
return score_list
def longest_common_prefix_with_lengths(list1, list2):
"""
计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度
:param list1: 第一个二维列表
:param list2: 第二个二维列表
:return: 最长前缀匹配长度以及拥有最长前缀匹配长度的两个子列表的长度
"""
max_length = 0
len_list1 = 0
len_list2 = 0
for i, sublist1 in enumerate(list1):
for j, sublist2 in enumerate(list2):
match_length = 0
min_length = min(len(sublist1), len(sublist2))
for k in range(min_length):
if sublist1[k] == sublist2[k]:
match_length += 1
else:
break
if match_length > max_length:
max_length = match_length
len_list1 = len(sublist1)
len_list2 = len(sublist2)
return max_length, len_list1, len_list2
def get_PM(answer_code: str, model_output_list: list, answer_name: str) -> list:
"""
计算PM返回一个有序的得分列表
:return:
"""
score_list = []
for code in model_output_list:
if '```python' in code:
code = code.replace('```python', '')
code = code.replace('```', '')
if not re.search(rf'\b{re.escape(answer_name)}\b', code) or not is_code_valid(
code
):
# if answer_name not in code or is_code_valid(code) == False:
score_list.append(0)
continue
# if answer_name not in code:
# score_list.append(0)
# continue
ans_list = get_token_per_line(answer_code)
output_token_list = get_token_per_line(code)
max_len, len1, len2 = longest_common_prefix_with_lengths(
ans_list, output_token_list
)
base_element_len = max(len1, len2)
if base_element_len != 0:
temp_score = max_len / base_element_len
score_list.append(temp_score)
else:
score_list.append(0)
score_list = sorted(score_list, reverse=True)
return score_list
def get_score(score_list: list, k):
"""
计算score@n,k
:param score_list:
:param k:
:return:
"""
n = len(score_list)
sum = 0
final = n - k + 1
for i in range(1, final + 1):
sum += math.comb(n - i, k - 1) * score_list[i - 1]
final_score = sum / math.comb(n, k)
return final_score
k = 1
task = 'block' # block or line
json_name = f'Versicode_{task}_completion.json'
folder_path = f'../data/result_data/{task}_completion'
model_list = os.listdir(folder_path)
for model in model_list:
model_json_path = os.path.join(folder_path, model, json_name)
with open(model_json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_dict = lodict
data_list = data_dict
data_len = len(data_list)
sum_ISM = 0
sum_PM = 0
for data in data_list:
# model_output_list = eval(data['model_output'])
model_output_list = eval(data['model_output_clear'])[:1]
temp_list = []
for o in model_output_list:
temp_out = o.replace('```python', '')
temp_out = temp_out.replace('```', '')
temp_list.append(temp_out)
model_output_list = temp_list
answer_code = data['code']
answer_name = data['core_token']
#
# answer_code = data['new_code'] #code editing
# answer_name = data['new_name'] #code editing
# answer_code = data['old_code'] # code editing new to old
# answer_name = data['old_name'] # code editing new to old
#
ISM_score_list = get_ISM(answer_code, model_output_list, answer_name)
# ISM_score_without_verification_list = get_ISM_without_verification(answer_code, model_output_list, answer_name) #新增
PM_score_list = get_PM(answer_code, model_output_list, answer_name)
# if not ISM_score_without_verification_list == ISM_score_list:#新增
# for s in ISM_score_list:#新增
# if s != ISM_score_without_verification_list[ISM_score_list.index(s)]:#新增
# print('元数据如下')#新增
# print(data)#新增
# print('答案如下')#新增
# print(model_output_list[ISM_score_list.index(s)])#新增
# flag = int(input('输入1继续0退出'))#新增
# if flag == 1:
# continue
ISM_score = get_score(ISM_score_list, k)
PM_score = get_score(PM_score_list, k)
sum_ISM += ISM_score
sum_PM += PM_score
# print(f"ISM分数{ISM_score}")
# print(f"PM分数{PM_score}")
print(f'{model}, {task} completion task, ISM@{k} score: {sum_ISM / data_len}')
print(f'{model}, {task} completion task, PM@{k} score: {sum_PM / data_len}')
# def get_token(ans_code:str, output_code:str):
# """
# 对代码进行词法分析,分解成标识符,返回两个标识符列表
# :param ans_code:
# :param output_code:
# :return:
# """
# tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline)
# tokens_output = tokenize.tokenize(io.BytesIO(output_code.encode('utf-8')).readline)
# identifiers_ans = []
# identifiers_output = []
# for token in tokens_ans:
# if token.type == tokenize.NAME:
# identifiers_ans.append(token.string)
#
# for to in tokens_output:
# if to.type == tokenize.NAME:
# identifiers_output.append(to.string)
#
# return identifiers_ans, identifiers_output

View File

@@ -1,198 +0,0 @@
"""
Calculate the cdc score for migration
"""
import json
import math
import os
import re
# warnings.filterwarnings("ignore", category=SyntaxWarning)
def is_correct_parameter_count(function_name, correct_code, test_code):
"""
判断参数数量是否一致
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 获取正确代码中的参数数量
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
expected_count = len(correct_param_list)
else:
expected_count = 0 # 如果没有参数期望数量为0
# 在需要判断的代码中查找函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
return len(test_param_list) == expected_count # 检查参数数量
else:
# 如果没有括号,检查函数名是否在字符串中
return expected_count == 0 and function_name in test_code
def check_keyword_parameters(function_name, correct_code, test_code):
"""
判断关键词参数赋值是否正确使用
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 正则表达式匹配正确代码中的函数调用
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
# 检查待检测代码中的函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
# 确保待检测的每个参数都以关键字参数形式赋值
for correct_param in correct_param_list:
if '=' in correct_param: # 仅当正确代码中有关键词参数
param_name = correct_param.split('=')[0].strip()
if not any(
param_name in test_param and '=' in test_param
for test_param in test_param_list
):
return False # 如果对应参数不是关键词参数则返回False
return True # 所有关键字参数匹配
return False # 如果没有匹配返回False
def with_correct(answer_code: str, model_output: str) -> bool:
"""
当answer是with结构时判断模型生成的是不是with结构
:param answer_code:
:param model_output:
:return:
"""
# return True
if not answer_code.startswith('with') and not model_output.startswith('with'):
return True
elif answer_code.startswith('with') and model_output.startswith('with'):
return True
else:
return False
def compute_block_score_k(
answer: str,
model_output: list,
k: int,
model_filled_code,
core_line_in_core_block,
core_line_in_output_clear,
):
"""
cdc需要满足五个条件em只需要满足第一个条件
"""
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if (
re.search(rf'\b{re.escape(answer)}\b', code)
and is_code_valid(model_filled_code[index])
and is_correct_parameter_count(
answer, core_line_in_core_block, core_line_in_output_clear[index]
)
and with_correct(core_line_in_core_block, core_line_in_output_clear[index])
and check_keyword_parameters(
answer, core_line_in_core_block, core_line_in_output_clear[index]
)
): # block
# if re.search(rf'\b{re.escape(answer)}\b', code):#block
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
def is_code_valid(code):
try:
compile(code, '<string>', 'exec')
return True
except Exception:
return False
def compute_score_k(answer: str, model_output: list, k: int):
c = 0
n = len(model_output)
for output in model_output:
if '```python' in output:
output = output.replace('```python', '')
output = output.replace('```', '')
# if answer == output:
if re.search(rf'\b{re.escape(answer)}\b', output) and is_code_valid(output):
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
k = 1 # cdc@k
json_name = 'VersiCode_migration.json'
task = 'migration'
folder_path = '../data/result_data/code_migration'
model_list = os.listdir(folder_path)
for model in model_list:
# if model != 'gpt-4o':
# continue
model_json_path = os.path.join(folder_path, model, json_name)
with open(model_json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_list = lodict
score_list = []
for data in data_list:
answer = data['new_name'] # old -> new
model_output = data['model_output_clear'] # old -> new
model_filled_code = model_output
# core_line_in_core_block = data['core_line_in_new_core_block']# old -> new
core_line_in_core_block = data['core_line_in_code'] # old -> new
core_line_in_output_clear = data['core_line_in_output_clear'] # old -> new
score_list.append(
compute_block_score_k(
answer,
model_output,
k,
model_filled_code,
core_line_in_core_block,
core_line_in_output_clear,
)
)
final_score = sum(score_list) / len(score_list)
print(f'{model}, {task} task, cdc@{k} score: {final_score}')

View File

@@ -1,225 +0,0 @@
"""
Calculate the cdc score for line and block
"""
import json
import math
import os
import re
# warnings.filterwarnings("ignore", category=SyntaxWarning)
def is_code_valid(code):
try:
compile(code, '<string>', 'exec')
return True
except Exception:
return False
def is_correct_parameter_count(function_name, correct_code, test_code):
"""
判断参数数量是否一致
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 获取正确代码中的参数数量
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
expected_count = len(correct_param_list)
else:
expected_count = 0 # 如果没有参数期望数量为0
# 在需要判断的代码中查找函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
return len(test_param_list) == expected_count # 检查参数数量
else:
# 如果没有括号,检查函数名是否在字符串中
return expected_count == 0 and function_name in test_code
def check_keyword_parameters(function_name, correct_code, test_code):
"""
判断关键词参数赋值是否正确使用
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 正则表达式匹配正确代码中的函数调用
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
# 检查待检测代码中的函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
# 确保待检测的每个参数都以关键字参数形式赋值
for correct_param in correct_param_list:
if '=' in correct_param: # 仅当正确代码中有关键词参数
param_name = correct_param.split('=')[0].strip()
if not any(
param_name in test_param and '=' in test_param
for test_param in test_param_list
):
return False # 如果对应参数不是关键词参数则返回False
return True # 所有关键字参数匹配
return False # 如果没有匹配返回False
def with_correct(answer_code: str, model_output: str) -> bool:
"""
当answer是with结构时判断模型生成的是不是with结构
:param answer_code:
:param model_output:
:return:
"""
# return True
if not answer_code.startswith('with') and not model_output.startswith('with'):
return True
elif answer_code.startswith('with') and model_output.startswith('with'):
return True
else:
return False
def compute_line_score_k(
answer: str, model_output: list, k: int, model_filled_code, core_line
):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if (
re.search(rf'\b{re.escape(answer)}\b', code)
and is_code_valid(model_filled_code[index])
and is_correct_parameter_count(answer, core_line, code)
and with_correct(core_line, code)
and check_keyword_parameters(answer, core_line, code)
): # line
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
def compute_block_score_k(
answer: str,
model_output: list,
k: int,
model_filled_code,
core_line_in_core_block,
core_line_in_output_clear,
):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if (
re.search(rf'\b{re.escape(answer)}\b', code)
and is_code_valid(model_filled_code[index])
and is_correct_parameter_count(
answer, core_line_in_core_block, core_line_in_output_clear[index]
)
and with_correct(core_line_in_core_block, core_line_in_output_clear[index])
and check_keyword_parameters(
answer, core_line_in_core_block, core_line_in_output_clear[index]
)
): # block
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
def compute_score_k(answer: str, model_output: list, k: int):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(
code
): # block
# if re.search(rf'\b{re.escape(answer)}\b', code):#line
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
k = 3 # cdc@k
task = 'block' # line or block
json_name = f'Versicode_{task}_completion.json'
folder_path = f'../data/result_data/{task}_completion'
model_list = os.listdir(folder_path)
for model in model_list:
model_json_path = os.path.join(folder_path, model, json_name)
with open(model_json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_list = lodict
if task == 'line':
score_list = []
for data in data_list:
answer = data['core_token']
model_output = eval(data['model_output_clear'])
model_filled_code = [
data['masked_code'].replace('<mask>', i) for i in model_output
]
core_line = data['core_line']
score_list.append(
compute_line_score_k(
answer, model_output, k, model_filled_code, core_line
)
)
else:
score_list = []
for data in data_list:
answer = data['core_token']
model_output = eval(data['model_output_clear'])
model_filled_code = eval(data['model_output_clear'])
core_line = data['core_line']
core_line_in_output_clear = data['core_line_in_output_clear']
score_list.append(
compute_block_score_k(
answer,
model_output,
k,
model_filled_code,
core_line,
core_line_in_output_clear,
)
)
final_score = sum(score_list) / len(score_list)
print(f'{model}, {task} completion task, cdc@{k} score: {final_score}')

View File

@@ -1,209 +0,0 @@
"""
Calculate the cdc score for line and block
"""
import json
import math
import os
import re
# warnings.filterwarnings("ignore", category=SyntaxWarning)
def is_code_valid(code):
try:
compile(code, '<string>', 'exec')
return True
except Exception:
return False
def is_correct_parameter_count(function_name, correct_code, test_code):
"""
判断参数数量是否一致
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 获取正确代码中的参数数量
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
expected_count = len(correct_param_list)
else:
expected_count = 0 # 如果没有参数期望数量为0
# 在需要判断的代码中查找函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
return len(test_param_list) == expected_count # 检查参数数量
else:
# 如果没有括号,检查函数名是否在字符串中
return expected_count == 0 and function_name in test_code
def check_keyword_parameters(function_name, correct_code, test_code):
"""
判断关键词参数赋值是否正确使用
:param function_name:
:param correct_code:
:param test_code:
:return:
"""
# 正则表达式匹配正确代码中的函数调用
# return True
pattern = rf'{function_name}\((.*?)\)'
correct_match = re.search(pattern, correct_code)
if correct_match:
correct_params = correct_match.group(1).strip()
correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
# 检查待检测代码中的函数调用
test_match = re.search(pattern, test_code)
if test_match:
test_params = test_match.group(1).strip()
test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
# 确保待检测的每个参数都以关键字参数形式赋值
for correct_param in correct_param_list:
if '=' in correct_param: # 仅当正确代码中有关键词参数
param_name = correct_param.split('=')[0].strip()
if not any(
param_name in test_param and '=' in test_param
for test_param in test_param_list
):
return False # 如果对应参数不是关键词参数则返回False
return True # 所有关键字参数匹配
return False # 如果没有匹配返回False
def with_correct(answer_code: str, model_output: str) -> bool:
"""
当answer是with结构时判断模型生成的是不是with结构
:param answer_code:
:param model_output:
:return:
"""
# return True
if not answer_code.startswith('with') and not model_output.startswith('with'):
return True
elif answer_code.startswith('with') and model_output.startswith('with'):
return True
else:
return False
def compute_line_score_k(
answer: str, model_output: list, k: int, model_filled_code, core_line
):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if re.search(rf'\b{re.escape(answer)}\b', code): # line
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
def compute_block_score_k(
answer: str,
model_output: list,
k: int,
model_filled_code,
core_line_in_core_block,
core_line_in_output_clear,
):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if re.search(rf'\b{re.escape(answer)}\b', code): # block
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
def compute_score_k(answer: str, model_output: list, k: int):
c = 0
n = len(model_output)
for index, code in enumerate(model_output):
if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(
code
): # block
# if re.search(rf'\b{re.escape(answer)}\b', code):#line
c += 1
if n - c < k:
return 1.0
score = 1 - (math.comb(n - c, k)) / (math.comb(n, k))
return score
k = 3 # em@k
task = 'block' # line or block
json_name = f'Versicode_{task}_completion.json'
folder_path = f'../data/result_data/{task}_completion'
model_list = os.listdir(folder_path)
for model in model_list:
model_json_path = os.path.join(folder_path, model, json_name)
with open(model_json_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_list = lodict
if task == 'line':
score_list = []
for data in data_list:
answer = data['core_token']
model_output = eval(data['model_output_clear'])
model_filled_code = [
data['masked_code'].replace('<mask>', i) for i in model_output
]
core_line = data['core_line']
score_list.append(
compute_line_score_k(
answer, model_output, k, model_filled_code, core_line
)
)
else:
score_list = []
for data in data_list:
answer = data['core_token']
model_output = eval(data['model_output_clear'])
model_filled_code = eval(data['model_output_clear'])
core_line = data['core_line']
core_line_in_output_clear = data['core_line_in_output_clear']
score_list.append(
compute_block_score_k(
answer,
model_output,
k,
model_filled_code,
core_line,
core_line_in_output_clear,
)
)
final_score = sum(score_list) / len(score_list)
print(f'{model}, {task} completion task, em@{k} score: {final_score}')

View File

@@ -1,99 +0,0 @@
"""
Find the line of code generated by the model using the block in the version code
"""
import json
import os
import random
import re
def process_line_mask(code_snippet, core_token):
if not core_token:
return None, None
replaced_lines = {}
lines = code_snippet.split('\n')
in_multi_line_comment = False
for i, line in enumerate(lines):
if in_multi_line_comment:
if ('"""' in line or "'''" in line) and not re.findall(
r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line
):
in_multi_line_comment = False
continue
elif line.strip().startswith('#'):
continue
elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
continue
elif ('"""' in line or "'''" in line) and not re.findall(
r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line
):
in_multi_line_comment = True
continue
else:
if re.search(r'\bdef\s+task_function\b', line):
continue
if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line):
replaced_lines.update({i: line})
if replaced_lines:
random_line_location = random.choice(list(replaced_lines.keys()))
masked_line = lines[random_line_location]
leading_spaces = re.match(r'^\s*', masked_line).group(0)
masked_line = masked_line.strip()
lines[random_line_location] = leading_spaces + '<line_mask>'
masked_code = '\n'.join(lines)
return masked_code, masked_line
return None, None
def load_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def save_json(file_path, data):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_list = os.listdir('../data/result_data/block_completion')
for model in model_list:
input_json_file = f'../data/result_data/block_completion/{model}/VersiCode_block_completion.json'
output_json_file = input_json_file
data = load_json(input_json_file)
for item in data:
core_token = item['core_token']
code = item['code']
_, core_line_in_code = process_line_mask(code, core_token)
if core_line_in_code:
item['core_line_in_code'] = core_line_in_code
else:
item['core_line_in_code'] = 'N/A'
model_output_clear = item['model_output_clear']
core_line_in_output_list = []
for entry in eval(model_output_clear):
_, core_line_in_output = process_line_mask(entry, core_token)
if core_line_in_output:
core_line_in_output_list.append(core_line_in_output)
else:
core_line_in_output_list.append('N/A')
item['core_line_in_output_clear'] = core_line_in_output_list
save_json(output_json_file, data)
print('Done!')

View File

@@ -1,102 +0,0 @@
"""
Find the line of code generated by the model using the block in the version code
"""
import json
import os
import random
import re
def process_line_mask(code_snippet, core_token):
if not core_token:
return None, None
replaced_lines = {}
lines = code_snippet.split('\n')
in_multi_line_comment = False
for i, line in enumerate(lines):
if in_multi_line_comment:
if ('"""' in line or "'''" in line) and not re.findall(
r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line
):
in_multi_line_comment = False
continue
elif line.strip().startswith('#'):
continue
elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
continue
elif ('"""' in line or "'''" in line) and not re.findall(
r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line
):
in_multi_line_comment = True
continue
else:
if re.search(r'\bdef\s+task_function\b', line):
continue
if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line):
replaced_lines.update({i: line})
if replaced_lines:
random_line_location = random.choice(list(replaced_lines.keys()))
masked_line = lines[random_line_location]
leading_spaces = re.match(r'^\s*', masked_line).group(0)
masked_line = masked_line.strip()
lines[random_line_location] = leading_spaces + '<line_mask>'
masked_code = '\n'.join(lines)
return masked_code, masked_line
return None, None
def load_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def save_json(file_path, data):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
model_list = os.listdir('../data/result_data/code_migration')
for model in model_list:
input_json_file = (
f'../data/result_data/code_migration/{model}/VersiCode_migration.json'
)
output_json_file = input_json_file
data = load_json(input_json_file)
for item in data:
core_token = item['old_name']
code = item['old_code']
_, core_line_in_code = process_line_mask(code, core_token)
if core_line_in_code:
item['core_line_in_code'] = core_line_in_code
else:
item['core_line_in_code'] = 'N/A'
model_output_clear = item['model_output_clear']
core_line_in_output_list = []
core_token = item['new_name']
for entry in eval(model_output_clear):
_, core_line_in_output = process_line_mask(entry, core_token)
if core_line_in_output:
core_line_in_output_list.append(core_line_in_output)
else:
core_line_in_output_list.append('N/A')
item['core_line_in_output_clear'] = core_line_in_output_list
save_json(output_json_file, data)
print('Done!')

View File

@@ -1,38 +0,0 @@
"""
Clear the<start>and<end>generated by the model in inference
"""
import json
model_name = ''
task = 'block_completion'
result_path = f'../data/result_data/{task}/{model_name}/VersiCode_block_completion.json' # Modify the file according to the task format
with open(result_path, 'r', encoding='utf-8') as fr:
lodict = json.load(fr)
data_dict = lodict
data_list = data_dict
for data in data_list:
temp_list = []
model_output_list = eval(data['model_output'])
for output in model_output_list:
if '<start>' in output and '<end>' in output:
start_index = output.find('<start>') + len('<start>')
end_index = output.find('<end>')
content = (
output[start_index:end_index]
.replace('```python', '')
.replace('```', '')
)
else:
content = 'no_answer'
temp_list.append(content)
data['model_output_clear'] = str(temp_list)
with open(result_path, 'w', encoding='utf-8') as fw:
json.dump(data_dict, fw, indent=4, ensure_ascii=False)

View File

@@ -1,146 +0,0 @@
aiohappyeyeballs==2.6.1
aiohttp==3.11.18
aiosignal==1.3.2
airportsdata==20250224
annotated-types==0.7.0
anyio==4.9.0
astor==0.8.1
attrs==25.3.0
blake3==1.0.4
cachetools==5.5.2
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
cloudpickle==3.1.1
compressed-tensors==0.9.3
cupy-cuda12x==13.4.1
Deprecated==1.2.18
depyf==0.18.0
dill==0.4.0
diskcache==5.6.3
distro==1.9.0
dnspython==2.7.0
einops==0.8.1
email_validator==2.2.0
fastapi==0.115.12
fastapi-cli==0.0.7
fastrlock==0.8.3
filelock==3.18.0
frozenlist==1.6.0
fsspec==2025.3.2
gguf==0.16.2
googleapis-common-protos==1.70.0
grpcio==1.71.0
h11==0.14.0
hf-xet==1.0.3
httpcore==1.0.8
httptools==0.6.4
httpx==0.28.1
huggingface-hub==0.30.2
idna==3.10
importlib_metadata==8.0.0
interegular==0.3.3
Jinja2==3.1.6
jiter==0.9.0
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
lark==1.2.2
llguidance==0.7.16
llvmlite==0.44.0
lm-format-enforcer==0.10.11
markdown-it-py==3.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
mistral_common==1.5.4
mpmath==1.3.0
msgpack==1.1.0
msgspec==0.19.0
multidict==6.4.3
nest-asyncio==1.6.0
networkx==3.4.2
ninja==1.11.1.4
numba==0.61.2
numpy==2.2.5
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-cusparselt-cu12==0.6.2
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
openai==1.75.0
opencv-python-headless==4.11.0.86
opentelemetry-api==1.26.0
opentelemetry-exporter-otlp==1.26.0
opentelemetry-exporter-otlp-proto-common==1.26.0
opentelemetry-exporter-otlp-proto-grpc==1.26.0
opentelemetry-exporter-otlp-proto-http==1.26.0
opentelemetry-proto==1.26.0
opentelemetry-sdk==1.26.0
opentelemetry-semantic-conventions==0.47b0
opentelemetry-semantic-conventions-ai==0.4.3
outlines==0.1.11
outlines_core==0.1.26
packaging==25.0
partial-json-parser==0.2.1.1.post5
pillow==11.2.1
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.21.1
propcache==0.3.1
protobuf==4.25.6
psutil==7.0.0
py-cpuinfo==9.0.0
pycountry==24.6.1
pydantic==2.11.3
pydantic_core==2.33.1
Pygments==2.19.1
python-dotenv==1.1.0
python-json-logger==3.3.0
python-multipart==0.0.20
PyYAML==6.0.2
pyzmq==26.4.0
ray==2.43.0
referencing==0.36.2
regex==2024.11.6
requests==2.32.3
rich==14.0.0
rich-toolkit==0.14.1
rpds-py==0.24.0
safetensors==0.5.3
scipy==1.15.2
sentencepiece==0.2.0
setuptools==75.8.0
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
starlette==0.46.2
sympy==1.13.1
tiktoken==0.9.0
tokenizers==0.21.1
torch==2.6.0
torchaudio==2.6.0
torchvision==0.21.0
tqdm==4.67.1
transformers==4.51.3
triton==3.2.0
typer==0.15.2
typing-inspection==0.4.0
typing_extensions==4.13.2
urllib3==2.4.0
uvicorn==0.34.2
uvloop==0.21.0
vllm==0.8.4
watchfiles==1.0.5
websockets==15.0.1
wheel==0.45.1
wrapt==1.17.2
xformers==0.0.29.post2
xgrammar==0.1.18
yarl==1.20.0
zipp==3.21.0

View File

@@ -212,7 +212,7 @@ if __name__ == '__main__':
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
llm_config.modify_params = False
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

View File

@@ -263,19 +263,8 @@ def prepare_dataset(
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
)
def make_serializable(instance: pd.Series) -> dict:
import numpy as np
instance_dict = instance.to_dict()
for k, v in instance_dict.items():
if isinstance(v, np.ndarray):
instance_dict[k] = v.tolist()
elif isinstance(v, pd.Timestamp):
instance_dict[k] = str(v)
return instance_dict
new_dataset = [
make_serializable(instance)
instance
for _, instance in dataset.iterrows()
if str(instance[id_column]) not in finished_ids
]

View File

@@ -193,9 +193,9 @@ describe("ChatInput", () => {
it("should handle image paste correctly", () => {
const onSubmit = vi.fn();
const onFilesPaste = vi.fn();
const onImagePaste = vi.fn();
render(<ChatInput onSubmit={onSubmit} onFilesPaste={onFilesPaste} />);
render(<ChatInput onSubmit={onSubmit} onImagePaste={onImagePaste} />);
const input = screen.getByTestId("chat-input").querySelector("textarea");
expect(input).toBeTruthy();
@@ -213,8 +213,8 @@ describe("ChatInput", () => {
},
});
// Verify file paste was handled
expect(onFilesPaste).toHaveBeenCalledWith([file]);
// Verify image paste was handled
expect(onImagePaste).toHaveBeenCalledWith([file]);
});
it("should use the default maxRows value", () => {

View File

@@ -4,6 +4,7 @@ import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";
import type { Message } from "#/message";
import { SUGGESTIONS } from "#/utils/suggestions";
import { WsClientProviderStatus } from "#/context/ws-client-provider";
import { ChatInterface } from "#/components/features/chat/chat-interface";
// eslint-disable-next-line @typescript-eslint/no-unused-vars
@@ -18,7 +19,7 @@ describe("Empty state", () => {
const { useWsClient: useWsClientMock } = vi.hoisted(() => ({
useWsClient: vi.fn(() => ({
send: sendMock,
status: "CONNECTED",
status: WsClientProviderStatus.CONNECTED,
isLoadingMessages: false,
})),
}));
@@ -63,7 +64,7 @@ describe("Empty state", () => {
// this is to test that the message is in the UI before the socket is called
useWsClientMock.mockImplementation(() => ({
send: sendMock,
status: "CONNECTED",
status: WsClientProviderStatus.CONNECTED,
isLoadingMessages: false,
}));
const user = userEvent.setup();
@@ -86,7 +87,7 @@ describe("Empty state", () => {
async () => {
useWsClientMock.mockImplementation(() => ({
send: sendMock,
status: "CONNECTED",
status: WsClientProviderStatus.CONNECTED,
isLoadingMessages: false,
}));
const user = userEvent.setup();
@@ -100,7 +101,7 @@ describe("Empty state", () => {
useWsClientMock.mockImplementation(() => ({
send: sendMock,
status: "CONNECTED",
status: WsClientProviderStatus.CONNECTED,
isLoadingMessages: false,
}));
rerender(<ChatInterface />);

View File

@@ -478,7 +478,7 @@ describe("ConversationCard", () => {
title="Conversation 1"
selectedRepository={null}
lastUpdatedAt="2021-10-01T12:00:00Z"
conversationStatus="RUNNING"
status="RUNNING"
/>,
);

View File

@@ -48,7 +48,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-01T12:00:00Z",
created_at: "2021-10-01T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},
@@ -61,7 +60,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-02T12:00:00Z",
created_at: "2021-10-02T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},
@@ -74,7 +72,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-03T12:00:00Z",
created_at: "2021-10-03T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},
@@ -161,7 +158,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-01T12:00:00Z",
created_at: "2021-10-01T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},
@@ -174,7 +170,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-02T12:00:00Z",
created_at: "2021-10-02T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},
@@ -187,7 +182,6 @@ describe("ConversationPanel", () => {
last_updated_at: "2021-10-03T12:00:00Z",
created_at: "2021-10-03T12:00:00Z",
status: "STOPPED" as const,
runtime_status: null,
url: null,
session_api_key: null,
},

View File

@@ -31,7 +31,7 @@ const renderRepoConnector = () => {
},
{
Component: () => <div data-testid="git-settings-screen" />,
path: "/settings/integrations",
path: "/settings/git",
},
],
},
@@ -50,13 +50,13 @@ const renderRepoConnector = () => {
const MOCK_RESPOSITORIES: GitRepository[] = [
{
id: "1",
id: 1,
full_name: "rbren/polaris",
git_provider: "github",
is_public: true,
},
{
id: "2",
id: 2,
full_name: "All-Hands-AI/OpenHands",
git_provider: "github",
is_public: true,

View File

@@ -94,13 +94,13 @@ describe("RepositorySelectionForm", () => {
it("shows loading indicator when repositories are being fetched", () => {
const MOCK_REPOS: GitRepository[] = [
{
id: "1",
id: 1,
full_name: "user/repo1",
git_provider: "github",
is_public: true,
},
{
id: "2",
id: 2,
full_name: "user/repo2",
git_provider: "github",
is_public: true,
@@ -122,13 +122,13 @@ describe("RepositorySelectionForm", () => {
it("shows dropdown when repositories are loaded", async () => {
const MOCK_REPOS: GitRepository[] = [
{
id: "1",
id: 1,
full_name: "user/repo1",
git_provider: "github",
is_public: true,
},
{
id: "2",
id: 2,
full_name: "user/repo2",
git_provider: "github",
is_public: true,
@@ -166,13 +166,13 @@ describe("RepositorySelectionForm", () => {
it("should call the search repos API when searching a URL", async () => {
const MOCK_REPOS: GitRepository[] = [
{
id: "1",
id: 1,
full_name: "user/repo1",
git_provider: "github",
is_public: true,
},
{
id: "2",
id: 2,
full_name: "user/repo2",
git_provider: "github",
is_public: true,
@@ -181,7 +181,7 @@ describe("RepositorySelectionForm", () => {
const MOCK_SEARCH_REPOS: GitRepository[] = [
{
id: "3",
id: 3,
full_name: "kubernetes/kubernetes",
git_provider: "github",
is_public: true,
@@ -228,7 +228,7 @@ describe("RepositorySelectionForm", () => {
it("should call onRepoSelection when a searched repository is selected", async () => {
const MOCK_SEARCH_REPOS: GitRepository[] = [
{
id: "3",
id: 3,
full_name: "kubernetes/kubernetes",
git_provider: "github",
is_public: true,

View File

@@ -19,10 +19,10 @@ const MOCK_TASK_1: SuggestedTask = {
};
const MOCK_RESPOSITORIES: GitRepository[] = [
{ id: "1", full_name: "repo1", git_provider: "github", is_public: true },
{ id: "2", full_name: "repo2", git_provider: "github", is_public: true },
{ id: "3", full_name: "repo3", git_provider: "gitlab", is_public: true },
{ id: "4", full_name: "repo4", git_provider: "gitlab", is_public: true },
{ id: 1, full_name: "repo1", git_provider: "github", is_public: true },
{ id: 2, full_name: "repo2", git_provider: "github", is_public: true },
{ id: 3, full_name: "repo3", git_provider: "gitlab", is_public: true },
{ id: 4, full_name: "repo4", git_provider: "gitlab", is_public: true },
];
const renderTaskCard = (task = MOCK_TASK_1) => {

Some files were not shown because too many files have changed in this diff Show More