mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
57 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
907c65cc00 | ||
|
|
a6d1a4c98f | ||
|
|
a60ee09881 | ||
|
|
246107c618 | ||
|
|
5fa18511b3 | ||
|
|
a482182a9e | ||
|
|
58d22a1905 | ||
|
|
17bbfa29a1 | ||
|
|
5fe116cfb1 | ||
|
|
e9637d40b9 | ||
|
|
6de177521f | ||
|
|
9d36b80b96 | ||
|
|
b11e905988 | ||
|
|
39e5311233 | ||
|
|
651ed1c3c8 | ||
|
|
e27c2e9c99 | ||
|
|
cfe222e1d5 | ||
|
|
c872af4658 | ||
|
|
99fa6c6a4a | ||
|
|
3946f813a4 | ||
|
|
455e667739 | ||
|
|
2874041381 | ||
|
|
279e1d7abc | ||
|
|
a7e4a7aa63 | ||
|
|
2466d903df | ||
|
|
424cdf121a | ||
|
|
6972f4806f | ||
|
|
78cc552e3a | ||
|
|
a241b9ff98 | ||
|
|
c757d7c613 | ||
|
|
2b06e4e5d0 | ||
|
|
cf157c86b3 | ||
|
|
f2dc3663d7 | ||
|
|
e4e3e4abb8 | ||
|
|
22292f72cd | ||
|
|
f4ee3a4cb6 | ||
|
|
2df426732a | ||
|
|
e81623110d | ||
|
|
de81020a8d | ||
|
|
1146b6248b | ||
|
|
c3ddb26e43 | ||
|
|
3d853f7db3 | ||
|
|
027c642268 | ||
|
|
910b2a9b9e | ||
|
|
ea96ffca9b | ||
|
|
7ec407dc50 | ||
|
|
83b94786a3 | ||
|
|
786cde39fd | ||
|
|
ceb60b9a37 | ||
|
|
794408cd31 | ||
|
|
9aa89e8f2f | ||
|
|
3314b97cb2 | ||
|
|
8f47547b08 | ||
|
|
c5117bc48d | ||
|
|
851d88593c | ||
|
|
9908e1b285 | ||
|
|
793e142c4a |
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@@ -18,7 +18,7 @@ updates:
|
||||
- "chromadb"
|
||||
browsergym:
|
||||
patterns:
|
||||
- "browsergym"
|
||||
- "browsergym*"
|
||||
security-all:
|
||||
applies-to: "security-updates"
|
||||
patterns:
|
||||
|
||||
3
.github/workflows/fe-unit-tests.yml
vendored
3
.github/workflows/fe-unit-tests.yml
vendored
@@ -24,7 +24,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
node-version: [20]
|
||||
node-version: [20, 22]
|
||||
fail-fast: true
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
3
.github/workflows/ghcr-build.yml
vendored
3
.github/workflows/ghcr-build.yml
vendored
@@ -68,6 +68,9 @@ jobs:
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: "Set up docker layer caching"
|
||||
uses: satackey/action-docker-layer-caching@v0.0.11
|
||||
continue-on-error: true
|
||||
- name: Build and push app image
|
||||
if: "!github.event.pull_request.head.repo.fork"
|
||||
run: |
|
||||
|
||||
51
.github/workflows/openhands-resolver.yml
vendored
51
.github/workflows/openhands-resolver.yml
vendored
@@ -16,6 +16,10 @@ on:
|
||||
type: string
|
||||
default: "main"
|
||||
description: "Target branch to pull and create PR against"
|
||||
LLM_MODEL:
|
||||
required: false
|
||||
type: string
|
||||
default: "anthropic/claude-3-5-sonnet-20241022"
|
||||
base_container_image:
|
||||
required: false
|
||||
type: string
|
||||
@@ -23,15 +27,15 @@ on:
|
||||
description: "Custom sandbox env"
|
||||
secrets:
|
||||
LLM_MODEL:
|
||||
required: true
|
||||
required: false
|
||||
LLM_API_KEY:
|
||||
required: true
|
||||
LLM_BASE_URL:
|
||||
required: false
|
||||
PAT_TOKEN:
|
||||
required: true
|
||||
required: false
|
||||
PAT_USERNAME:
|
||||
required: true
|
||||
required: false
|
||||
|
||||
issues:
|
||||
types: [labeled]
|
||||
@@ -106,13 +110,14 @@ jobs:
|
||||
|
||||
- name: Check required environment variables
|
||||
env:
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
||||
PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
|
||||
PAT_USERNAME: ${{ secrets.PAT_USERNAME }}
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
required_vars=("LLM_MODEL" "LLM_API_KEY" "PAT_TOKEN" "PAT_USERNAME")
|
||||
required_vars=("LLM_MODEL" "LLM_API_KEY")
|
||||
for var in "${required_vars[@]}"; do
|
||||
if [ -z "${!var}" ]; then
|
||||
echo "Error: Required environment variable $var is not set."
|
||||
@@ -120,6 +125,19 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
# Check optional variables and warn about fallbacks
|
||||
if [ -z "$PAT_TOKEN" ]; then
|
||||
echo "Warning: PAT_TOKEN is not set, falling back to GITHUB_TOKEN"
|
||||
fi
|
||||
|
||||
if [ -z "$LLM_BASE_URL" ]; then
|
||||
echo "Warning: LLM_BASE_URL is not set, will use default API endpoint"
|
||||
fi
|
||||
|
||||
if [ -z "$PAT_USERNAME" ]; then
|
||||
echo "Warning: PAT_USERNAME is not set, will use openhands-agent"
|
||||
fi
|
||||
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
if [ -n "${{ github.event.review.body }}" ]; then
|
||||
@@ -143,7 +161,7 @@ jobs:
|
||||
fi
|
||||
|
||||
echo "MAX_ITERATIONS=${{ inputs.max_iterations || 50 }}" >> $GITHUB_ENV
|
||||
echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV
|
||||
echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.PAT_TOKEN || github.token }}" >> $GITHUB_ENV
|
||||
echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
|
||||
|
||||
# Set branch variables
|
||||
@@ -152,7 +170,7 @@ jobs:
|
||||
- name: Comment on issue with start message
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
github-token: ${{ secrets.PAT_TOKEN || github.token }}
|
||||
script: |
|
||||
const issueType = process.env.ISSUE_TYPE;
|
||||
github.rest.issues.createComment({
|
||||
@@ -177,9 +195,9 @@ jobs:
|
||||
|
||||
- name: Attempt to resolve issue
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
|
||||
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
||||
PYTHONPATH: ""
|
||||
@@ -189,7 +207,7 @@ jobs:
|
||||
--issue-number ${{ env.ISSUE_NUMBER }} \
|
||||
--issue-type ${{ env.ISSUE_TYPE }} \
|
||||
--max-iterations ${{ env.MAX_ITERATIONS }} \
|
||||
--comment-id ${{ env.COMMENT_ID }} \
|
||||
--comment-id ${{ env.COMMENT_ID }}
|
||||
|
||||
- name: Check resolution result
|
||||
id: check_result
|
||||
@@ -211,9 +229,9 @@ jobs:
|
||||
- name: Create draft PR or push branch
|
||||
if: always() # Create PR or branch even if the previous steps fail
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
|
||||
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
|
||||
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
||||
PYTHONPATH: ""
|
||||
@@ -221,7 +239,8 @@ jobs:
|
||||
if [ "${{ steps.check_result.outputs.RESOLUTION_SUCCESS }}" == "true" ]; then
|
||||
cd /tmp && python -m openhands.resolver.send_pull_request \
|
||||
--issue-number ${{ env.ISSUE_NUMBER }} \
|
||||
--pr-type draft | tee pr_result.txt && \
|
||||
--pr-type draft \
|
||||
--reviewer ${{ github.actor }} | tee pr_result.txt && \
|
||||
grep "draft created" pr_result.txt | sed 's/.*\///g' > pr_number.txt
|
||||
else
|
||||
cd /tmp && python -m openhands.resolver.send_pull_request \
|
||||
@@ -235,7 +254,7 @@ jobs:
|
||||
uses: actions/github-script@v7
|
||||
if: always() # Comment on issue even if the previous steps fail
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
github-token: ${{ secrets.PAT_TOKEN || github.token }}
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
const issueNumber = ${{ env.ISSUE_NUMBER }};
|
||||
|
||||
2
.github/workflows/py-unit-tests.yml
vendored
2
.github/workflows/py-unit-tests.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
- name: Build Environment
|
||||
run: make build
|
||||
- name: Run Tests
|
||||
run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
|
||||
run: poetry run pytest --forked -n auto --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
env:
|
||||
|
||||
@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
|
||||
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
|
||||
setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
|
||||
|
||||
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.14-nikolaik`
|
||||
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.15-nikolaik`
|
||||
|
||||
## Develop inside Docker container
|
||||
|
||||
|
||||
@@ -38,16 +38,16 @@ See the [Installation](https://docs.all-hands.dev/modules/usage/installation) gu
|
||||
system requirements and more information.
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
|
||||
|
||||
docker run -it --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.14
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.15
|
||||
```
|
||||
|
||||
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
|
||||
|
||||
@@ -7,7 +7,7 @@ services:
|
||||
image: openhands:latest
|
||||
container_name: openhands-app-${DATE:-}
|
||||
environment:
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
|
||||
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
|
||||
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
||||
ports:
|
||||
|
||||
@@ -95,10 +95,10 @@ workspace_base = "./workspace"
|
||||
# AWS secret access key
|
||||
#aws_secret_access_key = ""
|
||||
|
||||
# API key to use
|
||||
# API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
|
||||
api_key = "your-api-key"
|
||||
|
||||
# API base URL
|
||||
# API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
|
||||
#base_url = ""
|
||||
|
||||
# API version
|
||||
@@ -131,7 +131,7 @@ embedding_model = "local"
|
||||
# Maximum number of output tokens
|
||||
#max_output_tokens = 0
|
||||
|
||||
# Model to use
|
||||
# Model to use. (For Headless / CLI only - In Web this is overridden by Session Init)
|
||||
model = "gpt-4o"
|
||||
|
||||
# Number of retries to attempt when an operation fails with the LLM.
|
||||
@@ -237,10 +237,10 @@ llm_config = 'gpt3'
|
||||
##############################################################################
|
||||
[security]
|
||||
|
||||
# Enable confirmation mode
|
||||
# Enable confirmation mode (For Headless / CLI only - In Web this is overridden by Session Init)
|
||||
#confirmation_mode = false
|
||||
|
||||
# The security analyzer to use
|
||||
# The security analyzer to use (For Headless / CLI only - In Web this is overridden by Session Init)
|
||||
#security_analyzer = ""
|
||||
|
||||
#################################### Eval ####################################
|
||||
|
||||
@@ -11,7 +11,7 @@ services:
|
||||
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
|
||||
- SANDBOX_API_HOSTNAME=host.docker.internal
|
||||
#
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
|
||||
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
|
||||
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
||||
ports:
|
||||
|
||||
@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
@@ -59,7 +59,7 @@ docker run -it \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.14 \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.15 \
|
||||
python -m openhands.core.cli
|
||||
```
|
||||
|
||||
|
||||
@@ -23,10 +23,75 @@ OpenHands provides a user-friendly Graphical User Interface (GUI) mode for inter
|
||||
|
||||
OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it is available. This can happen in two ways:
|
||||
|
||||
1. Locally (OSS): The user directly inputs their GitHub token.
|
||||
2. Online (SaaS): The token is obtained through GitHub OAuth authentication.
|
||||
1. **Locally (OSS)**: The user directly inputs their GitHub token
|
||||
2. **Online (SaaS)**: The token is obtained through GitHub OAuth authentication
|
||||
|
||||
When you reach the `/app` route, the app checks if a token is present. If it finds one, it sets it in the environment for the agent to use.
|
||||
#### Setting Up a Local GitHub Token
|
||||
|
||||
1. **Generate a Personal Access Token (PAT)**:
|
||||
- Go to GitHub Settings > Developer Settings > Personal Access Tokens > Tokens (classic)
|
||||
- Click "Generate new token (classic)"
|
||||
- Required scopes:
|
||||
- `repo` (Full control of private repositories)
|
||||
- `workflow` (Update GitHub Action workflows)
|
||||
- `read:org` (Read organization data)
|
||||
|
||||
2. **Enter Token in OpenHands**:
|
||||
- Click the Settings button (gear icon) in the top right
|
||||
- Navigate to the "GitHub" section
|
||||
- Paste your token in the "GitHub Token" field
|
||||
- Click "Save" to apply the changes
|
||||
|
||||
#### Organizational Token Policies
|
||||
|
||||
If you're working with organizational repositories, additional setup may be required:
|
||||
|
||||
1. **Check Organization Requirements**:
|
||||
- Organization admins may enforce specific token policies
|
||||
- Some organizations require tokens to be created with SSO enabled
|
||||
- Review your organization's [token policy settings](https://docs.github.com/en/organizations/managing-programmatic-access-to-your-organization/setting-a-personal-access-token-policy-for-your-organization)
|
||||
|
||||
2. **Verify Organization Access**:
|
||||
- Go to your token settings on GitHub
|
||||
- Look for the organization under "Organization access"
|
||||
- If required, click "Enable SSO" next to your organization
|
||||
- Complete the SSO authorization process
|
||||
|
||||
#### OAuth Authentication (Online Mode)
|
||||
|
||||
When using OpenHands in online mode, the GitHub OAuth flow:
|
||||
|
||||
1. Requests the following permissions:
|
||||
- Repository access (read/write)
|
||||
- Workflow management
|
||||
- Organization read access
|
||||
|
||||
2. Authentication steps:
|
||||
- Click "Sign in with GitHub" when prompted
|
||||
- Review the requested permissions
|
||||
- Authorize OpenHands to access your GitHub account
|
||||
- If using an organization, authorize organization access if prompted
|
||||
|
||||
#### Troubleshooting
|
||||
|
||||
Common issues and solutions:
|
||||
|
||||
1. **Token Not Recognized**:
|
||||
- Ensure the token is properly saved in settings
|
||||
- Check that the token hasn't expired
|
||||
- Verify the token has the required scopes
|
||||
- Try regenerating the token
|
||||
|
||||
2. **Organization Access Denied**:
|
||||
- Check if SSO is required but not enabled
|
||||
- Verify organization membership
|
||||
- Contact organization admin if token policies are blocking access
|
||||
|
||||
3. **Verifying Token Works**:
|
||||
- The app will show a green checkmark if the token is valid
|
||||
- Try accessing a repository to confirm permissions
|
||||
- Check the browser console for any error messages
|
||||
- Use the "Test Connection" button in settings if available
|
||||
|
||||
### Advanced Settings
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ LLM_API_KEY="sk_test_12345"
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
@@ -54,6 +54,6 @@ docker run -it \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.14 \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.15 \
|
||||
python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
@@ -11,16 +11,16 @@
|
||||
The easiest way to run OpenHands is in Docker.
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.14
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.15
|
||||
```
|
||||
|
||||
You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
|
||||
|
||||
@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:
|
||||
|
||||
```
|
||||
docker run # ...
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
# ...
|
||||
```
|
||||
|
||||
@@ -17,6 +17,7 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
|
||||
* [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
|
||||
* [Sessions are not restored](#sessions-are-not-restored)
|
||||
* [Connection to host.docker.internal timed out](#connection-to-host-docker-internal-timed-out)
|
||||
* [Error building runtime docker image](#error-building-runtime-docker-image)
|
||||
|
||||
### Unable to connect to Docker
|
||||
|
||||
@@ -178,3 +179,21 @@ which OpenHands makes use of when the main server is running inside a docker con
|
||||
* [Install Docker Desktop](https://www.docker.com/products/docker-desktop/)
|
||||
* Run OpenHands in [Development Mode](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
|
||||
So that the main server is not run inside a container, but still creates dockerized runtime sandboxes.
|
||||
|
||||
---
|
||||
### Error building runtime docker image
|
||||
|
||||
**Symptoms**
|
||||
Attempts to start a new session fail, and an errors with terms like the following appear in the logs:
|
||||
* `debian-security bookworm-security`
|
||||
* `InRelease At least one invalid signature was encountered.`
|
||||
|
||||
This seems to happen when the hash of an existing external library changes and your local docker instance has
|
||||
cached a previous version. To work around this, please try the following:
|
||||
|
||||
* Stop any containers where the name has the prefix `openhands-runtime-` :
|
||||
`docker ps --filter name=openhands-runtime- --filter status=running -aq | xargs docker stop`
|
||||
* Remove any containers where the name has the prefix `openhands-runtime-` :
|
||||
`docker rmi $(docker images --filter name=openhands-runtime- -q --no-trunc)`
|
||||
* Stop and Remove any containers / images where the name has the prefix `openhands-runtime-`
|
||||
* Prune containers / images : `docker container prune -f && docker image prune -f`
|
||||
|
||||
17
docs/package-lock.json
generated
17
docs/package-lock.json
generated
@@ -17,8 +17,8 @@
|
||||
"prism-react-renderer": "^2.4.0",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"react-icons": "^5.3.0",
|
||||
"react-use": "^17.5.1"
|
||||
"react-icons": "^5.4.0",
|
||||
"react-use": "^17.6.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@docusaurus/module-type-aliases": "^3.5.1",
|
||||
@@ -15155,9 +15155,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/react-icons": {
|
||||
"version": "5.3.0",
|
||||
"resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.3.0.tgz",
|
||||
"integrity": "sha512-DnUk8aFbTyQPSkCfF8dbX6kQjXA9DktMeJqfjrg6cK9vwQVMxmcA3BfP4QoiztVmEHtwlTgLFsPuH2NskKT6eg==",
|
||||
"version": "5.4.0",
|
||||
"resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.4.0.tgz",
|
||||
"integrity": "sha512-7eltJxgVt7X64oHh6wSWNwwbKTCtMfK35hcjvJS0yxEAhPM8oUKdS3+kqaW1vicIltw+kR2unHaa12S9pPALoQ==",
|
||||
"license": "MIT",
|
||||
"peerDependencies": {
|
||||
"react": "*"
|
||||
}
|
||||
@@ -15263,9 +15264,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/react-use": {
|
||||
"version": "17.5.1",
|
||||
"resolved": "https://registry.npmjs.org/react-use/-/react-use-17.5.1.tgz",
|
||||
"integrity": "sha512-LG/uPEVRflLWMwi3j/sZqR00nF6JGqTTDblkXK2nzXsIvij06hXl1V/MZIlwj1OKIQUtlh1l9jK8gLsRyCQxMg==",
|
||||
"version": "17.6.0",
|
||||
"resolved": "https://registry.npmjs.org/react-use/-/react-use-17.6.0.tgz",
|
||||
"integrity": "sha512-OmedEScUMKFfzn1Ir8dBxiLLSOzhKe/dPZwVxcujweSj45aNM7BEGPb9BEVIgVEqEXx6f3/TsXzwIktNgUR02g==",
|
||||
"dependencies": {
|
||||
"@types/js-cookie": "^2.2.6",
|
||||
"@xobotyi/scrollbar-width": "^1.9.5",
|
||||
|
||||
@@ -24,8 +24,8 @@
|
||||
"prism-react-renderer": "^2.4.0",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"react-icons": "^5.3.0",
|
||||
"react-use": "^17.5.1"
|
||||
"react-icons": "^5.4.0",
|
||||
"react-use": "^17.6.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@docusaurus/module-type-aliases": "^3.5.1",
|
||||
|
||||
@@ -4,12 +4,10 @@ This folder contains evaluation harness for evaluating agents on the Entity-dedu
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
|
||||
./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
|
||||
@@ -37,7 +35,8 @@ For example,
|
||||
```
|
||||
|
||||
## Reference
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@inproceedings{zhang2023entity,
|
||||
title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
|
||||
author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
|
||||
|
||||
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "Dataset not specified, use default 'things'"
|
||||
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
|
||||
--max-iterations 20 \
|
||||
--OPENAI_API_KEY $OPENAI_API_KEY \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,7 +4,7 @@ This folder contains evaluation harness for evaluating agents on the [AgentBench
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
|
||||
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
|
||||
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -10,7 +10,7 @@ Hugging Face dataset based on the
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local
|
||||
development environment and LLM.
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE=$AGENT_VERSION
|
||||
EVAL_NOTE=$OPENHANDS_VERSION
|
||||
|
||||
# Default to NOT use unit tests.
|
||||
if [ -z "$USE_UNIT_TESTS" ]; then
|
||||
|
||||
@@ -4,13 +4,14 @@ Implements evaluation of agents on BioCoder from the BioCoder benchmark introduc
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## BioCoder Docker Image
|
||||
|
||||
In the openhands branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenHands environment. In the Docker image are testing scripts (`/testing/start_test_openhands.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
|
||||
|
||||
**Before first execution, pull our Docker image with the following command**
|
||||
|
||||
```bash
|
||||
docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
|
||||
```
|
||||
@@ -19,7 +20,6 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
|
||||
```
|
||||
@@ -47,7 +47,8 @@ with current OpenHands version, then your command would be:
|
||||
```
|
||||
|
||||
## Reference
|
||||
```
|
||||
|
||||
```bibtex
|
||||
@misc{tang2024biocoder,
|
||||
title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
|
||||
author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
|
||||
|
||||
@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
|
||||
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 5 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION" \
|
||||
--eval-note $OPENHANDS_VERSION" \
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -7,7 +7,7 @@ If so, the browsing performance upper-bound of CodeActAgent will be the performa
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference
|
||||
|
||||
|
||||
@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -4,19 +4,18 @@ This folder contains the evaluation harness that we built on top of the original
|
||||
|
||||
The evaluation consists of three steps:
|
||||
|
||||
1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm).
|
||||
1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm).
|
||||
2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## OpenHands Commit0 Instance-level Docker Support
|
||||
|
||||
OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances).
|
||||
This is now the default behavior.
|
||||
|
||||
|
||||
## Run Inference on Commit0 Instances
|
||||
|
||||
Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support).
|
||||
|
||||
@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "HF SPLIT: $SPLIT"
|
||||
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
|
||||
@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run the evaluation
|
||||
|
||||
We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
|
||||
Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.
|
||||
|
||||
@@ -41,6 +42,7 @@ For example,
|
||||
## Get score
|
||||
|
||||
Then you can get stats by running the following command:
|
||||
|
||||
```bash
|
||||
python ./evaluation/benchmarks/gaia/get_score.py \
|
||||
--file <path_to/output.json>
|
||||
|
||||
@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$LEVELS" ]; then
|
||||
LEVELS="2023_level1"
|
||||
echo "Levels not specified, use default $LEVELS"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "LEVELS: $LEVELS"
|
||||
|
||||
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
|
||||
--level $LEVELS \
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,7 +4,7 @@ This folder contains evaluation harness we built on top of the original [Gorilla
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on APIBench Instances
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
if [ -z "$HUBS" ]; then
|
||||
HUBS="hf,torch,tf"
|
||||
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
|
||||
fi
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "HUBS: $HUBS"
|
||||
|
||||
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
|
||||
--hubs $HUBS \
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).
|
||||
|
||||
This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
|
||||
|
||||
- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
|
||||
- Even experts in the corresponding domains achieve only 65% accuracy.
|
||||
- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
|
||||
@@ -11,20 +12,24 @@ This code implements the evaluation of agents on the GPQA Benchmark with Open Bo
|
||||
Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
|
||||
|
||||
Further references:
|
||||
- https://arxiv.org/pdf/2311.12022
|
||||
- https://paperswithcode.com/dataset/gpqa
|
||||
- https://github.com/idavidrein/gpqa
|
||||
|
||||
- <https://arxiv.org/pdf/2311.12022>
|
||||
- <https://paperswithcode.com/dataset/gpqa>
|
||||
- <https://github.com/idavidrein/gpqa>
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on GPQA Benchmark
|
||||
|
||||
'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
|
||||
From the root of the OpenHands repo, run the following command:
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
|
||||
```
|
||||
|
||||
You can replace `model_config_name` with any model you set up in `config.toml`.
|
||||
|
||||
- `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
|
||||
|
||||
@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
|
||||
DATA_SPLIT="gpqa_diamond"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--data-split $DATA_SPLIT \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,7 +4,7 @@ Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on HumanEvalFix
|
||||
|
||||
@@ -14,13 +14,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
|
||||
|
||||
You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case.
|
||||
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"task_id": "Python/2",
|
||||
"instruction": "Please fix the function in Python__2.py such that all test cases pass.\nEnvironment has been set up for you to start working. You may assume all necessary tools are installed.\n\n# Problem Statement\ndef truncate_number(number: float) -> float:\n return number % 1.0 + 1.0\n\n\n\n\n\n\ndef check(truncate_number):\n assert truncate_number(3.5) == 0.5\n assert abs(truncate_number(1.33) - 0.33) < 1e-6\n assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n",
|
||||
|
||||
@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the logic reaso
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on logic_reasoning
|
||||
|
||||
The following code will run inference on the first example of the ProofWriter dataset,
|
||||
|
||||
```bash
|
||||
|
||||
@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
|
||||
DATASET="ProofWriter"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
|
||||
--dataset $DATASET \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,7 +4,7 @@ This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) ben
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Test if your environment works
|
||||
|
||||
@@ -42,7 +42,6 @@ poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/e
|
||||
|
||||
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
||||
|
||||
|
||||
## BrowsingAgent V1.0 result
|
||||
|
||||
Tested on BrowsingAgent V1.0
|
||||
|
||||
@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
|
||||
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -18,10 +18,10 @@ checkout_eval_branch
|
||||
# Only 'CodeActAgent' is supported for MINT now
|
||||
AGENT="CodeActAgent"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
|
||||
export PYTHONPATH=$(pwd)
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ For more details on the ML-Bench task and dataset, please refer to the paper: [M
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on ML-Bench
|
||||
|
||||
|
||||
@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
--eval-note $OPENHANDS_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# ScienceAgentBench Evaluation with OpenHands
|
||||
|
||||
This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080).
|
||||
This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: <https://arxiv.org/abs/2410.05080>).
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Setup ScienceAgentBench
|
||||
|
||||
@@ -45,6 +45,7 @@ After the inference is completed, you may use the following command to extract n
|
||||
```bash
|
||||
python post_proc.py [log_fname]
|
||||
```
|
||||
|
||||
- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
|
||||
|
||||
Output will be write to e.g. `evaluation/.../output.converted.jsonl`
|
||||
|
||||
@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
|
||||
USE_KNOWLEDGE=false
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
|
||||
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
|
||||
--use_knowledge $USE_KNOWLEDGE \
|
||||
--max-iterations 30 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION" \
|
||||
--eval-note $OPENHANDS_VERSION" \
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -6,20 +6,19 @@ This folder contains the evaluation harness that we built on top of the original
|
||||
|
||||
The evaluation consists of three steps:
|
||||
|
||||
1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
|
||||
1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
|
||||
2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue
|
||||
3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches)
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## OpenHands SWE-Bench Instance-level Docker Support
|
||||
|
||||
OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
|
||||
This is now the default behavior.
|
||||
|
||||
|
||||
## Run Inference on SWE-Bench Instances
|
||||
|
||||
Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
|
||||
@@ -52,7 +51,8 @@ default, it is set to 1.
|
||||
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
|
||||
|
||||
There are also two optional environment variables you can set.
|
||||
```
|
||||
|
||||
```bash
|
||||
export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
|
||||
export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
|
||||
```
|
||||
@@ -127,6 +127,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
|
||||
**This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
|
||||
|
||||
> If you want to evaluate existing results, you should first run this to clone existing outputs
|
||||
>
|
||||
>```bash
|
||||
>git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
|
||||
>```
|
||||
@@ -143,6 +144,7 @@ Then you can run the following:
|
||||
```
|
||||
|
||||
The script now accepts optional arguments:
|
||||
|
||||
- `instance_id`: Specify a single instance to evaluate (optional)
|
||||
- `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`)
|
||||
- `split`: The split of the dataset to use (default: `"test"`)
|
||||
@@ -179,7 +181,6 @@ To clean-up all existing runtimes that you've already started, run:
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
|
||||
```
|
||||
|
||||
|
||||
## Visualize Results
|
||||
|
||||
First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
|
||||
@@ -189,6 +190,7 @@ git clone https://huggingface.co/spaces/OpenHands/evaluation
|
||||
```
|
||||
|
||||
**(optional) setup streamlit environment with conda**:
|
||||
|
||||
```bash
|
||||
cd evaluation
|
||||
conda create -n streamlit python=3.10
|
||||
|
||||
@@ -128,6 +128,11 @@ def process_file(file_path):
|
||||
for error, count in error_counter.items()
|
||||
},
|
||||
},
|
||||
'costs': {
|
||||
'main_agent': sum(main_agent_cost),
|
||||
'editor': sum(editor_cost),
|
||||
'total': sum(main_agent_cost) + sum(editor_cost),
|
||||
},
|
||||
'statistics': {
|
||||
'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
|
||||
'costs': {
|
||||
@@ -251,6 +256,7 @@ if __name__ == '__main__':
|
||||
print(
|
||||
f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
|
||||
)
|
||||
print(f"Total cost: {result['costs']['total']:.2f} USD")
|
||||
print('## Statistics')
|
||||
print(
|
||||
f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
|
||||
|
||||
104
evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
Normal file
104
evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
|
||||
def verify_instance_costs(row: pd.Series) -> float:
|
||||
"""
|
||||
Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
Also checks for duplicate consecutive costs which might indicate buggy counting.
|
||||
If the consecutive costs are identical, the file is affected by this bug:
|
||||
https://github.com/All-Hands-AI/OpenHands/issues/5383
|
||||
|
||||
Args:
|
||||
row: DataFrame row containing instance data with metrics
|
||||
Returns:
|
||||
float: The verified total cost for this instance (corrected if needed)
|
||||
"""
|
||||
try:
|
||||
metrics = row.get('metrics')
|
||||
if not metrics:
|
||||
logger.warning(f"Instance {row['instance_id']}: No metrics found")
|
||||
return 0.0
|
||||
|
||||
accumulated = metrics.get('accumulated_cost')
|
||||
costs = metrics.get('costs', [])
|
||||
|
||||
if accumulated is None:
|
||||
logger.warning(
|
||||
f"Instance {row['instance_id']}: No accumulated_cost in metrics"
|
||||
)
|
||||
return 0.0
|
||||
|
||||
# Check for duplicate consecutive costs and systematic even-odd pairs
|
||||
has_duplicate = False
|
||||
all_pairs_match = True
|
||||
|
||||
# Check each even-odd pair (0-1, 2-3, etc.)
|
||||
for i in range(0, len(costs) - 1, 2):
|
||||
if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
|
||||
has_duplicate = True
|
||||
logger.debug(
|
||||
f"Instance {row['instance_id']}: Possible buggy double-counting detected! "
|
||||
f"Steps {i} and {i+1} have identical costs: {costs[i]['cost']:.2f}"
|
||||
)
|
||||
else:
|
||||
all_pairs_match = False
|
||||
break
|
||||
|
||||
# Calculate total cost, accounting for buggy double counting if detected
|
||||
if len(costs) >= 2 and has_duplicate and all_pairs_match:
|
||||
paired_steps_cost = sum(
|
||||
cost_entry['cost']
|
||||
for cost_entry in costs[: -1 if len(costs) % 2 else None]
|
||||
)
|
||||
real_paired_cost = paired_steps_cost / 2
|
||||
|
||||
unpaired_cost = costs[-1]['cost'] if len(costs) % 2 else 0
|
||||
total_cost = real_paired_cost + unpaired_cost
|
||||
|
||||
else:
|
||||
total_cost = sum(cost_entry['cost'] for cost_entry in costs)
|
||||
|
||||
if not abs(total_cost - accumulated) < 1e-6:
|
||||
logger.warning(
|
||||
f"Instance {row['instance_id']}: Cost mismatch: "
|
||||
f"accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, "
|
||||
)
|
||||
|
||||
return total_cost
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error verifying costs for instance {row.get('instance_id', 'UNKNOWN')}: {e}"
|
||||
)
|
||||
return 0.0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Verify costs in SWE-bench output file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'input_filepath', type=str, help='Path to the output.jsonl file'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Load and verify the JSONL file
|
||||
df = pd.read_json(args.input_filepath, lines=True)
|
||||
logger.info(f'Loaded {len(df)} instances from {args.input_filepath}')
|
||||
|
||||
# Verify costs for each instance and sum up total
|
||||
total_cost = df.apply(verify_instance_costs, axis=1).sum()
|
||||
logger.info(f'Total verified cost across all instances: ${total_cost:.2f}')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to process file: {e}')
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "SPLIT: $SPLIT"
|
||||
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
|
||||
@@ -4,7 +4,7 @@ This folder contains an evaluation harness we built on top of the original [Tool
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Run Inference on ToolQA Instances
|
||||
|
||||
|
||||
@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
|
||||
echo "WOLFRAM_APPID not specified"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "HARDNESS: $HARDNESS"
|
||||
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
|
||||
--wolfram_alpha_appid $WOLFRAM_APPID\
|
||||
--data-split validation \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
|
||||
@@ -4,7 +4,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
|
||||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
||||
|
||||
## Setup WebArena Environment
|
||||
|
||||
|
||||
@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
|
||||
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE=$AGENT_VERSION
|
||||
EVAL_NOTE=$OPENHANDS_VERSION
|
||||
|
||||
# Default to NOT use unit tests.
|
||||
if [ -z "$USE_UNIT_TESTS" ]; then
|
||||
|
||||
@@ -39,8 +39,8 @@ checkout_original_branch() {
|
||||
git checkout $current_branch
|
||||
}
|
||||
|
||||
get_agent_version() {
|
||||
get_openhands_version() {
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ describe("Browser", () => {
|
||||
browser: {
|
||||
url: "https://example.com",
|
||||
screenshotSrc: "",
|
||||
updateCount: 0,
|
||||
},
|
||||
},
|
||||
});
|
||||
@@ -26,6 +27,7 @@ describe("Browser", () => {
|
||||
url: "https://example.com",
|
||||
screenshotSrc:
|
||||
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mN0uGvyHwAFCAJS091fQwAAAABJRU5ErkJggg==",
|
||||
updateCount: 0,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
@@ -70,4 +70,12 @@ describe("ChatMessage", () => {
|
||||
);
|
||||
expect(screen.getByTestId("custom-component")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("should apply correct styles to inline code", () => {
|
||||
render(<ChatMessage type="assistant" message="Here is some `inline code` text" />);
|
||||
const codeElement = screen.getByText("inline code");
|
||||
|
||||
expect(codeElement.tagName.toLowerCase()).toBe("code");
|
||||
expect(codeElement.closest("article")).not.toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -9,7 +9,7 @@ import { WsClientProviderStatus } from "#/context/ws-client-provider";
|
||||
import { ChatInterface } from "#/components/features/chat/chat-interface";
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
|
||||
const renderChatInterface = (messages: (Message)[]) =>
|
||||
renderWithProviders(<ChatInterface />);
|
||||
|
||||
describe("Empty state", () => {
|
||||
@@ -278,7 +278,7 @@ describe.skip("ChatInterface", () => {
|
||||
});
|
||||
|
||||
it("should render inline errors", () => {
|
||||
const messages: (Message | ErrorMessage)[] = [
|
||||
const messages: (Message)[] = [
|
||||
{
|
||||
sender: "assistant",
|
||||
content: "Hello",
|
||||
@@ -287,9 +287,10 @@ describe.skip("ChatInterface", () => {
|
||||
pending: true,
|
||||
},
|
||||
{
|
||||
error: true,
|
||||
id: "",
|
||||
message: "Something went wrong",
|
||||
type: "error",
|
||||
content: "Something went wrong",
|
||||
sender: "assistant",
|
||||
timestamp: new Date().toISOString(),
|
||||
},
|
||||
];
|
||||
renderChatInterface(messages);
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
import { render, screen } from "@testing-library/react";
|
||||
import { it, describe, expect, vi } from "vitest";
|
||||
import userEvent from "@testing-library/user-event";
|
||||
import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
|
||||
import * as CaptureConsent from "#/utils/handle-capture-consent";
|
||||
|
||||
describe("WaitlistModal", () => {
|
||||
it("should render a tos checkbox that is unchecked by default", () => {
|
||||
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
|
||||
const checkbox = screen.getByRole("checkbox");
|
||||
|
||||
expect(checkbox).not.toBeChecked();
|
||||
});
|
||||
|
||||
it("should only enable the GitHub button if the tos checkbox is checked", async () => {
|
||||
const user = userEvent.setup();
|
||||
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
|
||||
const checkbox = screen.getByRole("checkbox");
|
||||
const button = screen.getByRole("button", { name: "Connect to GitHub" });
|
||||
|
||||
expect(button).toBeDisabled();
|
||||
|
||||
await user.click(checkbox);
|
||||
|
||||
expect(button).not.toBeDisabled();
|
||||
});
|
||||
|
||||
it("should set user analytics consent to true when the user checks the tos checkbox", async () => {
|
||||
const handleCaptureConsentSpy = vi.spyOn(
|
||||
CaptureConsent,
|
||||
"handleCaptureConsent",
|
||||
);
|
||||
|
||||
const user = userEvent.setup();
|
||||
render(<WaitlistModal ghToken={null} githubAuthUrl="mock-url" />);
|
||||
|
||||
const checkbox = screen.getByRole("checkbox");
|
||||
await user.click(checkbox);
|
||||
|
||||
const button = screen.getByRole("button", { name: "Connect to GitHub" });
|
||||
await user.click(button);
|
||||
|
||||
expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
|
||||
});
|
||||
});
|
||||
@@ -4,8 +4,9 @@ import { screen, waitFor, within } from "@testing-library/react";
|
||||
import { renderWithProviders } from "test-utils";
|
||||
import userEvent from "@testing-library/user-event";
|
||||
import MainApp from "#/routes/_oh/route";
|
||||
import * as CaptureConsent from "#/utils/handle-capture-consent";
|
||||
import i18n from "#/i18n";
|
||||
import * as CaptureConsent from "#/utils/handle-capture-consent";
|
||||
import OpenHands from "#/api/open-hands";
|
||||
|
||||
describe("frontend/routes/_oh", () => {
|
||||
const RouteStub = createRoutesStub([{ Component: MainApp, path: "/" }]);
|
||||
@@ -60,13 +61,20 @@ describe("frontend/routes/_oh", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("should capture the user's consent", async () => {
|
||||
it("should render and capture the user's consent if oss mode", async () => {
|
||||
const user = userEvent.setup();
|
||||
const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
|
||||
const handleCaptureConsentSpy = vi.spyOn(
|
||||
CaptureConsent,
|
||||
"handleCaptureConsent",
|
||||
);
|
||||
|
||||
getConfigSpy.mockResolvedValue({
|
||||
APP_MODE: "oss",
|
||||
GITHUB_CLIENT_ID: "test-id",
|
||||
POSTHOG_CLIENT_KEY: "test-key",
|
||||
});
|
||||
|
||||
renderWithProviders(<RouteStub />);
|
||||
|
||||
// The user has not consented to tracking
|
||||
@@ -87,6 +95,23 @@ describe("frontend/routes/_oh", () => {
|
||||
).not.toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("should not render the user consent form if saas mode", async () => {
|
||||
const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
|
||||
getConfigSpy.mockResolvedValue({
|
||||
APP_MODE: "saas",
|
||||
GITHUB_CLIENT_ID: "test-id",
|
||||
POSTHOG_CLIENT_KEY: "test-key",
|
||||
});
|
||||
|
||||
renderWithProviders(<RouteStub />);
|
||||
|
||||
await waitFor(() => {
|
||||
expect(
|
||||
screen.queryByTestId("user-capture-consent-form"),
|
||||
).not.toBeInTheDocument();
|
||||
});
|
||||
});
|
||||
|
||||
it("should not render the user consent form if the user has already made a decision", async () => {
|
||||
localStorage.setItem("analytics-consent", "true");
|
||||
renderWithProviders(<RouteStub />);
|
||||
|
||||
6
frontend/package-lock.json
generated
6
frontend/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "openhands-frontend",
|
||||
"version": "0.14.3",
|
||||
"version": "0.15.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "openhands-frontend",
|
||||
"version": "0.14.3",
|
||||
"version": "0.15.1",
|
||||
"dependencies": {
|
||||
"@monaco-editor/react": "^4.6.0",
|
||||
"@nextui-org/react": "^2.4.8",
|
||||
@@ -78,7 +78,7 @@
|
||||
"husky": "^9.1.6",
|
||||
"jsdom": "^25.0.1",
|
||||
"lint-staged": "^15.2.10",
|
||||
"msw": "^2.3.0-ws.rc-12",
|
||||
"msw": "^2.6.6",
|
||||
"postcss": "^8.4.47",
|
||||
"prettier": "^3.3.3",
|
||||
"tailwindcss": "^3.4.14",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "openhands-frontend",
|
||||
"version": "0.14.3",
|
||||
"version": "0.15.1",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"engines": {
|
||||
@@ -105,7 +105,7 @@
|
||||
"husky": "^9.1.6",
|
||||
"jsdom": "^25.0.1",
|
||||
"lint-staged": "^15.2.10",
|
||||
"msw": "^2.3.0-ws.rc-12",
|
||||
"msw": "^2.6.6",
|
||||
"postcss": "^8.4.47",
|
||||
"prettier": "^3.3.3",
|
||||
"tailwindcss": "^3.4.14",
|
||||
|
||||
@@ -47,7 +47,7 @@ export function ChatMessage({
|
||||
"rounded-xl relative",
|
||||
"flex flex-col gap-2",
|
||||
type === "user" && " max-w-[305px] p-4 bg-neutral-700 self-end",
|
||||
type === "assistant" && "pb-4 max-w-full bg-tranparent",
|
||||
type === "assistant" && "mt-6 max-w-full bg-tranparent",
|
||||
)}
|
||||
>
|
||||
<CopyToClipboardButton
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
|
||||
interface ErrorMessageProps {
|
||||
id?: string;
|
||||
message: string;
|
||||
}
|
||||
|
||||
export function ErrorMessage({ id, message }: ErrorMessageProps) {
|
||||
const { t, i18n } = useTranslation();
|
||||
const [showDetails, setShowDetails] = useState(true);
|
||||
const [headline, setHeadline] = useState("");
|
||||
const [details, setDetails] = useState(message);
|
||||
|
||||
useEffect(() => {
|
||||
if (id && i18n.exists(id)) {
|
||||
setHeadline(t(id));
|
||||
setDetails(message);
|
||||
setShowDetails(false);
|
||||
}
|
||||
}, [id, message, i18n.language]);
|
||||
|
||||
return (
|
||||
<div className="flex gap-2 items-center justify-start border-l-2 border-danger pl-2 my-2 py-2">
|
||||
<div className="text-sm leading-4 flex flex-col gap-2">
|
||||
{headline && <p className="text-danger font-bold">{headline}</p>}
|
||||
{headline && (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setShowDetails(!showDetails)}
|
||||
className="cursor-pointer text-left"
|
||||
>
|
||||
{showDetails
|
||||
? t("ERROR_MESSAGE$HIDE_DETAILS")
|
||||
: t("ERROR_MESSAGE$SHOW_DETAILS")}
|
||||
</button>
|
||||
)}
|
||||
{showDetails && <p className="text-neutral-300">{details}</p>}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
80
frontend/src/components/features/chat/expandable-message.tsx
Normal file
80
frontend/src/components/features/chat/expandable-message.tsx
Normal file
@@ -0,0 +1,80 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import Markdown from "react-markdown";
|
||||
import remarkGfm from "remark-gfm";
|
||||
import { code } from "../markdown/code";
|
||||
import { ol, ul } from "../markdown/list";
|
||||
import ArrowUp from "#/icons/angle-up-solid.svg?react";
|
||||
import ArrowDown from "#/icons/angle-down-solid.svg?react";
|
||||
|
||||
interface ExpandableMessageProps {
|
||||
id?: string;
|
||||
message: string;
|
||||
type: string;
|
||||
}
|
||||
|
||||
export function ExpandableMessage({
|
||||
id,
|
||||
message,
|
||||
type,
|
||||
}: ExpandableMessageProps) {
|
||||
const { t, i18n } = useTranslation();
|
||||
const [showDetails, setShowDetails] = useState(true);
|
||||
const [headline, setHeadline] = useState("");
|
||||
const [details, setDetails] = useState(message);
|
||||
|
||||
useEffect(() => {
|
||||
if (id && i18n.exists(id)) {
|
||||
setHeadline(t(id));
|
||||
setDetails(message);
|
||||
setShowDetails(false);
|
||||
}
|
||||
}, [id, message, i18n.language]);
|
||||
|
||||
const border = type === "error" ? "border-danger" : "border-neutral-300";
|
||||
const textColor = type === "error" ? "text-danger" : "text-neutral-300";
|
||||
let arrowClasses = "h-4 w-4 ml-2 inline";
|
||||
if (type === "error") {
|
||||
arrowClasses += " fill-danger";
|
||||
} else {
|
||||
arrowClasses += " fill-neutral-300";
|
||||
}
|
||||
|
||||
return (
|
||||
<div
|
||||
className={`flex gap-2 items-center justify-start border-l-2 pl-2 my-2 py-2 ${border}`}
|
||||
>
|
||||
<div className="text-sm leading-4 flex flex-col gap-2 max-w-full">
|
||||
{headline && (
|
||||
<p className={`${textColor} font-bold`}>
|
||||
{headline}
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setShowDetails(!showDetails)}
|
||||
className="cursor-pointer text-left"
|
||||
>
|
||||
{showDetails ? (
|
||||
<ArrowUp className={arrowClasses} />
|
||||
) : (
|
||||
<ArrowDown className={arrowClasses} />
|
||||
)}
|
||||
</button>
|
||||
</p>
|
||||
)}
|
||||
{showDetails && (
|
||||
<Markdown
|
||||
className="text-sm overflow-auto"
|
||||
components={{
|
||||
code,
|
||||
ul,
|
||||
ol,
|
||||
}}
|
||||
remarkPlugins={[remarkGfm]}
|
||||
>
|
||||
{details}
|
||||
</Markdown>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -1,14 +1,10 @@
|
||||
import { ChatMessage } from "#/components/features/chat/chat-message";
|
||||
import { ConfirmationButtons } from "#/components/shared/buttons/confirmation-buttons";
|
||||
import { ImageCarousel } from "../images/image-carousel";
|
||||
import { ErrorMessage } from "./error-message";
|
||||
|
||||
const isErrorMessage = (
|
||||
message: Message | ErrorMessage,
|
||||
): message is ErrorMessage => "error" in message;
|
||||
import { ExpandableMessage } from "./expandable-message";
|
||||
|
||||
interface MessagesProps {
|
||||
messages: (Message | ErrorMessage)[];
|
||||
messages: Message[];
|
||||
isAwaitingUserConfirmation: boolean;
|
||||
}
|
||||
|
||||
@@ -16,18 +12,27 @@ export function Messages({
|
||||
messages,
|
||||
isAwaitingUserConfirmation,
|
||||
}: MessagesProps) {
|
||||
return messages.map((message, index) =>
|
||||
isErrorMessage(message) ? (
|
||||
<ErrorMessage key={index} id={message.id} message={message.message} />
|
||||
) : (
|
||||
return messages.map((message, index) => {
|
||||
if (message.type === "error" || message.type === "action") {
|
||||
return (
|
||||
<ExpandableMessage
|
||||
key={index}
|
||||
type={message.type}
|
||||
id={message.translationID}
|
||||
message={message.content}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<ChatMessage key={index} type={message.sender} message={message.content}>
|
||||
{message.imageUrls.length > 0 && (
|
||||
{message.imageUrls && message.imageUrls.length > 0 && (
|
||||
<ImageCarousel size="small" images={message.imageUrls} />
|
||||
)}
|
||||
{messages.length - 1 === index &&
|
||||
message.sender === "assistant" &&
|
||||
isAwaitingUserConfirmation && <ConfirmationButtons />}
|
||||
</ChatMessage>
|
||||
),
|
||||
);
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import React from "react";
|
||||
import { useSelector } from "react-redux";
|
||||
|
||||
import { useFiles } from "#/context/files";
|
||||
import { cn } from "#/utils/utils";
|
||||
import { useListFiles } from "#/hooks/query/use-list-files";
|
||||
import { useListFile } from "#/hooks/query/use-list-file";
|
||||
import { Filename } from "./filename";
|
||||
import { RootState } from "#/store";
|
||||
|
||||
interface TreeNodeProps {
|
||||
path: string;
|
||||
@@ -20,6 +22,7 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) {
|
||||
selectedPath,
|
||||
} = useFiles();
|
||||
const [isOpen, setIsOpen] = React.useState(defaultOpen);
|
||||
const { curAgentState } = useSelector((state: RootState) => state.agent);
|
||||
|
||||
const isDirectory = path.endsWith("/");
|
||||
|
||||
@@ -39,6 +42,12 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) {
|
||||
}
|
||||
}, [fileContent, path]);
|
||||
|
||||
React.useEffect(() => {
|
||||
if (selectedPath === path && !isDirectory) {
|
||||
refetch();
|
||||
}
|
||||
}, [curAgentState, selectedPath, path, isDirectory]);
|
||||
|
||||
const fileParts = path.split("/");
|
||||
const filename =
|
||||
fileParts[fileParts.length - 1] || fileParts[fileParts.length - 2];
|
||||
|
||||
@@ -17,7 +17,20 @@ export function code({
|
||||
const match = /language-(\w+)/.exec(className || ""); // get the language
|
||||
|
||||
if (!match) {
|
||||
return <code className={className}>{children}</code>;
|
||||
return (
|
||||
<code
|
||||
className={className}
|
||||
style={{
|
||||
backgroundColor: "#2a3038",
|
||||
padding: "0.2em 0.4em",
|
||||
borderRadius: "4px",
|
||||
color: "#e6edf3",
|
||||
border: "1px solid #30363d",
|
||||
}}
|
||||
>
|
||||
{children}
|
||||
</code>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
|
||||
@@ -54,13 +54,13 @@ export function Sidebar() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<aside className="px-1 flex flex-col gap-1">
|
||||
<aside className="h-[40px] md:h-auto px-1 flex flex-row md:flex-col gap-1">
|
||||
<div className="w-[34px] h-[34px] flex items-center justify-center">
|
||||
{user.isLoading && <LoadingSpinner size="small" />}
|
||||
{!user.isLoading && <AllHandsLogoButton onClick={handleClickLogo} />}
|
||||
</div>
|
||||
|
||||
<nav className="py-[18px] flex flex-col items-center gap-[18px]">
|
||||
<nav className="md:py-[18px] flex flex-row md:flex-col items-center gap-[18px]">
|
||||
<UserActions
|
||||
user={user.data ? { avatar_url: user.data.avatar_url } : undefined}
|
||||
onLogout={logout}
|
||||
|
||||
22
frontend/src/components/features/waitlist/tos-checkbox.tsx
Normal file
22
frontend/src/components/features/waitlist/tos-checkbox.tsx
Normal file
@@ -0,0 +1,22 @@
|
||||
interface TOSCheckboxProps {
|
||||
onChange: () => void;
|
||||
}
|
||||
|
||||
export function TOSCheckbox({ onChange }: TOSCheckboxProps) {
|
||||
return (
|
||||
<label className="flex items-center gap-2">
|
||||
<input type="checkbox" onChange={onChange} />
|
||||
<span>
|
||||
I accept the{" "}
|
||||
<a
|
||||
href="https://www.all-hands.dev/tos"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="underline underline-offset-2 text-blue-500 hover:text-blue-700"
|
||||
>
|
||||
terms of service
|
||||
</a>
|
||||
</span>
|
||||
</label>
|
||||
);
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
import React from "react";
|
||||
import GitHubLogo from "#/assets/branding/github-logo.svg?react";
|
||||
import AllHandsLogo from "#/assets/branding/all-hands-logo.svg?react";
|
||||
import { JoinWaitlistAnchor } from "./join-waitlist-anchor";
|
||||
@@ -5,6 +6,8 @@ import { WaitlistMessage } from "./waitlist-message";
|
||||
import { ModalBackdrop } from "#/components/shared/modals/modal-backdrop";
|
||||
import { ModalButton } from "#/components/shared/buttons/modal-button";
|
||||
import { ModalBody } from "#/components/shared/modals/modal-body";
|
||||
import { TOSCheckbox } from "./tos-checkbox";
|
||||
import { handleCaptureConsent } from "#/utils/handle-capture-consent";
|
||||
|
||||
interface WaitlistModalProps {
|
||||
ghToken: string | null;
|
||||
@@ -12,22 +15,30 @@ interface WaitlistModalProps {
|
||||
}
|
||||
|
||||
export function WaitlistModal({ ghToken, githubAuthUrl }: WaitlistModalProps) {
|
||||
const [isTosAccepted, setIsTosAccepted] = React.useState(false);
|
||||
|
||||
const handleGitHubAuth = () => {
|
||||
if (githubAuthUrl) {
|
||||
handleCaptureConsent(true);
|
||||
window.location.href = githubAuthUrl;
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<ModalBackdrop>
|
||||
<ModalBody>
|
||||
<AllHandsLogo width={68} height={46} />
|
||||
<WaitlistMessage content={ghToken ? "waitlist" : "sign-in"} />
|
||||
|
||||
<TOSCheckbox onChange={() => setIsTosAccepted((prev) => !prev)} />
|
||||
|
||||
{!ghToken && (
|
||||
<ModalButton
|
||||
disabled={!isTosAccepted}
|
||||
text="Connect to GitHub"
|
||||
icon={<GitHubLogo width={20} height={20} />}
|
||||
className="bg-[#791B80] w-full"
|
||||
onClick={() => {
|
||||
if (githubAuthUrl) {
|
||||
window.location.href = githubAuthUrl;
|
||||
}
|
||||
}}
|
||||
onClick={handleGitHubAuth}
|
||||
/>
|
||||
)}
|
||||
{ghToken && <JoinWaitlistAnchor />}
|
||||
|
||||
@@ -5,7 +5,7 @@ import { NavTab } from "./nav-tab";
|
||||
interface ContainerProps {
|
||||
label?: string;
|
||||
labels?: {
|
||||
label: string;
|
||||
label: string | React.ReactNode;
|
||||
to: string;
|
||||
icon?: React.ReactNode;
|
||||
isBeta?: boolean;
|
||||
@@ -39,7 +39,7 @@ export function Container({
|
||||
{label}
|
||||
</div>
|
||||
)}
|
||||
<div className="overflow-scroll h-full rounded-b-xl">{children}</div>
|
||||
<div className="overflow-hidden h-full rounded-b-xl">{children}</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
7
frontend/src/components/layout/count-badge.tsx
Normal file
7
frontend/src/components/layout/count-badge.tsx
Normal file
@@ -0,0 +1,7 @@
|
||||
export function CountBadge({ count }: { count: number }) {
|
||||
return (
|
||||
<span className="text-[11px] leading-5 text-root-primary bg-neutral-400 px-1 rounded-xl">
|
||||
{count}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
@@ -4,7 +4,7 @@ import { BetaBadge } from "./beta-badge";
|
||||
|
||||
interface NavTabProps {
|
||||
to: string;
|
||||
label: string;
|
||||
label: string | React.ReactNode;
|
||||
icon: React.ReactNode;
|
||||
isBeta?: boolean;
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@ interface WsClientProviderProps {
|
||||
enabled: boolean;
|
||||
token: string | null;
|
||||
ghToken: string | null;
|
||||
selectedRepository: string | null;
|
||||
settings: Settings | null;
|
||||
}
|
||||
|
||||
@@ -45,12 +46,14 @@ export function WsClientProvider({
|
||||
enabled,
|
||||
token,
|
||||
ghToken,
|
||||
selectedRepository,
|
||||
settings,
|
||||
children,
|
||||
}: React.PropsWithChildren<WsClientProviderProps>) {
|
||||
const sioRef = React.useRef<Socket | null>(null);
|
||||
const tokenRef = React.useRef<string | null>(token);
|
||||
const ghTokenRef = React.useRef<string | null>(ghToken);
|
||||
const selectedRepositoryRef = React.useRef<string | null>(selectedRepository);
|
||||
const disconnectRef = React.useRef<ReturnType<typeof setTimeout> | null>(
|
||||
null,
|
||||
);
|
||||
@@ -81,8 +84,11 @@ export function WsClientProvider({
|
||||
if (ghToken) {
|
||||
initEvent.github_token = ghToken;
|
||||
}
|
||||
if (selectedRepository) {
|
||||
initEvent.selected_repository = selectedRepository;
|
||||
}
|
||||
const lastEvent = lastEventRef.current;
|
||||
if (lastEvent && !Number.isNaN(parseInt(lastEvent.id as string, 10))) {
|
||||
if (lastEvent) {
|
||||
initEvent.latest_event_id = lastEvent.id;
|
||||
}
|
||||
send(initEvent);
|
||||
@@ -93,7 +99,9 @@ export function WsClientProvider({
|
||||
messageRateHandler.record(new Date().getTime());
|
||||
}
|
||||
setEvents((prevEvents) => [...prevEvents, event]);
|
||||
lastEventRef.current = event;
|
||||
if (!Number.isNaN(parseInt(event.id as string, 10))) {
|
||||
lastEventRef.current = event;
|
||||
}
|
||||
const extras = event.extras as Record<string, unknown>;
|
||||
if (extras?.agent_state === AgentState.INIT) {
|
||||
setStatus(WsClientProviderStatus.ACTIVE);
|
||||
@@ -156,6 +164,7 @@ export function WsClientProvider({
|
||||
sioRef.current = sio;
|
||||
tokenRef.current = token;
|
||||
ghTokenRef.current = ghToken;
|
||||
selectedRepositoryRef.current = selectedRepository;
|
||||
|
||||
return () => {
|
||||
sio.off("connect", handleConnect);
|
||||
@@ -164,7 +173,7 @@ export function WsClientProvider({
|
||||
sio.off("connect_failed", handleError);
|
||||
sio.off("disconnect", handleDisconnect);
|
||||
};
|
||||
}, [enabled, token, ghToken]);
|
||||
}, [enabled, token, ghToken, selectedRepository]);
|
||||
|
||||
// Strict mode mounts and unmounts each component twice, so we have to wait in the destructor
|
||||
// before actually disconnecting the socket and cancel the operation if the component gets remounted.
|
||||
|
||||
@@ -1782,14 +1782,6 @@
|
||||
"fr": "Privé",
|
||||
"tr": "Özel"
|
||||
},
|
||||
"ERROR_MESSAGE$SHOW_DETAILS": {
|
||||
"en": "Show details",
|
||||
"es": "Mostrar detalles"
|
||||
},
|
||||
"ERROR_MESSAGE$HIDE_DETAILS": {
|
||||
"en": "Hide details",
|
||||
"es": "Ocultar detalles"
|
||||
},
|
||||
"STATUS$STARTING_RUNTIME": {
|
||||
"en": "Starting Runtime...",
|
||||
"zh-CN": "启动运行时...",
|
||||
@@ -2012,5 +2004,41 @@
|
||||
"PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_AS_ZIP_LABEL": {
|
||||
"en": "Download as .zip",
|
||||
"es": "Descargar como .zip"
|
||||
},
|
||||
"ACTION_MESSAGE$RUN": {
|
||||
"en": "Running a bash command"
|
||||
},
|
||||
"ACTION_MESSAGE$RUN_IPYTHON": {
|
||||
"en": "Running a Jupyter command"
|
||||
},
|
||||
"ACTION_MESSAGE$READ": {
|
||||
"en": "Reading the contents of a file"
|
||||
},
|
||||
"ACTION_MESSAGE$WRITE": {
|
||||
"en": "Writing to a file"
|
||||
},
|
||||
"ACTION_MESSAGE$BROWSE": {
|
||||
"en": "Browsing the web"
|
||||
},
|
||||
"OBSERVATION_MESSAGE$RUN": {
|
||||
"en": "Ran a bash command"
|
||||
},
|
||||
"OBSERVATION_MESSAGE$RUN_IPYTHON": {
|
||||
"en": "Ran a Jupyter command"
|
||||
},
|
||||
"OBSERVATION_MESSAGE$READ": {
|
||||
"en": "Read the contents of a file"
|
||||
},
|
||||
"OBSERVATION_MESSAGE$WRITE": {
|
||||
"en": "Wrote to a file"
|
||||
},
|
||||
"OBSERVATION_MESSAGE$BROWSE": {
|
||||
"en": "Browsing completed"
|
||||
},
|
||||
"EXPANDABLE_MESSAGE$SHOW_DETAILS": {
|
||||
"en": "Show details"
|
||||
},
|
||||
"EXPANDABLE_MESSAGE$HIDE_DETAILS": {
|
||||
"en": "Hide details"
|
||||
}
|
||||
}
|
||||
|
||||
1
frontend/src/icons/angle-down-solid.svg
Normal file
1
frontend/src/icons/angle-down-solid.svg
Normal file
@@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 374.6c12.5 12.5 32.8 12.5 45.3 0l160-160c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0L224 306.7 86.6 169.4c-12.5-12.5-32.8-12.5-45.3 0s-12.5 32.8 0 45.3l160 160z"/></svg>
|
||||
|
After Width: | Height: | Size: 400 B |
1
frontend/src/icons/angle-up-solid.svg
Normal file
1
frontend/src/icons/angle-up-solid.svg
Normal file
@@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 137.4c12.5-12.5 32.8-12.5 45.3 0l160 160c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L224 205.3 86.6 342.6c-12.5 12.5-32.8 12.5-45.3 0s-12.5-32.8 0-45.3l160-160z"/></svg>
|
||||
|
After Width: | Height: | Size: 400 B |
@@ -32,8 +32,11 @@ code {
|
||||
margin: 0;
|
||||
font-size: 85%;
|
||||
white-space: break-spaces;
|
||||
background-color: var(--bg-neutral-muted);
|
||||
border-radius: 6px;
|
||||
background-color: #2a3038;
|
||||
border-radius: 4px;
|
||||
color: #e6edf3;
|
||||
border: 1px solid #30363d;
|
||||
letter-spacing: -0.2px;
|
||||
}
|
||||
|
||||
.markdown-body pre code {
|
||||
|
||||
11
frontend/src/message.d.ts
vendored
11
frontend/src/message.d.ts
vendored
@@ -1,13 +1,10 @@
|
||||
type Message = {
|
||||
sender: "user" | "assistant";
|
||||
content: string;
|
||||
imageUrls: string[];
|
||||
timestamp: string;
|
||||
imageUrls?: string[];
|
||||
type?: "thought" | "error" | "action";
|
||||
pending?: boolean;
|
||||
};
|
||||
|
||||
type ErrorMessage = {
|
||||
error: boolean;
|
||||
id?: string;
|
||||
message: string;
|
||||
translationID?: string;
|
||||
eventID?: number;
|
||||
};
|
||||
|
||||
@@ -6,7 +6,6 @@ import {
|
||||
WsClientProviderStatus,
|
||||
} from "#/context/ws-client-provider";
|
||||
import { createChatMessage } from "#/services/chat-service";
|
||||
import { getCloneRepoCommand } from "#/services/terminal-service";
|
||||
import { setCurrentAgentState } from "#/state/agent-slice";
|
||||
import { addUserMessage } from "#/state/chat-slice";
|
||||
import {
|
||||
@@ -37,11 +36,6 @@ export const useWSStatusChange = () => {
|
||||
send(createChatMessage(query, base64Files, timestamp));
|
||||
};
|
||||
|
||||
const dispatchCloneRepoCommand = (ghToken: string, repository: string) => {
|
||||
send(getCloneRepoCommand(ghToken, repository));
|
||||
dispatch(clearSelectedRepository());
|
||||
};
|
||||
|
||||
const dispatchInitialQuery = (query: string, additionalInfo: string) => {
|
||||
if (additionalInfo) {
|
||||
sendInitialQuery(`${query}\n\n[${additionalInfo}]`, files);
|
||||
@@ -57,7 +51,7 @@ export const useWSStatusChange = () => {
|
||||
let additionalInfo = "";
|
||||
|
||||
if (gitHubToken && selectedRepository) {
|
||||
dispatchCloneRepoCommand(gitHubToken, selectedRepository);
|
||||
dispatch(clearSelectedRepository());
|
||||
additionalInfo = `Repository ${selectedRepository} has been cloned to /workspace. Please check the /workspace for files.`;
|
||||
} else if (importedProjectZip) {
|
||||
// if there's an uploaded project zip, add it to the chat
|
||||
|
||||
@@ -21,6 +21,7 @@ import { useUserPrefs } from "#/context/user-prefs-context";
|
||||
import { useConversationConfig } from "#/hooks/query/use-conversation-config";
|
||||
import { Container } from "#/components/layout/container";
|
||||
import Security from "#/components/shared/modals/security/security";
|
||||
import { CountBadge } from "#/components/layout/count-badge";
|
||||
|
||||
function App() {
|
||||
const { token, gitHubToken } = useAuth();
|
||||
@@ -33,6 +34,8 @@ function App() {
|
||||
(state: RootState) => state.initalQuery,
|
||||
);
|
||||
|
||||
const { updateCount } = useSelector((state: RootState) => state.browser);
|
||||
|
||||
const { data: latestGitHubCommit } = useLatestRepoCommit({
|
||||
repository: selectedRepository,
|
||||
});
|
||||
@@ -64,26 +67,31 @@ function App() {
|
||||
enabled
|
||||
token={token}
|
||||
ghToken={gitHubToken}
|
||||
selectedRepository={selectedRepository}
|
||||
settings={settings}
|
||||
>
|
||||
<EventHandler>
|
||||
<div className="flex flex-col h-full gap-3">
|
||||
<div className="flex h-full overflow-auto gap-3">
|
||||
<Container className="w-[390px] max-h-full relative">
|
||||
<Container className="w-full md:w-[390px] max-h-full relative">
|
||||
<ChatInterface />
|
||||
</Container>
|
||||
|
||||
<div className="flex flex-col grow gap-3">
|
||||
<div className="hidden md:flex flex-col grow gap-3">
|
||||
<Container
|
||||
className="h-2/3"
|
||||
labels={[
|
||||
{ label: "Workspace", to: "", icon: <CodeIcon /> },
|
||||
{ label: "Jupyter", to: "jupyter", icon: <ListIcon /> },
|
||||
{
|
||||
label: "Browser",
|
||||
label: (
|
||||
<div className="flex items-center gap-1">
|
||||
Browser
|
||||
{updateCount > 0 && <CountBadge count={updateCount} />}
|
||||
</div>
|
||||
),
|
||||
to: "browser",
|
||||
icon: <GlobeIcon />,
|
||||
isBeta: true,
|
||||
},
|
||||
]}
|
||||
>
|
||||
|
||||
@@ -6,9 +6,9 @@ import { useIsAuthed } from "#/hooks/query/use-is-authed";
|
||||
import { useAuth } from "#/context/auth-context";
|
||||
import { useUserPrefs } from "#/context/user-prefs-context";
|
||||
import { useConfig } from "#/hooks/query/use-config";
|
||||
import { AnalyticsConsentFormModal } from "#/components/features/analytics/analytics-consent-form-modal";
|
||||
import { Sidebar } from "#/components/features/sidebar/sidebar";
|
||||
import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
|
||||
import { AnalyticsConsentFormModal } from "#/components/features/analytics/analytics-consent-form-modal";
|
||||
|
||||
export function ErrorBoundary() {
|
||||
const error = useRouteError();
|
||||
@@ -79,18 +79,19 @@ export default function MainApp() {
|
||||
return (
|
||||
<div
|
||||
data-testid="root-layout"
|
||||
className="bg-root-primary p-3 h-screen min-w-[1024px] overflow-x-hidden flex gap-3"
|
||||
className="bg-root-primary p-3 h-screen md:min-w-[1024px] overflow-x-hidden flex flex-col md:flex-row gap-3"
|
||||
>
|
||||
<Sidebar />
|
||||
|
||||
<div className="h-full w-full relative">
|
||||
<div className="h-[calc(100%-50px)] md:h-full w-full relative">
|
||||
<Outlet />
|
||||
</div>
|
||||
|
||||
{isInWaitlist && (
|
||||
<WaitlistModal ghToken={gitHubToken} githubAuthUrl={gitHubAuthUrl} />
|
||||
)}
|
||||
{consentFormIsOpen && (
|
||||
|
||||
{config.data?.APP_MODE === "oss" && consentFormIsOpen && (
|
||||
<AnalyticsConsentFormModal
|
||||
onClose={() => setConsentFormIsOpen(false)}
|
||||
/>
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
import {
|
||||
addAssistantMessage,
|
||||
addAssistantAction,
|
||||
addUserMessage,
|
||||
addErrorMessage,
|
||||
} from "#/state/chat-slice";
|
||||
import { appendSecurityAnalyzerInput } from "#/state/security-analyzer-slice";
|
||||
import { setCode, setActiveFilepath } from "#/state/code-slice";
|
||||
import { appendJupyterInput } from "#/state/jupyter-slice";
|
||||
import {
|
||||
ActionSecurityRisk,
|
||||
appendSecurityAnalyzerInput,
|
||||
} from "#/state/security-analyzer-slice";
|
||||
import { setCurStatusMessage } from "#/state/status-slice";
|
||||
import store from "#/store";
|
||||
import ActionType from "#/types/action-type";
|
||||
@@ -17,21 +15,16 @@ import {
|
||||
ObservationMessage,
|
||||
StatusMessage,
|
||||
} from "#/types/message";
|
||||
import EventLogger from "#/utils/event-logger";
|
||||
import { handleObservationMessage } from "./observations";
|
||||
|
||||
const messageActions = {
|
||||
[ActionType.BROWSE]: (message: ActionMessage) => {
|
||||
if (message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
} else {
|
||||
if (!message.args.thought && message.message) {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
}
|
||||
},
|
||||
[ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
|
||||
if (message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
} else {
|
||||
if (!message.args.thought && message.message) {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
}
|
||||
},
|
||||
@@ -54,74 +47,28 @@ const messageActions = {
|
||||
store.dispatch(addAssistantMessage(message.args.content));
|
||||
}
|
||||
},
|
||||
[ActionType.FINISH]: (message: ActionMessage) => {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
},
|
||||
[ActionType.REJECT]: (message: ActionMessage) => {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
},
|
||||
[ActionType.DELEGATE]: (message: ActionMessage) => {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
},
|
||||
[ActionType.RUN]: (message: ActionMessage) => {
|
||||
if (message.args.hidden) return;
|
||||
if (message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
}
|
||||
},
|
||||
[ActionType.RUN_IPYTHON]: (message: ActionMessage) => {
|
||||
if (message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
}
|
||||
if (message.args.confirmation_state !== "rejected") {
|
||||
store.dispatch(appendJupyterInput(message.args.code));
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
function getRiskText(risk: ActionSecurityRisk) {
|
||||
switch (risk) {
|
||||
case ActionSecurityRisk.LOW:
|
||||
return "Low Risk";
|
||||
case ActionSecurityRisk.MEDIUM:
|
||||
return "Medium Risk";
|
||||
case ActionSecurityRisk.HIGH:
|
||||
return "High Risk";
|
||||
case ActionSecurityRisk.UNKNOWN:
|
||||
default:
|
||||
return "Unknown Risk";
|
||||
}
|
||||
}
|
||||
|
||||
export function handleActionMessage(message: ActionMessage) {
|
||||
if (message.args?.hidden) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ("args" in message && "security_risk" in message.args) {
|
||||
store.dispatch(appendSecurityAnalyzerInput(message));
|
||||
}
|
||||
|
||||
if (
|
||||
(message.action === ActionType.RUN ||
|
||||
message.action === ActionType.RUN_IPYTHON) &&
|
||||
message.args.confirmation_state === "awaiting_confirmation"
|
||||
) {
|
||||
if (message.args.thought) {
|
||||
if (message.source === "agent") {
|
||||
if (message.args && message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
}
|
||||
if (message.args.command) {
|
||||
store.dispatch(
|
||||
addAssistantMessage(
|
||||
`Running this command now: \n\`\`\`\`bash\n${message.args.command}\n\`\`\`\`\nEstimated security risk: ${getRiskText(message.args.security_risk as unknown as ActionSecurityRisk)}`,
|
||||
),
|
||||
);
|
||||
} else if (message.args.code) {
|
||||
store.dispatch(
|
||||
addAssistantMessage(
|
||||
`Running this code now: \n\`\`\`\`python\n${message.args.code}\n\`\`\`\`\nEstimated security risk: ${getRiskText(message.args.security_risk as unknown as ActionSecurityRisk)}`,
|
||||
),
|
||||
);
|
||||
} else {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
}
|
||||
return;
|
||||
// @ts-expect-error TODO: fix
|
||||
store.dispatch(addAssistantAction(message));
|
||||
}
|
||||
|
||||
if (message.action in messageActions) {
|
||||
@@ -155,6 +102,10 @@ export function handleAssistantMessage(message: Record<string, unknown>) {
|
||||
} else if (message.status_update) {
|
||||
handleStatusMessage(message as unknown as StatusMessage);
|
||||
} else {
|
||||
EventLogger.error(`Unknown message type ${message}`);
|
||||
store.dispatch(
|
||||
addErrorMessage({
|
||||
message: "Unknown message type received",
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,10 +2,14 @@ import { setCurrentAgentState } from "#/state/agent-slice";
|
||||
import { setUrl, setScreenshotSrc } from "#/state/browser-slice";
|
||||
import store from "#/store";
|
||||
import { ObservationMessage } from "#/types/message";
|
||||
import AgentState from "#/types/agent-state";
|
||||
import { appendOutput } from "#/state/command-slice";
|
||||
import { appendJupyterOutput } from "#/state/jupyter-slice";
|
||||
import ObservationType from "#/types/observation-type";
|
||||
import { addAssistantMessage } from "#/state/chat-slice";
|
||||
import {
|
||||
addAssistantMessage,
|
||||
addAssistantObservation,
|
||||
} from "#/state/chat-slice";
|
||||
|
||||
export function handleObservationMessage(message: ObservationMessage) {
|
||||
switch (message.observation) {
|
||||
@@ -46,4 +50,120 @@ export function handleObservationMessage(message: ObservationMessage) {
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
break;
|
||||
}
|
||||
if (!message.extras?.hidden) {
|
||||
// Convert the message to the appropriate observation type
|
||||
const { observation } = message;
|
||||
const baseObservation = {
|
||||
...message,
|
||||
source: "agent" as const,
|
||||
};
|
||||
|
||||
switch (observation) {
|
||||
case "agent_state_changed":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "agent_state_changed" as const,
|
||||
extras: {
|
||||
agent_state: (message.extras.agent_state as AgentState) || "idle",
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
case "run":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "run" as const,
|
||||
extras: {
|
||||
command: String(message.extras.command || ""),
|
||||
command_id: Number(message.extras.command_id || 0),
|
||||
exit_code: Number(message.extras.exit_code || 0),
|
||||
hidden: Boolean(message.extras.hidden),
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
case "run_ipython":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "run_ipython" as const,
|
||||
extras: {
|
||||
code: String(message.extras.code || ""),
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
case "delegate":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "delegate" as const,
|
||||
extras: {
|
||||
outputs:
|
||||
typeof message.extras.outputs === "object"
|
||||
? (message.extras.outputs as Record<string, unknown>)
|
||||
: {},
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
case "browse":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "browse" as const,
|
||||
extras: {
|
||||
url: String(message.extras.url || ""),
|
||||
screenshot: String(message.extras.screenshot || ""),
|
||||
error: Boolean(message.extras.error),
|
||||
open_page_urls: Array.isArray(message.extras.open_page_urls)
|
||||
? message.extras.open_page_urls
|
||||
: [],
|
||||
active_page_index: Number(message.extras.active_page_index || 0),
|
||||
dom_object:
|
||||
typeof message.extras.dom_object === "object"
|
||||
? (message.extras.dom_object as Record<string, unknown>)
|
||||
: {},
|
||||
axtree_object:
|
||||
typeof message.extras.axtree_object === "object"
|
||||
? (message.extras.axtree_object as Record<string, unknown>)
|
||||
: {},
|
||||
extra_element_properties:
|
||||
typeof message.extras.extra_element_properties === "object"
|
||||
? (message.extras.extra_element_properties as Record<
|
||||
string,
|
||||
unknown
|
||||
>)
|
||||
: {},
|
||||
last_browser_action: String(
|
||||
message.extras.last_browser_action || "",
|
||||
),
|
||||
last_browser_action_error:
|
||||
message.extras.last_browser_action_error,
|
||||
focused_element_bid: String(
|
||||
message.extras.focused_element_bid || "",
|
||||
),
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
case "error":
|
||||
store.dispatch(
|
||||
addAssistantObservation({
|
||||
...baseObservation,
|
||||
observation: "error" as const,
|
||||
source: "user" as const,
|
||||
extras: {
|
||||
error_id: message.extras.error_id,
|
||||
},
|
||||
}),
|
||||
);
|
||||
break;
|
||||
default:
|
||||
// For any unhandled observation types, just ignore them
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,11 +10,3 @@ export function getGitHubTokenCommand(gitHubToken: string) {
|
||||
const event = getTerminalCommand(command, true);
|
||||
return event;
|
||||
}
|
||||
|
||||
export function getCloneRepoCommand(gitHubToken: string, repository: string) {
|
||||
const url = `https://${gitHubToken}@github.com/${repository}.git`;
|
||||
const dirName = repository.split("/")[1];
|
||||
const command = `git clone ${url} ${dirName} ; cd ${dirName} ; git checkout -b openhands-workspace`;
|
||||
const event = getTerminalCommand(command, true);
|
||||
return event;
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@ export const initialState = {
|
||||
url: "https://github.com/All-Hands-AI/OpenHands",
|
||||
// Base64-encoded screenshot of browser window (placeholder for now, will be replaced with the actual screenshot later)
|
||||
screenshotSrc: "",
|
||||
// Counter for browser updates
|
||||
updateCount: 0,
|
||||
};
|
||||
|
||||
export const browserSlice = createSlice({
|
||||
@@ -16,6 +18,7 @@ export const browserSlice = createSlice({
|
||||
},
|
||||
setScreenshotSrc: (state, action) => {
|
||||
state.screenshotSrc = action.payload;
|
||||
state.updateCount += 1;
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
@@ -1,6 +1,28 @@
|
||||
import { createSlice, PayloadAction } from "@reduxjs/toolkit";
|
||||
|
||||
type SliceState = { messages: (Message | ErrorMessage)[] };
|
||||
import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
|
||||
import { OpenHandsObservation } from "#/types/core/observations";
|
||||
import { OpenHandsAction } from "#/types/core/actions";
|
||||
|
||||
type SliceState = { messages: Message[] };
|
||||
|
||||
const MAX_CONTENT_LENGTH = 1000;
|
||||
|
||||
const HANDLED_ACTIONS = ["run", "run_ipython", "write", "read", "browse"];
|
||||
|
||||
function getRiskText(risk: ActionSecurityRisk) {
|
||||
switch (risk) {
|
||||
case ActionSecurityRisk.LOW:
|
||||
return "Low Risk";
|
||||
case ActionSecurityRisk.MEDIUM:
|
||||
return "Medium Risk";
|
||||
case ActionSecurityRisk.HIGH:
|
||||
return "High Risk";
|
||||
case ActionSecurityRisk.UNKNOWN:
|
||||
default:
|
||||
return "Unknown Risk";
|
||||
}
|
||||
}
|
||||
|
||||
const initialState: SliceState = {
|
||||
messages: [],
|
||||
@@ -20,6 +42,7 @@ export const chatSlice = createSlice({
|
||||
}>,
|
||||
) {
|
||||
const message: Message = {
|
||||
type: "thought",
|
||||
sender: "user",
|
||||
content: action.payload.content,
|
||||
imageUrls: action.payload.imageUrls,
|
||||
@@ -40,6 +63,7 @@ export const chatSlice = createSlice({
|
||||
|
||||
addAssistantMessage(state, action: PayloadAction<string>) {
|
||||
const message: Message = {
|
||||
type: "thought",
|
||||
sender: "assistant",
|
||||
content: action.payload,
|
||||
imageUrls: [],
|
||||
@@ -49,12 +73,96 @@ export const chatSlice = createSlice({
|
||||
state.messages.push(message);
|
||||
},
|
||||
|
||||
addAssistantAction(state, action: PayloadAction<OpenHandsAction>) {
|
||||
const actionID = action.payload.action;
|
||||
if (!HANDLED_ACTIONS.includes(actionID)) {
|
||||
return;
|
||||
}
|
||||
const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`;
|
||||
let text = "";
|
||||
if (actionID === "run") {
|
||||
text = `\`${action.payload.args.command}\``;
|
||||
} else if (actionID === "run_ipython") {
|
||||
text = `\`\`\`\n${action.payload.args.code}\n\`\`\``;
|
||||
} else if (actionID === "write") {
|
||||
let { content } = action.payload.args;
|
||||
if (content.length > MAX_CONTENT_LENGTH) {
|
||||
content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
|
||||
}
|
||||
text = `${action.payload.args.path}\n${content}`;
|
||||
} else if (actionID === "read") {
|
||||
text = action.payload.args.path;
|
||||
} else if (actionID === "browse") {
|
||||
text = `Browsing ${action.payload.args.url}`;
|
||||
}
|
||||
if (actionID === "run" || actionID === "run_ipython") {
|
||||
if (
|
||||
action.payload.args.confirmation_state === "awaiting_confirmation"
|
||||
) {
|
||||
text += `\n\n${getRiskText(action.payload.args.security_risk as unknown as ActionSecurityRisk)}`;
|
||||
}
|
||||
}
|
||||
const message: Message = {
|
||||
type: "action",
|
||||
sender: "assistant",
|
||||
translationID,
|
||||
eventID: action.payload.id,
|
||||
content: text,
|
||||
imageUrls: [],
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
state.messages.push(message);
|
||||
},
|
||||
|
||||
addAssistantObservation(
|
||||
state,
|
||||
observation: PayloadAction<OpenHandsObservation>,
|
||||
) {
|
||||
const observationID = observation.payload.observation;
|
||||
if (!HANDLED_ACTIONS.includes(observationID)) {
|
||||
return;
|
||||
}
|
||||
const translationID = `OBSERVATION_MESSAGE$${observationID.toUpperCase()}`;
|
||||
const causeID = observation.payload.cause;
|
||||
const causeMessage = state.messages.find(
|
||||
(message) => message.eventID === causeID,
|
||||
);
|
||||
if (!causeMessage) {
|
||||
return;
|
||||
}
|
||||
causeMessage.translationID = translationID;
|
||||
if (observationID === "run" || observationID === "run_ipython") {
|
||||
let { content } = observation.payload;
|
||||
if (content.length > MAX_CONTENT_LENGTH) {
|
||||
content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
|
||||
}
|
||||
content = `\`\`\`\n${content}\n\`\`\``;
|
||||
causeMessage.content = content; // Observation content includes the action
|
||||
} else if (observationID === "browse") {
|
||||
let content = `**URL:** ${observation.payload.extras.url}\n`;
|
||||
if (observation.payload.extras.error) {
|
||||
content += `**Error:**\n${observation.payload.extras.error}\n`;
|
||||
}
|
||||
content += `**Output:**\n${observation.payload.content}`;
|
||||
if (content.length > MAX_CONTENT_LENGTH) {
|
||||
content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
|
||||
}
|
||||
causeMessage.content = content;
|
||||
}
|
||||
},
|
||||
|
||||
addErrorMessage(
|
||||
state,
|
||||
action: PayloadAction<{ id?: string; message: string }>,
|
||||
) {
|
||||
const { id, message } = action.payload;
|
||||
state.messages.push({ id, message, error: true });
|
||||
state.messages.push({
|
||||
translationID: id,
|
||||
content: message,
|
||||
type: "error",
|
||||
sender: "assistant",
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
},
|
||||
|
||||
clearMessages(state) {
|
||||
@@ -66,6 +174,8 @@ export const chatSlice = createSlice({
|
||||
export const {
|
||||
addUserMessage,
|
||||
addAssistantMessage,
|
||||
addAssistantAction,
|
||||
addAssistantObservation,
|
||||
addErrorMessage,
|
||||
clearMessages,
|
||||
} = chatSlice.actions;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { OpenHandsActionEvent } from "./base";
|
||||
import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
|
||||
|
||||
export interface UserMessageAction extends OpenHandsActionEvent<"message"> {
|
||||
source: "user";
|
||||
@@ -12,6 +13,7 @@ export interface CommandAction extends OpenHandsActionEvent<"run"> {
|
||||
source: "agent";
|
||||
args: {
|
||||
command: string;
|
||||
security_risk: ActionSecurityRisk;
|
||||
confirmation_state: "confirmed" | "rejected" | "awaiting_confirmation";
|
||||
thought: string;
|
||||
hidden?: boolean;
|
||||
@@ -32,6 +34,7 @@ export interface IPythonAction extends OpenHandsActionEvent<"run_ipython"> {
|
||||
source: "agent";
|
||||
args: {
|
||||
code: string;
|
||||
security_risk: ActionSecurityRisk;
|
||||
confirmation_state: "confirmed" | "rejected" | "awaiting_confirmation";
|
||||
kernel_init_code: string;
|
||||
thought: string;
|
||||
@@ -96,6 +99,23 @@ export interface ModifyTaskAction extends OpenHandsActionEvent<"modify_task"> {
|
||||
};
|
||||
}
|
||||
|
||||
export interface FileReadAction extends OpenHandsActionEvent<"read"> {
|
||||
source: "agent";
|
||||
args: {
|
||||
path: string;
|
||||
thought: string;
|
||||
};
|
||||
}
|
||||
|
||||
export interface FileWriteAction extends OpenHandsActionEvent<"write"> {
|
||||
source: "agent";
|
||||
args: {
|
||||
path: string;
|
||||
content: string;
|
||||
thought: string;
|
||||
};
|
||||
}
|
||||
|
||||
export interface RejectAction extends OpenHandsActionEvent<"reject"> {
|
||||
source: "agent";
|
||||
args: {
|
||||
@@ -112,6 +132,8 @@ export type OpenHandsAction =
|
||||
| DelegateAction
|
||||
| BrowseAction
|
||||
| BrowseInteractiveAction
|
||||
| FileReadAction
|
||||
| FileWriteAction
|
||||
| AddTaskAction
|
||||
| ModifyTaskAction
|
||||
| RejectAction;
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
type OpenHandsEventType =
|
||||
export type OpenHandsEventType =
|
||||
| "message"
|
||||
| "agent_state_changed"
|
||||
| "run"
|
||||
| "read"
|
||||
| "write"
|
||||
| "run_ipython"
|
||||
| "delegate"
|
||||
| "browse"
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
export interface ActionMessage {
|
||||
id: number;
|
||||
|
||||
// Either 'agent' or 'user'
|
||||
source: string;
|
||||
source: "agent" | "user";
|
||||
|
||||
// The action to be taken
|
||||
action: string;
|
||||
@@ -19,6 +21,9 @@ export interface ObservationMessage {
|
||||
// The type of observation
|
||||
observation: string;
|
||||
|
||||
id: number;
|
||||
cause: number;
|
||||
|
||||
// The observed data
|
||||
content: string;
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
/** @type {import('tailwindcss').Config} */
|
||||
const { nextui } = require("@nextui-org/react");
|
||||
import { nextui } from "@nextui-org/react";
|
||||
import typography from '@tailwindcss/typography';
|
||||
|
||||
export default {
|
||||
content: [
|
||||
"./src/**/*.{js,ts,jsx,tsx}",
|
||||
@@ -33,6 +35,6 @@ export default {
|
||||
}
|
||||
}
|
||||
}),
|
||||
require('@tailwindcss/typography'),
|
||||
typography,
|
||||
],
|
||||
};
|
||||
|
||||
@@ -4,6 +4,16 @@ __package_name__ = 'openhands_ai'
|
||||
|
||||
|
||||
def get_version():
|
||||
# Try getting the version from pyproject.toml
|
||||
try:
|
||||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('version ='):
|
||||
return line.split('=')[1].strip().strip('"')
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
|
||||
@@ -18,16 +28,6 @@ def get_version():
|
||||
except (ImportError, DistributionNotFound):
|
||||
pass
|
||||
|
||||
# Try getting the version from pyproject.toml
|
||||
try:
|
||||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('version ='):
|
||||
return line.split('=')[1].strip().strip('"')
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
return 'unknown'
|
||||
|
||||
|
||||
|
||||
@@ -1,28 +1,75 @@
|
||||
# CodeAct Agent Framework
|
||||
|
||||
This folder implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
|
||||
This folder is an implementation of OpenHands's main agent, the CodeAct Agent. It is based on ([CodeAct](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)), an idea of consolidating LLM agents' **act**ions into a unified **code** action space for both *simplicity* and *performance*.
|
||||
|
||||
The conceptual idea is illustrated below. At each turn, the agent can:
|
||||
## Overview
|
||||
|
||||
The CodeAct agent operates through a function calling interface. At each turn, the agent can:
|
||||
|
||||
1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
|
||||
2. **CodeAct**: Choose to perform the task by executing code
|
||||
- Execute any valid Linux `bash` command
|
||||
- Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
|
||||
2. **CodeAct**: Execute actions through a set of well-defined tools:
|
||||
- Execute Linux `bash` commands with `execute_bash`
|
||||
- Run Python code in an [IPython](https://ipython.org/) environment with `execute_ipython_cell`
|
||||
- Interact with web browsers using `browser` and `web_read`
|
||||
- Edit files using `str_replace_editor` or `edit_file`
|
||||
|
||||

|
||||
|
||||
## Built-in Tools
|
||||
|
||||
The agent provides several built-in tools:
|
||||
|
||||
### 1. `execute_bash`
|
||||
- Execute any valid Linux bash command
|
||||
- Handles long-running commands by running them in background with output redirection
|
||||
- Supports interactive processes with STDIN input and process interruption
|
||||
- Handles command timeouts with automatic retry in background mode
|
||||
|
||||
### 2. `execute_ipython_cell`
|
||||
- Run Python code in an IPython environment
|
||||
- Supports magic commands like `%pip`
|
||||
- Variables are scoped to the IPython environment
|
||||
- Requires defining variables and importing packages before use
|
||||
|
||||
### 3. `web_read` and `browser`
|
||||
- `web_read`: Read and convert webpage content to markdown
|
||||
- `browser`: Interact with webpages through Python code
|
||||
- Supports common browser actions like navigation, clicking, form filling, scrolling
|
||||
- Handles file uploads and drag-and-drop operations
|
||||
|
||||
### 4. `str_replace_editor`
|
||||
- View, create and edit files through string replacement
|
||||
- Persistent state across command calls
|
||||
- File viewing with line numbers
|
||||
- String replacement with exact matching
|
||||
- Undo functionality for edits
|
||||
|
||||
### 5. `edit_file` (LLM-based)
|
||||
- Edit files using LLM-based content generation
|
||||
- Support for partial file edits with line ranges
|
||||
- Handles large files by editing specific sections
|
||||
- Append mode for adding content to files
|
||||
|
||||
## Configuration
|
||||
|
||||
Tools can be enabled/disabled through configuration parameters:
|
||||
- `codeact_enable_browsing`: Enable browser interaction tools
|
||||
- `codeact_enable_jupyter`: Enable IPython code execution
|
||||
- `codeact_enable_llm_editor`: Enable LLM-based file editing (falls back to string replacement editor if disabled)
|
||||
|
||||
## Micro-agents
|
||||
|
||||
The agent includes specialized micro-agents for specific tasks:
|
||||
|
||||
1. **npm**: Handles npm package installation with non-interactive shell workarounds
|
||||
2. **github**: Manages GitHub operations with API token support and PR creation guidelines
|
||||
3. **flarglebargle**: Easter egg response handler
|
||||
|
||||
## Adding New Tools
|
||||
|
||||
The CodeAct agent uses a function calling interface to define tools that the agent can use. Tools are defined in `function_calling.py` using the `ChatCompletionToolParam` class from `litellm`. Each tool consists of:
|
||||
|
||||
1. A description string that explains what the tool does and how to use it
|
||||
2. A tool definition using `ChatCompletionToolParam` that specifies:
|
||||
- The tool's name
|
||||
- The tool's parameters and their types
|
||||
- Required vs optional parameters
|
||||
|
||||
Here's an example of how a tool is defined:
|
||||
The CodeAct agent uses a function calling interface based on `litellm`'s `ChatCompletionToolParam`. To add a new tool:
|
||||
|
||||
1. Define the tool in `function_calling.py`:
|
||||
```python
|
||||
MyTool = ChatCompletionToolParam(
|
||||
type='function',
|
||||
@@ -47,20 +94,20 @@ MyTool = ChatCompletionToolParam(
|
||||
)
|
||||
```
|
||||
|
||||
To add a new tool:
|
||||
2. Add the tool to `get_tools()` in `function_calling.py`
|
||||
3. Implement the corresponding action handler in the agent class
|
||||
|
||||
1. Define your tool in `function_calling.py` following the pattern above
|
||||
2. Add your tool to the `get_tools()` function in `function_calling.py`
|
||||
3. Implement the corresponding action handler in the agent to process the tool's invocation
|
||||
## Implementation Details
|
||||
|
||||
The agent currently supports several built-in tools:
|
||||
- `execute_bash`: Execute bash commands
|
||||
- `execute_ipython_cell`: Run Python code in IPython
|
||||
- `browser`: Interact with a web browser
|
||||
- `str_replace_editor`: Edit files using string replacement
|
||||
- `edit_file`: Edit files using LLM-based editing
|
||||
The agent is implemented in two main files:
|
||||
|
||||
Tools can be enabled/disabled through configuration parameters:
|
||||
- `codeact_enable_browsing`: Enable browser interaction
|
||||
- `codeact_enable_jupyter`: Enable IPython code execution
|
||||
- `codeact_enable_llm_editor`: Enable LLM-based file editing (if disabled, uses string replacement editor instead)
|
||||
1. `codeact_agent.py`: Core agent implementation with:
|
||||
- Message history management
|
||||
- Tool execution handling
|
||||
- State management
|
||||
- Action/observation processing
|
||||
|
||||
2. `function_calling.py`: Tool definitions and function calling interface with:
|
||||
- Tool parameter specifications
|
||||
- Tool descriptions and examples
|
||||
- Function calling response parsing
|
||||
@@ -154,10 +154,7 @@ class CodeActAgent(Agent):
|
||||
BrowseInteractiveAction,
|
||||
BrowseURLAction,
|
||||
),
|
||||
) or (
|
||||
isinstance(action, (AgentFinishAction, CmdRunAction))
|
||||
and action.source == 'agent'
|
||||
):
|
||||
) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
|
||||
tool_metadata = action.tool_call_metadata
|
||||
assert tool_metadata is not None, (
|
||||
'Tool call metadata should NOT be None when function calling is enabled. Action: '
|
||||
@@ -166,6 +163,7 @@ class CodeActAgent(Agent):
|
||||
|
||||
llm_response: ModelResponse = tool_metadata.model_response
|
||||
assistant_msg = llm_response.choices[0].message
|
||||
|
||||
# Add the LLM message (assistant) that initiated the tool calls
|
||||
# (overwrites any previous message with the same response_id)
|
||||
pending_tool_call_action_messages[llm_response.id] = Message(
|
||||
@@ -177,6 +175,33 @@ class CodeActAgent(Agent):
|
||||
tool_calls=assistant_msg.tool_calls,
|
||||
)
|
||||
return []
|
||||
elif isinstance(action, AgentFinishAction):
|
||||
role = 'user' if action.source == 'user' else 'assistant'
|
||||
|
||||
# when agent finishes, it has tool_metadata
|
||||
# which has already been executed, and it doesn't have a response
|
||||
# when the user finishes (/exit), we don't have tool_metadata
|
||||
tool_metadata = action.tool_call_metadata
|
||||
if tool_metadata is not None:
|
||||
# take the response message from the tool call
|
||||
assistant_msg = tool_metadata.model_response.choices[0].message
|
||||
content = assistant_msg.content or ''
|
||||
|
||||
# save content if any, to thought
|
||||
if action.thought:
|
||||
if action.thought != content:
|
||||
action.thought += '\n' + content
|
||||
else:
|
||||
action.thought = content
|
||||
|
||||
# remove the tool call metadata
|
||||
action.tool_call_metadata = None
|
||||
return [
|
||||
Message(
|
||||
role=role,
|
||||
content=[TextContent(text=action.thought)],
|
||||
)
|
||||
]
|
||||
elif isinstance(action, MessageAction):
|
||||
role = 'user' if action.source == 'user' else 'assistant'
|
||||
content = [TextContent(text=action.content or '')]
|
||||
@@ -373,6 +398,9 @@ class CodeActAgent(Agent):
|
||||
- Messages from the same role are combined to prevent consecutive same-role messages
|
||||
- For Anthropic models, specific messages are cached according to their documentation
|
||||
"""
|
||||
if not self.prompt_manager:
|
||||
raise Exception('Prompt Manager not instantiated.')
|
||||
|
||||
messages: list[Message] = [
|
||||
Message(
|
||||
role='system',
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user