Compare commits
57 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| aea92f3869 | |||
| cee88aff48 | |||
| 0c2283abcc | |||
| ef3e0c8dfe | |||
| 315d391414 | |||
| 95ef8965b7 | |||
| ab9fb50c4f | |||
| f866da6bf2 | |||
| 7229a16b45 | |||
| 19105a2a13 | |||
| fe486ad1f1 | |||
| 0ec6ed20cb | |||
| 794381c22b | |||
| 0c581ea946 | |||
| f7f4fcf98f | |||
| ab004478f6 | |||
| 340606e68a | |||
| daec23b5d7 | |||
| 587b4c311a | |||
| 7a86402c9c | |||
| 06d283dfa0 | |||
| a6a4246e30 | |||
| 4830b9a67d | |||
| d4489d62d7 | |||
| e41c020073 | |||
| 3f44c8436f | |||
| b740944075 | |||
| 5618a3eebb | |||
| a1ffe5c936 | |||
| f8376a9702 | |||
| 985a634d60 | |||
| e40681ca61 | |||
| 228e50df9c | |||
| fd805eb835 | |||
| 426350224b | |||
| 3e36911038 | |||
| 4c3ba62665 | |||
| f5e7c602dc | |||
| 2f32064778 | |||
| 5e85986f32 | |||
| 4f436922ca | |||
| d256348a46 | |||
| 6bdc5563cf | |||
| c2f46200c0 | |||
| e39bf80239 | |||
| 368a0248e3 | |||
| db9ceb380a | |||
| c64971d0c4 | |||
| 69fa580899 | |||
| e3411f743d | |||
| 2b65b8aff2 | |||
| 11f364c5e4 | |||
| 4e3a862571 | |||
| 50aa014876 | |||
| 500ab46918 | |||
| e311f3e70f | |||
| f68ad3695c |
@@ -0,0 +1,223 @@
|
||||
name: End-to-End Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, labeled]
|
||||
branches:
|
||||
- main
|
||||
- develop
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
e2e-tests:
|
||||
if: contains(github.event.pull_request.labels.*.name, 'end-to-end') || github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 60
|
||||
|
||||
env:
|
||||
GITHUB_REPO_NAME: ${{ github.repository }}
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install poetry via pipx
|
||||
uses: abatilo/actions-poetry@v3
|
||||
with:
|
||||
poetry-version: 2.1.3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: 'poetry'
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libgtk-3-0 libnotify4 libnss3 libxss1 libxtst6 xauth xvfb libgbm1 libasound2t64 netcat-openbsd
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22'
|
||||
cache: 'npm'
|
||||
cache-dependency-path: 'frontend/package-lock.json'
|
||||
|
||||
- name: Setup environment for end-to-end tests
|
||||
run: |
|
||||
# Create test results directory
|
||||
mkdir -p test-results
|
||||
|
||||
# Create downloads directory for OpenHands (use a directory in the home folder)
|
||||
mkdir -p $HOME/downloads
|
||||
sudo chown -R $USER:$USER $HOME/downloads
|
||||
sudo chmod -R 755 $HOME/downloads
|
||||
|
||||
- name: Build OpenHands
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL || 'gpt-4o' }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY || 'test-key' }}
|
||||
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
||||
INSTALL_DOCKER: 1
|
||||
RUNTIME: docker
|
||||
FRONTEND_PORT: 12000
|
||||
FRONTEND_HOST: 0.0.0.0
|
||||
BACKEND_HOST: 0.0.0.0
|
||||
BACKEND_PORT: 3000
|
||||
ENABLE_BROWSER: true
|
||||
INSTALL_PLAYWRIGHT: 1
|
||||
run: |
|
||||
# Fix poetry.lock file if needed
|
||||
echo "Fixing poetry.lock file if needed..."
|
||||
poetry lock
|
||||
|
||||
# Build OpenHands using make build
|
||||
echo "Running make build..."
|
||||
make build
|
||||
|
||||
# Install Chromium Headless Shell for Playwright (needed for pytest-playwright)
|
||||
echo "Installing Chromium Headless Shell for Playwright..."
|
||||
poetry run playwright install chromium-headless-shell
|
||||
|
||||
# Verify Playwright browsers are installed (for e2e tests only)
|
||||
echo "Verifying Playwright browsers installation for e2e tests..."
|
||||
BROWSER_CHECK=$(poetry run python tests/e2e/check_playwright.py 2>/dev/null)
|
||||
|
||||
if [ "$BROWSER_CHECK" != "chromium_found" ]; then
|
||||
echo "ERROR: Chromium browser not found or not working for e2e tests"
|
||||
echo "$BROWSER_CHECK"
|
||||
exit 1
|
||||
else
|
||||
echo "Playwright browsers are properly installed for e2e tests."
|
||||
fi
|
||||
|
||||
# Docker runtime will handle workspace directory creation
|
||||
|
||||
# Start the application using make run with custom parameters and reduced logging
|
||||
echo "Starting OpenHands using make run..."
|
||||
# Set environment variables to reduce logging verbosity
|
||||
export PYTHONUNBUFFERED=1
|
||||
export LOG_LEVEL=WARNING
|
||||
export UVICORN_LOG_LEVEL=warning
|
||||
export OPENHANDS_LOG_LEVEL=WARNING
|
||||
FRONTEND_PORT=12000 FRONTEND_HOST=0.0.0.0 BACKEND_HOST=0.0.0.0 make run > /tmp/openhands-e2e-test.log 2>&1 &
|
||||
|
||||
# Store the PID of the make run process
|
||||
MAKE_PID=$!
|
||||
echo "OpenHands started with PID: $MAKE_PID"
|
||||
|
||||
# Wait for the application to start
|
||||
echo "Waiting for OpenHands to start..."
|
||||
max_attempts=15
|
||||
attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
echo "Checking if OpenHands is running (attempt $attempt of $max_attempts)..."
|
||||
|
||||
# Check if the process is still running
|
||||
if ! ps -p $MAKE_PID > /dev/null; then
|
||||
echo "ERROR: OpenHands process has terminated unexpectedly"
|
||||
echo "Last 50 lines of the log:"
|
||||
tail -n 50 /tmp/openhands-e2e-test.log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if frontend port is open
|
||||
if nc -z localhost 12000; then
|
||||
# Verify we can get HTML content
|
||||
if curl -s http://localhost:12000 | grep -q "<html"; then
|
||||
echo "SUCCESS: OpenHands is running and serving HTML content on port 12000"
|
||||
break
|
||||
else
|
||||
echo "Port 12000 is open but not serving HTML content yet"
|
||||
fi
|
||||
else
|
||||
echo "Frontend port 12000 is not open yet"
|
||||
fi
|
||||
|
||||
# Show log output on each attempt
|
||||
echo "Recent log output:"
|
||||
tail -n 20 /tmp/openhands-e2e-test.log
|
||||
|
||||
# Wait before next attempt
|
||||
echo "Waiting 10 seconds before next check..."
|
||||
sleep 10
|
||||
attempt=$((attempt + 1))
|
||||
|
||||
# Exit if we've reached the maximum number of attempts
|
||||
if [ $attempt -gt $max_attempts ]; then
|
||||
echo "ERROR: OpenHands failed to start after $max_attempts attempts"
|
||||
echo "Last 50 lines of the log:"
|
||||
tail -n 50 /tmp/openhands-e2e-test.log
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Final verification that the app is running
|
||||
if ! nc -z localhost 12000 || ! curl -s http://localhost:12000 | grep -q "<html"; then
|
||||
echo "ERROR: OpenHands is not running properly on port 12000"
|
||||
echo "Last 50 lines of the log:"
|
||||
tail -n 50 /tmp/openhands-e2e-test.log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Print success message
|
||||
echo "OpenHands is running successfully on port 12000"
|
||||
|
||||
- name: Run end-to-end tests
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.E2E_TEST_GITHUB_TOKEN }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL || 'gpt-4o' }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY || 'test-key' }}
|
||||
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
||||
run: |
|
||||
# Check if the application is running
|
||||
if ! nc -z localhost 12000; then
|
||||
echo "ERROR: OpenHands is not running on port 12000"
|
||||
echo "Last 50 lines of the log:"
|
||||
tail -n 50 /tmp/openhands-e2e-test.log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the tests with detailed output
|
||||
cd tests/e2e
|
||||
poetry run python -m pytest test_settings.py::test_github_token_configuration test_conversation.py::test_conversation_start -v --no-header --capture=no --timeout=600
|
||||
|
||||
- name: Upload test results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: playwright-report
|
||||
path: tests/e2e/test-results/
|
||||
retention-days: 30
|
||||
|
||||
- name: Upload OpenHands logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openhands-logs
|
||||
path: |
|
||||
/tmp/openhands-e2e-test.log
|
||||
/tmp/openhands-e2e-build.log
|
||||
/tmp/openhands-backend.log
|
||||
/tmp/openhands-frontend.log
|
||||
/tmp/backend-health-check.log
|
||||
/tmp/frontend-check.log
|
||||
/tmp/vite-config.log
|
||||
/tmp/makefile-contents.log
|
||||
retention-days: 30
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: |
|
||||
# Stop OpenHands processes
|
||||
echo "Stopping OpenHands processes..."
|
||||
pkill -f "python -m openhands.server" || true
|
||||
pkill -f "npm run dev" || true
|
||||
pkill -f "make run" || true
|
||||
|
||||
# Print process status for debugging
|
||||
echo "Checking if any OpenHands processes are still running:"
|
||||
ps aux | grep -E "openhands|npm run dev" || true
|
||||
@@ -51,8 +51,6 @@ jobs:
|
||||
run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest --forked -n auto -svv ./tests/unit
|
||||
- name: Run Runtime Tests with CLIRuntime
|
||||
run: PYTHONPATH=".:$PYTHONPATH" TEST_RUNTIME=cli poetry run pytest -svv tests/runtime/test_bash.py
|
||||
- name: Run E2E Tests
|
||||
run: PYTHONPATH=".:$PYTHONPATH" poetry run pytest -svv tests/e2e
|
||||
|
||||
# Run specific Windows python tests
|
||||
test-on-windows:
|
||||
|
||||
@@ -254,3 +254,6 @@ containers/runtime/Dockerfile
|
||||
containers/runtime/project.tar.gz
|
||||
containers/runtime/code
|
||||
**/node_modules/
|
||||
|
||||
# test results
|
||||
test-results
|
||||
|
||||
@@ -159,7 +159,7 @@ poetry run pytest ./tests/unit/test_*.py
|
||||
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker
|
||||
container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
|
||||
|
||||
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.51-nikolaik`
|
||||
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.53-nikolaik`
|
||||
|
||||
## Develop inside Docker container
|
||||
|
||||
|
||||
@@ -52,37 +52,63 @@ which comes with $20 in free credits for new users.
|
||||
|
||||
## 💻 Running OpenHands Locally
|
||||
|
||||
OpenHands can also run on your local system using Docker.
|
||||
See the [Running OpenHands](https://docs.all-hands.dev/usage/installation) guide for
|
||||
system requirements and more information.
|
||||
### Option 1: CLI Launcher (Recommended)
|
||||
|
||||
> [!WARNING]
|
||||
> On a public network? See our [Hardened Docker Installation Guide](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)
|
||||
> to secure your deployment by restricting network binding and implementing additional security measures.
|
||||
The easiest way to run OpenHands locally is using the CLI launcher with [uv](https://docs.astral.sh/uv/). This provides better isolation from your current project's virtual environment and is required for OpenHands' default MCP servers.
|
||||
|
||||
**Install uv** (if you haven't already):
|
||||
|
||||
See the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/) for the latest installation instructions for your platform.
|
||||
|
||||
**Launch OpenHands**:
|
||||
```bash
|
||||
# Launch the GUI server
|
||||
uvx --python 3.12 --from openhands-ai openhands serve
|
||||
|
||||
# Or launch the CLI
|
||||
uvx --python 3.12 --from openhands-ai openhands
|
||||
```
|
||||
|
||||
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000) (for GUI mode)!
|
||||
|
||||
### Option 2: Docker
|
||||
|
||||
<details>
|
||||
<summary>Click to expand Docker command</summary>
|
||||
|
||||
You can also run OpenHands directly with Docker:
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v ~/.openhands:/.openhands \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
|
||||
|
||||
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
|
||||
> [!WARNING]
|
||||
> On a public network? See our [Hardened Docker Installation Guide](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)
|
||||
> to secure your deployment by restricting network binding and implementing additional security measures.
|
||||
|
||||
### Getting Started
|
||||
|
||||
When you open the application, you'll be asked to choose an LLM provider and add an API key.
|
||||
[Anthropic's Claude Sonnet 4](https://www.anthropic.com/api) (`anthropic/claude-sonnet-4-20250514`)
|
||||
works best, but you have [many options](https://docs.all-hands.dev/usage/llms).
|
||||
|
||||
See the [Running OpenHands](https://docs.all-hands.dev/usage/installation) guide for
|
||||
system requirements and more information.
|
||||
|
||||
## 💡 Other ways to run OpenHands
|
||||
|
||||
> [!WARNING]
|
||||
@@ -93,8 +119,8 @@ works best, but you have [many options](https://docs.all-hands.dev/usage/llms).
|
||||
> [OpenHands Cloud Helm Chart](https://github.com/all-Hands-AI/OpenHands-cloud)
|
||||
|
||||
You can [connect OpenHands to your local filesystem](https://docs.all-hands.dev/usage/runtimes/docker#connecting-to-your-filesystem),
|
||||
run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/usage/how-to/headless-mode),
|
||||
interact with it via a [friendly CLI](https://docs.all-hands.dev/usage/how-to/cli-mode),
|
||||
run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/usage/how-to/headless-mode),
|
||||
or run it on tagged issues with [a github action](https://docs.all-hands.dev/usage/how-to/github-action).
|
||||
|
||||
Visit [Running OpenHands](https://docs.all-hands.dev/usage/installation) for more information and setup instructions.
|
||||
|
||||
@@ -51,17 +51,17 @@ OpenHands也可以使用Docker在本地系统上运行。
|
||||
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v ~/.openhands:/.openhands \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
```
|
||||
|
||||
> **注意**: 如果您在0.44版本之前使用过OpenHands,您可能需要运行 `mv ~/.openhands-state ~/.openhands` 来将对话历史迁移到新位置。
|
||||
|
||||
@@ -42,17 +42,17 @@ OpenHandsはDockerを利用してローカル環境でも実行できます。
|
||||
> 公共ネットワークで実行していますか?[Hardened Docker Installation Guide](https://docs.all-hands.dev/usage/runtimes/docker#hardened-docker-installation)を参照して、ネットワークバインディングの制限や追加のセキュリティ対策を実施してください。
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v ~/.openhands:/.openhands \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
```
|
||||
|
||||
**注**: バージョン0.44以前のOpenHandsを使用していた場合は、会話履歴を移行するために `mv ~/.openhands-state ~/.openhands` を実行してください。
|
||||
|
||||
@@ -93,8 +93,7 @@ def build_vscode_extension():
|
||||
|
||||
|
||||
def build(setup_kwargs):
|
||||
"""
|
||||
This function is called by Poetry during the build process.
|
||||
"""This function is called by Poetry during the build process.
|
||||
`setup_kwargs` is a dictionary that will be passed to `setuptools.setup()`.
|
||||
"""
|
||||
print('--- Running custom Poetry build script (build_vscode.py) ---')
|
||||
|
||||
@@ -12,7 +12,7 @@ services:
|
||||
- SANDBOX_API_HOSTNAME=host.docker.internal
|
||||
- DOCKER_HOST_ADDR=host.docker.internal
|
||||
#
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.51-nikolaik}
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.53-nikolaik}
|
||||
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
|
||||
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
||||
ports:
|
||||
|
||||
@@ -7,7 +7,7 @@ services:
|
||||
image: openhands:latest
|
||||
container_name: openhands-app-${DATE:-}
|
||||
environment:
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik}
|
||||
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik}
|
||||
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of ~/.openhands for this user
|
||||
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
||||
ports:
|
||||
|
||||
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 55 KiB |
|
After Width: | Height: | Size: 55 KiB |
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 30 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 53 KiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 29 KiB |
|
After Width: | Height: | Size: 26 KiB |
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 28 KiB |
|
Before Width: | Height: | Size: 28 KiB |
@@ -78,6 +78,14 @@ description: Complete guide for setting up Jira Data Center integration with Ope
|
||||
- **Service Account API Key**: The personal access token from Step 2 above
|
||||
- Ensure **Active** toggle is enabled
|
||||
|
||||
<Note>
|
||||
Workspace name is the host name of your Jira Data Center instance.
|
||||
|
||||
Eg: http://jira.all-hands.dev/projects/OH/issues/OH-77
|
||||
|
||||
Here the workspace name is **jira.all-hands.dev**.
|
||||
</Note>
|
||||
|
||||
3. **Complete OAuth Flow**
|
||||
- You'll be redirected to Jira Data Center to complete OAuth verification
|
||||
- Grant the necessary permissions to verify your workspace access. If you have access to multiple workspaces, select the correct one that you initially provided
|
||||
@@ -101,18 +109,18 @@ description: Complete guide for setting up Jira Data Center integration with Ope
|
||||
|
||||
<AccordionGroup>
|
||||
<Accordion title="Workspace link flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Workspace Configure flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as a user">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as the workspace creator">
|
||||

|
||||

|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
@@ -15,28 +15,27 @@ description: Complete guide for setting up Jira Cloud integration with OpenHands
|
||||
- Go to **Directory** > **Users**
|
||||
|
||||
2. **Create OpenHands Service Account**
|
||||
- Click **Add user**
|
||||
- Email: `openhands@yourcompany.com` (replace with your preferred service account email)
|
||||
- Display name: `OpenHands Agent`
|
||||
- Send invitation: **No** (you'll set password manually)
|
||||
- Click **Add user**
|
||||
|
||||
3. **Configure Account**
|
||||
- Locate the created user and click on it
|
||||
- Set a secure password
|
||||
- Add to relevant Jira projects with appropriate permissions
|
||||
- Click **Service accounts**
|
||||
- Click **Create a service account**
|
||||
- Name: `OpenHands Agent`
|
||||
- Click **Next**
|
||||
- Select **User** role for Jira app
|
||||
- Click **Create**
|
||||
|
||||
### Step 2: Generate API Token
|
||||
|
||||
1. **Access API Token Management**
|
||||
- Log in as the OpenHands service account
|
||||
- Go to [API Tokens](https://id.atlassian.com/manage-profile/security/api-tokens)
|
||||
|
||||
2. **Create API Token**
|
||||
1. **Access Service Account Configuration**
|
||||
- Locate the created service account from above step and click on it
|
||||
- Click **Create API token**
|
||||
- Label: `OpenHands Cloud Integration`
|
||||
- Expiry: Set appropriate expiration (recommend 1 year)
|
||||
- Click **Create**
|
||||
- Set the expiry to 365 days (maximum allowed value)
|
||||
- Click **Next**
|
||||
- In **Select token scopes** screen, filter by following values
|
||||
- App: Jira
|
||||
- Scope type: Classic
|
||||
- Scope actions: Write, Read
|
||||
- Select `read:jira-work` and `write:jira-work` scopes
|
||||
- Click **Next**
|
||||
- Review and create API token
|
||||
- **Important**: Copy and securely store the token immediately
|
||||
|
||||
### Step 3: Configure Webhook
|
||||
@@ -83,6 +82,14 @@ description: Complete guide for setting up Jira Cloud integration with OpenHands
|
||||
- **Service Account API Key**: The API token from Step 2 above
|
||||
- Ensure **Active** toggle is enabled
|
||||
|
||||
<Note>
|
||||
Workspace name is the host name when accessing a resource in Jira Cloud.
|
||||
|
||||
Eg: https://all-hands.atlassian.net/browse/OH-55
|
||||
|
||||
Here the workspace name is **all-hands**.
|
||||
</Note>
|
||||
|
||||
3. **Complete OAuth Flow**
|
||||
- You'll be redirected to Jira Cloud to complete OAuth verification
|
||||
- Grant the necessary permissions to verify your workspace access.
|
||||
@@ -106,18 +113,18 @@ description: Complete guide for setting up Jira Cloud integration with OpenHands
|
||||
|
||||
<AccordionGroup>
|
||||
<Accordion title="Workspace link flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Workspace Configure flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as a user">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as the workspace creator">
|
||||

|
||||

|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
@@ -28,7 +28,7 @@ description: Complete guide for setting up Linear integration with OpenHands Clo
|
||||
|
||||
1. **Access API Settings**
|
||||
- Log in as the service account
|
||||
- Go to **Settings** > **API**
|
||||
- Go to **Settings** > **Security & access**
|
||||
|
||||
2. **Create Personal API Key**
|
||||
- Click **Create new key**
|
||||
@@ -82,6 +82,14 @@ description: Complete guide for setting up Linear integration with OpenHands Clo
|
||||
- **Service Account API Key**: The API key from Step 2 above
|
||||
- Ensure **Active** toggle is enabled
|
||||
|
||||
<Note>
|
||||
Workspace name is the identifier after the host name when accessing a resource in Linear.
|
||||
|
||||
Eg: https://linear.app/allhands/issue/OH-37
|
||||
|
||||
Here the workspace name is **allhands**.
|
||||
</Note>
|
||||
|
||||
3. **Complete OAuth Flow**
|
||||
- You'll be redirected to Linear to complete OAuth verification
|
||||
- Grant the necessary permissions to verify your workspace access. If you have access to multiple workspaces, select the correct one that you initially provided
|
||||
@@ -105,15 +113,15 @@ description: Complete guide for setting up Linear integration with OpenHands Clo
|
||||
|
||||
<AccordionGroup>
|
||||
<Accordion title="Workspace link flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Workspace Configure flow">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as a user">
|
||||

|
||||

|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Edit view as the workspace creator">
|
||||
|
||||
@@ -58,17 +58,18 @@ The OpenHands agent needs to identify which Git repository to work with when pro
|
||||
|
||||
### Platform Configuration Issues
|
||||
- **Webhook not triggering**: Verify the webhook URL is correct and the proper event types are selected (Comment, Issue updated)
|
||||
- **API authentication failing**: Check API key/token validity and ensure required scopes are granted
|
||||
- **API authentication failing**: Check API key/token validity and ensure required scopes are granted. If your current API token is expired, make sure to update it in the respective integration settings
|
||||
- **Permission errors**: Ensure the service account has access to relevant projects/teams and appropriate permissions
|
||||
|
||||
### Workspace Integration Issues
|
||||
- **Workspace linking requests credentials**: If there are no active workspace integrations for the workspace you specified, you need to configure it first. Contact your platform administrator that you want to integrate with (eg: Jira, Linear)
|
||||
- **OAuth flow fails**: Ensure you're signing in with the same Git provider account that contains the repositories you want OpenHands to work on
|
||||
- **Integration not found**: Verify the workspace name matches exactly and that platform configuration was completed first
|
||||
- **OAuth flow fails**: Make sure that you're authorizing with the correct account with proper workspace access
|
||||
|
||||
### General Issues
|
||||
- **Agent not responding**: Check webhook logs in your platform settings and verify service account status
|
||||
- **Authentication errors**: Verify Git provider permissions and OpenHands Cloud access
|
||||
- **Agent fails to identify git repo**: Ensure you're signing in with the same Git provider account that contains the repositories you want OpenHands to work on
|
||||
- **Partial functionality**: Ensure both platform configuration and workspace integration are properly completed
|
||||
|
||||
### Getting Help
|
||||
|
||||
@@ -20,27 +20,42 @@ for scripting.
|
||||
|
||||
### Running with Python
|
||||
|
||||
**Note** - OpenHands requires Python version 3.12 or higher (Python 3.14 is not currently supported) and `uvx` for the default `fetch` MCP server (more details below).
|
||||
**Note** - OpenHands requires Python version 3.12 or higher (Python 3.14 is not currently supported) and `uv` for the default `fetch` MCP server (more details below).
|
||||
|
||||
1. Install OpenHands using pip:
|
||||
```bash
|
||||
pip install openhands-ai
|
||||
```
|
||||
#### Recommended: Using uv
|
||||
|
||||
Or if you prefer not to manage your own Python environment, you can use `uvx`:
|
||||
We recommend using [uv](https://docs.astral.sh/uv/) for the best OpenHands experience. uv provides better isolation from your current project's virtual environment and is required for OpenHands' default MCP servers.
|
||||
|
||||
1. **Install uv** (if you haven't already):
|
||||
|
||||
See the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/) for the latest installation instructions for your platform.
|
||||
|
||||
2. **Launch OpenHands CLI**:
|
||||
```bash
|
||||
uvx --python 3.12 --from openhands-ai openhands
|
||||
```
|
||||
|
||||
<AccordionGroup>
|
||||
|
||||
<Accordion title="Alternative: Traditional pip installation">
|
||||
|
||||
If you prefer to use pip:
|
||||
|
||||
```bash
|
||||
# Install OpenHands
|
||||
pip install openhands-ai
|
||||
```
|
||||
|
||||
Note that you'll still need `uv` installed for the default MCP servers to work properly.
|
||||
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Create shell aliases for easy access across environments">
|
||||
|
||||
Add the following to your shell configuration file (`.bashrc`, `.zshrc`, etc.):
|
||||
|
||||
```bash
|
||||
# Add OpenHands aliases
|
||||
# Add OpenHands aliases (recommended)
|
||||
alias openhands="uvx --python 3.12 --from openhands-ai openhands"
|
||||
alias oh="uvx --python 3.12 --from openhands-ai openhands"
|
||||
```
|
||||
@@ -72,18 +87,19 @@ source ~/.bashrc # or source ~/.zshrc
|
||||
|
||||
</AccordionGroup>
|
||||
|
||||
2. Launch an interactive OpenHands conversation from the command line:
|
||||
3. Launch an interactive OpenHands conversation from the command line:
|
||||
```bash
|
||||
openhands
|
||||
# If using uvx (recommended)
|
||||
uvx --python 3.12 --from openhands-ai openhands
|
||||
```
|
||||
|
||||
<Note>
|
||||
If you have cloned the repository, you can also run the CLI directly using Poetry:
|
||||
|
||||
poetry run python -m openhands.cli.main
|
||||
poetry run openhands
|
||||
</Note>
|
||||
|
||||
3. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
|
||||
4. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
|
||||
|
||||
This command opens an interactive prompt where you can type tasks or commands and get responses from OpenHands.
|
||||
The first time you run the CLI, it will take you through configuring the required LLM
|
||||
@@ -103,7 +119,7 @@ The conversation history will be saved in `~/.openhands/sessions`.
|
||||
```bash
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
@@ -112,7 +128,7 @@ docker run -it \
|
||||
-v ~/.openhands:/.openhands \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51 \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53 \
|
||||
python -m openhands.cli.main --override-cli-mode true
|
||||
```
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ export GITHUB_TOKEN="your-token" # Required for repository operations
|
||||
# Run OpenHands
|
||||
docker run -it \
|
||||
--pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e SANDBOX_USER_ID=$(id -u) \
|
||||
-e SANDBOX_VOLUMES=$SANDBOX_VOLUMES \
|
||||
-e LLM_API_KEY=$LLM_API_KEY \
|
||||
@@ -73,7 +73,7 @@ docker run -it \
|
||||
-v ~/.openhands:/.openhands \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app-$(date +%Y%m%d%H%M%S) \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51 \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53 \
|
||||
python -m openhands.core.main -t "write a bash script that prints hi"
|
||||
```
|
||||
|
||||
|
||||
@@ -68,23 +68,23 @@ Download and install the LM Studio desktop app from [lmstudio.ai](https://lmstud
|
||||
1. Check [the installation guide](/usage/local-setup) and ensure all prerequisites are met before running OpenHands, then run:
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v ~/.openhands:/.openhands \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
```
|
||||
|
||||
2. Wait until the server is running (see log below):
|
||||
```
|
||||
Digest: sha256:e72f9baecb458aedb9afc2cd5bc935118d1868719e55d50da73190d3a85c674f
|
||||
Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
Status: Image is up to date for docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
Starting OpenHands...
|
||||
Running OpenHands as root
|
||||
14:22:13 - openhands:INFO: server_config.py:50 - Using config class None
|
||||
|
||||
@@ -66,9 +66,31 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
|
||||
|
||||
### Start the App
|
||||
|
||||
#### Option 1: Using the CLI Launcher (Recommended)
|
||||
#### Option 1: Using the CLI Launcher with uv (Recommended)
|
||||
|
||||
If you have Python 3.12+ installed, you can use the CLI launcher for a simpler experience:
|
||||
We recommend using [uv](https://docs.astral.sh/uv/) for the best OpenHands experience. uv provides better isolation from your current project's virtual environment and is required for OpenHands' default MCP servers (like the [fetch MCP server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch)).
|
||||
|
||||
**Install uv** (if you haven't already):
|
||||
|
||||
See the [uv installation guide](https://docs.astral.sh/uv/getting-started/installation/) for the latest installation instructions for your platform.
|
||||
|
||||
**Launch OpenHands**:
|
||||
```bash
|
||||
# Launch the GUI server
|
||||
uvx --python 3.12 --from openhands-ai openhands serve
|
||||
|
||||
# Or with GPU support (requires nvidia-docker)
|
||||
uvx --python 3.12 --from openhands-ai openhands serve --gpu
|
||||
|
||||
# Or with current directory mounted
|
||||
uvx --python 3.12 --from openhands-ai openhands serve --mount-cwd
|
||||
```
|
||||
|
||||
This will automatically handle Docker requirements checking, image pulling, and launching the GUI server. The `--gpu` flag enables GPU support via nvidia-docker, and `--mount-cwd` mounts your current directory into the container.
|
||||
|
||||
<Accordion title="Alternative: Traditional pip installation">
|
||||
|
||||
If you prefer to use pip and have Python 3.12+ installed:
|
||||
|
||||
```bash
|
||||
# Install OpenHands
|
||||
@@ -76,34 +98,32 @@ pip install openhands-ai
|
||||
|
||||
# Launch the GUI server
|
||||
openhands serve
|
||||
|
||||
# Or with GPU support (requires nvidia-docker)
|
||||
openhands serve --gpu
|
||||
|
||||
# Or with current directory mounted
|
||||
openhands serve --mount-cwd
|
||||
```
|
||||
|
||||
Or using `uvx --python 3.12 --from openhands-ai openhands serve` if you have [uv](https://docs.astral.sh/uv/) installed.
|
||||
Note that you'll still need `uv` installed for the default MCP servers to work properly.
|
||||
|
||||
This will automatically handle Docker requirements checking, image pulling, and launching the GUI server. The `--gpu` flag enables GPU support via nvidia-docker, and `--mount-cwd` mounts your current directory into the container.
|
||||
</Accordion>
|
||||
|
||||
#### Option 2: Using Docker Directly
|
||||
|
||||
<Accordion title="Docker Command (Click to expand)">
|
||||
|
||||
```bash
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik
|
||||
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik
|
||||
|
||||
docker run -it --rm --pull=always \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.51-nikolaik \
|
||||
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.53-nikolaik \
|
||||
-e LOG_ALL_EVENTS=true \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v ~/.openhands:/.openhands \
|
||||
-p 3000:3000 \
|
||||
--add-host host.docker.internal:host-gateway \
|
||||
--name openhands-app \
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.51
|
||||
docker.all-hands.dev/all-hands-ai/openhands:0.53
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
> **Note**: If you used OpenHands before version 0.44, you may want to run `mv ~/.openhands-state ~/.openhands` to migrate your conversation history to the new location.
|
||||
|
||||
You'll find OpenHands running at http://localhost:3000!
|
||||
|
||||
@@ -506,7 +506,6 @@ def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame:
|
||||
Returns:
|
||||
Filtered dataset based on split type
|
||||
"""
|
||||
|
||||
filtered_dataset = pd.concat(
|
||||
[
|
||||
dataset[dataset['repo'].str.split('/').str[1] == repo]
|
||||
|
||||
@@ -89,8 +89,7 @@ def get_config(
|
||||
def get_dv_query_for_real(
|
||||
datasets, question, domain_knowledge=None, workflow_tags=None
|
||||
):
|
||||
"""
|
||||
Prepare a structured query for the agent to execute on the specified datasets.
|
||||
"""Prepare a structured query for the agent to execute on the specified datasets.
|
||||
|
||||
This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags.
|
||||
|
||||
@@ -104,7 +103,6 @@ def get_dv_query_for_real(
|
||||
query_to_dv: Query to be run on the dataset
|
||||
dataset_meta: Metadata of the dataset
|
||||
"""
|
||||
|
||||
dataset_meta = ''
|
||||
for dataset_metadata in datasets:
|
||||
dataset_meta += 'Dataset name: ' + dataset_metadata['name']
|
||||
@@ -140,8 +138,7 @@ def get_dv_query_for_real(
|
||||
|
||||
|
||||
def initialize_runtime(runtime: Runtime, data_files: list[str]):
|
||||
"""
|
||||
Initialize the runtime for the agent.
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
@@ -231,8 +228,7 @@ def process_instance(
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
"""
|
||||
Process and evaluate a single instance of the dataset.
|
||||
"""Process and evaluate a single instance of the dataset.
|
||||
|
||||
This function executes the OpenHands agent
|
||||
for a specific instance of the dataset. It retrieves
|
||||
@@ -247,7 +243,6 @@ def process_instance(
|
||||
Returns:
|
||||
output: EvalOutput object
|
||||
"""
|
||||
|
||||
config = get_config(metadata)
|
||||
|
||||
# Setup the logger properly, so you can run
|
||||
@@ -356,8 +351,7 @@ def list_csv_files(list_of_datasets):
|
||||
|
||||
|
||||
def create_dataset(repo_location: str, split: str = 'test'):
|
||||
"""
|
||||
Create a dataset from the discoverybench repository
|
||||
"""Create a dataset from the discoverybench repository
|
||||
by walking through the repository and extracting metadata
|
||||
from the metadata_{}.json files
|
||||
|
||||
@@ -368,7 +362,6 @@ def create_dataset(repo_location: str, split: str = 'test'):
|
||||
Returns:
|
||||
df: DataFrame containing the dataset instances
|
||||
"""
|
||||
|
||||
data_dict = {}
|
||||
|
||||
data_location = os.path.join(repo_location, 'discoverybench', 'real', split)
|
||||
|
||||
@@ -105,8 +105,7 @@ def process_instance(
|
||||
log_dir: str | None = None,
|
||||
runtime_failure_count: int = 0,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a SWE-bench problem instance.
|
||||
"""Evaluate agent performance on a SWE-bench problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
# Evaluate OpenHands on NoCode-bench
|
||||
|
||||
## LLM Setup
|
||||
|
||||
Please follow [here](../../README.md#setup).
|
||||
|
||||
|
||||
## Docker image download
|
||||
|
||||
Evaluating OpenHands on NoCode-bench need instance-level docker image.
|
||||
Please follow the instructions of NoCode-bench image setup to build or download all instance-level dokcer [here](https://github.com/NoCode-bench/NoCode-bench).
|
||||
|
||||
## Generate patch
|
||||
|
||||
Please follow the instructions [here](../swe_bench/README.md#running-locally-with-docker)
|
||||
For example,
|
||||
```bash
|
||||
bash ./evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh llm.claude HEAD CodeActAgent 114 100 10 NoCode-bench/NoCode-bench_Verified test
|
||||
```
|
||||
The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl.
|
||||
|
||||
## Runing evaluation
|
||||
|
||||
First, install [NoCode-bench](https://github.com/NoCode-bench/NoCode-bench).
|
||||
|
||||
Second, convert the output.jsonl to patch.jsonl with [script](scripts/eval/convert.py).
|
||||
|
||||
```bash
|
||||
python evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
|
||||
```
|
||||
|
||||
Finally, evaluate with NoCode-bench.
|
||||
|
||||
```bash
|
||||
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
||||
python ./evaluation/eval.py \
|
||||
--predictions_path ./all_preds.jsonl \ # <path_to_your_predictions>
|
||||
--log_dir ./evaluation/logs \ # <path_to_your_log_dir>
|
||||
--bench_tasks NoCode-bench/NoCode-bench_Verified \ # <dataset_name>
|
||||
--max_workers 110 \ # <number_of_workers>
|
||||
--output_file eval_result.txt \ # <path_to_your_output_file>
|
||||
--image_level repo \ # <cache_image_level>
|
||||
--timeout 600 \ # <timeout_in_seconds>
|
||||
--proxy None # <proxy_if_needed>
|
||||
```
|
||||
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Utilities for handling binary files and patch generation in SWE-bench evaluation.
|
||||
"""
|
||||
|
||||
|
||||
def remove_binary_diffs(patch_text):
|
||||
"""
|
||||
Remove binary file diffs from a git patch.
|
||||
|
||||
Args:
|
||||
patch_text (str): The git patch text
|
||||
|
||||
Returns:
|
||||
str: The cleaned patch text with binary diffs removed
|
||||
"""
|
||||
lines = patch_text.splitlines()
|
||||
cleaned_lines = []
|
||||
block = []
|
||||
is_binary_block = False
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('diff --git '):
|
||||
if block and not is_binary_block:
|
||||
cleaned_lines.extend(block)
|
||||
block = [line]
|
||||
is_binary_block = False
|
||||
elif 'Binary files' in line:
|
||||
is_binary_block = True
|
||||
block.append(line)
|
||||
else:
|
||||
block.append(line)
|
||||
|
||||
if block and not is_binary_block:
|
||||
cleaned_lines.extend(block)
|
||||
return '\n'.join(cleaned_lines)
|
||||
|
||||
|
||||
def remove_binary_files_from_git():
|
||||
"""
|
||||
Generate a bash command to remove binary files from git staging.
|
||||
|
||||
Returns:
|
||||
str: A bash command that removes binary files from git staging
|
||||
"""
|
||||
return """
|
||||
for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
|
||||
if [ -f "$file" ] && (file "$file" | grep -q "executable" || git check-attr binary "$file" | grep -q "binary: set"); then
|
||||
git rm -f "$file" 2>/dev/null || rm -f "$file"
|
||||
echo "Removed: $file"
|
||||
fi
|
||||
done
|
||||
""".strip()
|
||||
@@ -0,0 +1,545 @@
|
||||
DOCPATH_PATTERNS = [
|
||||
r'docs/',
|
||||
r'^CHANGES\.rst$',
|
||||
r'doc/',
|
||||
r'ChangeLog',
|
||||
r'^changelog/',
|
||||
r'^CHANGES$',
|
||||
]
|
||||
|
||||
MATPLOTLIB_CONFIG = {
|
||||
k: {
|
||||
'python': '3.11',
|
||||
'conda_env': 'matplotlib_35',
|
||||
'install': 'python -m pip install -e .',
|
||||
'test_cmd': 'pytest -rA --color=no',
|
||||
}
|
||||
for k in ['3.5', '3.6', '3.7', '3.8', '3.9']
|
||||
}
|
||||
MATPLOTLIB_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'python': '3.8',
|
||||
'conda_env': 'matplotlib_31',
|
||||
'install': 'python -m pip install -e .',
|
||||
'test_cmd': 'pytest -rA --color=no',
|
||||
}
|
||||
for k in ['3.1', '3.2', '3.3', '3.4']
|
||||
}
|
||||
)
|
||||
MATPLOTLIB_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'python': '3.5',
|
||||
'install': 'python setup.py build; python setup.py install',
|
||||
'conda_env': 'matplotlib_11',
|
||||
'nonroot': True,
|
||||
'test_cmd': 'pytest -rA --color=no',
|
||||
}
|
||||
for k in ['2.0', '2.1', '2.2', '1.0', '1.1', '1.2', '1.3', '1.4', '1.5']
|
||||
}
|
||||
)
|
||||
for k in ['3.8', '3.9']:
|
||||
MATPLOTLIB_CONFIG[k]['install'] = (
|
||||
'python -m pip install --no-build-isolation -e ".[dev]"'
|
||||
)
|
||||
|
||||
|
||||
SYMPY_CONFIG = {}
|
||||
SYMPY_CONFIG.update(
|
||||
{
|
||||
'1.0': {
|
||||
'conda_env': 'sympy_10',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'bin/test -C -v',
|
||||
# testfile -k testname
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
REQUESTS_CONFIG = {}
|
||||
REQUESTS_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'requests_227',
|
||||
'install': 'pip install -r requirements-dev.txt',
|
||||
'test_cmd': 'pytest -rA',
|
||||
}
|
||||
for k in ['2.27']
|
||||
}
|
||||
)
|
||||
REQUESTS_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'requests_226',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest -rA',
|
||||
}
|
||||
for k in ['2.26']
|
||||
}
|
||||
)
|
||||
|
||||
PYTEST_CONFIG = {}
|
||||
PYTEST_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'pytest_33',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest -v --color=no',
|
||||
}
|
||||
for k in ['4.4', '4.1', '3.7', '3.4', '3.3']
|
||||
}
|
||||
)
|
||||
|
||||
PYLINT_CONFIG = {}
|
||||
PYLINT_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'pylint_210',
|
||||
'install': 'pip install -r requirements_test.txt',
|
||||
'test_cmd': 'pytest -rA --color=no',
|
||||
}
|
||||
for k in [
|
||||
'2.10',
|
||||
'2.11',
|
||||
'2.13',
|
||||
'2.14',
|
||||
'2.15',
|
||||
'2.16',
|
||||
'2.17',
|
||||
'3.0',
|
||||
'3.1',
|
||||
'3.2',
|
||||
'3.3',
|
||||
]
|
||||
}
|
||||
)
|
||||
PYLINT_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'pylint_210',
|
||||
'pre_install': [
|
||||
r"sed -i 's/setuptools==[0-9.]\+/setuptools==58.0.0/' requirements_test_min.txt"
|
||||
],
|
||||
'install': 'pip install -r requirements_test.txt',
|
||||
'test_cmd': 'pytest -rA --color=no',
|
||||
}
|
||||
for k in ['3.0', '3.1', '3.2', '3.3']
|
||||
}
|
||||
)
|
||||
|
||||
ASTROPY_CONFIG = {}
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_11',
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['1.1', '1.2', '1.3', '2.0']
|
||||
}
|
||||
)
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_30',
|
||||
'pre_install': """echo '[pytest]
|
||||
filterwarnings =
|
||||
ignore::DeprecationWarning' > pytest.ini""",
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['3.0', '3.1', '3.2']
|
||||
}
|
||||
)
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_40',
|
||||
'pre_install': [
|
||||
r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml"""
|
||||
],
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['4.0']
|
||||
}
|
||||
)
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_41',
|
||||
'pre_install': [
|
||||
r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml""",
|
||||
"""sed -i 's/^qt_no_exception_capture = 1$/; qt_no_exception_capture = 1/' setup.cfg""",
|
||||
r"""sed -i '/setuptools==68.0.0",/a \ "markupsafe==2.0.1",' pyproject.tomlsed -i '/setuptools==68.0.0",/a \ "markupsafe==2.0.1",' pyproject.toml""",
|
||||
],
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['4.1']
|
||||
}
|
||||
)
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_42',
|
||||
'pre_install': [
|
||||
r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml""",
|
||||
r"""sed -i '/setuptools==68.0.0",/a \ "markupsafe==2.0.1",' pyproject.tomlsed -i '/setuptools==68.0.0",/a \ "markupsafe==2.0.1",' pyproject.toml""",
|
||||
],
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['4.2', '4.3', '5.0', '5.1']
|
||||
}
|
||||
)
|
||||
|
||||
ASTROPY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'astropy_52',
|
||||
'pre_install': [
|
||||
r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml"""
|
||||
],
|
||||
'install': 'python -m pip install -e .[test] --verbose',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['5.2', '5.3', '6.0', '6.1', '7.0']
|
||||
}
|
||||
)
|
||||
|
||||
DJANGO_CONFIG = {}
|
||||
DJANGO_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'install': 'pip install -e .',
|
||||
'conda_env': 'django_22',
|
||||
'test_cmd': 'python tests/runtests.py --verbosity 2',
|
||||
}
|
||||
for k in ['1.9', '2.2']
|
||||
}
|
||||
)
|
||||
DJANGO_CONFIG.update(
|
||||
{
|
||||
'3.2': {
|
||||
'install': 'pip install -e .',
|
||||
'conda_env': 'django_32',
|
||||
'test_cmd': 'python tests/runtests.py --verbosity 2',
|
||||
},
|
||||
'4.2': {
|
||||
'install': 'pip install -e .',
|
||||
'conda_env': 'django_42',
|
||||
'test_cmd': 'python tests/runtests.py --verbosity 2',
|
||||
},
|
||||
'5.1': {
|
||||
'install': 'pip install -e .',
|
||||
'conda_env': 'django_51',
|
||||
'test_cmd': 'python tests/runtests.py --verbosity 2',
|
||||
},
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG = {}
|
||||
SPHINX_CONFIG.update(
|
||||
{ # 1.x 版本问题,实际无用
|
||||
k: {
|
||||
'conda_env': 'sphinx_20',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': ["sed -i 's/pytest/pytest -rA/' tox.ini"],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['1.3', '1.4', '1.5', '1.6', '1.7', '1.8']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_20',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
"sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['2.0', '2.1', '2.2', '2.3', '2.4']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_30',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
"sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
|
||||
"sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
|
||||
"sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
|
||||
"sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '4.0']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_30',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
"sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
|
||||
"sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
|
||||
"sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
|
||||
(
|
||||
"grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && "
|
||||
"sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || "
|
||||
"sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py"
|
||||
),
|
||||
(
|
||||
"grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && "
|
||||
"sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || "
|
||||
"sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py"
|
||||
),
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['4.1']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_30',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
"sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
|
||||
"sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
|
||||
"sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
|
||||
"sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py",
|
||||
"sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['4.2', '4.3', '4.4']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_30',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy37 -v --',
|
||||
}
|
||||
for k in ['4.5', '5.0', '5.1', '5.2']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_60',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy39 -v --',
|
||||
}
|
||||
for k in ['6.0', '6.2', '7.0', '7.1']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_72',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
'apt-get update && apt-get install -y graphviz',
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy39 -v --',
|
||||
}
|
||||
for k in ['7.2', '7.3', '7.4']
|
||||
}
|
||||
)
|
||||
SPHINX_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'sphinx_80',
|
||||
'install': 'python -m pip install -e .[test]',
|
||||
'pre_install': [
|
||||
"sed -i 's/pytest/pytest -rA/' tox.ini",
|
||||
],
|
||||
'test_cmd': 'tox --current-env -epy310 -v --',
|
||||
}
|
||||
for k in ['8.0', '8.1']
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
SKLEARN_CONFIG = {}
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_020',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0.20', '0.21', '0.22']
|
||||
}
|
||||
)
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_100',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0.23', '0.24', '1.00', '1.01', '1.02']
|
||||
}
|
||||
)
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_104',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['1.03', '1.04', '1.05']
|
||||
}
|
||||
)
|
||||
|
||||
SEABORN_CONFIG = {}
|
||||
SEABORN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'seaborn_010',
|
||||
'install': 'pip install -e .[dev]',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0.3', '0.4', '0.5', '0.6', '0.11', '0.12', '0.13', '0.14']
|
||||
}
|
||||
)
|
||||
|
||||
XARRAY_CONFIG = {}
|
||||
XARRAY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'xarray_0014',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0014', '0015', '0016']
|
||||
}
|
||||
)
|
||||
XARRAY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'xarray_0017',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0017', '0018', '0019', '0020', '0021']
|
||||
}
|
||||
)
|
||||
XARRAY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'xarray_2203',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['2203', '2206', '2209', '2210', '2211', '2212']
|
||||
}
|
||||
)
|
||||
XARRAY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'xarray_2303',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in [
|
||||
'2303',
|
||||
'2304',
|
||||
'2305',
|
||||
'2306',
|
||||
'2308',
|
||||
'2309',
|
||||
'2310',
|
||||
'2311',
|
||||
'2312',
|
||||
]
|
||||
}
|
||||
)
|
||||
XARRAY_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'xarray_2401',
|
||||
'install': 'pip install -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['2401', '2402', '2403', '2405', '2407', '2409', '2410', '2411']
|
||||
}
|
||||
)
|
||||
|
||||
SKLEARN_CONFIG = {}
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_020',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0.20', '0.21', '0.22']
|
||||
}
|
||||
)
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_100',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['0.23', '0.24', '1.00', '1.01', '1.02']
|
||||
}
|
||||
)
|
||||
SKLEARN_CONFIG.update(
|
||||
{
|
||||
k: {
|
||||
'conda_env': 'skl_104',
|
||||
'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
|
||||
'test_cmd': 'pytest --color=no -rA',
|
||||
}
|
||||
for k in ['1.03', '1.04', '1.05', '1.06', '1.07']
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
MAP_REPO_TO_CONFIG = {
|
||||
'pydata/xarray': XARRAY_CONFIG,
|
||||
'mwaskom/seaborn': SEABORN_CONFIG,
|
||||
'scikit-learn/scikit-learn': SKLEARN_CONFIG,
|
||||
'sphinx-doc/sphinx': SPHINX_CONFIG,
|
||||
'django/django': DJANGO_CONFIG,
|
||||
'astropy/astropy': ASTROPY_CONFIG,
|
||||
'pylint-dev/pylint': PYLINT_CONFIG,
|
||||
'pytest-dev/pytest': PYTEST_CONFIG,
|
||||
'psf/requests': REQUESTS_CONFIG,
|
||||
'sympy/sympy': SYMPY_CONFIG,
|
||||
'matplotlib/matplotlib': MATPLOTLIB_CONFIG,
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
<uploaded_files>
|
||||
/workspace/{{ workspace_dir_name }}
|
||||
</uploaded_files>
|
||||
|
||||
I've uploaded a python code repository in the directory {{ workspace_dir_name }}. Consider the following issue description:
|
||||
|
||||
<doc_change>
|
||||
{{ instance.problem_statement }}
|
||||
</doc_change>
|
||||
|
||||
Can you help me add the new features to the repository based on the changes in the <doc_change>?
|
||||
I've already taken care of all changes to any of the test files described in the <doc_change>. This means you DON'T have to modify the testing logic or any of the tests in any way!
|
||||
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
|
||||
Your task is to make the minimal changes to non-test files in the /workspace/{{ workspace_dir_name }} directory to implement the new features required by the documentation updates.
|
||||
|
||||
Follow these phases to resolve the issue:
|
||||
|
||||
Phase 1. READING: read the requirements and reword it in clearer terms
|
||||
1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
|
||||
1.2 Hightlight method names, variables, file names, stack traces, and technical details, particularly those related to new features.
|
||||
1.3 Explain the new feature requirements in clear terms.
|
||||
1.4 Specify functional scope and expected behavior of new features.
|
||||
1.5 Hightlight any best practices to take into account when developing and testing the new feature.
|
||||
|
||||
Phase 2. RUNNING: install and run the functionality in the repository to validate the new features
|
||||
2.1 Follow the readme.
|
||||
2.2 Install the environment and anything needed.
|
||||
2.2 Iterate and figure out how to validate the newly added features.
|
||||
|
||||
Phase 3. EXPLORATION: find the files related to the new features and possible implementation solutions
|
||||
3.1 Use `grep` to search for relevant methods, classes, keywords and feature requirements.
|
||||
3.2 Identify all files related to the new features.
|
||||
3.3 Propose the methods and files to implement the new features and explain why.
|
||||
3.4 From the possible file locations, select the most likely location to implement the new features.
|
||||
|
||||
Phase 4. TEST CREATION: before implementing any new features, create a script to validate the feature's correctness.
|
||||
4.1 Look at existing test files in the repository to understand the test format/structure.
|
||||
4.2 Create a minimal validation script to verify the newly added features.
|
||||
4.3 Run the validation script to confirm the new features are successfully added and working as expected.
|
||||
4.4 Adjust the validation script as necessary to ensure the new features fully meet the requirements.
|
||||
|
||||
Phase 5. FEATURE ANALYSIS: state clearly the new feature and how to implement it
|
||||
5.1 State clearly what the new feature is.
|
||||
5.2 State clearly where the feature should be implemented.
|
||||
5.3 State clearly how the test validates the new feature.
|
||||
5.4 State clearly the best practices to take into account when implementing the new feature.
|
||||
5.5 State clearly how to implement the new feature.
|
||||
|
||||
Phase 6. FEATURE IMPLEMENTATION: edit the source code to implement your chosen solution for the new feature
|
||||
6.1 Make minimal, focused changes to implement the new feature.
|
||||
|
||||
Phase 7. VERIFICATION: Test your new feature thoroughly.
|
||||
7.1 Run your validation script to verify the new feature works as expected.
|
||||
7.2 Add edge cases to your test script to ensure comprehensive coverage of the new feature.
|
||||
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
|
||||
|
||||
Phase 8. FINAL REVIEW: Carefully re-read the feature requirements and compare your changes with the base commit {{ instance.base_commit }}
|
||||
8.1 Ensure you've fully implemented all required features.
|
||||
8.2 Run any tests in the repository related to:
|
||||
8.2.1 The new features you are adding
|
||||
8.2.2 The files you modified
|
||||
8.2.3 The functions you changed
|
||||
8.3 If any tests fail, revise your implementation until all tests pass and the new feature works as expected.
|
||||
|
||||
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
|
||||
@@ -0,0 +1,39 @@
|
||||
"""Mapping instance_id to resource_factor.
|
||||
|
||||
Different instances may have different resource requirements.
|
||||
e.g., some instances may require more memory/CPU to run inference.
|
||||
This file tracks the resource requirements of different instances.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
|
||||
os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
|
||||
)
|
||||
|
||||
# dataset to resource mapping
|
||||
_global_resource_mapping: dict[str, dict[str, float]] = {}
|
||||
|
||||
|
||||
def get_resource_mapping(dataset_name: str) -> dict[str, float]:
|
||||
if dataset_name not in _global_resource_mapping:
|
||||
file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
|
||||
if not os.path.exists(file_path):
|
||||
logger.info(f'Resource mapping for {dataset_name} not found.')
|
||||
return None
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
_global_resource_mapping[dataset_name] = json.load(f)
|
||||
logger.debug(f'Loaded resource mapping for {dataset_name}')
|
||||
return _global_resource_mapping[dataset_name]
|
||||
|
||||
|
||||
def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
|
||||
resource_mapping = get_resource_mapping(dataset_name)
|
||||
if resource_mapping is None:
|
||||
return DEFAULT_RUNTIME_RESOURCE_FACTOR
|
||||
return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
|
||||
@@ -0,0 +1,909 @@
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Any, Literal
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import toml
|
||||
from datasets import load_dataset
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
import openhands.agenthub
|
||||
from evaluation.benchmarks.nocode_bench.binary_patch_utils import (
|
||||
remove_binary_diffs,
|
||||
remove_binary_files_from_git,
|
||||
)
|
||||
from evaluation.benchmarks.nocode_bench.consistants import MAP_REPO_TO_CONFIG
|
||||
from evaluation.benchmarks.nocode_bench.resource.mapping import (
|
||||
get_instance_resource_factor,
|
||||
)
|
||||
from evaluation.benchmarks.nocode_bench.scripts.utils.evaluation_utils import (
|
||||
run_evaluation_nocode_bench,
|
||||
)
|
||||
from evaluation.utils.shared import (
|
||||
EvalException,
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
assert_and_raise,
|
||||
codeact_user_response,
|
||||
get_default_sandbox_config_for_eval,
|
||||
get_metrics,
|
||||
is_fatal_evaluation_error,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
AgentConfig,
|
||||
OpenHandsConfig,
|
||||
get_evaluation_parser,
|
||||
get_llm_config_arg,
|
||||
)
|
||||
from openhands.core.config.condenser_config import NoOpCondenserConfig
|
||||
from openhands.core.config.utils import get_condenser_config_arg
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.critic import AgentFinishedCritic
|
||||
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
|
||||
from openhands.events.observation import (
|
||||
CmdOutputObservation,
|
||||
ErrorObservation,
|
||||
FileReadObservation,
|
||||
)
|
||||
from openhands.events.serialization.event import event_from_dict, event_to_dict
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
from openhands.utils.shutdown_listener import sleep_if_should_continue
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||
ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
|
||||
BenchMode = Literal['swe', 'swt', 'swt-ci']
|
||||
|
||||
# Global variable to track dataset type
|
||||
DATASET_TYPE = 'nc_bench'
|
||||
|
||||
|
||||
def set_dataset_type(dataset_name: str) -> str:
|
||||
"""Set dataset type based on dataset name."""
|
||||
global DATASET_TYPE
|
||||
DATASET_TYPE = 'nc_bench'
|
||||
|
||||
logger.info(f'Dataset type set to: {DATASET_TYPE}')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
|
||||
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
||||
return f'{instance.repo.split("/")[-1]}'
|
||||
|
||||
|
||||
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
metadata.details['mode']
|
||||
|
||||
# Determine the template file based on mode and LLM
|
||||
|
||||
template_name = 'nc.j2'
|
||||
|
||||
# Set up Jinja2 environment
|
||||
# Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script
|
||||
prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')
|
||||
env = Environment(loader=FileSystemLoader(prompts_dir))
|
||||
template = env.get_template(template_name)
|
||||
|
||||
# Prepare context for rendering
|
||||
context = {
|
||||
'instance': instance,
|
||||
'workspace_dir_name': workspace_dir_name,
|
||||
'metadata': metadata, # Pass metadata if needed in templates
|
||||
}
|
||||
|
||||
context['test_instructions'] = '' # Ensure it's defined for other modes
|
||||
|
||||
# Render the instruction
|
||||
instruction = template.render(context)
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
||||
)
|
||||
|
||||
if 'image_assets' in instance:
|
||||
assets = json.loads(instance['image_assets'])
|
||||
assert 'problem_statement' in assets, (
|
||||
'problem_statement is required in image_assets'
|
||||
)
|
||||
image_urls = assets['problem_statement']
|
||||
return MessageAction(content=instruction, image_urls=image_urls)
|
||||
return MessageAction(content=instruction)
|
||||
|
||||
|
||||
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
|
||||
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/'
|
||||
)
|
||||
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
||||
|
||||
|
||||
def get_instance_docker_image(
|
||||
instance_id: str,
|
||||
swebench_official_image: bool = False,
|
||||
) -> str:
|
||||
if swebench_official_image:
|
||||
# Official NoCode-Bench image
|
||||
image_name = f'ncbench_{instance_id}:latest'.lower()
|
||||
logger.debug(f'Using official NoCode-Bench image: {image_name}')
|
||||
return image_name
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def get_config(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
) -> OpenHandsConfig:
|
||||
# We use a different instance image for the each instance of NoCode-bench eval
|
||||
use_swebench_official_image = True
|
||||
|
||||
base_container_image = get_instance_docker_image(
|
||||
instance['instance_id'],
|
||||
swebench_official_image=use_swebench_official_image,
|
||||
)
|
||||
logger.info(
|
||||
f'Using instance container image: {base_container_image}. '
|
||||
f'Please make sure this image exists. '
|
||||
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
||||
)
|
||||
|
||||
sandbox_config = get_default_sandbox_config_for_eval()
|
||||
sandbox_config.base_container_image = base_container_image
|
||||
sandbox_config.enable_auto_lint = True
|
||||
sandbox_config.use_host_network = False
|
||||
# Add platform to the sandbox config to solve issue 4401
|
||||
sandbox_config.platform = 'linux/amd64'
|
||||
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
|
||||
dataset_name=metadata.dataset,
|
||||
instance_id=instance['instance_id'],
|
||||
)
|
||||
|
||||
config = OpenHandsConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
run_as_openhands=False,
|
||||
max_iterations=metadata.max_iterations,
|
||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||
sandbox=sandbox_config,
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
||||
)
|
||||
)
|
||||
# get 'draft_editor' config if exists
|
||||
config.set_llm_config(get_llm_config_arg('draft_editor'), 'draft_editor')
|
||||
|
||||
agent_config = AgentConfig(
|
||||
enable_jupyter=False,
|
||||
enable_browsing=RUN_WITH_BROWSING,
|
||||
enable_llm_editor=ENABLE_LLM_EDITOR,
|
||||
enable_mcp=False,
|
||||
condenser=metadata.condenser_config,
|
||||
enable_prompt_extensions=False,
|
||||
)
|
||||
config.set_agent_config(agent_config)
|
||||
return config
|
||||
|
||||
|
||||
def make_serializable(obj):
|
||||
if isinstance(obj, pd.Series):
|
||||
obj = obj.to_dict()
|
||||
if isinstance(obj, dict):
|
||||
return {k: make_serializable(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [make_serializable(v) for v in obj]
|
||||
elif isinstance(obj, tuple):
|
||||
return tuple(make_serializable(v) for v in obj)
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
elif isinstance(obj, pd.Timestamp):
|
||||
return str(obj)
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
def initialize_runtime(
|
||||
runtime: Runtime,
|
||||
instance: pd.Series, # this argument is not required
|
||||
metadata: EvalMetadata,
|
||||
):
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info('-' * 30)
|
||||
logger.info('BEGIN Runtime Initialization Fn')
|
||||
logger.info('-' * 30)
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id and git configuration
|
||||
action = CmdRunAction(
|
||||
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
||||
|
||||
# inject the init script
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
# inject the instance info
|
||||
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
||||
)
|
||||
|
||||
swe_instance_json_name = 'swe-bench-instance.json'
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Construct the full path for the desired file name within the temporary directory
|
||||
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
||||
# Write to the file with the desired name within the temporary directory
|
||||
|
||||
with open(temp_file_path, 'w') as f:
|
||||
if not isinstance(instance, dict):
|
||||
instance_dict = make_serializable(instance)
|
||||
else:
|
||||
instance_dict = dict(instance)
|
||||
|
||||
if DATASET_TYPE == 'nc_bench':
|
||||
config = MAP_REPO_TO_CONFIG.get(instance['repo'], {}).get(
|
||||
instance['version'], []
|
||||
)
|
||||
docker_conda_env_name = config['conda_env']
|
||||
instance_dict['conda_env'] = docker_conda_env_name
|
||||
|
||||
json.dump([instance_dict], f)
|
||||
|
||||
# Copy the file to the desired location
|
||||
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
||||
|
||||
# inject the instance swe entry
|
||||
entry_script_path = 'instance_nc_entry.sh'
|
||||
|
||||
runtime.copy_to(
|
||||
str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
|
||||
'/swe_util/',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command='cat ~/.bashrc')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command='source ~/.bashrc')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
if isinstance(obs, ErrorObservation):
|
||||
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command='git reset --hard')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(
|
||||
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
||||
|
||||
if DATASET_TYPE != 'Multimodal' and DATASET_TYPE != 'SWE-bench-Live':
|
||||
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
|
||||
# SWE-Bench multimodal datasets and SWE-bench-Live are not using the testbed environment
|
||||
action = CmdRunAction(command='which python')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Expected to find python interpreter, but got: {str(obs)}',
|
||||
)
|
||||
|
||||
logger.info('-' * 30)
|
||||
logger.info('END Runtime Initialization Fn')
|
||||
logger.info('-' * 30)
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
runtime: Runtime,
|
||||
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
||||
) -> dict[str, Any]:
|
||||
"""Complete the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info('-' * 30)
|
||||
logger.info('BEGIN Runtime Completion Fn')
|
||||
logger.info('-' * 30)
|
||||
obs: CmdOutputObservation
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
if obs.exit_code == -1:
|
||||
# The previous command is still running
|
||||
# We need to kill previous command
|
||||
logger.info('The previous command is still running, trying to kill it...')
|
||||
action = CmdRunAction(command='C-c')
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Then run the command again
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
if obs.exit_code == -1:
|
||||
# The previous command is still running
|
||||
# We need to kill previous command
|
||||
logger.info('The previous command is still running, trying to ctrl+z it...')
|
||||
action = CmdRunAction(command='C-z')
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Then run the command again
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command='git config --global core.pager ""')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to git config --global core.pager "": {str(obs)}',
|
||||
)
|
||||
|
||||
# First check for any git repositories in subdirectories
|
||||
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to find git repositories: {str(obs)}',
|
||||
)
|
||||
|
||||
git_dirs = [p for p in obs.content.strip().split('\n') if p]
|
||||
if git_dirs:
|
||||
# Remove all .git directories in subdirectories
|
||||
for git_dir in git_dirs:
|
||||
action = CmdRunAction(command=f'rm -rf "{git_dir}"')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to remove git directory {git_dir}: {str(obs)}',
|
||||
)
|
||||
|
||||
# add all files
|
||||
action = CmdRunAction(command='git add -A')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to git add -A: {str(obs)}',
|
||||
)
|
||||
|
||||
# Remove binary files from git staging
|
||||
action = CmdRunAction(command=remove_binary_files_from_git())
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
||||
f'Failed to remove binary files: {str(obs)}',
|
||||
)
|
||||
|
||||
n_retries = 0
|
||||
git_patch = None
|
||||
while n_retries < 5:
|
||||
action = CmdRunAction(
|
||||
command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
|
||||
)
|
||||
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
n_retries += 1
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
if obs.exit_code == 0:
|
||||
# Read the patch file
|
||||
action = FileReadAction(path='patch.diff')
|
||||
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
if isinstance(obs, FileReadObservation):
|
||||
git_patch = obs.content
|
||||
break
|
||||
elif isinstance(obs, ErrorObservation):
|
||||
# Fall back to cat "patch.diff" to get the patch
|
||||
assert 'File could not be decoded as utf-8' in obs.content
|
||||
action = CmdRunAction(command='cat patch.diff')
|
||||
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
git_patch = obs.content
|
||||
break
|
||||
else:
|
||||
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
||||
else:
|
||||
logger.info('Failed to get git diff, retrying...')
|
||||
sleep_if_should_continue(10)
|
||||
elif isinstance(obs, ErrorObservation):
|
||||
logger.error(f'Error occurred: {obs.content}. Retrying...')
|
||||
sleep_if_should_continue(10)
|
||||
else:
|
||||
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
||||
|
||||
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
|
||||
|
||||
# Remove binary diffs from the patch
|
||||
git_patch = remove_binary_diffs(git_patch)
|
||||
|
||||
logger.info('-' * 30)
|
||||
logger.info('END Runtime Completion Fn')
|
||||
logger.info('-' * 30)
|
||||
return {'git_patch': git_patch}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
runtime_failure_count: int = 0,
|
||||
) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
# Increase resource_factor with increasing attempt_id
|
||||
if runtime_failure_count > 0:
|
||||
config.sandbox.remote_runtime_resource_factor = min(
|
||||
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
||||
8,
|
||||
)
|
||||
logger.warning(
|
||||
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
)
|
||||
|
||||
metadata = copy.deepcopy(metadata)
|
||||
metadata.details['runtime_failure_count'] = runtime_failure_count
|
||||
metadata.details['remote_runtime_resource_factor'] = (
|
||||
config.sandbox.remote_runtime_resource_factor
|
||||
)
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
try:
|
||||
initialize_runtime(runtime, instance, metadata)
|
||||
|
||||
message_action = get_instruction(instance, metadata)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
initial_user_action=message_action,
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
metadata.agent_class
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
# if fatal error, throw EvalError to trigger re-run
|
||||
if is_fatal_evaluation_error(state.last_error):
|
||||
raise EvalException('Fatal error detected: ' + state.last_error)
|
||||
|
||||
# ======= THIS IS SWE-Bench specific =======
|
||||
# Get git patch
|
||||
if DATASET_TYPE == 'SWE-bench-Live':
|
||||
from evaluation.benchmarks.swe_bench.live_utils import (
|
||||
complete_runtime as complete_runtime_fn,
|
||||
)
|
||||
else:
|
||||
complete_runtime_fn = complete_runtime
|
||||
return_val = complete_runtime_fn(runtime, instance)
|
||||
git_patch = return_val['git_patch']
|
||||
logger.info(
|
||||
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
|
||||
)
|
||||
finally:
|
||||
runtime.close()
|
||||
# ==========================================
|
||||
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# we use eval_infer.sh to evaluate the agent's edits, not here
|
||||
# because the agent may alter the environment / testcases
|
||||
test_result = {
|
||||
'git_patch': git_patch,
|
||||
}
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
|
||||
histories = [event_to_dict(event) for event in state.history]
|
||||
metrics = get_metrics(state)
|
||||
|
||||
# Save the output
|
||||
instruction = message_action.content
|
||||
if message_action.image_urls:
|
||||
instruction += (
|
||||
'\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
|
||||
)
|
||||
output = EvalOutput(
|
||||
instance_id=instance.instance_id,
|
||||
instruction=instruction,
|
||||
instance=instance.to_dict(), # SWE Bench specific
|
||||
test_result=test_result,
|
||||
metadata=metadata,
|
||||
history=histories,
|
||||
metrics=metrics,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
)
|
||||
return output
|
||||
|
||||
|
||||
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
data = toml.load(file)
|
||||
if 'selected_ids' in data:
|
||||
selected_ids = data['selected_ids']
|
||||
logger.info(
|
||||
f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
||||
)
|
||||
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
||||
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
||||
return subset
|
||||
if 'selected_repos' in data:
|
||||
# repos for the swe-bench instances:
|
||||
# ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy']
|
||||
selected_repos = data['selected_repos']
|
||||
if isinstance(selected_repos, str):
|
||||
selected_repos = [selected_repos]
|
||||
assert isinstance(selected_repos, list)
|
||||
logger.info(
|
||||
f'Filtering {selected_repos} tasks from "selected_repos"...'
|
||||
)
|
||||
subset = dataset[dataset['repo'].isin(selected_repos)]
|
||||
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
||||
return subset
|
||||
|
||||
skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
||||
if len(skip_ids) > 0:
|
||||
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
||||
return dataset[~dataset[filter_column].isin(skip_ids)]
|
||||
return dataset
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_evaluation_parser()
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='NoCode-bench/NoCode-bench_Verified',
|
||||
help='data set to evaluate on, either full-test or lite-test',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--split',
|
||||
type=str,
|
||||
default='test',
|
||||
help='split to evaluate on',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mode',
|
||||
type=str,
|
||||
default='swe',
|
||||
choices=['swe', 'swt', 'swt-ci'],
|
||||
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenHands's repo
|
||||
|
||||
dataset = load_dataset(args.dataset, args.split)
|
||||
|
||||
# Set the global dataset type based on dataset name
|
||||
set_dataset_type(args.dataset)
|
||||
|
||||
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
||||
logger.info(
|
||||
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
||||
)
|
||||
|
||||
llm_config = None
|
||||
if args.llm_config:
|
||||
llm_config = get_llm_config_arg(args.llm_config)
|
||||
llm_config.log_completions = True
|
||||
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
||||
llm_config.modify_params = False
|
||||
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
||||
|
||||
# Get condenser config from environment variable
|
||||
condenser_name = os.environ.get('EVAL_CONDENSER')
|
||||
if condenser_name:
|
||||
condenser_config = get_condenser_config_arg(condenser_name)
|
||||
if condenser_config is None:
|
||||
raise ValueError(
|
||||
f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
|
||||
)
|
||||
else:
|
||||
# If no specific condenser config is provided via env var, default to NoOpCondenser
|
||||
condenser_config = NoOpCondenserConfig()
|
||||
logger.debug(
|
||||
'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.'
|
||||
)
|
||||
|
||||
details = {'mode': args.mode}
|
||||
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
||||
|
||||
dataset_descrption = (
|
||||
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
||||
)
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
dataset_descrption,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
condenser_config=condenser_config,
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
print(f'### OUTPUT FILE: {output_file} ###')
|
||||
|
||||
# Run evaluation in iterative mode:
|
||||
# If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
|
||||
ITERATIVE_EVAL_MODE = (
|
||||
os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
|
||||
)
|
||||
ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
|
||||
os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
|
||||
)
|
||||
|
||||
if not ITERATIVE_EVAL_MODE:
|
||||
# load the dataset
|
||||
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
||||
if len(instances) > 0 and not isinstance(
|
||||
instances['PASS2PASS'][instances['PASS2PASS'].index[0]], str
|
||||
):
|
||||
for col in ['PASS2PASS', 'FAIL2PASS']:
|
||||
instances[col] = instances[col].apply(lambda x: str(x))
|
||||
|
||||
run_evaluation_nocode_bench(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
timeout_seconds=8
|
||||
* 60
|
||||
* 60, # 8 hour PER instance should be more than enough
|
||||
max_retries=5,
|
||||
)
|
||||
else:
|
||||
critic = AgentFinishedCritic()
|
||||
|
||||
def get_cur_output_file_path(attempt: int) -> str:
|
||||
return (
|
||||
f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
|
||||
)
|
||||
|
||||
eval_ids = None
|
||||
for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
|
||||
cur_output_file = get_cur_output_file_path(attempt)
|
||||
logger.info(
|
||||
f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
|
||||
)
|
||||
|
||||
# For deterministic eval, we set temperature to 0.1 for (>1) attempt
|
||||
# so hopefully we get slightly different results
|
||||
if attempt > 1 and metadata.llm_config.temperature == 0:
|
||||
logger.info(
|
||||
f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
|
||||
)
|
||||
metadata.llm_config.temperature = 0.1
|
||||
|
||||
# Load instances - at first attempt, we evaluate all instances
|
||||
# On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
|
||||
instances = prepare_dataset(
|
||||
swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
|
||||
)
|
||||
if len(instances) > 0 and not isinstance(
|
||||
instances['PASS2PASS'][instances['PASS2PASS'].index[0]], str
|
||||
):
|
||||
for col in ['PASS2PASS', 'FAIL2PASS']:
|
||||
instances[col] = instances[col].apply(lambda x: str(x))
|
||||
|
||||
# Run evaluation - but save them to cur_output_file
|
||||
logger.info(
|
||||
f'Evaluating {len(instances)} instances for attempt {attempt}...'
|
||||
)
|
||||
run_evaluation_nocode_bench(
|
||||
instances,
|
||||
metadata,
|
||||
cur_output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
timeout_seconds=8
|
||||
* 60
|
||||
* 60, # 8 hour PER instance should be more than enough
|
||||
max_retries=5,
|
||||
)
|
||||
|
||||
# When eval is done, we update eval_ids to the instances that failed the current attempt
|
||||
instances_failed = []
|
||||
logger.info(
|
||||
f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
|
||||
)
|
||||
with open(cur_output_file, 'r') as f:
|
||||
for line in f:
|
||||
instance = json.loads(line)
|
||||
try:
|
||||
history = [
|
||||
event_from_dict(event) for event in instance['history']
|
||||
]
|
||||
critic_result = critic.evaluate(
|
||||
history, instance['test_result'].get('git_patch', '')
|
||||
)
|
||||
if not critic_result.success:
|
||||
instances_failed.append(instance['instance_id'])
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f'Error loading history for instance {instance["instance_id"]}: {e}'
|
||||
)
|
||||
instances_failed.append(instance['instance_id'])
|
||||
logger.info(
|
||||
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
|
||||
)
|
||||
eval_ids = instances_failed
|
||||
|
||||
# If no instances failed, we break
|
||||
if len(instances_failed) == 0:
|
||||
break
|
||||
|
||||
# Then we should aggregate the results from all attempts into the original output file
|
||||
# and remove the intermediate files
|
||||
logger.info(
|
||||
'Aggregating results from all attempts into the original output file...'
|
||||
)
|
||||
fout = open(output_file, 'w')
|
||||
added_instance_ids = set()
|
||||
for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
|
||||
cur_output_file = get_cur_output_file_path(attempt)
|
||||
if not os.path.exists(cur_output_file):
|
||||
logger.warning(
|
||||
f'Intermediate output file {cur_output_file} does not exist. Skipping...'
|
||||
)
|
||||
continue
|
||||
|
||||
with open(cur_output_file, 'r') as f:
|
||||
for line in f:
|
||||
instance = json.loads(line)
|
||||
# Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
|
||||
if (
|
||||
instance['instance_id'] not in added_instance_ids
|
||||
and instance['test_result'].get('git_patch', '').strip()
|
||||
):
|
||||
fout.write(line)
|
||||
added_instance_ids.add(instance['instance_id'])
|
||||
logger.info(
|
||||
f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
|
||||
)
|
||||
fout.close()
|
||||
logger.info(
|
||||
f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
|
||||
)
|
||||
@@ -0,0 +1,33 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def main(output_jsonl: str):
|
||||
with open(output_jsonl, 'r') as f:
|
||||
for line in f:
|
||||
try:
|
||||
output = json.loads(line)
|
||||
pred = {
|
||||
'instance_id': output['instance_id'],
|
||||
'model_name_or_path': output['metadata']['llm_config']['model'],
|
||||
'model_patch': output['test_result']['git_patch'],
|
||||
}
|
||||
except Exception as e:
|
||||
print(
|
||||
f'Error while reading output of instance {output["instance_id"]}: {e}'
|
||||
)
|
||||
|
||||
print(json.dumps(pred))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--output_jsonl',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Path to the prediction file (.../outputs.jsonl)',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.output_jsonl)
|
||||
@@ -0,0 +1,104 @@
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
|
||||
def verify_instance_costs(row: pd.Series) -> float:
|
||||
"""
|
||||
Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
Also checks for duplicate consecutive costs which might indicate buggy counting.
|
||||
If the consecutive costs are identical, the file is affected by this bug:
|
||||
https://github.com/All-Hands-AI/OpenHands/issues/5383
|
||||
|
||||
Args:
|
||||
row: DataFrame row containing instance data with metrics
|
||||
Returns:
|
||||
float: The verified total cost for this instance (corrected if needed)
|
||||
"""
|
||||
try:
|
||||
metrics = row.get('metrics')
|
||||
if not metrics:
|
||||
logger.warning(f'Instance {row["instance_id"]}: No metrics found')
|
||||
return 0.0
|
||||
|
||||
accumulated = metrics.get('accumulated_cost')
|
||||
costs = metrics.get('costs', [])
|
||||
|
||||
if accumulated is None:
|
||||
logger.warning(
|
||||
f'Instance {row["instance_id"]}: No accumulated_cost in metrics'
|
||||
)
|
||||
return 0.0
|
||||
|
||||
# Check for duplicate consecutive costs and systematic even-odd pairs
|
||||
has_duplicate = False
|
||||
all_pairs_match = True
|
||||
|
||||
# Check each even-odd pair (0-1, 2-3, etc.)
|
||||
for i in range(0, len(costs) - 1, 2):
|
||||
if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
|
||||
has_duplicate = True
|
||||
logger.debug(
|
||||
f'Instance {row["instance_id"]}: Possible buggy double-counting detected! '
|
||||
f'Steps {i} and {i + 1} have identical costs: {costs[i]["cost"]:.2f}'
|
||||
)
|
||||
else:
|
||||
all_pairs_match = False
|
||||
break
|
||||
|
||||
# Calculate total cost, accounting for buggy double counting if detected
|
||||
if len(costs) >= 2 and has_duplicate and all_pairs_match:
|
||||
paired_steps_cost = sum(
|
||||
cost_entry['cost']
|
||||
for cost_entry in costs[: -1 if len(costs) % 2 else None]
|
||||
)
|
||||
real_paired_cost = paired_steps_cost / 2
|
||||
|
||||
unpaired_cost = costs[-1]['cost'] if len(costs) % 2 else 0
|
||||
total_cost = real_paired_cost + unpaired_cost
|
||||
|
||||
else:
|
||||
total_cost = sum(cost_entry['cost'] for cost_entry in costs)
|
||||
|
||||
if not abs(total_cost - accumulated) < 1e-6:
|
||||
logger.warning(
|
||||
f'Instance {row["instance_id"]}: Cost mismatch: '
|
||||
f'accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, '
|
||||
)
|
||||
|
||||
return total_cost
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f'Error verifying costs for instance {row.get("instance_id", "UNKNOWN")}: {e}'
|
||||
)
|
||||
return 0.0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Verify costs in SWE-bench output file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'input_filepath', type=str, help='Path to the output.jsonl file'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Load and verify the JSONL file
|
||||
df = pd.read_json(args.input_filepath, lines=True)
|
||||
logger.info(f'Loaded {len(df)} instances from {args.input_filepath}')
|
||||
|
||||
# Verify costs for each instance and sum up total
|
||||
total_cost = df.apply(verify_instance_costs, axis=1).sum()
|
||||
logger.info(f'Total verified cost across all instances: ${total_cost:.2f}')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to process file: {e}')
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
MAX_ITER=$5
|
||||
NUM_WORKERS=$6
|
||||
DATASET=$7
|
||||
SPLIT=$8
|
||||
N_RUNS=$9
|
||||
MODE=${10}
|
||||
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
echo "Number of workers not specified, use default $NUM_WORKERS"
|
||||
fi
|
||||
checkout_eval_branch
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
if [ -z "$MAX_ITER" ]; then
|
||||
echo "MAX_ITER not specified, use default 100"
|
||||
MAX_ITER=100
|
||||
fi
|
||||
|
||||
if [ -z "$RUN_WITH_BROWSING" ]; then
|
||||
echo "RUN_WITH_BROWSING not specified, use default false"
|
||||
RUN_WITH_BROWSING=false
|
||||
fi
|
||||
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
||||
DATASET="princeton-nlp/SWE-bench_Lite"
|
||||
fi
|
||||
|
||||
if [ -z "$SPLIT" ]; then
|
||||
echo "SPLIT not specified, use default test"
|
||||
SPLIT="test"
|
||||
fi
|
||||
|
||||
if [ -z "$MODE" ]; then
|
||||
MODE="swe"
|
||||
echo "MODE not specified, use default $MODE"
|
||||
fi
|
||||
|
||||
if [ -n "$EVAL_CONDENSER" ]; then
|
||||
echo "Using Condenser Config: $EVAL_CONDENSER"
|
||||
else
|
||||
echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
|
||||
fi
|
||||
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "SPLIT: $SPLIT"
|
||||
echo "MAX_ITER: $MAX_ITER"
|
||||
echo "NUM_WORKERS: $NUM_WORKERS"
|
||||
echo "COMMIT_HASH: $COMMIT_HASH"
|
||||
echo "MODE: $MODE"
|
||||
echo "EVAL_CONDENSER: $EVAL_CONDENSER"
|
||||
|
||||
# Default to NOT use Hint
|
||||
if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
fi
|
||||
|
||||
if [ "$RUN_WITH_BROWSING" = true ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-with-browsing"
|
||||
fi
|
||||
|
||||
if [ -n "$EXP_NAME" ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
||||
fi
|
||||
# if mode != swe, add mode to the eval note
|
||||
if [ "$MODE" != "swe" ]; then
|
||||
EVAL_NOTE="${EVAL_NOTE}-${MODE}"
|
||||
fi
|
||||
# Add condenser config to eval note if provided
|
||||
if [ -n "$EVAL_CONDENSER" ]; then
|
||||
EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
|
||||
fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note="${1}"
|
||||
COMMAND="poetry run python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $eval_note \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT \
|
||||
--mode $MODE"
|
||||
|
||||
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
}
|
||||
|
||||
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
|
||||
if [ -z "$N_RUNS" ]; then
|
||||
N_RUNS=1
|
||||
echo "N_RUNS not specified, use default $N_RUNS"
|
||||
fi
|
||||
|
||||
# Skip runs if the run number is in the SKIP_RUNS list
|
||||
# read from env variable SKIP_RUNS as a comma separated list of run numbers
|
||||
SKIP_RUNS=(${SKIP_RUNS//,/ })
|
||||
for i in $(seq 1 $N_RUNS); do
|
||||
if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
|
||||
echo "Skipping run $i"
|
||||
continue
|
||||
fi
|
||||
current_eval_note="$EVAL_NOTE-run_$i"
|
||||
echo "EVAL_NOTE: $current_eval_note"
|
||||
run_eval $current_eval_note
|
||||
done
|
||||
|
||||
checkout_original_branch
|
||||
@@ -0,0 +1,54 @@
|
||||
"""This script compares gold patches with OpenHands-generated patches and check whether
|
||||
OpenHands found the right (set of) files to modify.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def extract_modified_files(patch):
|
||||
modified_files = set()
|
||||
file_pattern = re.compile(r'^diff --git a/(.*?) b/')
|
||||
|
||||
for line in patch.split('\n'):
|
||||
match = file_pattern.match(line)
|
||||
if match:
|
||||
modified_files.add(match.group(1))
|
||||
|
||||
return modified_files
|
||||
|
||||
|
||||
def process_report(oh_output_file):
|
||||
succ = 0
|
||||
fail = 0
|
||||
for line in open(oh_output_file):
|
||||
line = json.loads(line)
|
||||
instance_id = line['instance_id']
|
||||
gold_patch = line['swe_instance']['patch']
|
||||
generated_patch = line['git_patch']
|
||||
gold_modified_files = extract_modified_files(gold_patch)
|
||||
# swe-bench lite only: a gold patch always contains exactly one file
|
||||
assert len(gold_modified_files) == 1
|
||||
generated_modified_files = extract_modified_files(generated_patch)
|
||||
|
||||
# Check if all files in gold_patch are also in generated_patch
|
||||
all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
|
||||
if all_files_in_generated:
|
||||
succ += 1
|
||||
else:
|
||||
fail += 1
|
||||
print(
|
||||
f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
|
||||
)
|
||||
print(
|
||||
f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--oh_output_file', help='Path to the OH output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
process_report(args.oh_output_file)
|
||||
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source ~/.bashrc
|
||||
SWEUTIL_DIR=/swe_util
|
||||
|
||||
if [ -z "$SWE_INSTANCE_ID" ]; then
|
||||
echo "Error: SWE_INSTANCE_ID is not set." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
|
||||
|
||||
|
||||
if [[ -z "$item" ]]; then
|
||||
echo "No item found for the provided instance ID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REPO_NAME=$(echo "$item" | jq -r '.repo | split("/")[-1]')
|
||||
WORKSPACE_NAME="$REPO_NAME"
|
||||
|
||||
|
||||
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
|
||||
|
||||
|
||||
# Clear the workspace
|
||||
if [ -d /workspace ]; then
|
||||
rm -rf /workspace/*
|
||||
else
|
||||
mkdir /workspace
|
||||
fi
|
||||
# Copy repo to workspace
|
||||
if [ -d /workspace/$WORKSPACE_NAME ]; then
|
||||
rm -rf /workspace/$WORKSPACE_NAME
|
||||
fi
|
||||
mkdir -p /workspace
|
||||
|
||||
SRC_DIR="/root/$REPO_NAME"
|
||||
DEST_DIR="/workspace/$WORKSPACE_NAME"
|
||||
|
||||
cp -r "$SRC_DIR" "$DEST_DIR"
|
||||
|
||||
|
||||
|
||||
echo ">> Extracting conda environment name..."
|
||||
CONDA_ENV_NAME=$(echo "$item" | jq -r '.conda_env // empty')
|
||||
|
||||
# Activate instance-specific environment
|
||||
if [ -d /opt/miniconda3 ]; then
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate $CONDA_ENV_NAME
|
||||
fi
|
||||
@@ -0,0 +1,154 @@
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
from typing import Awaitable, Callable, TextIO
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pydantic import SecretStr
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
_process_instance_wrapper,
|
||||
_process_instance_wrapper_mp,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
|
||||
def update_progress_nc(
|
||||
result: EvalOutput,
|
||||
pbar: tqdm,
|
||||
output_fp: TextIO,
|
||||
):
|
||||
"""Update the progress bar and write the result to the output file."""
|
||||
pbar.update(1)
|
||||
pbar.set_description(f'Instance {result.instance_id}')
|
||||
pbar.set_postfix_str(f'Test Result: {str(result.test_result)[:300]}...')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {result.instance_id}: '
|
||||
f'{str(result.test_result)[:300]}...\n'
|
||||
)
|
||||
|
||||
def make_serializable(obj):
|
||||
if isinstance(obj, pd.Series):
|
||||
return make_serializable(obj.to_dict())
|
||||
|
||||
if isinstance(obj, dict):
|
||||
return {k: make_serializable(v) for k, v in obj.items()}
|
||||
|
||||
elif isinstance(obj, (list, tuple, set)):
|
||||
converted = [make_serializable(v) for v in obj]
|
||||
if isinstance(obj, list):
|
||||
return converted
|
||||
elif isinstance(obj, tuple):
|
||||
return tuple(converted)
|
||||
else: # set
|
||||
return converted
|
||||
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
|
||||
elif isinstance(obj, np.generic):
|
||||
return obj.item()
|
||||
|
||||
elif isinstance(obj, pd.Timestamp):
|
||||
return obj.isoformat()
|
||||
|
||||
elif SecretStr is not None and isinstance(obj, SecretStr):
|
||||
return str(obj)
|
||||
|
||||
else:
|
||||
return obj
|
||||
|
||||
try:
|
||||
raw_data = result.model_dump(mode='python', round_trip=False)
|
||||
safe_data = make_serializable(raw_data)
|
||||
output_fp.write(json.dumps(safe_data, ensure_ascii=False) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to write full result: {e}')
|
||||
|
||||
fallback = {
|
||||
'instance_id': result.instance_id,
|
||||
'model_patch': result.test_result.get('git_patch', ''),
|
||||
}
|
||||
try:
|
||||
output_fp.write(json.dumps(fallback, ensure_ascii=False) + '\n')
|
||||
output_fp.flush()
|
||||
logger.info(
|
||||
f'Wrote fallback result for instance {result.instance_id}: only instance_id and model_patch.'
|
||||
)
|
||||
except Exception as e2:
|
||||
logger.error(f'Failed to write fallback result: {e2}')
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def run_evaluation_nocode_bench(
|
||||
dataset: pd.DataFrame,
|
||||
metadata: EvalMetadata | None,
|
||||
output_file: str,
|
||||
num_workers: int,
|
||||
process_instance_func: Callable[
|
||||
[pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
|
||||
],
|
||||
max_retries: int = 5, # number of retries for each instance
|
||||
timeout_seconds: int | None = None,
|
||||
):
|
||||
use_multiprocessing = num_workers > 1
|
||||
|
||||
if metadata is not None:
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {metadata.agent_class}:\n'
|
||||
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
|
||||
)
|
||||
else:
|
||||
logger.warning('Running evaluation without metadata.')
|
||||
logger.info(f'Evaluation started with {num_workers} workers.')
|
||||
|
||||
total_instances = len(dataset)
|
||||
pbar = tqdm(total=total_instances, desc='Instances processed')
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
try:
|
||||
if use_multiprocessing:
|
||||
with mp.Pool(num_workers) as pool:
|
||||
args_iter = (
|
||||
(
|
||||
process_instance_func,
|
||||
instance,
|
||||
metadata,
|
||||
True,
|
||||
max_retries,
|
||||
timeout_seconds,
|
||||
)
|
||||
for _, instance in dataset.iterrows()
|
||||
)
|
||||
results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
|
||||
for result in results:
|
||||
update_progress_nc(result, pbar, output_fp)
|
||||
else:
|
||||
for _, instance in dataset.iterrows():
|
||||
result = _process_instance_wrapper(
|
||||
process_instance_func=process_instance_func,
|
||||
instance=instance,
|
||||
metadata=metadata,
|
||||
use_mp=False,
|
||||
max_retries=max_retries,
|
||||
)
|
||||
update_progress_nc(result, pbar, output_fp)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print('\nKeyboardInterrupt received. Cleaning up...\n')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('\nEvaluation finished.\n')
|
||||
@@ -1,11 +1,8 @@
|
||||
"""
|
||||
Utilities for handling binary files and patch generation in SWE-bench evaluation.
|
||||
"""
|
||||
"""Utilities for handling binary files and patch generation in SWE-bench evaluation."""
|
||||
|
||||
|
||||
def remove_binary_diffs(patch_text):
|
||||
"""
|
||||
Remove binary file diffs from a git patch.
|
||||
"""Remove binary file diffs from a git patch.
|
||||
|
||||
Args:
|
||||
patch_text (str): The git patch text
|
||||
@@ -36,8 +33,7 @@ def remove_binary_diffs(patch_text):
|
||||
|
||||
|
||||
def remove_binary_files_from_git():
|
||||
"""
|
||||
Generate a bash command to remove binary files from git staging.
|
||||
"""Generate a bash command to remove binary files from git staging.
|
||||
|
||||
Returns:
|
||||
str: A bash command that removes binary files from git staging
|
||||
|
||||
@@ -111,8 +111,7 @@ def process_instance(
|
||||
runtime_failure_count: int = 0,
|
||||
conditional_imports: ConditionalImports | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a SWE-bench problem instance.
|
||||
"""Evaluate agent performance on a SWE-bench problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
|
||||
@@ -16,8 +16,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
class LocEvaluator:
|
||||
def __init__(self, args):
|
||||
"""
|
||||
Localization evaluation.
|
||||
"""Localization evaluation.
|
||||
|
||||
Args:
|
||||
args: all main arguments
|
||||
@@ -76,8 +75,7 @@ class LocEvaluator:
|
||||
self.task_resolved = False
|
||||
|
||||
def _init_dir(self, directory_path):
|
||||
"""
|
||||
Check if a directory exists and create it if it doesn't.
|
||||
"""Check if a directory exists and create it if it doesn't.
|
||||
|
||||
Args:
|
||||
directory_path (str): Path to the directory to check/create
|
||||
@@ -207,8 +205,7 @@ class LocEvaluator:
|
||||
self._compute_avg_over_all()
|
||||
|
||||
def _write_to_json(self, data, file_name):
|
||||
"""
|
||||
Writes the current object data to a JSON file.
|
||||
"""Writes the current object data to a JSON file.
|
||||
|
||||
Returns:
|
||||
bool: True if writing was successful, False otherwise.
|
||||
@@ -225,8 +222,7 @@ class LocEvaluator:
|
||||
return False
|
||||
|
||||
def read_from_json(self, file_path):
|
||||
"""
|
||||
Reads data from a JSON file and loads it into the current object.
|
||||
"""Reads data from a JSON file and loads it into the current object.
|
||||
|
||||
Returns:
|
||||
dict: The loaded JSON data, or an empty dict if the file doesn't exist
|
||||
@@ -253,8 +249,7 @@ class LocEvaluator:
|
||||
return {}
|
||||
|
||||
def read_from_jsonl(self, file_path):
|
||||
"""
|
||||
Reads data from a JSON file and loads it into the current object.
|
||||
"""Reads data from a JSON file and loads it into the current object.
|
||||
|
||||
Returns:
|
||||
dict: The loaded JSON data, or an empty dict if the file doesn't exist
|
||||
@@ -294,8 +289,7 @@ class LocEvaluator:
|
||||
history_idx += 1
|
||||
|
||||
def _parse_string_to_dict(self, dict_string) -> dict:
|
||||
"""
|
||||
Convert a string representation of a dictionary to an actual dictionary.
|
||||
"""Convert a string representation of a dictionary to an actual dictionary.
|
||||
|
||||
Args:
|
||||
dict_string (str): String representation of a dictionary
|
||||
@@ -328,8 +322,7 @@ class LocEvaluator:
|
||||
return None
|
||||
|
||||
def _parse_value_from_args(self, argument_str: str, key: str) -> str:
|
||||
"""
|
||||
Parse a specific key's value from argument string.
|
||||
"""Parse a specific key's value from argument string.
|
||||
|
||||
Args:
|
||||
argument_str (str): The argument string containing key-value pairs
|
||||
@@ -407,8 +400,7 @@ class LocEvaluator:
|
||||
return ''
|
||||
|
||||
def _parse_path_from_args(self, argument_str: str) -> str:
|
||||
"""
|
||||
Parse path from argument string.
|
||||
"""Parse path from argument string.
|
||||
|
||||
Args:
|
||||
argument_str (str): The argument string containing path information
|
||||
@@ -419,8 +411,7 @@ class LocEvaluator:
|
||||
return self._parse_value_from_args(argument_str, 'path')
|
||||
|
||||
def _parse_func_names_from_str(self, code_patch) -> list:
|
||||
"""
|
||||
Parse function names from the new_str code patch.
|
||||
"""Parse function names from the new_str code patch.
|
||||
|
||||
Args:
|
||||
code_patch: Either a string (argument string) or already extracted new_str code
|
||||
@@ -801,8 +792,7 @@ class LocEvaluator:
|
||||
|
||||
|
||||
def swe_data_loader(args):
|
||||
"""
|
||||
Loading SWE-Bench data.
|
||||
"""Loading SWE-Bench data.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
@@ -834,8 +824,7 @@ def swe_data_loader(args):
|
||||
|
||||
|
||||
def infer_data_loader(args):
|
||||
"""
|
||||
Load instance IDs.
|
||||
"""Load instance IDs.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
@@ -868,8 +857,7 @@ def infer_data_loader(args):
|
||||
|
||||
|
||||
def infer_cost_calculator(args):
|
||||
"""
|
||||
Calculate total and average costs from metric JSON files with detailed output.
|
||||
"""Calculate total and average costs from metric JSON files with detailed output.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
|
||||
@@ -28,8 +28,7 @@ class LocalizationInfo:
|
||||
hunks_per_file: dict[str, int] # File -> number of hunks
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Convert LocalizationInfo to a dictionary for JSON serialization.
|
||||
"""Convert LocalizationInfo to a dictionary for JSON serialization.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of the localization information
|
||||
@@ -58,8 +57,7 @@ class LocalizationInfo:
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> 'LocalizationInfo':
|
||||
"""
|
||||
Create LocalizationInfo from a dictionary (for loading from JSON).
|
||||
"""Create LocalizationInfo from a dictionary (for loading from JSON).
|
||||
|
||||
Args:
|
||||
data: Dictionary containing localization information
|
||||
@@ -91,8 +89,7 @@ class LocalizationInfo:
|
||||
|
||||
|
||||
class LocMeta:
|
||||
"""
|
||||
SWE-Bench dataset loader and ground-truth localization parser.
|
||||
"""SWE-Bench dataset loader and ground-truth localization parser.
|
||||
|
||||
This class handles loading SWE-Bench datasets and extracting ground-truth
|
||||
localization information from patches for code localization evaluation.
|
||||
@@ -104,8 +101,7 @@ class LocMeta:
|
||||
dataset_name: str = 'princeton-nlp/SWE-bench_Verified',
|
||||
split: str = 'test',
|
||||
):
|
||||
"""
|
||||
Initialize LocMeta with a SWE-Bench dataset.
|
||||
"""Initialize LocMeta with a SWE-Bench dataset.
|
||||
|
||||
Args:
|
||||
dataset_name: HuggingFace dataset name (e.g., "princeton-nlp/SWE-bench_Verified")
|
||||
@@ -124,8 +120,7 @@ class LocMeta:
|
||||
self._init_swe_dataset()
|
||||
|
||||
def _init_swe_dataset(self) -> None:
|
||||
"""
|
||||
Load and initialize the SWE-Bench dataset from HuggingFace.
|
||||
"""Load and initialize the SWE-Bench dataset from HuggingFace.
|
||||
Converts to pandas DataFrame for easy manipulation.
|
||||
"""
|
||||
try:
|
||||
@@ -150,8 +145,7 @@ class LocMeta:
|
||||
raise
|
||||
|
||||
def get_instance_by_id(self, instance_id: str) -> pd.Series:
|
||||
"""
|
||||
Retrieve a specific instance by its ID.
|
||||
"""Retrieve a specific instance by its ID.
|
||||
|
||||
Args:
|
||||
instance_id: The instance identifier
|
||||
@@ -169,8 +163,7 @@ class LocMeta:
|
||||
return self.df.iloc[idx]
|
||||
|
||||
def parse_instance_loc(self, instance: Union[pd.Series, str]) -> LocalizationInfo:
|
||||
"""
|
||||
Parse ground-truth localization information from a SWE-Bench instance.
|
||||
"""Parse ground-truth localization information from a SWE-Bench instance.
|
||||
|
||||
Args:
|
||||
instance: Either a pandas Series with instance data or an instance_id string
|
||||
@@ -218,8 +211,7 @@ class LocMeta:
|
||||
def _parse_file_patch_lines(
|
||||
self, file_patch: str
|
||||
) -> tuple[list[tuple[int, int]], int, int]:
|
||||
"""
|
||||
Parse line ranges and count changes from a single file patch.
|
||||
"""Parse line ranges and count changes from a single file patch.
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -253,8 +245,7 @@ class LocMeta:
|
||||
def _parse_code_structures_from_patch(
|
||||
self, file_patch: str, file_path: str
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Extract function and class names from patch context (fallback method).
|
||||
"""Extract function and class names from patch context (fallback method).
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -311,8 +302,7 @@ class LocMeta:
|
||||
def _parse_patch_localization(
|
||||
self, patch_content: str, instance_id: str
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse localization information from a git patch (improved method).
|
||||
"""Parse localization information from a git patch (improved method).
|
||||
|
||||
Args:
|
||||
patch_content: The git patch content
|
||||
@@ -390,8 +380,7 @@ class LocMeta:
|
||||
def _extract_code_structures_from_patch(
|
||||
self, file_patch: str, file_path: str
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Extract function and class names from patch context and content.
|
||||
"""Extract function and class names from patch context and content.
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -519,8 +508,7 @@ class LocMeta:
|
||||
def _parse_patch_localization_with_runtime(
|
||||
self, patch_content: str, instance_id: str, runtime: Runtime
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse localization information from a git patch using OpenHands runtime.
|
||||
"""Parse localization information from a git patch using OpenHands runtime.
|
||||
This is the superior method when runtime is available.
|
||||
|
||||
Args:
|
||||
@@ -596,8 +584,7 @@ class LocMeta:
|
||||
def parse_instance_loc_with_runtime(
|
||||
self, instance: Union[pd.Series, str], runtime: Runtime = None
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse ground-truth localization information using OpenHands runtime.
|
||||
"""Parse ground-truth localization information using OpenHands runtime.
|
||||
|
||||
Args:
|
||||
instance: Either a pandas Series with instance data or an instance_id string
|
||||
@@ -634,8 +621,7 @@ class LocMeta:
|
||||
def _analyze_source_code_with_runtime(
|
||||
self, runtime: Runtime, file_path: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Analyze source code using OpenHands runtime to find functions and classes.
|
||||
"""Analyze source code using OpenHands runtime to find functions and classes.
|
||||
|
||||
Args:
|
||||
runtime: OpenHands runtime object
|
||||
@@ -695,8 +681,7 @@ class LocMeta:
|
||||
def _parse_cython_content_with_line_mapping(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Cython content to extract functions and classes with line mapping.
|
||||
"""Parse Cython content to extract functions and classes with line mapping.
|
||||
Since Cython files can't be parsed with Python's AST, we use regex-based parsing.
|
||||
|
||||
Args:
|
||||
@@ -828,8 +813,7 @@ class LocMeta:
|
||||
def _parse_python_content_with_line_mapping(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Python content to extract functions and classes with accurate line mapping.
|
||||
"""Parse Python content to extract functions and classes with accurate line mapping.
|
||||
|
||||
Args:
|
||||
content: Python source code content
|
||||
@@ -914,8 +898,7 @@ class LocMeta:
|
||||
def _parse_python_content(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Python content to extract functions and classes.
|
||||
"""Parse Python content to extract functions and classes.
|
||||
|
||||
Args:
|
||||
content: Python source code content
|
||||
@@ -989,8 +972,7 @@ class LocMeta:
|
||||
return [], [], {}, {}
|
||||
|
||||
def _split_patch_by_files(self, patch_content: str) -> dict[str, str]:
|
||||
"""
|
||||
Split a multi-file patch into individual file patches.
|
||||
"""Split a multi-file patch into individual file patches.
|
||||
|
||||
Args:
|
||||
patch_content: Complete patch content
|
||||
@@ -1049,8 +1031,7 @@ class LocMeta:
|
||||
def _empty_localization_info(
|
||||
self, instance_id: str = 'unknown'
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Return an empty LocalizationInfo object.
|
||||
"""Return an empty LocalizationInfo object.
|
||||
|
||||
Args:
|
||||
instance_id: Instance identifier
|
||||
@@ -1072,8 +1053,7 @@ class LocMeta:
|
||||
)
|
||||
|
||||
def get_dataset_statistics(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the loaded dataset.
|
||||
"""Get statistics about the loaded dataset.
|
||||
|
||||
Returns:
|
||||
Dictionary containing dataset statistics
|
||||
@@ -1095,8 +1075,7 @@ class LocMeta:
|
||||
return stats
|
||||
|
||||
def get_instances_by_repo(self, repo_name: str) -> pd.DataFrame:
|
||||
"""
|
||||
Get all instances for a specific repository.
|
||||
"""Get all instances for a specific repository.
|
||||
|
||||
Args:
|
||||
repo_name: Repository name (e.g., "django/django")
|
||||
|
||||
@@ -6,8 +6,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
|
||||
def verify_instance_costs(row: pd.Series) -> float:
|
||||
"""
|
||||
Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
"""Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
Also checks for duplicate consecutive costs which might indicate buggy counting.
|
||||
If the consecutive costs are identical, the file is affected by this bug:
|
||||
https://github.com/All-Hands-AI/OpenHands/issues/5383
|
||||
|
||||
@@ -181,9 +181,7 @@ def distinct_methods_stats(tree, num_lines):
|
||||
|
||||
|
||||
def loops_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of loops.
|
||||
"""
|
||||
"""Calculate the average number of loops."""
|
||||
total_loops = 0
|
||||
|
||||
def traverse(node):
|
||||
@@ -199,9 +197,7 @@ def loops_stats(tree, num_lines):
|
||||
|
||||
|
||||
def branches_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of branches (conditional statements).
|
||||
"""
|
||||
"""Calculate the average number of branches (conditional statements)."""
|
||||
total_branches = 0
|
||||
|
||||
def traverse(node):
|
||||
|
||||
@@ -192,8 +192,7 @@ def run_mutation_testing(
|
||||
def grade_test_output(
|
||||
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime
|
||||
):
|
||||
"""
|
||||
Two-pass test grading with short-circuiting:
|
||||
"""Two-pass test grading with short-circuiting:
|
||||
1. Run all tests to identify passing/failing tests
|
||||
2. If no failing tests, evaluate coverage immediately
|
||||
3. Otherwise, run only passing tests for coverage analysis
|
||||
@@ -280,8 +279,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
log_dir: str | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a TestGenEval problem instance.
|
||||
"""Evaluate agent performance on a TestGenEval problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
@@ -453,8 +451,7 @@ def process_instance(
|
||||
|
||||
|
||||
def count_and_log_fields(evaluated_predictions, fields, key):
|
||||
"""
|
||||
Count and log the sum of specified fields in the evaluated predictions,
|
||||
"""Count and log the sum of specified fields in the evaluated predictions,
|
||||
ignoring fields with a value of -1. If all values for a field are -1,
|
||||
return -1.
|
||||
|
||||
|
||||
@@ -4,8 +4,7 @@ from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
|
||||
|
||||
def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -26,8 +25,7 @@ def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework with options
|
||||
"""Parser for test logs generated with PyTest framework with options
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -61,8 +59,7 @@ def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_django(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Django tester framework
|
||||
"""Parser for test logs generated with Django tester framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -141,8 +138,7 @@ def parse_log_django(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework (Later Version)
|
||||
"""Parser for test logs generated with PyTest framework (Later Version)
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -170,8 +166,7 @@ def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with seaborn testing framework
|
||||
"""Parser for test logs generated with seaborn testing framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -196,8 +191,7 @@ def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Sympy framework
|
||||
"""Parser for test logs generated with Sympy framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -229,8 +223,7 @@ def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_matplotlib(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
|
||||
@@ -12,8 +12,7 @@ if sys.getrecursionlimit() < 10_000:
|
||||
|
||||
|
||||
def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
"""
|
||||
Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
"""Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
:param pred: list of predicted tokens
|
||||
@@ -30,8 +29,7 @@ def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
|
||||
|
||||
def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
"""
|
||||
Calculate BLEU score for a batch of sentences.
|
||||
"""Calculate BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -43,8 +41,7 @@ def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
|
||||
|
||||
def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate corpus-level BLEU score for a batch of sentences.
|
||||
"""Calculate corpus-level BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -63,8 +60,7 @@ def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
def edit_sim(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> float:
|
||||
"""
|
||||
Calculate char-level edit similarity, in the range of 0~100.
|
||||
"""Calculate char-level edit similarity, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -85,8 +81,7 @@ def batch_edit_sim(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate char-level edit similarity for a batch of sentences.
|
||||
"""Calculate char-level edit similarity for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -102,8 +97,7 @@ T = TypeVar('T')
|
||||
|
||||
|
||||
def exact_match(gold: T, pred: T) -> float:
|
||||
"""
|
||||
Calculate exact match accuracy, in the range of {0, 100}.
|
||||
"""Calculate exact match accuracy, in the range of {0, 100}.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -115,8 +109,7 @@ def exact_match(gold: T, pred: T) -> float:
|
||||
|
||||
|
||||
def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
"""
|
||||
Calculate exact match accuracy for a batch of sentences.
|
||||
"""Calculate exact match accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -130,8 +123,7 @@ def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
def rouge_l(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -156,8 +148,7 @@ def batch_rouge_l(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -175,8 +166,7 @@ def accuracy(
|
||||
pred: list[str],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate token-level accuracy, in the range of 0~100.
|
||||
"""Calculate token-level accuracy, in the range of 0~100.
|
||||
If gold and pred are not the same length, the longer one would be truncated.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
@@ -210,8 +200,7 @@ def batch_accuracy(
|
||||
preds: list[list[str]],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate token-level accuracy for a batch of sentences.
|
||||
"""Calculate token-level accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -226,8 +215,7 @@ def batch_accuracy(
|
||||
def first_match_to_topk(
|
||||
first_match_list: list[int], k_values: list[int]
|
||||
) -> dict[int, list[float]]:
|
||||
"""
|
||||
Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
"""Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
|
||||
:param first_match: first match ranks (1-indexed)
|
||||
:param k_values: k values to consider
|
||||
@@ -237,8 +225,7 @@ def first_match_to_topk(
|
||||
|
||||
|
||||
def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
"""Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
:param n: total number of samples
|
||||
:param c: number of correct samples
|
||||
:param k: k in pass@$k$
|
||||
@@ -251,8 +238,7 @@ def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
|
||||
|
||||
def self_bleu(samples: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate self-BLEU among the samples.
|
||||
"""Calculate self-BLEU among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:return: self-BLEU
|
||||
"""
|
||||
@@ -274,8 +260,7 @@ def self_bleu(samples: list[list[str]]) -> float:
|
||||
|
||||
|
||||
def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
|
||||
"""
|
||||
Calculate self-edit-distance among the samples.
|
||||
"""Calculate self-edit-distance among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:param sep: the separator between tokens
|
||||
:return: self-edit-distance
|
||||
|
||||
@@ -30,8 +30,7 @@ def check_mutation(mutation_output):
|
||||
|
||||
|
||||
def count_methods(code_str):
|
||||
"""
|
||||
Counts the number of methods/functions in a given string of code.
|
||||
"""Counts the number of methods/functions in a given string of code.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
@@ -46,8 +45,7 @@ def count_methods(code_str):
|
||||
|
||||
|
||||
def get_lines_of_code(code_str):
|
||||
"""
|
||||
Extracts lines of code from a given string.
|
||||
"""Extracts lines of code from a given string.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
|
||||
@@ -7,8 +7,7 @@ import traceback
|
||||
|
||||
|
||||
def insert_line_in_string(input_string, new_str, insert_line):
|
||||
"""
|
||||
Inserts a new line into a string at the specified line number.
|
||||
"""Inserts a new line into a string at the specified line number.
|
||||
|
||||
:param input_string: The original string.
|
||||
:param new_str: The string to insert.
|
||||
@@ -29,8 +28,7 @@ def insert_line_in_string(input_string, new_str, insert_line):
|
||||
|
||||
|
||||
def print_string_diff(original, modified):
|
||||
"""
|
||||
Prints the differences between two strings line by line.
|
||||
"""Prints the differences between two strings line by line.
|
||||
|
||||
:param original: The original string.
|
||||
:param modified: The modified string.
|
||||
|
||||
@@ -37,8 +37,7 @@ def extract_preamble_classes_and_functions(code):
|
||||
current_position = 0
|
||||
|
||||
def extract_class_body(code: str, start_index: int) -> tuple[str, int]:
|
||||
"""
|
||||
Extracts the body of a class from the given code starting from the specified index.
|
||||
"""Extracts the body of a class from the given code starting from the specified index.
|
||||
Returns the class body and the end index of the class body.
|
||||
"""
|
||||
if not code or start_index < 0 or start_index >= len(code):
|
||||
@@ -168,8 +167,8 @@ def extract_preamble_classes_and_functions(code):
|
||||
def filter_passing_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests based on their execution results.
|
||||
"""Filter tests based on their execution results.
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Modified test content with only passing tests
|
||||
@@ -246,8 +245,7 @@ def filter_passing_tests(
|
||||
def filter_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
"""Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
|
||||
are preserved.
|
||||
|
||||
|
||||
@@ -20,9 +20,7 @@ DIFF_MODIFIED_FILE_REGEX = r'--- a/(.*)'
|
||||
|
||||
@dataclass
|
||||
class TestSpec:
|
||||
"""
|
||||
A dataclass that represents a test specification for a single instance of SWE-bench.
|
||||
"""
|
||||
"""A dataclass that represents a test specification for a single instance of SWE-bench."""
|
||||
|
||||
instance_id: str
|
||||
id: str
|
||||
@@ -86,10 +84,7 @@ def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
|
||||
|
||||
|
||||
def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
includes_tox = 'tox' in test_cmd
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
|
||||
eval_commands += [
|
||||
@@ -104,10 +99,7 @@ def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
|
||||
|
||||
def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory)
|
||||
eval_commands += [
|
||||
'cosmic-ray init mutation.toml mutation.sqlite',
|
||||
|
||||
@@ -11,8 +11,7 @@ from evaluation.benchmarks.testgeneval.constants import (
|
||||
|
||||
|
||||
def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
"""
|
||||
Get test directives from the test_patch of a task instance
|
||||
"""Get test directives from the test_patch of a task instance
|
||||
|
||||
Args:
|
||||
instance (dict): task instance
|
||||
@@ -43,9 +42,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
def load_testgeneval_dataset(
|
||||
name='kjain14/testgeneval', split='test', ids=None
|
||||
) -> list[TestGenEvalInstance]:
|
||||
"""
|
||||
Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
|
||||
"""
|
||||
"""Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file"""
|
||||
# check that all instance IDs are in the dataset
|
||||
if ids:
|
||||
ids = set(ids)
|
||||
|
||||
@@ -24,9 +24,7 @@ class ActionType(Enum):
|
||||
|
||||
@dataclass
|
||||
class Selector:
|
||||
"""
|
||||
Represents either a direct anchor ID or a descriptive selector
|
||||
"""
|
||||
"""Represents either a direct anchor ID or a descriptive selector"""
|
||||
|
||||
value: str
|
||||
is_anchor: bool = False
|
||||
@@ -149,8 +147,7 @@ def find_matching_anchor(content: str, selector: str) -> str | None:
|
||||
|
||||
|
||||
def resolve_action(action: BrowserAction, content: str) -> BrowserAction:
|
||||
"""
|
||||
Resolve any descriptive selectors in the action to anchor IDs based on the content.
|
||||
"""Resolve any descriptive selectors in the action to anchor IDs based on the content.
|
||||
Returns a new action with resolved selectors.
|
||||
"""
|
||||
if isinstance(action, (InputAction, ClickAction)):
|
||||
@@ -174,8 +171,7 @@ def pre_login(
|
||||
save_screenshots=True,
|
||||
screenshots_dir='screenshots',
|
||||
):
|
||||
"""
|
||||
Logs in to all the websites that are needed for the evaluation.
|
||||
"""Logs in to all the websites that are needed for the evaluation.
|
||||
Once logged in, the sessions would be cached in the browser, so OpenHands
|
||||
agent doesn't need to log in to these websites again.
|
||||
"""
|
||||
|
||||
@@ -68,8 +68,7 @@ def get_config(
|
||||
|
||||
|
||||
def load_dependencies(runtime: Runtime) -> list[str]:
|
||||
"""
|
||||
Every task has a dependencies.yml file, which lists all the services that the
|
||||
"""Every task has a dependencies.yml file, which lists all the services that the
|
||||
task depends on. This function loads the file and returns all dependent service names.
|
||||
"""
|
||||
command = 'cat /utils/dependencies.yml'
|
||||
|
||||
@@ -11,9 +11,7 @@ import sys
|
||||
|
||||
|
||||
def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
||||
"""
|
||||
Calculate the cost of the model call.
|
||||
"""
|
||||
"""Calculate the cost of the model call."""
|
||||
if 'claude-3-5-sonnet' in model.lower():
|
||||
# https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024
|
||||
return 0.000003 * prompt_tokens + 0.000015 * completion_tokens
|
||||
@@ -60,8 +58,7 @@ def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> fl
|
||||
|
||||
|
||||
def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
|
||||
"""
|
||||
Analyze a single eval JSON file and extract the total and result from final_score.
|
||||
"""Analyze a single eval JSON file and extract the total and result from final_score.
|
||||
|
||||
Args:
|
||||
filepath: Path to the JSON file
|
||||
@@ -84,8 +81,7 @@ def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
|
||||
|
||||
|
||||
def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
|
||||
"""
|
||||
Analyze a single trajectory JSON file and extract the steps and tokens
|
||||
"""Analyze a single trajectory JSON file and extract the steps and tokens
|
||||
for each step. Then estimate the cost based on the tokens and the model type.
|
||||
Note: this is assuming there's no prompt caching at all.
|
||||
"""
|
||||
@@ -115,8 +111,7 @@ def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
|
||||
def analyze_folder(
|
||||
folder_path: str,
|
||||
) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]:
|
||||
"""
|
||||
Analyze all eval_*.json & traj_*.json files in the specified folder.
|
||||
"""Analyze all eval_*.json & traj_*.json files in the specified folder.
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder containing JSON files
|
||||
@@ -148,9 +143,7 @@ def analyze_folder(
|
||||
|
||||
|
||||
def get_task_nature_category(task_name: str) -> str:
|
||||
"""
|
||||
Get the nature category of the task.
|
||||
"""
|
||||
"""Get the nature category of the task."""
|
||||
task_nature = task_name.split('-')[0]
|
||||
if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']:
|
||||
return task_nature
|
||||
@@ -159,8 +152,7 @@ def get_task_nature_category(task_name: str) -> str:
|
||||
|
||||
|
||||
def calculate_score(total: int, result: int) -> float:
|
||||
"""
|
||||
Calculate the score as a number between 0 and 1.
|
||||
"""Calculate the score as a number between 0 and 1.
|
||||
|
||||
Formula: score = (result / total) * 0.5 + (result // total) * 0.5
|
||||
Explanation:
|
||||
@@ -178,8 +170,7 @@ def calculate_score(total: int, result: int) -> float:
|
||||
|
||||
|
||||
def is_perfect_completion(total: int, result: int) -> bool:
|
||||
"""
|
||||
Check if the task achieved perfect completion.
|
||||
"""Check if the task achieved perfect completion.
|
||||
|
||||
Args:
|
||||
total: Total possible points
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
GPT performs line level generation prediction and truncates overly long tokens
|
||||
"""
|
||||
"""GPT performs line level generation prediction and truncates overly long tokens"""
|
||||
|
||||
import json
|
||||
import os
|
||||
@@ -56,8 +54,7 @@ def predict(content, model_name):
|
||||
|
||||
|
||||
def bulid_prompt(description, old_version, old_code, new_version) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
GPT performs line level generation prediction and truncates overly long tokens
|
||||
"""
|
||||
"""GPT performs line level generation prediction and truncates overly long tokens"""
|
||||
|
||||
import json
|
||||
import os
|
||||
@@ -56,8 +54,7 @@ def predict(content, model_name):
|
||||
|
||||
|
||||
def bulid_prompt(version, description) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
block completion
|
||||
"""
|
||||
"""block completion"""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
@@ -79,8 +77,7 @@ def run_inference(model_name, origin_data_list):
|
||||
|
||||
|
||||
def bulid_prompt(version, description) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
code migration
|
||||
"""
|
||||
"""code migration"""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
@@ -81,8 +79,7 @@ def run_inference(model_name, origin_data_list):
|
||||
|
||||
|
||||
def bulid_prompt(description, old_version, old_code, new_version) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""
|
||||
评测block的预测能力
|
||||
"""评测block的预测能力
|
||||
1、判断是否包含正确的函数名
|
||||
2、判断是否合法
|
||||
3、计算ISM,和PM
|
||||
@@ -22,8 +21,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def longest_common_prefix_between_lists_with_elements(list1, list2):
|
||||
"""
|
||||
计算两个字符串列表中元素的最长前缀匹配长度
|
||||
"""计算两个字符串列表中元素的最长前缀匹配长度
|
||||
:param list1:
|
||||
:param list2:
|
||||
:return:
|
||||
@@ -46,8 +44,7 @@ def longest_common_prefix_between_lists_with_elements(list1, list2):
|
||||
|
||||
|
||||
def get_token(ans_code: str, output_code: str):
|
||||
"""
|
||||
对代码进行词法分析,分解成标识符,返回两个标识符列表
|
||||
"""对代码进行词法分析,分解成标识符,返回两个标识符列表
|
||||
:param ans_code:
|
||||
:param output_code:
|
||||
:return:
|
||||
@@ -94,8 +91,7 @@ def get_token(ans_code: str, output_code: str):
|
||||
|
||||
|
||||
def get_token_per_line(code: str):
|
||||
"""
|
||||
对每一行代码进行词法分析,记录每一行的标识符
|
||||
"""对每一行代码进行词法分析,记录每一行的标识符
|
||||
:param code: 代码字符串
|
||||
:return: 每一行的标识符列表组成的列表
|
||||
"""
|
||||
@@ -117,8 +113,7 @@ def get_token_per_line(code: str):
|
||||
|
||||
|
||||
def get_ISM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
"""
|
||||
计算ISM,返回一个有序的得分列表
|
||||
"""计算ISM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -157,8 +152,7 @@ def get_ISM(answer_code: str, model_output_list: list, answer_name: str) -> list
|
||||
def get_ISM_without_verification(
|
||||
answer_code: str, model_output_list: list, answer_name: str
|
||||
) -> list:
|
||||
"""
|
||||
计算ISM,返回一个有序的得分列表
|
||||
"""计算ISM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -190,8 +184,7 @@ def get_ISM_without_verification(
|
||||
|
||||
|
||||
def longest_common_prefix_with_lengths(list1, list2):
|
||||
"""
|
||||
计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度
|
||||
"""计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度
|
||||
:param list1: 第一个二维列表
|
||||
:param list2: 第二个二维列表
|
||||
:return: 最长前缀匹配长度以及拥有最长前缀匹配长度的两个子列表的长度
|
||||
@@ -216,8 +209,7 @@ def longest_common_prefix_with_lengths(list1, list2):
|
||||
|
||||
|
||||
def get_PM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
"""
|
||||
计算PM,返回一个有序的得分列表
|
||||
"""计算PM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -254,8 +246,7 @@ def get_PM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
|
||||
|
||||
def get_score(score_list: list, k):
|
||||
"""
|
||||
计算score@n,k
|
||||
"""计算score@n,k
|
||||
:param score_list:
|
||||
:param k:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for migration
|
||||
"""
|
||||
"""Calculate the cdc score for migration"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -11,8 +9,7 @@ import re
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -43,8 +40,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -82,8 +78,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
@@ -105,9 +100,7 @@ def compute_block_score_k(
|
||||
core_line_in_core_block,
|
||||
core_line_in_output_clear,
|
||||
):
|
||||
"""
|
||||
cdc需要满足五个条件,em只需要满足第一个条件
|
||||
"""
|
||||
"""cdc需要满足五个条件,em只需要满足第一个条件"""
|
||||
c = 0
|
||||
n = len(model_output)
|
||||
for index, code in enumerate(model_output):
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for line and block
|
||||
"""
|
||||
"""Calculate the cdc score for line and block"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -19,8 +17,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -51,8 +48,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -90,8 +86,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for line and block
|
||||
"""
|
||||
"""Calculate the cdc score for line and block"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -19,8 +17,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -51,8 +48,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -90,8 +86,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Find the line of code generated by the model using the block in the version code
|
||||
"""
|
||||
"""Find the line of code generated by the model using the block in the version code"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Find the line of code generated by the model using the block in the version code
|
||||
"""
|
||||
"""Find the line of code generated by the model using the block in the version code"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Clear the<start>and<end>generated by the model in inference
|
||||
"""
|
||||
"""Clear the<start>and<end>generated by the model in inference"""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
node_modules
|
||||
outputs
|
||||
@@ -1,70 +0,0 @@
|
||||
# OpenHands - Regression Test Framework
|
||||
|
||||
OpenHands project is an open-source software engineering AI that can solve various software engineering tasks. This repository contains the regression test framework for OpenHands project.
|
||||
|
||||
## Running the Tests
|
||||
|
||||
To run the tests for OpenHands project, you can use the provided test runner script. Follow these steps:
|
||||
|
||||
1. Ensure you have Python 3.6 or higher installed on your system.
|
||||
2. Install the required dependencies by running the following command in your terminal:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
3. Navigate to the root directory of the project.
|
||||
4. Run the test suite using the test runner script with the required arguments:
|
||||
```
|
||||
python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
|
||||
```
|
||||
Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.
|
||||
|
||||
The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.
|
||||
|
||||
## Test Case Structure
|
||||
|
||||
The test cases for OpenHands project are organized in the `cases/` directory. Each test case has the following structure:
|
||||
|
||||
```
|
||||
cases/
|
||||
├── hello-world/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── hello_world.sh
|
||||
│ └── test_hello_world.py
|
||||
├── create_web_app/
|
||||
│ ├── task.txt
|
||||
│ ├── outputs/
|
||||
│ │ └── codeact_agent/
|
||||
│ │ └── workspace/
|
||||
│ │ ├── app.py
|
||||
│ │ ├── requirements.txt
|
||||
│ │ ├── static/
|
||||
│ │ └── templates/
|
||||
│ └── test_create_web_app.py
|
||||
└── ...
|
||||
```
|
||||
|
||||
- `task.txt`: This file contains the task description provided by the user.
|
||||
- `outputs/`: This directory contains the output generated by OpenHands for each agent.
|
||||
- `outputs/*/workspace/`: This directory contains the actual output files generated by OpenHands.
|
||||
- `test_*.py`: These are the test scripts that validate the output of OpenHands.
|
||||
|
||||
## Adding New Test Cases
|
||||
|
||||
To add a new test case to the regression test framework, follow the same steps as described in the previous sections.
|
||||
|
||||
## Customizing the Test Cases
|
||||
|
||||
The test cases can be customized by modifying the fixtures defined in the `conftest.py` file. The available fixtures are:
|
||||
|
||||
- `test_cases_dir`: The directory containing the test cases.
|
||||
- `task_file`: The path to the `task.txt` file for the current test case.
|
||||
- `workspace_dir`: The path to the `workspace/` directory for the current test case.
|
||||
- `model`: The model selected start the generation.
|
||||
- `run_test_case`: A fixture that runs OpenHands and generates the workspace for the current test case.
|
||||
|
||||
You can modify these fixtures to change the behavior of the test cases or add new ones as needed.
|
||||
|
||||
If you have any questions or need further assistance, feel free to reach out to the project maintainers.
|
||||
@@ -1 +0,0 @@
|
||||
Write an API server in node express which responds with a random number, and a frontend in React that displays the next number from the API
|
||||
@@ -1 +0,0 @@
|
||||
Write a simple hello world server in node Express
|
||||
@@ -1,2 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
echo "hello world"
|
||||
@@ -1 +0,0 @@
|
||||
Rewrite the script so that it prints the user's name, using the first argument. If there's no name, default to "world"
|
||||
@@ -1 +0,0 @@
|
||||
Write a bash script named "hello_world.sh" that prints "Hello, World!"
|
||||
@@ -1,20 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from conftest import agents
|
||||
|
||||
|
||||
@pytest.mark.parametrize('agent', agents())
|
||||
def test_hello_world(task_file, run_test_case, agent):
|
||||
"""Test case for the "Hello, World!" Bash script using different agents."""
|
||||
# Run the test case for the specified agent
|
||||
workspace_dir = run_test_case(agent, 'hello-world')
|
||||
|
||||
# Validate the generated workspace
|
||||
assert os.path.exists(workspace_dir)
|
||||
assert os.path.isfile(os.path.join(workspace_dir, 'hello_world.sh'))
|
||||
|
||||
# Execute the hello_world.sh script
|
||||
os.chdir(workspace_dir)
|
||||
output = os.popen('bash hello_world.sh').read()
|
||||
assert output == 'Hello, World!\n'
|
||||
@@ -1,2 +0,0 @@
|
||||
def string_length(s):
|
||||
return len(s)
|
||||
@@ -1,2 +0,0 @@
|
||||
def to_lowercase(s):
|
||||
return s.lower()
|
||||
@@ -1,2 +0,0 @@
|
||||
def reverse_string(s):
|
||||
return s[::-1]
|
||||
@@ -1,7 +0,0 @@
|
||||
import random
|
||||
|
||||
|
||||
def scramble_string(s):
|
||||
s_list = list(s)
|
||||
random.shuffle(s_list)
|
||||
return ''.join(s_list)
|
||||
@@ -1,8 +0,0 @@
|
||||
def spongebob_case(s):
|
||||
result = ''
|
||||
for i, char in enumerate(s):
|
||||
if i % 2 == 0:
|
||||
result += char.lower()
|
||||
else:
|
||||
result += char.upper()
|
||||
return result
|
||||
@@ -1,2 +0,0 @@
|
||||
def to_uppercase(s):
|
||||
return s.upper()
|
||||
@@ -1,55 +0,0 @@
|
||||
import sys
|
||||
|
||||
|
||||
def print_help():
|
||||
help_text = """
|
||||
Usage: python string_cli.py <command> <string>
|
||||
|
||||
Commands:
|
||||
reverse - Reverses the input string.
|
||||
uppercase - Converts the input string to uppercase.
|
||||
lowercase - Converts the input string to lowercase.
|
||||
spongebob - Converts the input string to spongebob case.
|
||||
length - Returns the length of the input string.
|
||||
scramble - Randomly scrambles the characters in the input string.
|
||||
"""
|
||||
print(help_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 2 and sys.argv[1] == '--help':
|
||||
print_help()
|
||||
sys.exit(0)
|
||||
elif len(sys.argv) < 3:
|
||||
print('Usage: python string_cli.py <command> <string>')
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
input_string = sys.argv[2]
|
||||
|
||||
if command == 'reverse':
|
||||
from commands.reverse import reverse_string
|
||||
|
||||
print(reverse_string(input_string))
|
||||
elif command == 'uppercase':
|
||||
from commands.uppercase import to_uppercase
|
||||
|
||||
print(to_uppercase(input_string))
|
||||
elif command == 'lowercase':
|
||||
from commands.lowercase import to_lowercase
|
||||
|
||||
print(to_lowercase(input_string))
|
||||
elif command == 'spongebob':
|
||||
from commands.spongebob import spongebob_case
|
||||
|
||||
print(spongebob_case(input_string))
|
||||
elif command == 'length':
|
||||
from commands.length import string_length
|
||||
|
||||
print(string_length(input_string))
|
||||
elif command == 'scramble':
|
||||
from commands.scramble import scramble_string
|
||||
|
||||
print(scramble_string(input_string))
|
||||
else:
|
||||
print('Invalid command!')
|
||||
@@ -1 +0,0 @@
|
||||
Please rewrite the entire CLI in node.js
|
||||
@@ -1,2 +0,0 @@
|
||||
def string_length(s):
|
||||
return len(s)
|
||||
@@ -1,2 +0,0 @@
|
||||
def to_lowercase(s):
|
||||
return s.lower()
|
||||
@@ -1,2 +0,0 @@
|
||||
def reverse_string(s):
|
||||
return s[::-1]
|
||||
@@ -1,7 +0,0 @@
|
||||
import random
|
||||
|
||||
|
||||
def scramble_string(s):
|
||||
s_list = list(s)
|
||||
random.shuffle(s_list)
|
||||
return ''.join(s_list)
|
||||