Compare commits

..

105 Commits

Author SHA1 Message Date
openhands
a702360d3b fix: Avoid duplicate messages during browser search operations 2024-12-09 22:20:06 +00:00
tofarr
c872af4658 Doc: Added troubleshooting section for Nebulous docker errors (#5482) 2024-12-09 22:04:23 +00:00
OpenHands
99fa6c6a4a Fix issue #5186: [Bug]: Fix up inline code styles in chat window (#5226)
Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: amanape <83104063+amanape@users.noreply.github.com>
2024-12-09 16:33:25 -05:00
OpenHands
3946f813a4 Fix issue #5471: Resolver: LLM_MODEL should use "variable" instead of "secret" (#5477) 2024-12-09 16:08:45 -05:00
Engel Nyst
455e667739 add cost to summary (#5473) 2024-12-10 03:14:03 +08:00
Engel Nyst
2874041381 Fix stuck execution flow (#5458) 2024-12-08 22:39:32 +01:00
Engel Nyst
279e1d7abc Resolver minor tweaks (#5461) 2024-12-08 12:34:01 -05:00
Graham Neubig
a7e4a7aa63 Improve error message when issue/PR not found in resolver (#5455)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-07 23:34:55 -05:00
Engel Nyst
2466d903df Update version (#5459) 2024-12-07 18:59:46 -05:00
Cheng Yang
424cdf121a Feat/better log: Add colorize function and TermColor enum for text coloring (#5410)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-12-07 16:30:40 -05:00
Graham Neubig
6972f4806f Update resolver README.md to fix repo location (#5454) 2024-12-07 21:02:45 +00:00
Graham Neubig
78cc552e3a Fix syntax in external openhands-resolver.yml (#5453) 2024-12-07 20:46:20 +00:00
Graham Neubig
a241b9ff98 fix: Update frontend tests to support Node.js 22.x (#5444)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-07 04:58:27 +01:00
Regis David Souza Mesquita
c757d7c613 Allows using the github-resolver without a PAT (#5278)
Co-authored-by: Rohit Malhotra <rohitvinodmalhotra@gmail.com>
2024-12-07 02:59:08 +00:00
Raj Maheshwari
2b06e4e5d0 [Feat] Custom MicroAgents. (#4983)
Co-authored-by: diwu-sf <di.wu@shadowfaxdata.com>
2024-12-06 17:11:06 -05:00
diwu-sf
cf157c86b3 rename socket.py to listen_socket.py to avoid circular import (#5373) 2024-12-06 20:13:41 +00:00
mamoodi
f2dc3663d7 Release 0.15.1 (#5437) 2024-12-06 14:02:29 -05:00
mamoodi
e4e3e4abb8 Revert "issue/4599-Add cursor position information on the bottom of the editor area" (#5440) 2024-12-06 18:16:28 +00:00
dependabot[bot]
22292f72cd chore(deps-dev): bump llama-index from 0.12.2 to 0.12.3 in the llama group (#5434)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-06 17:47:38 +01:00
Engel Nyst
f4ee3a4cb6 e2b take two (#5433) 2024-12-06 16:02:16 +00:00
STF-Zero
2df426732a issue/4599-Add cursor position information on the bottom of the editor area (#5379) 2024-12-06 15:42:15 +04:00
Engel Nyst
e81623110d Fix finish action (#5428) 2024-12-06 04:36:19 +01:00
tofarr
de81020a8d Feat: Introduce class for SessionInitData rather than using a dict (#5406) 2024-12-05 13:11:00 -07:00
Engel Nyst
1146b6248b Support multiline and default user messages (#5400) 2024-12-05 21:03:18 +01:00
tofarr
c3ddb26e43 Feat named imports (#5413) 2024-12-05 12:10:52 -07:00
dependabot[bot]
3d853f7db3 chore(deps-dev): bump chromadb from 0.5.20 to 0.5.23 in the chromadb group across 1 directory (#5420)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-05 18:44:04 +01:00
tofarr
027c642268 Fix duplicate events on reinit (#5424) 2024-12-05 10:09:53 -07:00
sp.wack
910b2a9b9e chore(frontend): Remove initial analytics modal and update waitlist modal (#5416) 2024-12-05 20:57:51 +04:00
Robert Brennan
ea96ffca9b fix messages (#5421) 2024-12-05 11:38:02 -05:00
sp.wack
7ec407dc50 chore(frontend): Update msw (#5367) 2024-12-05 18:53:50 +04:00
Graham Neubig
83b94786a3 docs: Update CodeAct agent documentation (#5418)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-05 22:25:54 +08:00
dependabot[bot]
786cde39fd chore(deps): bump react-icons from 5.3.0 to 5.4.0 in /docs in the version-all group (#5404)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-05 10:57:47 +04:00
tofarr
ceb60b9a37 Prioritize version from pyproject.toml (#5412) 2024-12-04 21:34:07 +01:00
OpenHands
794408cd31 Fix issue #5383: [Bug]: LLM Cost is added to the metrics twice (#5396)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-12-04 21:32:08 +01:00
tofarr
9aa89e8f2f Fix: Only send the last agent state changed event (#5411) 2024-12-04 19:18:47 +00:00
Engel Nyst
3314b97cb2 Fix e2b import (#5409) 2024-12-04 18:44:57 +00:00
Cheng Yang
8f47547b08 docs: fix markdown linting and broken links (#5401) 2024-12-05 01:28:04 +08:00
Ryan H. Tran
c5117bc48d Upgrade openhands-aci to v0.1.2 (#5397) 2024-12-05 01:25:24 +08:00
mamoodi
851d88593c Release 0.15.0 (#5402) 2024-12-04 10:08:22 -05:00
Xingyao Wang
9908e1b285 [Evaluation]: Log openhands version in eval output folder, instead of agent version (#5394) 2024-12-04 03:33:43 +00:00
Robert Brennan
793e142c4a Show all actions in the message window (#5190)
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: amanape <83104063+amanape@users.noreply.github.com>
2024-12-03 18:29:49 -05:00
Robert Brennan
d617f6f6d8 fix download zip (#5393)
Co-authored-by: mamoodi <mamoodiha@gmail.com>
2024-12-03 22:08:32 +00:00
sp.wack
438f19c80e fix(frontend): auth logic (#5390) 2024-12-03 14:25:16 -05:00
Robert Brennan
1b8104ba14 fix requests in error (#5389) 2024-12-03 17:38:08 +00:00
dependabot[bot]
f07a4c6074 chore(deps-dev): bump llama-index from 0.12.1 to 0.12.2 in the llama group (#5366)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-03 18:24:19 +01:00
tofarr
26a38fce98 Fix 401 on exit project (#5388) 2024-12-03 17:06:28 +00:00
Cheng Yang
05cc6d4fc3 docs: align docstrings with Google style (#5328) 2024-12-03 11:53:39 -05:00
sp.wack
43e074c3ed fix(frontend): Reduce delta calculating message rate and fix modal import (#5387) 2024-12-03 16:42:09 +00:00
tofarr
0dde1602c2 Feat: Multi tab support! (#5370)
Co-authored-by: Robert Brennan <accounts@rbren.io>
2024-12-03 09:25:39 -07:00
Xingyao Wang
d0b5dd3000 feat: display exact error for runtime requests exception handling (#5386) 2024-12-03 16:23:31 +00:00
Rohit Malhotra
bf2688de7e [Resolver][Bug]: Fix success list to str representation bug (#5351)
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-12-03 11:11:57 -05:00
Xingyao Wang
990f277132 misc: Support folder-level exp analysis for SWE-Bench summarize_outputs.py; Handle CrashLoopBackoff for RemoteRuntime (#5385) 2024-12-03 15:37:21 +00:00
Xingyao Wang
2f11634cca Add comprehensive analytics tracking (#5271)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-03 02:02:07 +08:00
Mislav Balunovic
871c544b74 fix: asyncio issues with security analyzer + enable security analyzer in cli (#5356) 2024-12-02 21:57:37 +04:00
tofarr
92b38dcea1 Change the default value of keep_runtime_alive from True to False (#5288) 2024-12-02 09:56:41 -07:00
sp.wack
a378ff0965 chore(frontend): Migrate from Remix to React Router 7 (#5304) 2024-12-02 20:46:24 +04:00
sp.wack
5069a8700a feat(frontend): Integrate axios for client requests (#5255) 2024-12-02 16:34:30 +00:00
sp.wack
96c429df00 feat(frontend): Add default error handling to queries and mutations (#5360) 2024-12-02 20:08:24 +04:00
OpenHands
d96118af4d Fix issue #5363: [Bug]: Slack Invite Link Expired (#5364) 2024-12-02 23:09:43 +08:00
OpenHands
809b58de89 Fix issue #5086: [Bug]: resolver: Error finding issue with empty description (#5357)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-12-02 09:23:16 -05:00
Engel Nyst
cd22817004 Switch dependency to browsergym-core (#5242)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-02 08:52:02 -05:00
dependabot[bot]
4b633782e5 chore(deps-dev): bump typescript from 5.6.3 to 5.7.2 in /docs in the version-all group (#5298)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-02 12:00:17 +04:00
sp.wack
b9b6cfd406 refactor(frontend) Refactor and move components (#5290) 2024-12-02 05:47:02 +00:00
OpenHands
3e49f0f827 Fix issue #5277: [Bug]: AttributeError: 'EventStreamRuntime' object has no attribute 'sid' if runtime_extra_deps is not None (#5330)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-12-01 17:05:01 -05:00
sp.wack
3e4220b66d chore: Handle eslint warnings (#5253) 2024-12-01 16:58:25 -05:00
Rohit Malhotra
64a7fef57e [Resolver]: Support custom sandbox (#5348) 2024-12-01 20:11:50 +00:00
OpenHands
59c57ac2f0 Fix issue #5263: [Bug]: resolver example should use "max_iterations: ${{ fromJson(vars.OPENHANDS_MAX_ITER || 50) }}" (#5349) 2024-12-01 14:48:50 -05:00
Graham Neubig
5672a317ac fix: Update frontend workflows to catch TypeScript errors (#5347)
Co-authored-by: openhands <openhands@all-hands.dev>
2024-12-01 17:45:54 +00:00
Graham Neubig
afc94a2f0c Microagent to fix issues with npm (#5314) 2024-12-01 10:47:29 -05:00
mamoodi
6a79f19c8f Clicking row should select file (#5312) 2024-12-01 10:46:16 -05:00
Cheng Yang
eb5f4f5ebc docs: add hyperlinks to directories and improve navigation (#5178)
Co-authored-by: Graham Neubig <neubig@gmail.com>
2024-12-01 09:26:44 -05:00
OpenHands
6ee9028d4a Fix issue #5337: [Bug]: lint-fix workflow is failing frontend and python steps (#5338)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-11-30 19:18:13 -05:00
Rohit Malhotra
b156b237ec [Resolver] API Retry on guess success (#5187) 2024-11-30 12:53:26 -05:00
mamoodi
4c432d35e2 Fix slack link in docs (#5329) 2024-11-29 20:28:48 +01:00
Engel Nyst
7afdf0659e Update e2b (#5321) 2024-11-29 20:28:24 +01:00
tofarr
16a7dd52ae Fix: Session expired (#5305) 2024-11-29 09:08:47 -07:00
Engel Nyst
ea994b6209 More integration tests info (#5319) 2024-11-29 16:39:03 +01:00
ross
8f750de752 Add vscode url to runloop runtime (#5300) 2024-11-29 04:45:19 +01:00
mamoodi
59532c916e Place Configuration Options under Advanced Configuration (#5316) 2024-11-28 15:07:20 -05:00
Robert Brennan
3ac57a61a7 fix issue where message is none (#5307) 2024-11-28 02:02:52 +01:00
Cheng Yang
b808a639d9 docs: improve evaluation README with proper links and formatting (#5221) 2024-11-27 18:27:36 -05:00
S. Aniruddha
4374b4aba4 [feat(backend)] Alignment checker for browsing agent (#5105) 2024-11-27 22:27:04 +00:00
Xingyao Wang
4d3b035e00 feat(agent): add BrowseURLAction to CodeAct (produce markdown from URL) (#5285) 2024-11-27 21:55:57 +00:00
OpenHands
f0ca2239f3 Fix issue #5076: Integration test github action (#5077)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-11-27 21:31:48 +01:00
Engel Nyst
082a55195f Add docker option, that's how many people run with (#5303) 2024-11-27 19:55:10 +00:00
Ryan H. Tran
9fab9ae8a6 Add fn call in response debug logging (#5301) 2024-11-27 20:29:35 +01:00
tofarr
1a06906743 Fix error message on invalid token (#5302) 2024-11-27 11:27:12 -07:00
tofarr
c70d160637 Fix for issue where exit session intermittently does not return to splash (#5291) 2024-11-27 15:30:39 +00:00
sp.wack
5d366129d1 refactor(frontend): App index route (mainly file explorer) (#5287) 2024-11-27 09:46:30 +04:00
mamoodi
9a96e9f1e4 Release 0.14.3 (#5289) 2024-11-26 15:17:23 -05:00
sp.wack
f4ef6ab50f refactor(frontend): app layout and event handler (#5279) 2024-11-26 23:56:37 +04:00
sp.wack
8fad6e6e36 refactor(frontend): Root layout route (#5275) 2024-11-26 23:46:07 +04:00
sp.wack
a8bb35eccb refactor(frontend): Root index route (#5276) 2024-11-26 22:06:46 +04:00
Robert Brennan
cac3b6d7f7 Refactor listen.py (#5281)
Co-authored-by: sp.wack <83104063+amanape@users.noreply.github.com>
2024-11-26 17:57:24 +00:00
tofarr
be6ca4a3ce Add event search endpoint with pagination and filtering (#4688)
Co-authored-by: AI Assistant <assistant@example.com>
2024-11-26 17:18:01 +00:00
Cheng Yang
71be744f2e Style/add return type hints (#5274) 2024-11-26 09:42:45 -07:00
Faraz Shamim
0aa4a7184f Fix Issue #3325 (#5004)
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-11-26 16:52:37 +01:00
Graham Neubig
12dd3352c5 Add remote runtime support to agent_bench (#5280)
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2024-11-26 13:45:49 +00:00
tofarr
c7d89713e8 Feat socket io (#5056)
Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: Robert Brennan <contact@rbren.io>
2024-11-26 00:12:28 +00:00
Xingyao Wang
0b05c296d8 chore(posthog): capture "push to" buttons (#5270) 2024-11-26 07:30:09 +08:00
Cheng Yang
3b18d77d31 Docs/improve agent controller docstrings (#5233) 2024-11-25 22:45:28 +01:00
Xingyao Wang
c9315d6447 chore: update demo video with newer interface + vscode (#5265) 2024-11-25 21:08:31 +00:00
OpenHands
6184b9d7f4 Fix issue #4820: [Bug]: litellm doesn't support function calling model from OpenRouter. bug cause codeactagent couldn't interact with internet solely without ask browser agent for help (#4822)
Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
2024-11-25 16:26:27 +00:00
sp.wack
cd47100888 hotfix: Revert cache steps in Makefile (#5262) 2024-11-25 16:06:41 +00:00
OpenHands
d267c066e7 Fix issue #5179: [frontend]: Push to Github button should only push branch, but not creating a PR (#5181)
Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
2024-11-25 15:59:14 +00:00
414 changed files with 10876 additions and 26278 deletions

View File

@@ -1,4 +1,4 @@
name: Run Evaluation
name: Run SWE-Bench Evaluation
on:
pull_request:
@@ -58,24 +58,6 @@ jobs:
echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
echo "temperature = 0.0" >> config.toml
- name: Run integration test evaluation
env:
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
RUNTIME: remote
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
run: |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
# get evaluation report
REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
echo "REPORT_FILE: $REPORT_FILE"
echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
cat $REPORT_FILE >> $GITHUB_ENV
echo >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Run SWE-Bench evaluation
env:
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
@@ -143,9 +125,6 @@ jobs:
**SWE-Bench Evaluation Report**
${{ env.SWEBENCH_REPORT }}
---
**Integration Tests Evaluation Report**
${{ env.INTEGRATION_TEST_REPORT }}
---
You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
- name: Post to a Slack channel

View File

@@ -24,7 +24,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [20]
node-version: [20, 22]
fail-fast: true
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -35,6 +36,9 @@ jobs:
- name: Install dependencies
working-directory: ./frontend
run: npm ci
- name: Run TypeScript compilation
working-directory: ./frontend
run: npm run make-i18n && tsc
- name: Run tests and collect coverage
working-directory: ./frontend
run: npm run test:coverage

View File

@@ -291,7 +291,7 @@ jobs:
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
@@ -368,7 +368,7 @@ jobs:
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=true \
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:

158
.github/workflows/integration-runner.yml vendored Normal file
View File

@@ -0,0 +1,158 @@
name: Run Integration Tests
on:
pull_request:
types: [labeled]
workflow_dispatch:
inputs:
reason:
description: 'Reason for manual trigger'
required: true
default: ''
schedule:
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day
env:
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
jobs:
run-integration-tests:
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
runs-on: ubuntu-latest
permissions:
contents: "read"
id-token: "write"
pull-requests: "write"
issues: "write"
strategy:
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install poetry via pipx
run: pipx install poetry
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Comment on PR if 'integration-test' label is present
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
uses: KeisukeYamashita/create-comment@v1
with:
unique: false
comment: |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
- name: Install Python dependencies using Poetry
run: poetry install --without evaluation,llama-index
- name: Configure config.toml for testing with Haiku
env:
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
run: |
echo "[llm.eval]" > config.toml
echo "model = \"$LLM_MODEL\"" >> config.toml
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
echo "temperature = 0.0" >> config.toml
- name: Build environment
run: make build
- name: Run integration test evaluation for Haiku
env:
SANDBOX_FORCE_REBUILD_RUNTIME: True
run: |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
# get integration tests report
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
echo "REPORT_FILE: $REPORT_FILE_HAIKU"
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
echo >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Wait a little bit
run: sleep 10
- name: Configure config.toml for testing with DeepSeek
env:
LLM_MODEL: "litellm_proxy/deepseek-chat"
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
run: |
echo "[llm.eval]" > config.toml
echo "model = \"$LLM_MODEL\"" >> config.toml
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
echo "temperature = 0.0" >> config.toml
- name: Run integration test evaluation for DeepSeek
env:
SANDBOX_FORCE_REBUILD_RUNTIME: True
run: |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
# get integration tests report
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
echo >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Create archive of evaluation outputs
run: |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories
- name: Upload evaluation results as artifact
uses: actions/upload-artifact@v4
id: upload_results_artifact
with:
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
path: integration_tests_*.tar.gz
- name: Get artifact URLs
run: |
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
- name: Set timestamp and trigger reason
run: |
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
else
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
fi
- name: Comment with results and artifact link
id: create_comment
uses: KeisukeYamashita/create-comment@v1
with:
# if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }}
unique: false
comment: |
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
Commit: ${{ github.sha }}
**Integration Tests Report (Haiku)**
Haiku LLM Test Results:
${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
---
**Integration Tests Report (DeepSeek)**
DeepSeek LLM Test Results:
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
---
Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})

View File

@@ -5,9 +5,10 @@ on:
types: [labeled]
jobs:
lint-fix:
# Frontend lint fixes
lint-fix-frontend:
if: github.event.label.name == 'lint-fix'
name: Fix linting issues
name: Fix frontend linting issues
runs-on: ubuntu-latest
permissions:
contents: write
@@ -20,7 +21,6 @@ jobs:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
# Frontend lint fixes
- name: Install Node.js 20
uses: actions/setup-node@v4
with:
@@ -34,7 +34,36 @@ jobs:
cd frontend
npm run lint:fix
# Python lint fixes
# Commit and push changes if any
- name: Check for changes
id: git-check
run: |
git diff --quiet || echo "changes=true" >> $GITHUB_OUTPUT
- name: Commit and push if there are changes
if: steps.git-check.outputs.changes == 'true'
run: |
git config --local user.email "openhands@all-hands.dev"
git config --local user.name "OpenHands Bot"
git add -A
git commit -m "🤖 Auto-fix frontend linting issues"
git push
# Python lint fixes
lint-fix-python:
if: github.event.label.name == 'lint-fix'
name: Fix Python linting issues
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.head_ref }}
repository: ${{ github.event.pull_request.head.repo.full_name }}
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up python
uses: actions/setup-python@v5
with:
@@ -58,5 +87,5 @@ jobs:
git config --local user.email "openhands@all-hands.dev"
git config --local user.name "OpenHands Bot"
git add -A
git commit -m "🤖 Auto-fix linting issues"
git commit -m "🤖 Auto-fix Python linting issues"
git push

View File

@@ -30,10 +30,11 @@ jobs:
run: |
cd frontend
npm install --frozen-lockfile
- name: Lint
- name: Lint and TypeScript compilation
run: |
cd frontend
npm run lint
npm run make-i18n && tsc
# Run lint on the python code
lint-python:

View File

@@ -16,17 +16,26 @@ on:
type: string
default: "main"
description: "Target branch to pull and create PR against"
LLM_MODEL:
required: false
type: string
default: "anthropic/claude-3-5-sonnet-20241022"
base_container_image:
required: false
type: string
default: ""
description: "Custom sandbox env"
secrets:
LLM_MODEL:
required: true
required: false
LLM_API_KEY:
required: true
LLM_BASE_URL:
required: false
PAT_TOKEN:
required: true
required: false
PAT_USERNAME:
required: true
required: false
issues:
types: [labeled]
@@ -101,13 +110,14 @@ jobs:
- name: Check required environment variables
env:
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
PAT_USERNAME: ${{ secrets.PAT_USERNAME }}
GITHUB_TOKEN: ${{ github.token }}
run: |
required_vars=("LLM_MODEL" "LLM_API_KEY" "PAT_TOKEN" "PAT_USERNAME")
required_vars=("LLM_MODEL" "LLM_API_KEY")
for var in "${required_vars[@]}"; do
if [ -z "${!var}" ]; then
echo "Error: Required environment variable $var is not set."
@@ -115,6 +125,19 @@ jobs:
fi
done
# Check optional variables and warn about fallbacks
if [ -z "$PAT_TOKEN" ]; then
echo "Warning: PAT_TOKEN is not set, falling back to GITHUB_TOKEN"
fi
if [ -z "$LLM_BASE_URL" ]; then
echo "Warning: LLM_BASE_URL is not set, will use default API endpoint"
fi
if [ -z "$PAT_USERNAME" ]; then
echo "Warning: PAT_USERNAME is not set, will use openhands-agent"
fi
- name: Set environment variables
run: |
if [ -n "${{ github.event.review.body }}" ]; then
@@ -138,7 +161,8 @@ jobs:
fi
echo "MAX_ITERATIONS=${{ inputs.max_iterations || 50 }}" >> $GITHUB_ENV
echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV
echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.PAT_TOKEN || github.token }}" >> $GITHUB_ENV
echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
# Set branch variables
echo "TARGET_BRANCH=${{ inputs.target_branch }}" >> $GITHUB_ENV
@@ -146,7 +170,7 @@ jobs:
- name: Comment on issue with start message
uses: actions/github-script@v7
with:
github-token: ${{secrets.GITHUB_TOKEN}}
github-token: ${{ secrets.PAT_TOKEN || github.token }}
script: |
const issueType = process.env.ISSUE_TYPE;
github.rest.issues.createComment({
@@ -171,9 +195,9 @@ jobs:
- name: Attempt to resolve issue
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
PYTHONPATH: ""
@@ -183,7 +207,7 @@ jobs:
--issue-number ${{ env.ISSUE_NUMBER }} \
--issue-type ${{ env.ISSUE_TYPE }} \
--max-iterations ${{ env.MAX_ITERATIONS }} \
--comment-id ${{ env.COMMENT_ID }} \
--comment-id ${{ env.COMMENT_ID }}
- name: Check resolution result
id: check_result
@@ -205,9 +229,9 @@ jobs:
- name: Create draft PR or push branch
if: always() # Create PR or branch even if the previous steps fail
env:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
PYTHONPATH: ""
@@ -229,7 +253,7 @@ jobs:
uses: actions/github-script@v7
if: always() # Comment on issue even if the previous steps fail
with:
github-token: ${{secrets.GITHUB_TOKEN}}
github-token: ${{ secrets.PAT_TOKEN || github.token }}
script: |
const fs = require('fs');
const issueNumber = ${{ env.ISSUE_NUMBER }};

View File

@@ -21,14 +21,14 @@ There are many ways that you can contribute:
1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
3. **Improve the Codebase** by sending [PRs](#sending-pull-requests-to-openhands) (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
## What can I build?
Here are a few ways you can help improve the codebase.
#### UI/UX
We're always looking to improve the look and feel of the application. If you've got a small fix
for something that's bugging you, feel free to open up a PR that changes the `./frontend` directory.
for something that's bugging you, feel free to open up a PR that changes the [`./frontend`](./frontend) directory.
If you're looking to make a bigger change, add a new UI element, or significantly alter the style
of the application, please open an issue first, or better, join the #frontend channel in our Slack
@@ -46,7 +46,7 @@ We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. Y
channel in Slack to learn more.
#### Adding a new agent
You may want to experiment with building new types of agents. You can add an agent to `openhands/agenthub`
You may want to experiment with building new types of agents. You can add an agent to [`openhands/agenthub`](./openhands/agenthub)
to help expand the capabilities of OpenHands.
#### Adding a new runtime
@@ -57,8 +57,8 @@ If you work for a company that provides a cloud-based runtime, you could help us
by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/base.py).
#### Testing
When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
When you write code, it is also good to write tests. Please navigate to the [`./tests`](./tests) folder to see existing test suites.
At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integration`](./evaluation/integration_tests). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
## Sending Pull Requests to OpenHands
@@ -103,7 +103,7 @@ Further, if you see an issue you like, please leave a "thumbs-up" or a comment,
### Making Pull Requests
We're generally happy to consider all PRs, with the evaluation process varying based on the type of change:
We're generally happy to consider all [PRs](https://github.com/All-Hands-AI/OpenHands/pulls), with the evaluation process varying based on the type of change:
#### For Small Improvements

View File

@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.14-nikolaik`
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.15-nikolaik`
## Develop inside Docker container

View File

@@ -184,10 +184,6 @@ test:
@$(MAKE) -s test-frontend
build-frontend:
@echo "$(YELLOW)Cleaning TypeScript build cache...$(RESET)"
@cd frontend && npx tsc --build --clean
@echo "$(YELLOW)Cleaning Git cache for casing issues...$(RESET)"
@cd frontend && git rm -r --cached . && git add . && git commit -m "Fix Git cache" || echo "No changes to commit"
@echo "$(YELLOW)Building frontend...$(RESET)"
@cd frontend && npm run build

View File

@@ -12,7 +12,7 @@
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
<br/>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
<br/>
@@ -38,16 +38,16 @@ See the [Installation](https://docs.all-hands.dev/modules/usage/installation) gu
system requirements and more information.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
docker run -it --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.14
docker.all-hands.dev/all-hands-ai/openhands:0.15
```
You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -82,7 +82,7 @@ troubleshooting resources, and advanced configuration options.
OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
through Slack, so this is the best place to start, but we also are happy to have you contact us on Discord or Github:
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw) - Here we talk about research, architecture, and future development.
- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg) - Here we talk about research, architecture, and future development.
- [Join our Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
- [Read or post Github Issues](https://github.com/All-Hands-AI/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.

View File

@@ -7,7 +7,7 @@ services:
image: openhands:latest
container_name: openhands-app-${DATE:-}
environment:
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -95,10 +95,10 @@ workspace_base = "./workspace"
# AWS secret access key
#aws_secret_access_key = ""
# API key to use
# API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
api_key = "your-api-key"
# API base URL
# API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
#base_url = ""
# API version
@@ -131,7 +131,7 @@ embedding_model = "local"
# Maximum number of output tokens
#max_output_tokens = 0
# Model to use
# Model to use. (For Headless / CLI only - In Web this is overridden by Session Init)
model = "gpt-4o"
# Number of retries to attempt when an operation fails with the LLM.
@@ -237,10 +237,10 @@ llm_config = 'gpt3'
##############################################################################
[security]
# Enable confirmation mode
# Enable confirmation mode (For Headless / CLI only - In Web this is overridden by Session Init)
#confirmation_mode = false
# The security analyzer to use
# The security analyzer to use (For Headless / CLI only - In Web this is overridden by Session Init)
#security_analyzer = ""
#################################### Eval ####################################

View File

@@ -11,7 +11,7 @@ services:
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
- SANDBOX_API_HOSTNAME=host.docker.internal
#
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
ports:

View File

@@ -27,7 +27,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/All-H
Nous avons à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenHands et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, LLM, agent, etc.
- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
- [Espace de travail Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg)
- [Serveur Discord](https://discord.gg/ESHStjSjD4)
Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions ensemble l'ingénierie logicielle !

View File

@@ -27,7 +27,7 @@ OpenHands 是一个社区驱动的项目,我们欢迎每个人的贡献。无
我们有 Slack 工作区用于协作构建 OpenHands也有 Discord 服务器用于讨论任何相关的内容,例如此项目、大语言模型、代理等。
- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
- [Slack 工作区](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg)
- [Discord 服务器](https://discord.gg/ESHStjSjD4)
如果你想做出贡献,欢迎加入我们的社区。让我们一起简化软件工程!

View File

@@ -0,0 +1,465 @@
# Configuration Options
This guide details all configuration options available for OpenHands, helping you customize its behavior and integrate it with other services.
:::note
If you are running in [GUI Mode](https://docs.all-hands.dev/modules/usage/how-to/gui-mode), the settings available in the Settings UI will always
take precedence.
:::
---
# Table of Contents
1. [Core Configuration](#core-configuration)
- [API Keys](#api-keys)
- [Workspace](#workspace)
- [Debugging and Logging](#debugging-and-logging)
- [Session Management](#session-management)
- [Trajectories](#trajectories)
- [File Store](#file-store)
- [Task Management](#task-management)
- [Sandbox Configuration](#sandbox-configuration)
- [Miscellaneous](#miscellaneous)
2. [LLM Configuration](#llm-configuration)
- [AWS Credentials](#aws-credentials)
- [API Configuration](#api-configuration)
- [Custom LLM Provider](#custom-llm-provider)
- [Embeddings](#embeddings)
- [Message Handling](#message-handling)
- [Model Selection](#model-selection)
- [Retrying](#retrying)
- [Advanced Options](#advanced-options)
3. [Agent Configuration](#agent-configuration)
- [Microagent Configuration](#microagent-configuration)
- [Memory Configuration](#memory-configuration)
- [LLM Configuration](#llm-configuration-2)
- [ActionSpace Configuration](#actionspace-configuration)
- [Microagent Usage](#microagent-usage)
4. [Sandbox Configuration](#sandbox-configuration-2)
- [Execution](#execution)
- [Container Image](#container-image)
- [Networking](#networking)
- [Linting and Plugins](#linting-and-plugins)
- [Dependencies and Environment](#dependencies-and-environment)
- [Evaluation](#evaluation)
5. [Security Configuration](#security-configuration)
- [Confirmation Mode](#confirmation-mode)
- [Security Analyzer](#security-analyzer)
---
## Core Configuration
The core configuration options are defined in the `[core]` section of the `config.toml` file.
**API Keys**
- `e2b_api_key`
- Type: `str`
- Default: `""`
- Description: API key for E2B
- `modal_api_token_id`
- Type: `str`
- Default: `""`
- Description: API token ID for Modal
- `modal_api_token_secret`
- Type: `str`
- Default: `""`
- Description: API token secret for Modal
**Workspace**
- `workspace_base`
- Type: `str`
- Default: `"./workspace"`
- Description: Base path for the workspace
- `cache_dir`
- Type: `str`
- Default: `"/tmp/cache"`
- Description: Cache directory path
**Debugging and Logging**
- `debug`
- Type: `bool`
- Default: `false`
- Description: Enable debugging
- `disable_color`
- Type: `bool`
- Default: `false`
- Description: Disable color in terminal output
**Trajectories**
- `trajectories_path`
- Type: `str`
- Default: `"./trajectories"`
- Description: Path to store trajectories (can be a folder or a file). If it's a folder, the trajectories will be saved in a file named with the session id name and .json extension, in that folder.
**File Store**
- `file_store_path`
- Type: `str`
- Default: `"/tmp/file_store"`
- Description: File store path
- `file_store`
- Type: `str`
- Default: `"memory"`
- Description: File store type
- `file_uploads_allowed_extensions`
- Type: `list of str`
- Default: `[".*"]`
- Description: List of allowed file extensions for uploads
- `file_uploads_max_file_size_mb`
- Type: `int`
- Default: `0`
- Description: Maximum file size for uploads, in megabytes
- `file_uploads_restrict_file_types`
- Type: `bool`
- Default: `false`
- Description: Restrict file types for file uploads
- `file_uploads_allowed_extensions`
- Type: `list of str`
- Default: `[".*"]`
- Description: List of allowed file extensions for uploads
**Task Management**
- `max_budget_per_task`
- Type: `float`
- Default: `0.0`
- Description: Maximum budget per task (0.0 means no limit)
- `max_iterations`
- Type: `int`
- Default: `100`
- Description: Maximum number of iterations
**Sandbox Configuration**
- `workspace_mount_path_in_sandbox`
- Type: `str`
- Default: `"/workspace"`
- Description: Path to mount the workspace in the sandbox
- `workspace_mount_path`
- Type: `str`
- Default: `""`
- Description: Path to mount the workspace
- `workspace_mount_rewrite`
- Type: `str`
- Default: `""`
- Description: Path to rewrite the workspace mount path to. You can usually ignore this, it refers to special cases of running inside another container.
**Miscellaneous**
- `run_as_openhands`
- Type: `bool`
- Default: `true`
- Description: Run as OpenHands
- `runtime`
- Type: `str`
- Default: `"eventstream"`
- Description: Runtime environment
- `default_agent`
- Type: `str`
- Default: `"CodeActAgent"`
- Description: Name of the default agent
- `jwt_secret`
- Type: `str`
- Default: `uuid.uuid4().hex`
- Description: JWT secret for authentication. Please set it to your own value.
## LLM Configuration
The LLM (Large Language Model) configuration options are defined in the `[llm]` section of the `config.toml` file.
To use these with the docker command, pass in `-e LLM_<option>`. Example: `-e LLM_NUM_RETRIES`.
**AWS Credentials**
- `aws_access_key_id`
- Type: `str`
- Default: `""`
- Description: AWS access key ID
- `aws_region_name`
- Type: `str`
- Default: `""`
- Description: AWS region name
- `aws_secret_access_key`
- Type: `str`
- Default: `""`
- Description: AWS secret access key
**API Configuration**
- `api_key`
- Type: `str`
- Default: `None`
- Description: API key to use
- `base_url`
- Type: `str`
- Default: `""`
- Description: API base URL
- `api_version`
- Type: `str`
- Default: `""`
- Description: API version
- `input_cost_per_token`
- Type: `float`
- Default: `0.0`
- Description: Cost per input token
- `output_cost_per_token`
- Type: `float`
- Default: `0.0`
- Description: Cost per output token
**Custom LLM Provider**
- `custom_llm_provider`
- Type: `str`
- Default: `""`
- Description: Custom LLM provider
**Embeddings**
- `embedding_base_url`
- Type: `str`
- Default: `""`
- Description: Embedding API base URL
- `embedding_deployment_name`
- Type: `str`
- Default: `""`
- Description: Embedding deployment name
- `embedding_model`
- Type: `str`
- Default: `"local"`
- Description: Embedding model to use
**Message Handling**
- `max_message_chars`
- Type: `int`
- Default: `30000`
- Description: The approximate maximum number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
- `max_input_tokens`
- Type: `int`
- Default: `0`
- Description: Maximum number of input tokens
- `max_output_tokens`
- Type: `int`
- Default: `0`
- Description: Maximum number of output tokens
**Model Selection**
- `model`
- Type: `str`
- Default: `"claude-3-5-sonnet-20241022"`
- Description: Model to use
**Retrying**
- `num_retries`
- Type: `int`
- Default: `8`
- Description: Number of retries to attempt
- `retry_max_wait`
- Type: `int`
- Default: `120`
- Description: Maximum wait time (in seconds) between retry attempts
- `retry_min_wait`
- Type: `int`
- Default: `15`
- Description: Minimum wait time (in seconds) between retry attempts
- `retry_multiplier`
- Type: `float`
- Default: `2.0`
- Description: Multiplier for exponential backoff calculation
**Advanced Options**
- `drop_params`
- Type: `bool`
- Default: `false`
- Description: Drop any unmapped (unsupported) params without causing an exception
- `caching_prompt`
- Type: `bool`
- Default: `true`
- Description: Using the prompt caching feature if provided by the LLM and supported
- `ollama_base_url`
- Type: `str`
- Default: `""`
- Description: Base URL for the OLLAMA API
- `temperature`
- Type: `float`
- Default: `0.0`
- Description: Temperature for the API
- `timeout`
- Type: `int`
- Default: `0`
- Description: Timeout for the API
- `top_p`
- Type: `float`
- Default: `1.0`
- Description: Top p for the API
- `disable_vision`
- Type: `bool`
- Default: `None`
- Description: If model is vision capable, this option allows to disable image processing (useful for cost reduction)
## Agent Configuration
The agent configuration options are defined in the `[agent]` and `[agent.<agent_name>]` sections of the `config.toml` file.
**Microagent Configuration**
- `micro_agent_name`
- Type: `str`
- Default: `""`
- Description: Name of the micro agent to use for this agent
**Memory Configuration**
- `memory_enabled`
- Type: `bool`
- Default: `false`
- Description: Whether long-term memory (embeddings) is enabled
- `memory_max_threads`
- Type: `int`
- Default: `3`
- Description: The maximum number of threads indexing at the same time for embeddings
**LLM Configuration**
- `llm_config`
- Type: `str`
- Default: `'your-llm-config-group'`
- Description: The name of the LLM config to use
**ActionSpace Configuration**
- `function_calling`
- Type: `bool`
- Default: `true`
- Description: Whether function calling is enabled
- `codeact_enable_browsing`
- Type: `bool`
- Default: `false`
- Description: Whether browsing delegate is enabled in the action space (only works with function calling)
- `codeact_enable_llm_editor`
- Type: `bool`
- Default: `false`
- Description: Whether LLM editor is enabled in the action space (only works with function calling)
- `codeact_enable_jupyter`
- Type: `bool`
- Default: `false`
- Description: Whether Jupyter is enabled in the action space
**Microagent Usage**
- `use_microagents`
- Type: `bool`
- Default: `true`
- Description: Whether to use microagents at all
- `disabled_microagents`
- Type: `list of str`
- Default: `None`
- Description: A list of microagents to disable
## Sandbox Configuration
The sandbox configuration options are defined in the `[sandbox]` section of the `config.toml` file.
To use these with the docker command, pass in `-e SANDBOX_<option>`. Example: `-e SANDBOX_TIMEOUT`.
**Execution**
- `timeout`
- Type: `int`
- Default: `120`
- Description: Sandbox timeout in seconds
- `user_id`
- Type: `int`
- Default: `1000`
- Description: Sandbox user ID
**Container Image**
- `base_container_image`
- Type: `str`
- Default: `"nikolaik/python-nodejs:python3.12-nodejs22"`
- Description: Container image to use for the sandbox
**Networking**
- `use_host_network`
- Type: `bool`
- Default: `false`
- Description: Use host network
**Linting and Plugins**
- `enable_auto_lint`
- Type: `bool`
- Default: `false`
- Description: Enable auto linting after editing
- `initialize_plugins`
- Type: `bool`
- Default: `true`
- Description: Whether to initialize plugins
**Dependencies and Environment**
- `runtime_extra_deps`
- Type: `str`
- Default: `""`
- Description: Extra dependencies to install in the runtime image
- `runtime_startup_env_vars`
- Type: `dict`
- Default: `{}`
- Description: Environment variables to set at the launch of the runtime
**Evaluation**
- `browsergym_eval_env`
- Type: `str`
- Default: `""`
- Description: BrowserGym environment to use for evaluation
## Security Configuration
The security configuration options are defined in the `[security]` section of the `config.toml` file.
To use these with the docker command, pass in `-e SECURITY_<option>`. Example: `-e SECURITY_CONFIRMATION_MODE`.
**Confirmation Mode**
- `confirmation_mode`
- Type: `bool`
- Default: `false`
- Description: Enable confirmation mode
**Security Analyzer**
- `security_analyzer`
- Type: `str`
- Default: `""`
- Description: The security analyzer to use
---
> **Note**: Adjust configurations carefully, especially for memory, security, and network-related settings to ensure optimal performance and security.
Please note that the configuration options may be subject to change in future versions of OpenHands. It's recommended to refer to the official documentation for the most up-to-date information.

View File

@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.14 \
docker.all-hands.dev/all-hands-ai/openhands:0.15 \
python -m openhands.core.cli
```

View File

@@ -37,12 +37,15 @@ the [README for the OpenHands Resolver](https://github.com/All-Hands-AI/OpenHand
You can provide custom directions for OpenHands by following the [README for the resolver](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/resolver/README.md#providing-custom-instructions).
### Configure custom macro
### Custom configurations
To customize the default macro (`@openhands-agent`):
Github resolver will automatically check for valid [repository secrets](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui#creating-secrets-for-a-repository) or [repository variables](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) to customize its behavior. The customization options you can set are:
1. [Create a repository variable](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) named `OPENHANDS_MACRO`
2. Assign the variable a custom value
| **Attribute name** | **Type** | **Purpose** | **Example** |
| -------------------------------- | -------- | --------------------------------------------------------------------------------------------------- | ----------------------------------------------- |
| `OPENHANDS_MAX_ITER` | Variable | Set max limit for agent iterations | `OPENHANDS_MAX_ITER=10` |
| `OPENHANDS_MACRO` | Variable | Customize default macro for invoking the resolver | `OPENHANDS_MACRO=@resolveit` |
| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
## Writing Effective .openhands_instructions Files
@@ -55,6 +58,7 @@ The `.openhands_instructions` file is a file that you can put in the root direct
2. **Repository Structure**: Explain the key directories and their purposes, especially highlighting where different types of code (e.g., frontend, backend) are located.
3. **Development Workflows**: Document the essential commands for:
- Building and setting up the project
- Running tests
- Linting and code quality checks
@@ -69,24 +73,29 @@ The `.openhands_instructions` file is a file that you can put in the root direct
```markdown
# Repository Overview
[Brief description of the project]
## General Setup
- Main build command
- Development environment setup
- Pre-commit checks
## Backend
- Location and structure
- Testing instructions
- Environment requirements
## Frontend
- Setup prerequisites
- Build and test commands
- Environment variables
## Additional Guidelines
- Code style requirements
- Special considerations
- Common workflows

View File

@@ -44,7 +44,7 @@ LLM_API_KEY="sk_test_12345"
```bash
docker run -it \
--pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
@@ -54,6 +54,6 @@ docker run -it \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
docker.all-hands.dev/all-hands-ai/openhands:0.14 \
docker.all-hands.dev/all-hands-ai/openhands:0.15 \
python -m openhands.core.main -t "write a bash script that prints hi"
```

View File

@@ -11,16 +11,16 @@
The easiest way to run OpenHands is in Docker.
```bash
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
-e LOG_ALL_EVENTS=true \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.14
docker.all-hands.dev/all-hands-ai/openhands:0.15
```
You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).

View File

@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:
```
docker run # ...
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
# ...
```

View File

@@ -17,6 +17,7 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
* [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
* [Sessions are not restored](#sessions-are-not-restored)
* [Connection to host.docker.internal timed out](#connection-to-host-docker-internal-timed-out)
* [Error building runtime docker image](#error-building-runtime-docker-image)
### Unable to connect to Docker
@@ -178,3 +179,21 @@ which OpenHands makes use of when the main server is running inside a docker con
* [Install Docker Desktop](https://www.docker.com/products/docker-desktop/)
* Run OpenHands in [Development Mode](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
So that the main server is not run inside a container, but still creates dockerized runtime sandboxes.
---
### Error building runtime docker image
**Symptoms**
Attempts to start a new session fail, and an errors with terms like the following appear in the logs:
* `debian-security bookworm-security`
* `InRelease At least one invalid signature was encountered.`
This seems to happen when the hash of an existing external library changes and your local docker instance has
cached a previous version. To work around this, please try the following:
* Stop any containers where the name has the prefix `openhands-runtime-` :
`docker ps --filter name=openhands-runtime- --filter status=running -aq | xargs docker stop`
* Remove any containers where the name has the prefix `openhands-runtime-` :
`docker rmi $(docker images --filter name=openhands-runtime- -q --no-trunc)`
* Stop and Remove any containers / images where the name has the prefix `openhands-runtime-`
* Prune containers / images : `docker container prune -f && docker image prune -f`

18
docs/package-lock.json generated
View File

@@ -17,14 +17,14 @@
"prism-react-renderer": "^2.4.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-icons": "^5.3.0",
"react-icons": "^5.4.0",
"react-use": "^17.5.1"
},
"devDependencies": {
"@docusaurus/module-type-aliases": "^3.5.1",
"@docusaurus/tsconfig": "^3.6.3",
"@docusaurus/types": "^3.5.1",
"typescript": "~5.6.3"
"typescript": "~5.7.2"
},
"engines": {
"node": ">=18.0"
@@ -15155,9 +15155,10 @@
}
},
"node_modules/react-icons": {
"version": "5.3.0",
"resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.3.0.tgz",
"integrity": "sha512-DnUk8aFbTyQPSkCfF8dbX6kQjXA9DktMeJqfjrg6cK9vwQVMxmcA3BfP4QoiztVmEHtwlTgLFsPuH2NskKT6eg==",
"version": "5.4.0",
"resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.4.0.tgz",
"integrity": "sha512-7eltJxgVt7X64oHh6wSWNwwbKTCtMfK35hcjvJS0yxEAhPM8oUKdS3+kqaW1vicIltw+kR2unHaa12S9pPALoQ==",
"license": "MIT",
"peerDependencies": {
"react": "*"
}
@@ -16985,9 +16986,10 @@
}
},
"node_modules/typescript": {
"version": "5.6.3",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz",
"integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==",
"version": "5.7.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.2.tgz",
"integrity": "sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==",
"license": "Apache-2.0",
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"

View File

@@ -24,14 +24,14 @@
"prism-react-renderer": "^2.4.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-icons": "^5.3.0",
"react-icons": "^5.4.0",
"react-use": "^17.5.1"
},
"devDependencies": {
"@docusaurus/module-type-aliases": "^3.5.1",
"@docusaurus/tsconfig": "^3.6.3",
"@docusaurus/types": "^3.5.1",
"typescript": "~5.6.3"
"typescript": "~5.7.2"
},
"browserslist": {
"production": [

View File

@@ -100,6 +100,11 @@ const sidebars: SidebarsConfig = {
label: 'Runtime Configuration',
id: 'usage/runtimes',
},
{
type: 'doc',
label: 'Configuration Options',
id: 'usage/configuration-options',
},
{
type: 'doc',
label: 'Custom Sandbox',

View File

@@ -8,7 +8,7 @@ function CustomFooter() {
<footer className="custom-footer">
<div className="footer-content">
<div className="footer-icons">
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA" target="_blank" rel="noopener noreferrer">
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg" target="_blank" rel="noopener noreferrer">
<FaSlack />
</a>
<a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">

View File

@@ -23,7 +23,7 @@ export function HomepageHeader() {
<a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
<br/>
<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
<a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
<br/>

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -6,9 +6,9 @@ This folder contains code and resources to run experiments and evaluations.
### Setup
Before starting evaluation, follow the instructions here [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
Before starting evaluation, follow the instructions [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the evaluation directory.
Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the [evaluation directory](#supported-benchmarks).
Generally these will involve running `run_infer.py` to perform inference with the agents.
### Implementing and Evaluating an Agent
@@ -42,7 +42,7 @@ temperature = 0.0
## Supported Benchmarks
The OpenHands evaluation harness supports a wide variety of benchmarks across software engineering, web browsing, and miscellaneous assistance tasks.
The OpenHands evaluation harness supports a wide variety of benchmarks across [software engineering](#software-engineering), [web browsing](#web-browsing), and [miscellaneous assistance](#misc-assistance) tasks.
### Software Engineering
@@ -83,7 +83,7 @@ You can start your own fork of [our huggingface evaluation outputs](https://hugg
To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/benchmarks/swe_bench` should contain
- Each subfolder contains a specific benchmark or experiment. For example, [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench) should contain
all the preprocessing/evaluation/analysis scripts.
- Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.

View File

@@ -4,12 +4,10 @@ This folder contains evaluation harness for evaluating agents on the Entity-dedu
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Start the evaluation
```bash
export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
@@ -37,7 +35,8 @@ For example,
```
## Reference
```
```bibtex
@inproceedings{zhang2023entity,
title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},

View File

@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$DATASET" ]; then
echo "Dataset not specified, use default 'things'"
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
exit 1
fi
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
--max-iterations 20 \
--OPENAI_API_KEY $OPENAI_API_KEY \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,7 +4,7 @@ This folder contains evaluation harness for evaluating agents on the [AgentBench
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Start the evaluation
@@ -36,3 +36,21 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc
```bash
./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
```
## Run with Remote Runtime (experimental)
You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally.
To use the remote runtime, set the following environment variables:
```bash
# Required environment variables
export ALLHANDS_API_KEY="your-api-key" # Contact the team to get an API key
export RUNTIME=remote
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
# Run the evaluation
./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1
```
The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime.

View File

@@ -43,12 +43,16 @@ def get_config(
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='eventstream',
runtime=os.environ.get('RUNTIME', 'eventstream'),
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='python:3.12-bookworm',
base_container_image='python:3.12-slim',
enable_auto_lint=True,
use_host_network=False,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace
workspace_base=None,

View File

@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -10,7 +10,7 @@ Hugging Face dataset based on the
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local
Please follow instruction [here](../../README.md#setup) to setup your local
development environment and LLM.
## Start the evaluation

View File

@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE=$AGENT_VERSION
EVAL_NOTE=$OPENHANDS_VERSION
# Default to NOT use unit tests.
if [ -z "$USE_UNIT_TESTS" ]; then

View File

@@ -4,13 +4,14 @@ Implements evaluation of agents on BioCoder from the BioCoder benchmark introduc
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## BioCoder Docker Image
In the openhands branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenHands environment. In the Docker image are testing scripts (`/testing/start_test_openhands.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
**Before first execution, pull our Docker image with the following command**
```bash
docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
```
@@ -19,7 +20,6 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode
## Start the evaluation
```bash
./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
```
@@ -47,7 +47,8 @@ with current OpenHands version, then your command would be:
```
## Reference
```
```bibtex
@misc{tang2024biocoder,
title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},

View File

@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

File diff suppressed because one or more lines are too long

View File

@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -7,7 +7,7 @@ If so, the browsing performance upper-bound of CodeActAgent will be the performa
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference

View File

@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
--agent-cls $AGENT \

View File

@@ -4,19 +4,18 @@ This folder contains the evaluation harness that we built on top of the original
The evaluation consists of three steps:
1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm).
1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm).
2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## OpenHands Commit0 Instance-level Docker Support
OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances).
This is now the default behavior.
## Run Inference on Commit0 Instances
Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support).

View File

@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "HF SPLIT: $SPLIT"
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"

View File

@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run the evaluation
We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.
@@ -41,6 +42,7 @@ For example,
## Get score
Then you can get stats by running the following command:
```bash
python ./evaluation/benchmarks/gaia/get_score.py \
--file <path_to/output.json>

View File

@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$LEVELS" ]; then
LEVELS="2023_level1"
echo "Levels not specified, use default $LEVELS"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "LEVELS: $LEVELS"
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
--level $LEVELS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,7 +4,7 @@ This folder contains evaluation harness we built on top of the original [Gorilla
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on APIBench Instances

View File

@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$HUBS" ]; then
HUBS="hf,torch,tf"
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
fi
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "HUBS: $HUBS"
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
--hubs $HUBS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -3,6 +3,7 @@
Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).
This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
- Even experts in the corresponding domains achieve only 65% accuracy.
- State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
@@ -11,20 +12,24 @@ This code implements the evaluation of agents on the GPQA Benchmark with Open Bo
Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
Further references:
- https://arxiv.org/pdf/2311.12022
- https://paperswithcode.com/dataset/gpqa
- https://github.com/idavidrein/gpqa
- <https://arxiv.org/pdf/2311.12022>
- <https://paperswithcode.com/dataset/gpqa>
- <https://github.com/idavidrein/gpqa>
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on GPQA Benchmark
'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
From the root of the OpenHands repo, run the following command:
```bash
./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
```
You can replace `model_config_name` with any model you set up in `config.toml`.
- `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.

View File

@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
DATA_SPLIT="gpqa_diamond"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--data-split $DATA_SPLIT \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,7 +4,7 @@ Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on HumanEvalFix
@@ -14,13 +14,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
## Examples
For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case.
```
```json
{
"task_id": "Python/2",
"instruction": "Please fix the function in Python__2.py such that all test cases pass.\nEnvironment has been set up for you to start working. You may assume all necessary tools are installed.\n\n# Problem Statement\ndef truncate_number(number: float) -> float:\n return number % 1.0 + 1.0\n\n\n\n\n\n\ndef check(truncate_number):\n assert truncate_number(3.5) == 0.5\n assert abs(truncate_number(1.33) - 0.33) < 1e-6\n assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n",

View File

@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the logic reaso
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on logic_reasoning
The following code will run inference on the first example of the ProofWriter dataset,
```bash

View File

@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
DATASET="ProofWriter"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
--dataset $DATASET \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,7 +4,7 @@ This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) ben
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Test if your environment works
@@ -42,7 +42,6 @@ poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/e
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
## BrowsingAgent V1.0 result
Tested on BrowsingAgent V1.0

View File

@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
AGENT="BrowsingAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
--agent-cls $AGENT \

View File

@@ -18,10 +18,10 @@ checkout_eval_branch
# Only 'CodeActAgent' is supported for MINT now
AGENT="CodeActAgent"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
export PYTHONPATH=$(pwd)

View File

@@ -12,7 +12,7 @@ For more details on the ML-Bench task and dataset, please refer to the paper: [M
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on ML-Bench

View File

@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -1,10 +1,10 @@
# ScienceAgentBench Evaluation with OpenHands
This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080).
This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: <https://arxiv.org/abs/2410.05080>).
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Setup ScienceAgentBench
@@ -45,6 +45,7 @@ After the inference is completed, you may use the following command to extract n
```bash
python post_proc.py [log_fname]
```
- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
Output will be write to e.g. `evaluation/.../output.converted.jsonl`

View File

@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
USE_KNOWLEDGE=false
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
--use_knowledge $USE_KNOWLEDGE \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -6,20 +6,19 @@ This folder contains the evaluation harness that we built on top of the original
The evaluation consists of three steps:
1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue
3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches)
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## OpenHands SWE-Bench Instance-level Docker Support
OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
This is now the default behavior.
## Run Inference on SWE-Bench Instances
Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
@@ -52,7 +51,8 @@ default, it is set to 1.
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
There are also two optional environment variables you can set.
```
```bash
export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
```
@@ -127,6 +127,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
**This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
> If you want to evaluate existing results, you should first run this to clone existing outputs
>
>```bash
>git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
>```
@@ -143,6 +144,7 @@ Then you can run the following:
```
The script now accepts optional arguments:
- `instance_id`: Specify a single instance to evaluate (optional)
- `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`)
- `split`: The split of the dataset to use (default: `"test"`)
@@ -179,7 +181,6 @@ To clean-up all existing runtimes that you've already started, run:
ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
```
## Visualize Results
First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
@@ -189,6 +190,7 @@ git clone https://huggingface.co/spaces/OpenHands/evaluation
```
**(optional) setup streamlit environment with conda**:
```bash
cd evaluation
conda create -n streamlit python=3.10

View File

@@ -1,8 +1,12 @@
#!/usr/bin/env python3
import argparse
import glob
import json
import os
from collections import Counter
import pandas as pd
from openhands.events.serialization import event_from_dict
from openhands.events.utils import get_pairs_from_events
@@ -10,25 +14,21 @@ ERROR_KEYWORDS = [
'Agent encountered an error while processing the last action',
'APIError',
'Action execution failed',
'litellm.Timeout: APITimeoutError',
]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('output_file', type=str, help='The file to summarize')
args = parser.parse_args()
with open(args.output_file, 'r') as file:
def process_file(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
num_lines = len(lines)
num_error_lines = 0
num_agent_stuck_in_loop = 0
num_resolved = 0
num_empty_patch = 0
num_unfinished_runs = 0
error_counter = Counter()
main_agent_cost = []
editor_cost = []
num_turns = []
@@ -36,6 +36,11 @@ if __name__ == '__main__':
for line in lines:
_d = json.loads(line)
if 'metrics' not in _d or _d['metrics'] is None:
# this is a failed run
num_unfinished_runs += 1
continue
# Cost
costs = _d['metrics'].get('costs', [])
_cur_main_agent_cost = 0
@@ -89,30 +94,186 @@ if __name__ == '__main__':
num_error_lines += 1
break
# print the error counter (with percentage)
print(
f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
)
print(
f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
)
print(
f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
)
print(
f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
)
assert len(num_turns) == num_lines
assert len(main_agent_cost) == num_lines
assert len(editor_cost) == num_lines
print('## Statistics')
print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
print(
f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
return {
'file_path': file_path,
'total_instances': num_lines,
'resolved': {
'count': num_resolved,
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
},
'empty_patches': {
'count': num_empty_patch,
'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
},
'unfinished_runs': {
'count': num_unfinished_runs,
'percentage': (num_unfinished_runs / num_lines * 100)
if num_lines > 0
else 0,
},
'errors': {
'total': num_error_lines,
'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
'stuck_in_loop': {
'count': num_agent_stuck_in_loop,
'percentage': (num_agent_stuck_in_loop / num_lines * 100)
if num_lines > 0
else 0,
},
'breakdown': {
str(error): {
'count': count,
'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
}
for error, count in error_counter.items()
},
},
'costs': {
'main_agent': sum(main_agent_cost),
'editor': sum(editor_cost),
'total': sum(main_agent_cost) + sum(editor_cost),
},
'statistics': {
'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
'costs': {
'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
if num_lines > 0
else 0,
},
},
}
def aggregate_directory(input_path) -> pd.DataFrame:
# Process all output.jsonl files in subdirectories
pattern = os.path.join(input_path, '**/output.jsonl')
files = glob.glob(pattern, recursive=True)
print(f'Processing {len(files)} files from directory {input_path}')
# Process each file silently and collect results
results = []
for file_path in files:
try:
result = process_file(file_path)
results.append(result)
except Exception as e:
print(f'Error processing {file_path}: {str(e)}')
import traceback
traceback.print_exc()
continue
# Convert results to pandas DataFrame and sort by resolve rate
df = pd.DataFrame(results)
# Extract directory name from file path
df['directory'] = df['file_path'].apply(
lambda x: os.path.basename(os.path.dirname(x))
)
print('## Detailed error breakdown:')
for error, count in error_counter.items():
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
df = df.sort_values('resolve_rate', ascending=False)
return df
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'input_path', type=str, help='The file or directory to summarize'
)
parser.add_argument(
'--output',
type=str,
help='Output JSONL file for results',
default='summary_results.jsonl',
)
args = parser.parse_args()
if os.path.isdir(args.input_path):
df = aggregate_directory(args.input_path)
# Create the summary string
columns = [
'directory',
'resolve_rate',
'empty_patch_rate',
'unfinished_rate',
'error_rate',
'avg_turns',
'avg_cost',
'total_instances',
]
summary_str = df[columns].to_string(
float_format=lambda x: '{:.2f}'.format(x),
formatters={
'directory': lambda x: x[:90]
}, # Truncate directory names to 20 chars
index=False,
)
# Print to console
print('\nResults summary (sorted by resolve rate):')
print(summary_str)
# Save to text file
txt_output = args.output.rsplit('.', 1)[0] + '.txt'
with open(txt_output, 'w') as f:
f.write('Results summary (sorted by resolve rate):\n')
f.write(summary_str)
# Save
df.to_json(args.output, lines=True, orient='records')
df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
else:
# Process single file with detailed output
results = []
try:
result = process_file(args.input_path)
results.append(result)
# Print detailed results for single file
print(f'\nResults for {args.input_path}:')
print(
f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
)
print(
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
)
print(
f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
)
print(
f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
)
print(
f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
)
print(f"Total cost: {result['costs']['total']:.2f} USD")
print('## Statistics')
print(
f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
)
print(
f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
)
print(
f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
)
print(
f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
)
print('## Detailed error breakdown:')
for error, data in result['errors']['breakdown'].items():
print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
except Exception as e:
print(f'Error processing {args.input_path}: {str(e)}')

View File

@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "SPLIT: $SPLIT"
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"

View File

@@ -4,7 +4,7 @@ This folder contains an evaluation harness we built on top of the original [Tool
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Run Inference on ToolQA Instances

View File

@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
echo "WOLFRAM_APPID not specified"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "HARDNESS: $HARDNESS"
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
--wolfram_alpha_appid $WOLFRAM_APPID\
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"

View File

@@ -4,7 +4,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
## Setup Environment and LLM Configuration
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
## Setup WebArena Environment

View File

@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
AGENT="BrowsingAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
--agent-cls $AGENT \

View File

@@ -48,13 +48,19 @@ def get_config(
# use default base_container_image
enable_auto_lint=True,
use_host_network=False,
timeout=100,
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
# debug
debug=True,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -107,31 +113,37 @@ def process_instance(
# =============================================
# create sandbox and run the agent
# =============================================
runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
try:
test_class.initialize_runtime(runtime)
test_class.initialize_runtime(runtime)
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
)
)
)
if state is None:
raise ValueError('State should not be None.')
if state is None:
raise ValueError('State should not be None.')
# # =============================================
# # result evaluation
# # =============================================
# # =============================================
# # result evaluation
# # =============================================
histories = [event_to_dict(event) for event in state.history]
test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None
histories = state.history
# some basic check
logger.info(f'Total events in history: {len(histories)}')
assert len(histories) > 0, 'History should not be empty'
test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None
finally:
runtime.close()
# Save the output
output = EvalOutput(
@@ -139,7 +151,7 @@ def process_instance(
instance=instance.to_dict(),
instruction=instruction,
metadata=metadata,
history=histories,
history=[event_to_dict(event) for event in histories],
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result=test_result.model_dump(),
@@ -206,6 +218,8 @@ if __name__ == '__main__':
)
df = pd.read_json(output_file, lines=True, orient='records')
# record success and reason for failure for the final report
df['success'] = df['test_result'].apply(lambda x: x['success'])
df['reason'] = df['test_result'].apply(lambda x: x['reason'])
logger.info('-' * 100)
@@ -219,9 +233,16 @@ if __name__ == '__main__':
)
logger.info('-' * 100)
# record cost for each instance, with 3 decimal places
df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3))
logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
report_file = os.path.join(metadata.eval_output_dir, 'report.md')
with open(report_file, 'w') as f:
f.write(
f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
)
f.write(df[['instance_id', 'success', 'reason']].to_markdown(index=False))
f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
f.write(
df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False)
)

View File

@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE=$AGENT_VERSION
EVAL_NOTE=$OPENHANDS_VERSION
# Default to NOT use unit tests.
if [ -z "$USE_UNIT_TESTS" ]; then

View File

@@ -108,6 +108,8 @@ class Test(BaseIntegrationTest):
@classmethod
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
from openhands.core.logger import openhands_logger as logger
# check if the "The answer is OpenHands is all you need!" is in any message
message_actions = [
event
@@ -116,19 +118,29 @@ class Test(BaseIntegrationTest):
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
)
]
for event in message_actions:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
elif isinstance(event, MessageAction):
content = event.content
else:
raise ValueError(f'Unknown event type: {type(event)}')
logger.debug(f'Total message-like events: {len(message_actions)}')
if 'OpenHands is all you need!' in content:
return TestResult(success=True)
for event in message_actions:
try:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
elif isinstance(event, MessageAction):
content = event.content
else:
logger.warning(f'Unexpected event type: {type(event)}')
continue
if 'OpenHands is all you need!' in content:
return TestResult(success=True)
except Exception as e:
logger.error(f'Error processing event: {e}')
logger.debug(
f'Total messages: {len(message_actions)}. Messages: {message_actions}'
)
return TestResult(
success=False,
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
)

View File

@@ -14,7 +14,9 @@ class Test(BaseIntegrationTest):
@classmethod
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
# check if the "The answer is OpenHands is all you need!" is in any message
from openhands.core.logger import openhands_logger as logger
# check if the license information is in any message
message_actions = [
event
for event in histories
@@ -22,23 +24,35 @@ class Test(BaseIntegrationTest):
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
)
]
for event in message_actions:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
elif isinstance(event, MessageAction):
content = event.content
else:
raise ValueError(f'Unknown event type: {type(event)}')
logger.info(f'Total message-like events: {len(message_actions)}')
if (
'non-commercial' in content
or 'MIT' in content
or 'Apache 2.0' in content
):
return TestResult(success=True)
for event in message_actions:
try:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
if event.thought:
content += f'\n\n{event.thought}'
elif isinstance(event, MessageAction):
content = event.content
else:
logger.warning(f'Unexpected event type: {type(event)}')
continue
if (
'non-commercial' in content
or 'MIT' in content
or 'Apache 2.0' in content
):
return TestResult(success=True)
except Exception as e:
logger.error(f'Error processing event: {e}')
logger.debug(
f'Total messages: {len(message_actions)}. Messages: {message_actions}'
)
return TestResult(
success=False,
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
)

View File

@@ -39,8 +39,8 @@ checkout_original_branch() {
git checkout $current_branch
}
get_agent_version() {
get_openhands_version() {
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
}

1
frontend/.gitignore vendored
View File

@@ -7,3 +7,4 @@ node_modules/
/playwright-report/
/blob-report/
/playwright/.cache/
.react-router/

View File

@@ -9,6 +9,7 @@ This is the frontend of the OpenHands project. It is a React application that pr
- Remix SPA Mode (React + Vite + React Router)
- TypeScript
- Redux
- TanStack Query
- Tailwind CSS
- i18next
- React Testing Library
@@ -85,7 +86,7 @@ frontend
├── src
│ ├── api # API calls
│ ├── assets
│ ├── components # Reusable components
│ ├── components
│ ├── context # Local state management
│ ├── hooks # Custom hooks
│ ├── i18n # Internationalization
@@ -99,6 +100,18 @@ frontend
└── .env.sample # Sample environment variables
```
#### Components
Components are organized into folders based on their **domain**, **feature**, or **shared functionality**.
```sh
components
├── features # Domain-specific components
├── layout
├── modals
└── ui # Shared UI components
```
### Features
- Real-time updates with WebSockets

View File

@@ -1,7 +1,8 @@
import { screen } from "@testing-library/react";
import { describe, it, expect } from "vitest";
import { renderWithProviders } from "../../test-utils";
import BrowserPanel from "#/components/browser";
import { BrowserPanel } from "#/components/features/browser/browser";
describe("Browser", () => {
it("renders a message if no screenshotSrc is provided", () => {

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, it, expect, test } from "vitest";
import { ChatMessage } from "#/components/chat-message";
import { ChatMessage } from "#/components/features/chat/chat-message";
describe("ChatMessage", () => {
it("should render a user message", () => {
@@ -70,4 +70,12 @@ describe("ChatMessage", () => {
);
expect(screen.getByTestId("custom-component")).toBeInTheDocument();
});
it("should apply correct styles to inline code", () => {
render(<ChatMessage type="assistant" message="Here is some `inline code` text" />);
const codeElement = screen.getByText("inline code");
expect(codeElement.tagName.toLowerCase()).toBe("code");
expect(codeElement.closest("article")).not.toBeNull();
});
});

View File

@@ -1,7 +1,7 @@
import userEvent from "@testing-library/user-event";
import { fireEvent, render, screen } from "@testing-library/react";
import { describe, afterEach, vi, it, expect } from "vitest";
import { ChatInput } from "#/components/chat-input";
import { ChatInput } from "#/components/features/chat/chat-input";
describe("ChatInput", () => {
const onSubmitMock = vi.fn();

View File

@@ -2,13 +2,14 @@ import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
import { act, screen, waitFor, within } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";
import { ChatInterface } from "#/components/chat-interface";
import { addUserMessage } from "#/state/chat-slice";
import { SUGGESTIONS } from "#/utils/suggestions";
import * as ChatSlice from "#/state/chat-slice";
import { WsClientProviderStatus } from "#/context/ws-client-provider";
import { ChatInterface } from "#/components/features/chat/chat-interface";
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
const renderChatInterface = (messages: (Message)[]) =>
renderWithProviders(<ChatInterface />);
describe("Empty state", () => {
@@ -17,12 +18,16 @@ describe("Empty state", () => {
}));
const { useWsClient: useWsClientMock } = vi.hoisted(() => ({
useWsClient: vi.fn(() => ({ send: sendMock, runtimeActive: true })),
useWsClient: vi.fn(() => ({
send: sendMock,
status: WsClientProviderStatus.ACTIVE,
isLoadingMessages: false,
})),
}));
beforeAll(() => {
vi.mock("@remix-run/react", async (importActual) => ({
...(await importActual<typeof import("@remix-run/react")>()),
vi.mock("react-router", async (importActual) => ({
...(await importActual<typeof import("react-router")>()),
useRouteLoaderData: vi.fn(() => ({})),
}));
@@ -51,6 +56,7 @@ describe("Empty state", () => {
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
}),
);
});
@@ -84,7 +90,8 @@ describe("Empty state", () => {
// this is to test that the message is in the UI before the socket is called
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: false, // mock an inactive runtime setup
status: WsClientProviderStatus.ACTIVE,
isLoadingMessages: false,
}));
const addUserMessageSpy = vi.spyOn(ChatSlice, "addUserMessage");
const user = userEvent.setup();
@@ -113,7 +120,8 @@ describe("Empty state", () => {
async () => {
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: false, // mock an inactive runtime setup
status: WsClientProviderStatus.ACTIVE,
isLoadingMessages: false,
}));
const user = userEvent.setup();
const { rerender } = renderWithProviders(<ChatInterface />, {
@@ -130,7 +138,8 @@ describe("Empty state", () => {
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: true, // mock an active runtime setup
status: WsClientProviderStatus.ACTIVE,
isLoadingMessages: false,
}));
rerender(<ChatInterface />);
@@ -164,12 +173,14 @@ describe.skip("ChatInterface", () => {
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
{
sender: "assistant",
content: "Hi",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
renderChatInterface(messages);
@@ -203,6 +214,7 @@ describe.skip("ChatInterface", () => {
content: "Here are some images",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
const { rerender } = renderChatInterface(messages);
@@ -215,6 +227,7 @@ describe.skip("ChatInterface", () => {
content: "Here are some images",
imageUrls: ["image1", "image2"],
timestamp: new Date().toISOString(),
pending: true,
},
];
@@ -236,12 +249,14 @@ describe.skip("ChatInterface", () => {
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
{
sender: "user",
content: "Hi",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
const { rerender } = renderChatInterface(messages);
@@ -254,6 +269,7 @@ describe.skip("ChatInterface", () => {
content: "How can I help you?",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
});
rerender(<ChatInterface />);
@@ -262,17 +278,19 @@ describe.skip("ChatInterface", () => {
});
it("should render inline errors", () => {
const messages: (Message | ErrorMessage)[] = [
const messages: (Message)[] = [
{
sender: "assistant",
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
{
error: true,
id: "",
message: "Something went wrong",
type: "error",
content: "Something went wrong",
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
renderChatInterface(messages);
@@ -281,6 +299,70 @@ describe.skip("ChatInterface", () => {
expect(within(error).getByText("Something went wrong")).toBeInTheDocument();
});
it("should render both GitHub buttons initially when ghToken is available", () => {
vi.mock("react-router", async (importActual) => ({
...(await importActual<typeof import("react-router")>()),
useRouteLoaderData: vi.fn(() => ({ ghToken: "test-token" })),
}));
const messages: Message[] = [
{
sender: "assistant",
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
renderChatInterface(messages);
const pushButton = screen.getByRole("button", { name: "Push to Branch" });
const prButton = screen.getByRole("button", { name: "Push & Create PR" });
expect(pushButton).toBeInTheDocument();
expect(prButton).toBeInTheDocument();
expect(pushButton).toHaveTextContent("Push to Branch");
expect(prButton).toHaveTextContent("Push & Create PR");
});
it("should render only 'Push changes to PR' button after PR is created", async () => {
vi.mock("react-router", async (importActual) => ({
...(await importActual<typeof import("react-router")>()),
useRouteLoaderData: vi.fn(() => ({ ghToken: "test-token" })),
}));
const messages: Message[] = [
{
sender: "assistant",
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
const { rerender } = renderChatInterface(messages);
const user = userEvent.setup();
// Click the "Push & Create PR" button
const prButton = screen.getByRole("button", { name: "Push & Create PR" });
await user.click(prButton);
// Re-render to trigger state update
rerender(<ChatInterface />);
// Verify only one button is shown
const pushToPrButton = screen.getByRole("button", {
name: "Push changes to PR",
});
expect(pushToPrButton).toBeInTheDocument();
expect(
screen.queryByRole("button", { name: "Push to Branch" }),
).not.toBeInTheDocument();
expect(
screen.queryByRole("button", { name: "Push & Create PR" }),
).not.toBeInTheDocument();
});
it("should render feedback actions if there are more than 3 messages", () => {
const messages: Message[] = [
{
@@ -288,18 +370,21 @@ describe.skip("ChatInterface", () => {
content: "Hello",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
{
sender: "user",
content: "Hi",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
{
sender: "assistant",
content: "How can I help you?",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
},
];
const { rerender } = renderChatInterface(messages);
@@ -310,6 +395,7 @@ describe.skip("ChatInterface", () => {
content: "I need help",
imageUrls: [],
timestamp: new Date().toISOString(),
pending: true,
});
rerender(<ChatInterface />);

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, test, vi } from "vitest";
import { AccountSettingsContextMenu } from "#/components/context-menu/account-settings-context-menu";
import { AccountSettingsContextMenu } from "#/components/features/context-menu/account-settings-context-menu";
describe("AccountSettingsContextMenu", () => {
const user = userEvent.setup();

View File

@@ -1,7 +1,7 @@
import { describe, it, expect, vi } from "vitest";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { ContextMenuListItem } from "#/components/context-menu/context-menu-list-item";
import { ContextMenuListItem } from "#/components/features/context-menu/context-menu-list-item";
describe("ContextMenuListItem", () => {
it("should render the component with the children", () => {

View File

@@ -0,0 +1,45 @@
import { render, screen } from "@testing-library/react";
import { it, describe, expect, vi } from "vitest";
import userEvent from "@testing-library/user-event";
import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
import * as CaptureConsent from "#/utils/handle-capture-consent";
describe("WaitlistModal", () => {
it("should render a tos checkbox that is unchecked by default", () => {
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
const checkbox = screen.getByRole("checkbox");
expect(checkbox).not.toBeChecked();
});
it("should only enable the GitHub button if the tos checkbox is checked", async () => {
const user = userEvent.setup();
render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
const checkbox = screen.getByRole("checkbox");
const button = screen.getByRole("button", { name: "Connect to GitHub" });
expect(button).toBeDisabled();
await user.click(checkbox);
expect(button).not.toBeDisabled();
});
it("should set user analytics consent to true when the user checks the tos checkbox", async () => {
const handleCaptureConsentSpy = vi.spyOn(
CaptureConsent,
"handleCaptureConsent",
);
const user = userEvent.setup();
render(<WaitlistModal ghToken={null} githubAuthUrl="mock-url" />);
const checkbox = screen.getByRole("checkbox");
await user.click(checkbox);
const button = screen.getByRole("button", { name: "Connect to GitHub" });
await user.click(button);
expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
});
});

View File

@@ -1,7 +1,7 @@
import { render, screen, within } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { FeedbackActions } from "#/components/feedback-actions";
import { FeedbackActions } from "#/components/features/feedback/feedback-actions";
describe("FeedbackActions", () => {
const user = userEvent.setup();

View File

@@ -1,8 +1,8 @@
import { render, screen } from "@testing-library/react";
import { screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { renderWithProviders } from "test-utils";
import { FeedbackForm } from "#/components/feedback-form";
import { FeedbackForm } from "#/components/features/feedback/feedback-form";
describe("FeedbackForm", () => {
const user = userEvent.setup();

View File

@@ -1,7 +1,7 @@
import { screen } from "@testing-library/react";
import { renderWithProviders } from "test-utils";
import { describe, afterEach, vi, it, expect } from "vitest";
import ExplorerTree from "#/components/file-explorer/explorer-tree";
import { ExplorerTree } from "#/components/features/file-explorer/explorer-tree";
const FILES = ["file-1-1.ts", "folder-1-2"];

View File

@@ -4,8 +4,8 @@ import { renderWithProviders } from "test-utils";
import { describe, it, expect, vi, Mock, afterEach } from "vitest";
import toast from "#/utils/toast";
import AgentState from "#/types/agent-state";
import FileExplorer from "#/components/file-explorer/file-explorer";
import OpenHands from "#/api/open-hands";
import { FileExplorer } from "#/components/features/file-explorer/file-explorer";
const toastSpy = vi.spyOn(toast, "error");
const uploadFilesSpy = vi.spyOn(OpenHands, "uploadFiles");

View File

@@ -2,7 +2,7 @@ import { screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";
import { vi, describe, afterEach, it, expect } from "vitest";
import TreeNode from "#/components/file-explorer/tree-node";
import TreeNode from "#/components/features/file-explorer/tree-node";
import OpenHands from "#/api/open-hands";
const getFileSpy = vi.spyOn(OpenHands, "getFile");

View File

@@ -1,7 +1,7 @@
import { ImagePreview } from "#/components/features/images/image-preview";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, expect, it, vi } from "vitest";
import { ImagePreview } from "#/components/image-preview";
describe("ImagePreview", () => {
it("should render an image", () => {

View File

@@ -1,7 +1,7 @@
import { render, screen, within } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
import { InteractiveChatBox } from "#/components/interactive-chat-box";
import { InteractiveChatBox } from "#/components/features/chat/interactive-chat-box";
describe("InteractiveChatBox", () => {
const onSubmitMock = vi.fn();

View File

@@ -1,7 +1,7 @@
import { render, screen, act } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, it, vi, expect } from "vitest";
import BaseModal from "#/components/modals/base-modal/base-modal";
import { BaseModal } from "#/components/shared/modals/base-modal/base-modal";
describe("BaseModal", () => {
it("should render if the modal is open", () => {

View File

@@ -1,7 +1,7 @@
import { describe, it, expect } from "vitest";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { ModelSelector } from "#/components/modals/settings/model-selector";
import { ModelSelector } from "#/components/shared/modals/settings/model-selector";
describe("ModelSelector", () => {
const models = {

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { SuggestionItem } from "#/components/suggestion-item";
import { SuggestionItem } from "#/components/features/suggestions/suggestion-item";
describe("SuggestionItem", () => {
const suggestionItem = { label: "suggestion1", value: "a long text value" };

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { Suggestions } from "#/components/suggestions";
import { Suggestions } from "#/components/features/suggestions/suggestions";
describe("Suggestions", () => {
const firstSuggestion = {

View File

@@ -2,7 +2,7 @@ import { act, screen } from "@testing-library/react";
import { renderWithProviders } from "test-utils";
import { vi, describe, afterEach, it, expect } from "vitest";
import { Command, appendInput, appendOutput } from "#/state/command-slice";
import Terminal from "#/components/terminal/terminal";
import Terminal from "#/components/features/terminal/terminal";
global.ResizeObserver = vi.fn().mockImplementation(() => ({
observe: vi.fn(),

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { UploadImageInput } from "#/components/upload-image-input";
import { UploadImageInput } from "#/components/features/images/upload-image-input";
describe("UploadImageInput", () => {
const user = userEvent.setup();

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import { describe, expect, it, test, vi, afterEach } from "vitest";
import userEvent from "@testing-library/user-event";
import { UserActions } from "#/components/user-actions";
import { UserActions } from "#/components/features/sidebar/user-actions";
describe("UserActions", () => {
const user = userEvent.setup();

View File

@@ -1,7 +1,7 @@
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { afterEach, describe, expect, it, vi } from "vitest";
import { UserAvatar } from "#/components/user-avatar";
import { UserAvatar } from "#/components/features/sidebar/user-avatar";
describe("UserAvatar", () => {
const onClickMock = vi.fn();

Some files were not shown because too many files have changed in this diff Show More