Compare commits

..

3 Commits

Author SHA1 Message Date
Xingyao Wang f5d86e8132 Merge branch 'main' into fix-cli-command-interruption 2025-08-22 09:26:13 -04:00
openhands d615fe26c0 cli: refine Ctrl+C behavior and async safety
- Double-press Ctrl+C within 400ms to send interrupt to running command
- Single Ctrl+C pauses (legacy) when command running or not
- Honor CLI config in dialogs and avoid blocking event loop via to_thread
- Debounce interrupts with asyncio.Lock to prevent races
- Use bounded reverse search on EventStream with EventFilter; rely on exit_code
- Pass config through start_pause_listener; remove ad-hoc OpenHandsConfig()
- Update help text for clarity

Co-authored-by: openhands <openhands@all-hands.dev>
2025-08-21 19:54:57 +00:00
openhands 01f28f6269 Fix issue #10434: Add command interruption support to CLI
- Enhanced Ctrl+C behavior to detect running commands and provide user options
- Added is_command_running() function to analyze event stream for active commands
- Modified process_agent_pause() to handle command interruption vs agent pause
- Added _handle_command_interrupt() with user confirmation dialog offering:
  * Kill running command (send Ctrl+C to command)
  * Continue waiting for command completion
  * Pause the entire agent
- Updated help documentation with new keyboard shortcuts section
- Maintains backward compatibility: Ctrl+C still pauses agent when no command running
- All existing CLI tests pass (237 tests)

Co-authored-by: openhands <openhands@all-hands.dev>
2025-08-16 23:29:04 +00:00
220 changed files with 3109 additions and 8462 deletions
-1
View File
@@ -187,7 +187,6 @@ jobs:
test_settings.py::test_github_token_configuration \
test_conversation.py::test_conversation_start \
test_browsing_catchphrase.py::test_browsing_catchphrase \
test_multi_conversation_resume.py::test_multi_conversation_resume \
-v --no-header --capture=no --timeout=900
- name: Upload test results
+1 -1
View File
@@ -15,7 +15,7 @@ jobs:
stale-issue-message: 'This issue is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
stale-pr-message: 'This PR is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
days-before-stale: 40
exempt-issue-labels: roadmap,backlog
exempt-issue-labels: 'roadmap'
close-issue-message: 'This issue was automatically closed due to 50 days of inactivity. We do this to help keep the issues somewhat manageable and focus on active issues.'
close-pr-message: 'This PR was closed because it had no activity for 50 days. If you feel this was closed in error, and you would like to continue the PR, please resubmit or let us know.'
days-before-close: 10
-2
View File
@@ -257,5 +257,3 @@ containers/runtime/code
# test results
test-results
.sessions
.eval_sessions
+2 -3
View File
@@ -363,11 +363,10 @@ classpath = "my_package.my_module.MyCustomAgent"
#confirmation_mode = false
# The security analyzer to use (For Headless / CLI only - In Web this is overridden by Session Init)
# Available options: 'llm' (default), 'invariant'
#security_analyzer = "llm"
#security_analyzer = ""
# Whether to enable security analyzer
#enable_security_analyzer = true
#enable_security_analyzer = false
#################################### Condenser #################################
# Condensers control how conversation history is managed and compressed when
+13 -13
View File
@@ -58,34 +58,34 @@ RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
# Default is 60000, but we've seen up to 200000
RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs
RUN groupadd --gid $OPENHANDS_USER_ID openhands
RUN groupadd --gid $OPENHANDS_USER_ID app
RUN useradd -l -m -u $OPENHANDS_USER_ID --gid $OPENHANDS_USER_ID -s /bin/bash openhands && \
usermod -aG openhands openhands && \
usermod -aG app openhands && \
usermod -aG sudo openhands && \
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
RUN chown -R openhands:openhands /app && chmod -R 770 /app
RUN sudo chown -R openhands:openhands $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
RUN chown -R openhands:app /app && chmod -R 770 /app
RUN sudo chown -R openhands:app $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
USER openhands
ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH" \
PYTHONPATH='/app'
COPY --chown=openhands:openhands --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
COPY --chown=openhands:app --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
COPY --chown=openhands:openhands --chmod=770 ./microagents ./microagents
COPY --chown=openhands:openhands --chmod=770 ./openhands ./openhands
COPY --chown=openhands:openhands --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
COPY --chown=openhands:openhands pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./
COPY --chown=openhands:app --chmod=770 ./microagents ./microagents
COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
COPY --chown=openhands:app pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./
# This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
RUN python openhands/core/download.py # No-op to download assets
# Add this line to set group ownership of all files/directories not already in "app" group
# openhands:openhands -> openhands:openhands
RUN find /app \! -group openhands -exec chgrp openhands {} +
# openhands:openhands -> openhands:app
RUN find /app \! -group app -exec chgrp app {} +
COPY --chown=openhands:openhands --chmod=770 --from=frontend-builder /app/build ./frontend/build
COPY --chown=openhands:openhands --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
USER root
+1 -1
View File
@@ -54,7 +54,7 @@ else
fi
fi
fi
usermod -aG openhands enduser
usermod -aG app enduser
# get the user group of /var/run/docker.sock and set openhands to that group
DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
echo "Docker socket group id: $DOCKER_SOCKET_GID"
@@ -1,5 +1,5 @@
---
title: Jira Data Center Integration (Coming soon...)
title: Jira Data Center Integration (Beta)
description: Complete guide for setting up Jira Data Center integration with OpenHands Cloud, including service account creation, personal access token generation, webhook configuration, and workspace integration setup.
---
@@ -1,5 +1,5 @@
---
title: Jira Cloud Integration (Coming soon...)
title: Jira Cloud Integration
description: Complete guide for setting up Jira Cloud integration with OpenHands Cloud, including service account creation, API token generation, webhook configuration, and workspace integration setup.
---
@@ -1,5 +1,5 @@
---
title: Linear Integration (Coming soon...)
title: Linear Integration
description: Complete guide for setting up Linear integration with OpenHands Cloud, including service account creation, API key generation, webhook configuration, and workspace integration setup.
---
@@ -1,5 +1,5 @@
---
title: Project Management Tool Integrations (Coming soon...)
title: Project Management Tool Integrations
description: Overview of OpenHands Cloud integrations with project management platforms including Jira Cloud, Jira Data Center, and Linear. Learn about setup requirements, usage methods, and troubleshooting.
---
@@ -18,9 +18,9 @@ Integration requires two levels of setup:
2. **Workspace Integration** - Self-service configuration through the OpenHands Cloud UI to link your OpenHands account to the target workspace
### Platform-Specific Setup Guides:
- [Jira Cloud Integration (Coming soon...)](./jira-integration.md)
- [Jira Data Center Integration (Coming soon...)](./jira-dc-integration.md)
- [Linear Integration (Coming soon...)](./linear-integration.md)
- [Jira Cloud Integration](./jira-integration.md)
- [Jira Data Center Integration](./jira-dc-integration.md)
- [Linear Integration](./linear-integration.md)
## Usage
-52
View File
@@ -1,52 +0,0 @@
# Confirmation Mode and Security Analyzers
OpenHands provides a security framework to help protect users from potentially risky actions through **Confirmation Mode** and **Security Analyzers**. This system analyzes agent actions and prompts users for confirmation when high-risk operations are detected.
## Overview
The security system consists of two main components:
1. **Confirmation Mode**: When enabled, the agent will pause and ask for user confirmation before executing actions that are flagged as high-risk by the security analyzer.
2. **Security Analyzers**: These are modules that evaluate the risk level of agent actions and determine whether user confirmation is required.
## Configuration
### CLI
In CLI mode, confirmation is enabled by default. You will have an option to uses the LLM Analyzer and will automatically confirm LOW and MEDIUM risk actions, only prompting for HIGH risk actions.
## Security Analyzers
OpenHands includes multiple analyzers:
- **No Analyzer**: Do not use any security analyzer. The agent will prompt you to confirm *EVERY* action.
- **LLM Risk Analyzer** (default): Uses the same LLM as the agent to assess action risk levels
- **Invariant Analyzer**: Uses Invariant Labs' policy engine to evaluate action traces against security policies
### LLM Risk Analyzer
The default analyzer that leverages the agent's LLM to evaluate the security risk of each action. It considers the action type, parameters, and context to assign risk levels.
### Invariant Analyzer
An advanced analyzer that:
- Collects conversation events and parses them into a trace
- Checks the trace against an Invariant policy to classify risk (low, medium, high)
- Manages an Invariant server container automatically if needed
- Supports optional browsing-alignment and harmful-content checks
## How It Works
1. **Action Analysis**: When the agent wants to perform an action, the selected security analyzer evaluates its risk level.
2. **Risk Assessment**: The analyzer returns one of three risk levels:
- **LOW**: Action proceeds without confirmation
- **MEDIUM**: Action proceeds without confirmation (may be configurable in future)
- **HIGH**: Action is paused, and user confirmation is requested
3. **User Confirmation**: For high-risk actions, a confirmation dialog appears with:
- Description of the action
- Risk assessment explanation
- Options to approve or deny action
4. **Action Execution**: Based on user response:
- **Approve**: Action proceeds as planned
- **Deny**: Action is cancelled
+7 -1
View File
@@ -87,13 +87,19 @@ source ~/.bashrc # or source ~/.zshrc
</AccordionGroup>
3. Launch an interactive OpenHands conversation from the command line:
```bash
# If using uvx (recommended)
uvx --python 3.12 --from openhands-ai openhands
```
<Note>
If you have cloned the repository, you can also run the CLI directly using Poetry:
poetry run openhands
</Note>
3. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
4. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
This command opens an interactive prompt where you can type tasks or commands and get responses from OpenHands.
The first time you run the CLI, it will take you through configuring the required LLM
+1 -8
View File
@@ -45,13 +45,6 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
2. Run `wsl --version` in powershell and confirm `Default Version: 2`.
**Ubuntu (Linux Distribution)**
1. Install Ubuntu: `wsl --install -d Ubuntu` in PowerShell as Administrator.
2. Restart computer when prompted.
3. Open Ubuntu from Start menu to complete setup.
4. Verify installation: `wsl --list` should show Ubuntu.
**Docker Desktop**
1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
@@ -60,7 +53,7 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
- Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
<Note>
The docker command below to start the app must be run inside the WSL terminal. Use `wsl -d Ubuntu` in PowerShell or search "Ubuntu" in the Start menu to access the Ubuntu terminal.
The docker command below to start the app must be run inside the WSL terminal.
</Note>
**Alternative: Windows without WSL**
+1 -1
View File
@@ -22,7 +22,7 @@ SDK to spawn and control these sandboxes.
You can use the E2B CLI to create a custom sandbox with a Dockerfile. Read the full guide
[here](https://e2b.dev/docs/guide/custom-sandbox). The premade OpenHands sandbox for E2B is set up in the `containers`
directory, and it's called `openhands`.
directory. and it's called `openhands`.
## Debugging
+11 -8
View File
@@ -9,8 +9,8 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -61,15 +61,18 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config(
metadata: EvalMetadata,
) -> OpenHandsConfig:
# Create config with EDA-specific container image
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
# Override the container image for EDA
config.sandbox.base_container_image = 'python:3.12-bookworm'
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False
+13 -6
View File
@@ -17,8 +17,8 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -41,12 +41,19 @@ from openhands.utils.async_utils import call_async_from_sync
def get_config(
metadata: EvalMetadata,
) -> OpenHandsConfig:
# Create config with agent_bench-specific container image
config = get_openhands_config_for_eval(metadata=metadata)
# Override the container image for agent_bench
config.sandbox.base_container_image = 'python:3.12-slim'
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-slim'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False
@@ -18,7 +18,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -51,10 +50,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.11-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config=sandbox_config,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -16,7 +16,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -62,10 +61,15 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -19,7 +19,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -76,10 +75,15 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -12,7 +12,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -41,8 +40,14 @@ def get_config(
)
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata, runtime='docker', sandbox_config=sandbox_config
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+9 -5
View File
@@ -17,7 +17,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -115,11 +114,16 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'),
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -18,7 +18,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -66,10 +65,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -23,7 +23,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -61,10 +60,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config=sandbox_config,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
if metadata.agent_config:
+8 -4
View File
@@ -13,7 +13,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -44,10 +43,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -31,7 +31,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -65,10 +64,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -24,7 +24,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -86,10 +85,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -16,7 +16,6 @@ import ruamel.yaml
from evaluation.utils.shared import (
EvalMetadata,
get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
make_metadata,
)
from openhands.core.config import (
@@ -38,10 +37,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -23,7 +23,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -49,10 +48,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -11,7 +11,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -53,10 +52,15 @@ def get_config(
'$OH_INTERPRETER_PATH -m pip install scitools-pyke'
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -14,7 +14,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -59,10 +58,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
+8 -4
View File
@@ -16,7 +16,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -111,10 +110,15 @@ def get_config(
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
+8 -4
View File
@@ -27,7 +27,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -81,10 +80,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -23,7 +23,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
@@ -88,9 +87,13 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
config = OpenHandsConfig(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
return config
@@ -21,7 +21,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -342,11 +341,16 @@ def get_config(
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -31,7 +31,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -175,10 +174,15 @@ def get_config(
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
@@ -13,7 +13,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -65,10 +64,16 @@ def get_config(
sandbox_config.base_container_image = (
'docker.io/xingyaoww/openhands-eval-scienceagentbench'
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
max_budget_per_task=4,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -19,7 +19,6 @@ from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
@@ -84,9 +83,13 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
dataset_name=metadata.dataset,
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
config = OpenHandsConfig(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
return config
+8 -4
View File
@@ -32,7 +32,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -228,11 +227,16 @@ def get_config(
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
@@ -20,7 +20,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -200,11 +199,16 @@ def get_config(
'REPO_PATH': f'/workspace/{workspace_dir_name}/',
}
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -13,7 +13,6 @@ N_RUNS=${4:-1}
export EXP_NAME=$EXP_NAME
# use 2x resources for rollout since some codebases are pretty resource-intensive
export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
export ITERATIVE_EVAL_MODE=false
echo "MODEL: $MODEL"
echo "EXP_NAME: $EXP_NAME"
DATASET="SWE-Gym/SWE-Gym" # change this to the "/SWE-Gym-Lite" if you want to rollout the lite subset
+13 -15
View File
@@ -37,7 +37,6 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
get_openhands_config_for_eval,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
@@ -59,21 +58,20 @@ def get_config(instance: pd.Series) -> OpenHandsConfig:
f'Invalid container image for instance {instance["instance_id_swebench"]}.'
)
logger.info(f'Using instance container image: {base_container_image}.')
# Create custom sandbox config for testgeneval with specific requirements
sandbox_config = SandboxConfig(
base_container_image=base_container_image,
use_host_network=False,
timeout=1800, # Longer timeout than default (300)
api_key=os.environ.get('ALLHANDS_API_KEY'),
remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
return OpenHandsConfig(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'eventstream'),
sandbox=SandboxConfig(
base_container_image=base_container_image,
use_host_network=False,
timeout=1800,
api_key=os.environ.get('ALLHANDS_API_KEY'),
remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
),
),
)
return get_openhands_config_for_eval(
sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'), # Different default runtime
workspace_base=None,
workspace_mount_path=None,
)
+22 -20
View File
@@ -25,7 +25,6 @@ from evaluation.utils.shared import (
assert_and_raise,
codeact_user_response,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -127,26 +126,29 @@ def get_config(
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
)
sandbox_config = SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'eventstream'),
sandbox=SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
)
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -12,10 +12,7 @@ import tempfile
import yaml
from browsing import pre_login
from evaluation.utils.shared import (
get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
)
from evaluation.utils.shared import get_default_sandbox_config_for_eval
from openhands.controller.state.state import State
from openhands.core.config import (
LLMConfig,
@@ -45,17 +42,19 @@ def get_config(
sandbox_config.enable_auto_lint = True
# If the web services are running on the host machine, this must be set to True
sandbox_config.use_host_network = True
config = get_openhands_config_for_eval(
config = OpenHandsConfig(
run_as_openhands=False,
max_budget_per_task=4,
max_iterations=100,
save_trajectory_path=os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json'
),
sandbox=sandbox_config,
# we mount trajectories path so that trajectories, generated by OpenHands
# controller, can be accessible to the evaluator file in the runtime container
sandbox_config=sandbox_config,
workspace_mount_path=mount_path_on_host,
workspace_mount_path_in_sandbox='/outputs',
)
config.save_trajectory_path = os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json'
)
config.max_budget_per_task = 4
config.set_llm_config(llm_config)
if agent_config:
config.set_agent_config(agent_config)
+8 -4
View File
@@ -12,7 +12,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -45,10 +44,15 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -20,7 +20,6 @@ from evaluation.utils.shared import (
codeact_user_response,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
@@ -161,11 +160,16 @@ def get_config(
instance_id=instance['instance_id'],
)
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -13,7 +13,6 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -74,10 +73,16 @@ def get_config(
'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399',
}
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
attach_to_existing=True,
)
config.set_llm_config(
update_llm_config_for_completions_logging(
@@ -1,212 +0,0 @@
# WebArena CDP Integration Implementation Plan
## Overview
This document outlines the proper solution for integrating OpenHands with the official WebArena evaluation harness using Chrome DevTools Protocol (CDP) session logging.
## The Problem
WebArena evaluators require:
1. Live browser state (DOM, cookies, localStorage, etc.)
2. CDPSession object for making CDP calls
3. Page object for accessing current URL, title, content
OpenHands only provides:
1. Action/observation pairs in text format
2. No live browser state
3. No CDP access during evaluation
## The Solution: CDP Session Logging
### Phase 1: Capture Browser State During Inference
**Modify `openhands/runtime/browser/browser_env.py`:**
```python
class BrowserEnv:
def __init__(self, ...):
# ... existing code ...
self.cdp_logger = CDPSessionLogger() if should_log_cdp() else None
def step(self, action):
# ... existing action execution ...
# Log CDP state after each action
if self.cdp_logger:
self.cdp_logger.capture_state_snapshot(f"after_action_{action.action}")
# ... return observation ...
def close(self):
# Save final CDP session
if self.cdp_logger:
instance_id = get_current_instance_id() # from evaluation context
self.cdp_logger.save_session(instance_id)
```
**Add CDP Logger Integration:**
```python
class CDPSessionLogger:
def attach_to_browsergym_env(self, env):
"""Attach to BrowserGym environment's Playwright page."""
# Access the underlying Playwright page from BrowserGym
playwright_page = env.page # or however BrowserGym exposes it
self.attach_to_page(playwright_page)
def capture_state_snapshot(self, trigger: str):
"""Capture complete browser state using CDP."""
# DOM snapshot (key for WebArena evaluators)
dom_snapshot = self.cdp_session.send("DOMSnapshot.captureSnapshot", {
"computedStyles": [],
"includeDOMRects": True,
"includePaintOrder": True,
})
# All other state (cookies, localStorage, etc.)
# ... as shown in POC ...
```
### Phase 2: Mock Objects for Evaluation
**Create Mock Page/CDPSession:**
```python
class MockCDPSession:
def __init__(self, saved_state):
self.saved_state = saved_state
def send(self, method: str, params=None):
"""Return saved state instead of making live CDP calls."""
if method == "DOMSnapshot.captureSnapshot":
return self.saved_state["dom_snapshot"]
elif method == "Network.getAllCookies":
return self.saved_state["cookies"]
# ... handle all CDP methods WebArena uses ...
class MockPage:
def __init__(self, saved_state):
self.saved_state = saved_state
def url(self): return self.saved_state["final_url"]
def title(self): return self.saved_state["final_title"]
def context(self): return MockBrowserContext(self.saved_state)
# ... implement all Page methods WebArena uses ...
```
### Phase 3: Updated Evaluation Script
**Modify `eval_infer.py`:**
```python
def evaluate_with_official_webarena_harness(instance_data, config_file):
"""Use official WebArena evaluators with saved CDP state."""
# Load saved CDP session
cdp_integration = WebArenaCDPIntegration()
mock_page, mock_client = cdp_integration.create_mock_page_and_client(
instance_data["instance_id"]
)
# Convert OpenHands trajectory to WebArena format
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
# Use official WebArena evaluator with mock objects
evaluator = evaluator_router(config_file)
score = evaluator(
trajectory=trajectory,
config_file=config_file,
page=mock_page, # Mock page with saved state
client=mock_client, # Mock CDP session with saved state
)
return score
```
## Implementation Steps
### Step 1: Integrate CDP Logger into BrowserEnv
1. **Add CDP logging to `browser_env.py`:**
- Detect when running WebArena evaluation
- Attach CDP logger to BrowserGym's Playwright page
- Capture state snapshots after each action
- Save final session with instance ID
2. **Environment variable setup:**
```bash
export WEBARENA_CDP_LOGGING=true
export WEBARENA_CDP_SESSION_DIR=/tmp/cdp_sessions
```
### Step 2: Create Mock Objects
1. **Implement `MockCDPSession`:**
- Handle all CDP methods WebArena evaluators use
- Return saved state instead of making live calls
- Support `DOMSnapshot.captureSnapshot`, `Network.getAllCookies`, etc.
2. **Implement `MockPage`:**
- Provide saved URL, title, content
- Mock JavaScript evaluation with saved state
- Support element queries using DOM snapshot
### Step 3: Update Evaluation Pipeline
1. **Modify `run_infer.py`:**
- Enable CDP logging for WebArena tasks
- Ensure instance IDs are properly set
- Save CDP sessions to accessible location
2. **Update `eval_infer.py`:**
- Load saved CDP sessions
- Create mock objects
- Use official WebArena evaluators
- Remove all heuristic evaluation logic
### Step 4: Testing and Validation
1. **Test with known tasks:**
- Run inference with CDP logging
- Verify CDP sessions are saved correctly
- Test evaluation with mock objects
- Compare results with expected outcomes
2. **Validate DOM snapshot format:**
- Ensure saved DOM snapshots match WebArena expectations
- Test all CDP methods used by evaluators
- Verify JavaScript evaluation works correctly
## Benefits of This Approach
1. **✅ Uses Official WebArena Evaluation:** No heuristics or approximations
2. **✅ Preserves Exact Browser State:** DOM, cookies, localStorage, etc.
3. **✅ No Live Browser Needed:** Evaluation works offline with saved state
4. **✅ Scalable:** Can evaluate many instances without browser overhead
5. **✅ Accurate:** Evaluators get exactly the state they expect
## File Structure
```
/tmp/cdp_sessions/
├── webarena.1.json # CDP session for task 1
├── webarena.2.json # CDP session for task 2
├── webarena.3.json # CDP session for task 3
└── webarena.4.json # CDP session for task 4
evaluation/benchmarks/webarena/
├── run_infer.py # Modified to enable CDP logging
├── eval_infer.py # Uses mock objects with saved state
├── cdp_integration.py # Mock Page/CDPSession implementation
└── IMPLEMENTATION_PLAN.md # This document
```
## Next Steps
1. **Implement CDP logger integration in `browser_env.py`**
2. **Create comprehensive mock objects**
3. **Update evaluation scripts**
4. **Test with actual WebArena tasks**
5. **Validate results against expected outcomes**
This approach solves the fundamental problem: WebArena evaluators need live browser state, but OpenHands only provides action/observation pairs. By capturing and replaying the exact browser state, we can use the official WebArena evaluation harness without any compromises.
+7 -48
View File
@@ -6,21 +6,11 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
Make sure to install the evaluation dependencies:
```bash
poetry install --with evaluation
```
## Setup WebArena Environment
WebArena requires access to websites containing pre-populated content. You can either:
1. **Use an existing WebArena environment** (recommended for evaluation): Set the `WEBARENA_BASE_URL` environment variable to point to an existing WebArena server.
2. **Set up your own environment**: Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
The WebArena evaluation package is already installed with the evaluation dependencies, so you don't need to clone the WebArena repository separately.
WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
## Test if your environment works
@@ -31,51 +21,20 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie
## Run Evaluation
### Step 1: Run Inference
Before running, you must provide an LLM config in a local config.toml and pass its name to run_infer.sh:
1) Create config.toml in the repo root (this file is gitignored):
```toml
[llm.eval_openai]
model = "gpt-4o"
api_key = "sk-..." # Your OpenAI API key
```
2) Ensure Docker is installed and running (the first run will build a browser-enabled runtime image).
```bash
export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
# args: MODEL_CONFIG COMMIT_HASH AGENT EVAL_LIMIT NUM_WORKERS
bash evaluation/benchmarks/webarena/scripts/run_infer.sh llm.eval_openai HEAD BrowsingAgent 3 1
bash evaluation/benchmarks/webarena/scripts/run_infer.sh
```
Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
### Step 2: Evaluate Results
To calculate the success rate, run:
To evaluate the results and calculate success rate using the official WebArena harness, you must have the official WebArena repo and its Python dependencies available locally:
1) Clone the official repo and install deps (one-time):
```bash
cd /workspace/project
git clone https://github.com/web-arena-x/webarena
cd webarena && pip install -e .
```sh
poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
```
2) Then run the evaluator:
```bash
poetry run python evaluation/benchmarks/webarena/eval_infer.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
```
Notes:
- The evaluator expects WEBARENA_BASE_URL to be set and the WebArena services to be reachable.
- If you skip installing the official harness, you can still inspect output.jsonl manually or write your own scorer, but the script above will fail without the harness.
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -1,283 +0,0 @@
#!/usr/bin/env python3
"""
BrowserGym State Capture for WebArena Evaluation
This module leverages BrowserGym's existing state capture capabilities to save
browser state for proper WebArena evaluation. BrowserGym already provides:
- extract_dom_snapshot() - exactly what WebArena evaluators need
- Direct Playwright page access via env.page
- CDP session access via page.context.new_cdp_session()
This is much simpler than our original CDP logging approach because BrowserGym
already has all the infrastructure we need.
"""
import json
from pathlib import Path
from typing import Any, Optional
import browsergym.core.observation as obs
class BrowserGymStateCapture:
"""
Captures browser state using BrowserGym's existing observation functions.
This provides everything WebArena evaluators need without custom CDP logging.
"""
def __init__(self, output_dir: str = '/tmp/webarena_states'):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.current_instance_id: str | None = None
def set_instance_id(self, instance_id: str) -> None:
"""Set the current WebArena instance ID for state saving."""
self.current_instance_id = instance_id
def capture_final_state(self, browsergym_env) -> dict[str, Any]:
"""
Capture the final browser state using BrowserGym's observation functions.
This captures everything WebArena evaluators need.
"""
if not hasattr(browsergym_env, 'page'):
raise RuntimeError('BrowserGym environment does not have page attribute')
page = browsergym_env.page
# Use BrowserGym's existing observation extraction functions
state = {
'instance_id': self.current_instance_id,
'final_url': page.url,
'final_title': page.title(),
# This is the key - BrowserGym's extract_dom_snapshot uses CDP internally
# and returns exactly the format WebArena evaluators expect
'dom_snapshot': obs.extract_dom_snapshot(page),
# Additional state that might be useful
'screenshot': obs.extract_screenshot(page),
'axtree': obs.extract_merged_axtree(page),
'focused_element': obs.extract_focused_element_bid(page),
}
# Get additional browser state via CDP
try:
cdp_session = page.context.new_cdp_session(page)
# Get cookies
cookies_result = cdp_session.send('Network.getAllCookies')
state['cookies'] = cookies_result
# Get localStorage
local_storage = cdp_session.send(
'Runtime.evaluate',
{'expression': 'JSON.stringify(localStorage)', 'returnByValue': True},
)
state['local_storage'] = local_storage.get('result', {}).get('value', '{}')
# Get sessionStorage
session_storage = cdp_session.send(
'Runtime.evaluate',
{'expression': 'JSON.stringify(sessionStorage)', 'returnByValue': True},
)
state['session_storage'] = session_storage.get('result', {}).get(
'value', '{}'
)
cdp_session.detach()
except Exception as e:
print(f'Warning: Could not capture additional state via CDP: {e}')
state['cookies'] = {'cookies': []}
state['local_storage'] = '{}'
state['session_storage'] = '{}'
return state
def save_state(self, browsergym_env) -> str:
"""Save the current browser state to disk."""
if self.current_instance_id is None:
raise RuntimeError('Instance ID not set. Call set_instance_id() first.')
state = self.capture_final_state(browsergym_env)
# Save to file
state_file = self.output_dir / f'{self.current_instance_id}.json'
with open(state_file, 'w') as f:
json.dump(state, f, indent=2, default=str)
print(f'✅ Saved browser state to: {state_file}')
return str(state_file)
def load_state(self, instance_id: str) -> dict[str, Any]:
"""Load saved browser state from disk."""
state_file = self.output_dir / f'{instance_id}.json'
if not state_file.exists():
raise FileNotFoundError(f'State file not found: {state_file}')
with open(state_file, 'r') as f:
state = json.load(f)
return state
class MockPageForWebArena:
"""
Mock Page object that provides saved browser state for WebArena evaluation.
This uses the exact state captured by BrowserGym's observation functions.
"""
def __init__(self, saved_state: dict[str, Any]):
self.saved_state = saved_state
self._url = saved_state.get('final_url', '')
self._title = saved_state.get('final_title', '')
self._context = MockBrowserContextForWebArena(saved_state)
def url(self) -> str:
return self._url
def title(self) -> str:
return self._title
@property
def context(self):
return self._context
def evaluate(self, expression: str) -> Any:
"""Mock JavaScript evaluation using saved state."""
if 'window.location.href' in expression:
return self._url
elif 'document.title' in expression:
return self._title
elif 'localStorage' in expression:
return self.saved_state.get('local_storage', '{}')
elif 'sessionStorage' in expression:
return self.saved_state.get('session_storage', '{}')
return None
class MockCDPSessionForWebArena:
"""
Mock CDPSession that returns saved state from BrowserGym's observations.
This is the key component that makes WebArena evaluators work.
"""
def __init__(self, saved_state: dict[str, Any]):
self.saved_state = saved_state
def send(self, method: str, params: Optional[dict] = None) -> dict[str, Any]:
"""
Mock CDP send method that returns BrowserGym's captured state.
The key insight: BrowserGym's extract_dom_snapshot() already returns
the exact format that WebArena evaluators expect from CDP calls.
"""
if method == 'DOMSnapshot.captureSnapshot':
# BrowserGym's extract_dom_snapshot already returns the right format!
return self.saved_state.get('dom_snapshot', {})
elif method == 'Network.getAllCookies':
return self.saved_state.get('cookies', {'cookies': []})
elif method == 'Runtime.evaluate':
if params and 'expression' in params:
expression = params['expression']
if 'localStorage' in expression:
return {
'result': {'value': self.saved_state.get('local_storage', '{}')}
}
elif 'sessionStorage' in expression:
return {
'result': {
'value': self.saved_state.get('session_storage', '{}')
}
}
elif 'window.location.href' in expression:
return {'result': {'value': self.saved_state.get('final_url', '')}}
elif 'document.title' in expression:
return {
'result': {'value': self.saved_state.get('final_title', '')}
}
return {}
def detach(self):
"""Mock detach method."""
pass
class MockBrowserContextForWebArena:
"""Mock browser context for WebArena evaluation."""
def __init__(self, saved_state: dict[str, Any]):
self.saved_state = saved_state
def new_cdp_session(self, page) -> MockCDPSessionForWebArena:
"""Return mock CDP session with BrowserGym's captured state."""
return MockCDPSessionForWebArena(self.saved_state)
def integrate_with_openhands_browser_env():
"""
Integration point for OpenHands browser_env.py.
This shows how to add state capture to the existing BrowserGym usage.
"""
# This would be added to browser_env.py in the browser_process method
example_integration = """
def browser_process(self) -> None:
env = gym.make('browsergym/openended', ...)
obs, info = env.reset()
# Add state capture for WebArena evaluation
state_capture = None
if os.getenv('WEBARENA_EVALUATION'):
state_capture = BrowserGymStateCapture()
while should_continue():
if self.browser_side.poll(timeout=0.01):
unique_request_id, action_data = self.browser_side.recv()
# Handle WebArena instance ID setting
if unique_request_id == 'SET_WEBARENA_INSTANCE':
if state_capture:
state_capture.set_instance_id(action_data['instance_id'])
continue
action = action_data['action']
obs, reward, terminated, truncated, info = env.step(action)
# Capture final state when task completes
if terminated and state_capture:
state_capture.save_state(env)
# ... rest of existing code ...
"""
return example_integration
def demonstrate_integration():
"""Demonstrate how this integrates with WebArena evaluation."""
print('🚀 BrowserGym State Capture for WebArena')
print('=' * 50)
print('✅ Key advantages of this approach:')
print(" 1. Uses BrowserGym's existing observation functions")
print(' 2. extract_dom_snapshot() already returns WebArena-compatible format')
print(' 3. No custom CDP logging needed')
print(' 4. Minimal changes to OpenHands browser_env.py')
print(' 5. Leverages existing, tested BrowserGym infrastructure')
print('\n📋 Integration steps:')
print(' 1. Add BrowserGymStateCapture to browser_env.py')
print(' 2. Capture state when WebArena tasks complete')
print(
' 3. Use MockPageForWebArena and MockCDPSessionForWebArena in eval_infer.py'
)
print(' 4. Official WebArena evaluators work with mock objects')
print('\n🎯 This is much simpler than custom CDP logging because')
print(' BrowserGym already provides everything we need!')
if __name__ == '__main__':
demonstrate_integration()
@@ -1,359 +0,0 @@
#!/usr/bin/env python3
"""
WebArena evaluation script for OpenHands outputs using official WebArena evaluation harness.
This script evaluates the results from run_infer.py using the official WebArena evaluation code.
This script requires:
1. Official WebArena repository cloned to /workspace/project/webarena
2. WebArena environment variables properly configured
3. Authentication files set up for WebArena sites
4. Docker containers running for WebArena sites
"""
import argparse
import json
import os
import sys
from typing import Any
# Set up environment variables for WebArena
WEBARENA_BASE_URL = os.environ.get('WEBARENA_BASE_URL', '')
if WEBARENA_BASE_URL:
os.environ['REDDIT'] = f'{WEBARENA_BASE_URL}:9999'
os.environ['SHOPPING'] = f'{WEBARENA_BASE_URL}:7770'
os.environ['SHOPPING_ADMIN'] = f'{WEBARENA_BASE_URL}:7780'
os.environ['GITLAB'] = f'{WEBARENA_BASE_URL}:8023'
os.environ['WIKIPEDIA'] = f'{WEBARENA_BASE_URL}:8888'
os.environ['MAP'] = f'{WEBARENA_BASE_URL}:3000'
os.environ['HOMEPAGE'] = f'{WEBARENA_BASE_URL}:4399'
# Add the webarena path to sys.path to import its modules
WEBARENA_PATH = '/workspace/project/webarena'
sys.path.insert(0, WEBARENA_PATH)
try:
from browser_env import ScriptBrowserEnv, create_stop_action
from browser_env.actions import Action
from browser_env.utils import StateInfo
from evaluation_harness import evaluator_router
print('✅ WebArena evaluation harness imported successfully')
except ImportError as e:
print(f'❌ Failed to import WebArena evaluation harness: {e}')
print('Make sure the WebArena repository is cloned to /workspace/project/webarena')
print('and all dependencies are installed.')
sys.exit(1)
def load_config_file(config_path: str) -> dict[str, Any]:
"""Load WebArena config file."""
with open(config_path, 'r') as f:
return json.load(f)
def convert_openhands_action_to_webarena(action_data: dict[str, Any]) -> Action:
"""Convert OpenHands action format to WebArena action format."""
action_type = action_data.get('action', '')
args = action_data.get('args', {})
if action_type == 'browse':
url = args.get('url', '')
if url:
return Action(action_type='goto', coordinate=[0, 0], text=url)
elif action_type == 'click':
coordinate = args.get('coordinate', [0, 0])
return Action(action_type='click', coordinate=coordinate)
elif action_type == 'type':
text = args.get('text', '')
return Action(action_type='type', text=text, coordinate=[0, 0])
elif action_type == 'key':
key = args.get('key', '')
return Action(action_type='key', text=key, coordinate=[0, 0])
elif action_type == 'scroll':
coordinate = args.get('coordinate', [0, 0])
direction = args.get('direction', 'down')
return Action(action_type='scroll', coordinate=coordinate, text=direction)
elif action_type == 'finish':
return create_stop_action('')
# Default fallback for unknown actions
return Action(action_type='none', coordinate=[0, 0])
def convert_openhands_trajectory_to_webarena_format(
openhands_output: dict[str, Any],
) -> list[Any]:
"""
Convert OpenHands trajectory format to WebArena trajectory format.
OpenHands format: history contains pairs of [action, observation]
WebArena format: trajectory is a list alternating between StateInfo and Action
"""
trajectory = []
# Add initial state
initial_state = StateInfo(
observation={'text': 'Initial state'}, info={'observation_metadata': {}}
)
trajectory.append(initial_state)
# Process the history
history = openhands_output.get('history', [])
for history_pair in history:
if len(history_pair) >= 2:
action_data = history_pair[0]
observation_data = history_pair[1]
# Convert action
webarena_action = convert_openhands_action_to_webarena(action_data)
trajectory.append(webarena_action)
# Add state info from observation
state_info = StateInfo(
observation={'text': observation_data.get('content', '')},
info={'observation_metadata': observation_data.get('extras', {})},
)
trajectory.append(state_info)
return trajectory
def evaluate_with_official_webarena_harness(
instance_data: dict[str, Any], config_file_path: str
) -> dict[str, Any]:
"""
Evaluate a single WebArena instance using the official evaluation harness.
This function:
1. Converts OpenHands trajectory to WebArena format
2. Sets up a browser environment
3. Replays the trajectory to reach the final state
4. Runs the official WebArena evaluator
"""
instance_id = instance_data.get('instance_id', 'unknown')
print(f'\n🔍 Evaluating instance: {instance_id}')
try:
# Load config to understand the task
config_data = load_config_file(config_file_path)
intent = config_data.get('intent', '')
start_url = config_data.get('start_url', '')
print(f' Task: {intent}')
print(f' Start URL: {start_url}')
# Convert OpenHands trajectory to WebArena format
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
print(f' Converted trajectory with {len(trajectory)} steps')
# Get the evaluator for this config
evaluator = evaluator_router(config_file_path)
print(f' Using evaluator: {type(evaluator).__name__}')
# Create browser environment for evaluation
env = ScriptBrowserEnv(
headless=True,
slow_mo=0,
observation_type='accessibility_tree',
current_viewport_only=True,
viewport_size={'width': 1280, 'height': 720},
)
try:
# Initialize the environment with the task
obs, info = env.reset(options={'config_file': config_file_path})
# Replay the trajectory to reach the final state
# This is necessary because the evaluator needs the actual browser state
current_obs = obs
for i, step in enumerate(trajectory):
if isinstance(step, Action):
try:
current_obs, reward, done, info = env.step(step)
if done:
break
except Exception as e:
print(f' Warning: Error replaying step {i}: {e}')
continue
# Run the official evaluation
score = evaluator(
trajectory=trajectory,
config_file=config_file_path,
page=env.page,
client=env.page.context.new_cdp_session(env.page),
)
result = {
'instance_id': instance_id,
'score': score,
'success': score == 1.0,
'trajectory_length': len(trajectory),
'evaluator': type(evaluator).__name__,
'evaluation_type': 'official_webarena_harness',
'intent': intent,
}
print(
f' Result: {"✅ PASS" if score == 1.0 else "❌ FAIL"} (score: {score})'
)
return result
finally:
env.close()
except Exception as e:
print(f' ❌ Error evaluating {instance_id}: {e}')
return {
'instance_id': instance_id,
'score': 0.0,
'success': False,
'error': str(e),
'evaluator': 'error',
'evaluation_type': 'error',
}
def main():
parser = argparse.ArgumentParser(
description='Evaluate WebArena results using ONLY the official WebArena evaluation harness'
)
parser.add_argument(
'output_file', type=str, help='Path to OpenHands output.jsonl file'
)
parser.add_argument(
'--results_file',
type=str,
default='webarena_official_eval_results.json',
help='Path to save evaluation results',
)
parser.add_argument(
'--config_dir',
type=str,
default='/workspace/project/webarena/config_files/examples',
help='Directory containing WebArena config files',
)
args = parser.parse_args()
print('🚀 Starting WebArena Evaluation with Official WebArena Harness ONLY')
print(f'📁 Output file: {args.output_file}')
print(f'📁 Config directory: {args.config_dir}')
# Verify WebArena environment is properly set up
if not WEBARENA_BASE_URL:
print('❌ WEBARENA_BASE_URL environment variable not set')
print('Please set WEBARENA_BASE_URL to your WebArena server URL')
sys.exit(1)
print(f'🌐 WebArena base URL: {WEBARENA_BASE_URL}')
# Load OpenHands results
results = []
with open(args.output_file, 'r') as f:
for line in f:
if line.strip():
results.append(json.loads(line))
print(f'📊 Found {len(results)} instances to evaluate')
# Evaluate each instance using ONLY official WebArena evaluation harness
evaluation_results = []
total_score = 0.0
for result in results:
instance_id = result.get('instance_id', 'unknown')
# Find corresponding config file
config_file = None
# Accept either plain numeric id ("8") or legacy prefixed id ("webarena.8")
task_num = instance_id.split('.')[-1]
config_file = f'{args.config_dir}/{task_num}.json'
if config_file and os.path.exists(config_file):
eval_result = evaluate_with_official_webarena_harness(result, config_file)
evaluation_results.append(eval_result)
total_score += eval_result.get('score', 0.0)
else:
print(f'\n🔍 Evaluating instance: {instance_id}')
print(f' ⚠️ Config file not found: {config_file}')
evaluation_results.append(
{
'instance_id': instance_id,
'score': 0.0,
'success': False,
'error': f'Config file not found: {config_file}',
'evaluation_type': 'config_error',
}
)
# Calculate final metrics
total_instances = len(evaluation_results)
success_count = sum(1 for r in evaluation_results if r.get('success', False))
success_rate = success_count / total_instances if total_instances > 0 else 0.0
average_score = total_score / total_instances if total_instances > 0 else 0.0
# Save results
final_results = {
'evaluation_method': 'official_webarena_harness_only',
'webarena_base_url': WEBARENA_BASE_URL,
'total_instances': total_instances,
'success_count': success_count,
'success_rate': success_rate,
'average_score': average_score,
'individual_results': evaluation_results,
}
with open(args.results_file, 'w') as f:
json.dump(final_results, f, indent=2)
# Print summary
print('\n' + '=' * 70)
print('🎯 WEBARENA EVALUATION RESULTS (Official Harness ONLY)')
print('=' * 70)
print(f'📊 Total instances: {total_instances}')
print(f'✅ Successful: {success_count}')
print(f'❌ Failed: {total_instances - success_count}')
print(f'📈 Success rate: {success_rate:.2%}')
print(f'📊 Average score: {average_score:.4f}')
print(f'💾 Results saved to: {args.results_file}')
print('=' * 70)
# Print individual results
print('\n📋 Individual Results:')
for result in evaluation_results:
status = '✅ PASS' if result.get('success', False) else '❌ FAIL'
score = result.get('score', 0.0)
instance_id = result.get('instance_id', 'unknown')
evaluator = result.get('evaluator', 'unknown')
error = result.get('error', '')
if error:
print(f' {instance_id}: {status} (score: {score:.2f}) - Error: {error}')
else:
print(
f' {instance_id}: {status} (score: {score:.2f}) - Evaluator: {evaluator}'
)
# Print requirements if there were errors
error_count = sum(1 for r in evaluation_results if r.get('error'))
if error_count > 0:
print('\n' + '⚠️' * 20)
print('EVALUATION ERRORS DETECTED')
print('⚠️' * 20)
print('This evaluation requires:')
print('1. WebArena Docker containers running and accessible')
print('2. Authentication files (.auth/) properly set up')
print('3. All WebArena dependencies installed')
print('4. Proper network access to WebArena sites')
print('\nPlease resolve these issues for accurate evaluation.')
print('⚠️' * 20)
if __name__ == '__main__':
main()
@@ -1,211 +0,0 @@
#!/usr/bin/env python3
"""
WebArena Evaluation Script
This script evaluates WebArena task results using the official WebArena evaluation harness
with BrowserGym state capture. It loads saved browser state and creates mock objects
that provide the exact state WebArena evaluators need.
This approach leverages BrowserGym's existing observation functions (extract_dom_snapshot, etc.)
which already provide WebArena-compatible state capture.
"""
import json
import os
import sys
from pathlib import Path
from typing import Any
# Add WebArena to path
sys.path.insert(0, '/workspace/project/webarena')
def convert_openhands_trajectory_to_webarena_format(
instance_data: dict[str, Any],
) -> list[Any]:
"""
Convert OpenHands trajectory format to WebArena trajectory format.
WebArena expects a list of alternating Action and StateInfo objects.
OpenHands provides action/observation pairs in text format.
"""
trajectory = []
# Get the conversation history
history = instance_data.get('history', [])
for entry in history:
if entry.get('source') == 'agent':
# This is an agent action
content = entry.get('message', {}).get('content', '')
# Create a WebArena-compatible action
action = {
'action_type': 'browser_action',
'content': content,
'timestamp': entry.get('timestamp', 0),
}
trajectory.append(action)
elif entry.get('source') == 'user':
# This might be an observation or state info
content = entry.get('message', {}).get('content', '')
# Create a WebArena-compatible state info
state_info = {
'observation': content,
'timestamp': entry.get('timestamp', 0),
}
trajectory.append(state_info)
# Add a final stop action if needed
if trajectory and not trajectory[-1].get('action_type'):
trajectory.append(
{
'action_type': 'stop',
'content': 'Task completed',
'timestamp': trajectory[-1].get('timestamp', 0) + 1,
}
)
return trajectory
def evaluate_with_browsergym_state_capture(
instance_data: dict[str, Any], config_file: str
) -> float:
"""
Evaluate using official WebArena harness with BrowserGym state capture.
This loads the saved browser state captured during inference and creates
mock Page/CDPSession objects that provide the exact state WebArena evaluators need.
"""
try:
# Import BrowserGym state capture
from browsergym_state_capture import (
BrowserGymStateCapture,
MockCDPSessionForWebArena,
MockPageForWebArena,
)
# Import WebArena evaluation components
from evaluation_harness import evaluator_router
# Load saved browser state
instance_id = instance_data.get('instance_id', 'unknown')
state_capture = BrowserGymStateCapture()
try:
saved_state = state_capture.load_state(instance_id)
print(f' ✅ Loaded browser state for {instance_id}')
except FileNotFoundError:
print(f' ❌ No saved browser state found for {instance_id}')
print(' Make sure inference was run with browser_logging_dir enabled')
return 0.0
# Create mock objects with saved state
mock_page = MockPageForWebArena(saved_state)
mock_client = MockCDPSessionForWebArena(saved_state)
# Convert trajectory format
trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
# Get the official evaluator
evaluator = evaluator_router(config_file)
# Run evaluation with mock objects containing saved browser state
score = evaluator(
trajectory=trajectory,
config_file=config_file,
page=mock_page, # Mock page with BrowserGym's captured state
client=mock_client, # Mock CDP session with BrowserGym's captured state
)
return score
except ImportError as e:
print(f' ❌ Could not import BrowserGym state capture: {e}')
print(' Make sure browsergym_state_capture.py is available')
return 0.0
except Exception as e:
print(f' ❌ Evaluation failed: {e}')
import traceback
traceback.print_exc()
return 0.0
def main():
"""Main evaluation function."""
if len(sys.argv) != 2:
print('Usage: python eval_infer.py <output_file>')
sys.exit(1)
output_file = sys.argv[1]
if not os.path.exists(output_file):
print(f'❌ Output file not found: {output_file}')
sys.exit(1)
print('🔍 WebArena Evaluation (BrowserGym State Capture)')
print('=' * 60)
# Load results
with open(output_file, 'r') as f:
results = [json.loads(line) for line in f]
print(f'📊 Evaluating {len(results)} WebArena tasks...')
# WebArena config files
config_dir = Path('/workspace/project/webarena/config_files/examples')
total_score = 0
evaluated_count = 0
for result in results:
instance_id = result.get('instance_id', 'unknown')
# Find corresponding config file
config_file = config_dir / f'{instance_id}.json'
if not config_file.exists():
print(f'⚠️ Config file not found for {instance_id}')
continue
print(f'\n🧪 Evaluating {instance_id}...')
try:
# Use official WebArena evaluation with BrowserGym state capture
score = evaluate_with_browsergym_state_capture(result, str(config_file))
print(f' Score: {score}')
total_score += score
evaluated_count += 1
except Exception as e:
print(f' ❌ Evaluation failed: {e}')
if evaluated_count > 0:
average_score = total_score / evaluated_count
print('\n📈 Results Summary:')
print(f' Tasks evaluated: {evaluated_count}')
print(f' Total score: {total_score}')
print(f' Average score: {average_score:.3f}')
print(
f' Pass rate: {total_score}/{evaluated_count} ({100 * total_score / evaluated_count:.1f}%)'
)
else:
print('\n❌ No tasks could be evaluated')
print('\n🎯 Evaluation Method:')
print(' - Uses official WebArena evaluation harness')
print(' - Loads browser state captured by BrowserGym during inference')
print(' - Creates mock Page/CDPSession objects with exact browser state')
print(' - WebArena evaluators get the exact state they need')
print('\n💡 To enable browser state capture during inference:')
print(' export WEBARENA_BROWSER_LOGGING_DIR=/tmp/webarena_states')
if __name__ == '__main__':
main()
@@ -0,0 +1,33 @@
import argparse
import json
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
import gymnasium as gym
parser = argparse.ArgumentParser(description='Calculate average reward.')
parser.add_argument('output_path', type=str, help='path to output.jsonl')
args = parser.parse_args()
if __name__ == '__main__':
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
]
total_num = len(env_ids)
print('Total number of tasks: ', total_num)
total_reward = 0
total_cost = 0
actual_num = 0
with open(args.output_path, 'r') as f:
for line in f:
data = json.loads(line)
actual_num += 1
total_cost += data['metrics']['accumulated_cost']
total_reward += data['test_result']
avg_reward = total_reward / total_num
print('Success Rate: ', avg_reward)
avg_cost = total_cost / actual_num
print('Avg Cost: ', avg_cost)
print('Actual number of tasks finished: ', actual_num)
+64 -158
View File
@@ -1,17 +1,18 @@
import asyncio
import json
import os
from typing import Any
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
import gymnasium as gym
import pandas as pd
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -21,32 +22,29 @@ from openhands.controller.state.state import State
from openhands.core.config import (
OpenHandsConfig,
get_llm_config_arg,
parse_arguments,
)
from openhands.core.config.arg_utils import get_evaluation_parser
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import (
BrowseInteractiveAction,
CmdRunAction,
MessageAction,
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.runtime.browser.browser_env import (
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync
SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
'BrowsingAgent': codeact_user_response,
}
# Global variable to store task configs
TASK_CONFIGS = {}
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
def get_config(
metadata: EvalMetadata,
task_config: dict,
env_id: str,
) -> OpenHandsConfig:
base_url = os.environ.get('WEBARENA_BASE_URL', None)
openai_api_key = os.environ.get('OPENAI_API_KEY', None)
@@ -55,7 +53,7 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
# Remove browsergym_eval_env dependency - we'll use regular browser environment
sandbox_config.browsergym_eval_env = env_id
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
@@ -67,11 +65,15 @@ def get_config(
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
}
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker',
sandbox_config=sandbox_config,
enable_browser=True,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class)
@@ -79,59 +81,30 @@ def get_config(
return config
def get_instruction(task_config: dict) -> MessageAction:
"""Create the instruction message for the agent based on the task config."""
intent = task_config.get('intent', 'Complete the task')
start_url = task_config.get('start_url', 'about:blank')
# BrowserGym WebArena already handles URL substitution, so we can use start_url directly
# Create a comprehensive instruction that includes the task and starting point
instruction = f"""You are a web browsing agent. Your task is: {intent}
Please start by navigating to: {start_url}
Complete the task by interacting with the webpage as needed. Use the browser tool to navigate, click, fill forms, and perform other web interactions to accomplish the goal."""
return MessageAction(content=instruction)
def initialize_runtime(
runtime: Runtime,
task_config: dict,
) -> None:
) -> dict:
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
Also performs initial navigation to the task's start_url because USE_NAV is disabled during evaluation.
"""
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
obs: CmdOutputObservation
# Ensure workspace exists
# Set instance id
action = CmdRunAction(command='mkdir -p /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
assert obs.exit_code == 0
# Navigate to the configured start_url so the page is ready for the agent
try:
from openhands.events.action import BrowseInteractiveAction
start_url = task_config.get('start_url')
if start_url:
browse_action = BrowseInteractiveAction(
browser_actions=f'goto("{start_url}")',
return_axtree=True,
)
runtime.browse_interactive(browse_action)
else:
logger.warning(
'No start_url found in task_config; skipping initial navigation'
)
except Exception as e:
logger.error(f'Failed to perform initial navigation: {e}')
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
goal = obs.content
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
return goal
def complete_runtime(
@@ -139,40 +112,22 @@ def complete_runtime(
) -> dict[str, Any]:
"""Complete the runtime for the agent.
This function is called after the agent has run.
Since we're using the official webarena evaluation, we don't need to get rewards here.
This function is called before the runtime is used to run the agent.
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
obs: CmdOutputObservation
# Capture the final accessibility tree for WebArena evaluation
try:
# Create a browser action to get the current page state with accessibility tree
from openhands.events.action import BrowseInteractiveAction
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Use a no-op action that returns the accessibility tree
final_browse_action = BrowseInteractiveAction(
browser_actions='noop()', # No-op action to just get current state
return_axtree=True, # Ensure we get the accessibility tree
)
# Execute the action to get the final observation with accessibility tree
final_obs = runtime.browse_interactive(final_browse_action)
# Extract the accessibility tree from the observation
final_axtree = None
if hasattr(final_obs, 'axtree_object') and final_obs.axtree_object:
final_axtree = final_obs.axtree_object
logger.info('Successfully captured final accessibility tree')
else:
logger.warning('No accessibility tree found in final observation')
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
return {'final_accessibility_tree': final_axtree}
except Exception as e:
logger.error(f'Error capturing final accessibility tree: {e}')
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
return {'final_accessibility_tree': None}
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
return {
'rewards': json.loads(obs.content),
}
def process_instance(
@@ -180,34 +135,31 @@ def process_instance(
metadata: EvalMetadata,
reset_logger: bool = True,
):
task_id = instance.instance_id
task_config = TASK_CONFIGS.get(task_id, {})
config = get_config(metadata, task_config)
env_id = instance.instance_id
config = get_config(metadata, env_id)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, str(task_id), log_dir)
reset_logger_for_multiprocessing(logger, env_id, log_dir)
else:
logger.info(f'Starting evaluation for task {task_id}.')
logger.info(f'Starting evaluation for instance {env_id}.')
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, task_config)
# Get the proper instruction message
message_action = get_instruction(task_config)
task_str = initialize_runtime(runtime)
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=message_action,
initial_user_action=MessageAction(content=task_str),
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class
],
)
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
@@ -223,6 +175,7 @@ def process_instance(
return_val = complete_runtime(runtime)
logger.info(f'Return value from complete_runtime: {return_val}')
reward = max(return_val['rewards'])
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
@@ -231,90 +184,43 @@ def process_instance(
# Save the output
output = EvalOutput(
instance_id=str(task_id),
instance_id=env_id,
instruction=instruction,
metadata=metadata,
history=histories,
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
'task_config': task_config, # Store task config for later evaluation
'final_accessibility_tree': return_val.get('final_accessibility_tree')
if return_val
else None,
'reward': reward,
},
)
return output
if __name__ == '__main__':
parser = get_evaluation_parser()
args = parser.parse_args()
args = parse_arguments()
# Set up WebArena environment variables for BrowserGym
base_url = os.environ.get('WEBARENA_BASE_URL', None)
if not base_url:
raise ValueError('WEBARENA_BASE_URL must be set')
# Set up the WA_ prefixed environment variables that BrowserGym expects
os.environ['WA_SHOPPING'] = f'{base_url}:7770/'
os.environ['WA_SHOPPING_ADMIN'] = f'{base_url}:7780/admin'
os.environ['WA_REDDIT'] = f'{base_url}:9999'
os.environ['WA_GITLAB'] = f'{base_url}:8023'
os.environ['WA_WIKIPEDIA'] = (
f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing'
)
os.environ['WA_MAP'] = f'{base_url}:3000'
os.environ['WA_HOMEPAGE'] = f'{base_url}:4399'
# Load webarena task configs from BrowserGym
from browsergym.webarena.config import TASK_IDS
from browsergym.webarena.task import GenericWebArenaTask
task_configs = []
# Load a subset of tasks for testing (first 10 tasks)
test_task_ids = list(TASK_IDS)[:10] # Use first 10 tasks for testing
for task_id in test_task_ids:
try:
# Create a temporary task to get the config
temp_task = GenericWebArenaTask(seed=42, task_id=task_id)
# Get the first (and likely only) task config for this task_id
if temp_task.task_configs:
task_config = temp_task.task_configs[0]
task_configs.append({'task_id': task_id, 'task_config': task_config})
except Exception as e:
print(f'Warning: Could not load task {task_id}: {e}')
continue
if not task_configs:
raise ValueError('No task configs could be loaded from BrowserGym WebArena')
print(f'Found {len(task_configs)} task configs from BrowserGym WebArena')
# Store task configs globally for process_instance to access
for task in task_configs:
TASK_CONFIGS[str(task['task_id'])] = task['task_config']
# Create dataset from task configs
dataset = pd.DataFrame(
[{'instance_id': str(task['task_id'])} for task in task_configs]
{
'instance_id': [
id
for id in gym.envs.registry.keys()
if id.startswith('browsergym/webarena')
]
}
)
llm_config = None
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config, args.config_file)
llm_config = get_llm_config_arg(args.llm_config)
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
if llm_config:
llm_config.modify_params = False
llm_config.modify_params = False
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
metadata = make_metadata(
llm_config,
'webarena',
args.dataset_name,
args.agent_cls,
args.max_iterations,
args.eval_note,
@@ -38,7 +38,7 @@ EVAL_NOTE="$OPENHANDS_VERSION"
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--max-iterations 15 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE"
@@ -1,19 +0,0 @@
#!/usr/bin/env bash
# WebArena environment configuration
# This script sets up the environment variables needed for WebArena evaluation
# Check if WEBARENA_BASE_URL is set
if [ -z "$WEBARENA_BASE_URL" ]; then
echo "Warning: WEBARENA_BASE_URL is not set. Please set it to the base URL where webarena services are hosted."
echo "Example: export WEBARENA_BASE_URL=http://your-webarena-host"
fi
# Check if OPENAI_API_KEY is set
if [ -z "$OPENAI_API_KEY" ]; then
echo "Warning: OPENAI_API_KEY is not set. Please set it to your OpenAI API key."
fi
echo "WebArena environment configured:"
echo " WEBARENA_BASE_URL: $WEBARENA_BASE_URL"
echo " OPENAI_API_KEY: ${OPENAI_API_KEY:+[SET]}${OPENAI_API_KEY:-[NOT SET]}"
+10 -5
View File
@@ -10,7 +10,6 @@ from evaluation.utils.shared import (
EvalOutput,
get_default_sandbox_config_for_eval,
get_metrics,
get_openhands_config_for_eval,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
@@ -46,12 +45,18 @@ def get_config(
) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.platform = 'linux/amd64'
config = get_openhands_config_for_eval(
metadata=metadata,
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
# debug
debug=True,
)
config.debug = True
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, instance_id
@@ -1,209 +0,0 @@
#!/usr/bin/env python3
"""
Script to aggregate token usage metrics from LLM completion files.
Usage:
python aggregate_token_usage.py <directory_path> [--input-cost <cost>] [--output-cost <cost>] [--cached-cost <cost>]
Arguments:
directory_path: Path to the directory containing completion files
--input-cost: Cost per input token (default: 0.0)
--output-cost: Cost per output token (default: 0.0)
--cached-cost: Cost per cached token (default: 0.0)
"""
import argparse
import json
import os
from pathlib import Path
def aggregate_token_usage(
directory_path, input_cost=0.0, output_cost=0.0, cached_cost=0.0
):
"""
Aggregate token usage metrics from all JSON completion files in the directory.
Args:
directory_path (str): Path to directory containing completion files
input_cost (float): Cost per input token
output_cost (float): Cost per output token
cached_cost (float): Cost per cached token
"""
# Initialize counters
totals = {
'input_tokens': 0,
'output_tokens': 0,
'cached_tokens': 0,
'total_tokens': 0,
'files_processed': 0,
'files_with_errors': 0,
'cost': 0,
}
# Find all JSON files recursively
json_files = list(Path(directory_path).rglob('*.json'))
print(f'Found {len(json_files)} JSON files to process...')
for json_file in json_files:
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Look for usage data in response or fncall_response
usage_data = None
if (
'response' in data
and isinstance(data['response'], dict)
and 'usage' in data['response']
):
usage_data = data['response']['usage']
elif (
'fncall_response' in data
and isinstance(data['fncall_response'], dict)
and 'usage' in data['fncall_response']
):
usage_data = data['fncall_response']['usage']
if usage_data:
# Extract token counts
completion_tokens = usage_data.get('completion_tokens', 0)
prompt_tokens = usage_data.get('prompt_tokens', 0)
cached_tokens = usage_data.get('cached_tokens', 0)
# Handle cases where cached_tokens might be in prompt_tokens_details
if cached_tokens == 0 and 'prompt_tokens_details' in usage_data:
details = usage_data['prompt_tokens_details']
if isinstance(details, dict) and 'cached_tokens' in details:
cached_tokens = details.get('cached_tokens', 0) or 0
# Calculate non-cached input tokens
non_cached_input = prompt_tokens - cached_tokens
# Update totals
totals['input_tokens'] += non_cached_input
totals['output_tokens'] += completion_tokens
totals['cached_tokens'] += cached_tokens
totals['total_tokens'] += prompt_tokens + completion_tokens
if 'cost' in data:
totals['cost'] += data['cost']
totals['files_processed'] += 1
# Progress indicator
if totals['files_processed'] % 1000 == 0:
print(f'Processed {totals["files_processed"]} files...')
except Exception as e:
totals['files_with_errors'] += 1
if totals['files_with_errors'] <= 5: # Only show first 5 errors
print(f'Error processing {json_file}: {e}')
# Calculate costs
input_cost_total = totals['input_tokens'] * input_cost
output_cost_total = totals['output_tokens'] * output_cost
cached_cost_total = totals['cached_tokens'] * cached_cost
total_cost = input_cost_total + output_cost_total + cached_cost_total
# Print results
print('\n' + '=' * 60)
print('TOKEN USAGE AGGREGATION RESULTS')
print('=' * 60)
print(f'Files processed: {totals["files_processed"]:,}')
print(f'Files with errors: {totals["files_with_errors"]:,}')
print()
print('TOKEN COUNTS:')
print(f' Input tokens (non-cached): {totals["input_tokens"]:,}')
print(f' Output tokens: {totals["output_tokens"]:,}')
print(f' Cached tokens: {totals["cached_tokens"]:,}')
print(f' Total tokens: {totals["total_tokens"]:,}')
print(f' Total costs (based on returned value): ${totals["cost"]:.6f}')
print()
if input_cost > 0 or output_cost > 0 or cached_cost > 0:
print('COST CALCULATED BASED ON PROVIDED RATE:')
print(
f' Input cost: ${input_cost_total:.6f} ({totals["input_tokens"]:,} × ${input_cost:.6f})'
)
print(
f' Output cost: ${output_cost_total:.6f} ({totals["output_tokens"]:,} × ${output_cost:.6f})'
)
print(
f' Cached cost: ${cached_cost_total:.6f} ({totals["cached_tokens"]:,} × ${cached_cost:.6f})'
)
print(f' Total cost: ${total_cost:.6f}')
print()
print('SUMMARY:')
print(
f' Total input tokens: {totals["input_tokens"] + totals["cached_tokens"]:,}'
)
print(f' Total output tokens: {totals["output_tokens"]:,}')
print(f' Grand total tokens: {totals["total_tokens"]:,}')
return totals
def main():
parser = argparse.ArgumentParser(
description='Aggregate token usage metrics from LLM completion files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python aggregate_token_usage.py /path/to/completions
python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002
python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002 --cached-cost 0.0000005
""",
)
parser.add_argument(
'directory_path', help='Path to directory containing completion files'
)
parser.add_argument(
'--input-cost',
type=float,
default=0.0,
help='Cost per input token (default: 0.0)',
)
parser.add_argument(
'--output-cost',
type=float,
default=0.0,
help='Cost per output token (default: 0.0)',
)
parser.add_argument(
'--cached-cost',
type=float,
default=0.0,
help='Cost per cached token (default: 0.0)',
)
args = parser.parse_args()
# Validate directory path
if not os.path.exists(args.directory_path):
print(f"Error: Directory '{args.directory_path}' does not exist.")
return 1
if not os.path.isdir(args.directory_path):
print(f"Error: '{args.directory_path}' is not a directory.")
return 1
# Run aggregation
try:
aggregate_token_usage(
args.directory_path, args.input_cost, args.output_cost, args.cached_cost
)
return 0
except Exception as e:
print(f'Error during aggregation: {e}')
return 1
if __name__ == '__main__':
exit(main())
-84
View File
@@ -188,14 +188,6 @@ def make_metadata(
pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
parents=True, exist_ok=True
)
# Allow overriding the evaluation output directory via env for smoke runs
override_output_dir = os.environ.get('EVAL_OUTPUT_DIR')
if override_output_dir:
eval_output_path = override_output_dir
pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_path}')
metadata = EvalMetadata(
@@ -711,79 +703,3 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig:
remote_runtime_enable_retries=True,
remote_runtime_class='sysbox',
)
def get_openhands_config_for_eval(
metadata: EvalMetadata | None = None,
sandbox_config: SandboxConfig | None = None,
runtime: str | None = None,
max_iterations: int | None = None,
default_agent: str | None = None,
enable_browser: bool = False,
workspace_base: str | None = None,
workspace_mount_path: str | None = None,
):
"""Create an OpenHandsConfig with common patterns used across evaluation scripts.
This function provides a standardized way to create OpenHands configurations
for evaluation runs, with sensible defaults that match the patterns used in
most run_infer.py scripts. Individual evaluation scripts can override specific
attributes as needed.
Args:
metadata: EvalMetadata containing agent class, max iterations, etc.
sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval()
runtime: Runtime type. If None, uses environment RUNTIME or 'docker'
max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations
default_agent: Agent class name. If None, uses metadata.agent_class
enable_browser: Whether to enable browser functionality
workspace_base: Workspace base path. Defaults to None
workspace_mount_path: Workspace mount path. Defaults to None
Returns:
OpenHandsConfig: Configured for evaluation with eval-specific overrides applied
"""
# Defer import to avoid circular imports at module load time
from openhands.core.config.openhands_config import (
OpenHandsConfig as _OHConfig, # type: ignore
)
# Use provided sandbox config or get default
if sandbox_config is None:
sandbox_config = get_default_sandbox_config_for_eval()
# Extract values from metadata if provided
if metadata is not None:
if max_iterations is None:
max_iterations = metadata.max_iterations
if default_agent is None:
default_agent = metadata.agent_class
# Use environment runtime or default
if runtime is None:
runtime = os.environ.get('RUNTIME', 'docker')
# Provide sensible defaults if still None
if default_agent is None:
default_agent = 'CodeActAgent'
if max_iterations is None:
max_iterations = 50
# Always use repo-local .eval_sessions directory (absolute path)
eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions'))
# Create the base config with evaluation-specific overrides
config = _OHConfig(
default_agent=default_agent,
run_as_openhands=False,
runtime=runtime,
max_iterations=max_iterations,
enable_browser=enable_browser,
sandbox=sandbox_config,
workspace_base=workspace_base,
workspace_mount_path=workspace_mount_path,
file_store='local',
file_store_path=eval_store,
)
return config
@@ -14,32 +14,21 @@ import { Conversation } from "#/api/open-hands.types";
// Mock hooks
const mockUseUserProviders = vi.fn();
const mockUseGitRepositories = vi.fn();
const mockUseUserRepositories = vi.fn();
const mockUseConfig = vi.fn();
const mockUseRepositoryMicroagents = vi.fn();
const mockUseMicroagentManagementConversations = vi.fn();
vi.mock("#/hooks/use-user-providers", () => ({
useUserProviders: () => mockUseUserProviders(),
}));
vi.mock("#/hooks/query/use-git-repositories", () => ({
useGitRepositories: () => mockUseGitRepositories(),
vi.mock("#/hooks/query/use-user-repositories", () => ({
useUserRepositories: () => mockUseUserRepositories(),
}));
vi.mock("#/hooks/query/use-config", () => ({
useConfig: () => mockUseConfig(),
}));
vi.mock("#/hooks/query/use-repository-microagents", () => ({
useRepositoryMicroagents: () => mockUseRepositoryMicroagents(),
}));
vi.mock("#/hooks/query/use-microagent-management-conversations", () => ({
useMicroagentManagementConversations: () =>
mockUseMicroagentManagementConversations(),
}));
describe("MicroagentManagement", () => {
const RouterStub = createRoutesStub([
{
@@ -185,7 +174,7 @@ describe("MicroagentManagement", () => {
providers: ["github"],
});
mockUseGitRepositories.mockReturnValue({
mockUseUserRepositories.mockReturnValue({
data: {
pages: [
{
@@ -207,18 +196,6 @@ describe("MicroagentManagement", () => {
},
});
mockUseRepositoryMicroagents.mockReturnValue({
data: mockMicroagents,
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: mockConversations,
isLoading: false,
isError: false,
});
// Setup default mock for retrieveUserGitRepositories
vi.spyOn(OpenHands, "retrieveUserGitRepositories").mockResolvedValue({
data: [...mockRepositories],
@@ -250,7 +227,7 @@ describe("MicroagentManagement", () => {
it("should display loading state when fetching repositories", async () => {
// Mock loading state
mockUseGitRepositories.mockReturnValue({
mockUseUserRepositories.mockReturnValue({
data: undefined,
isLoading: true,
isError: false,
@@ -268,7 +245,7 @@ describe("MicroagentManagement", () => {
it("should handle error when fetching repositories", async () => {
// Mock error state
mockUseGitRepositories.mockReturnValue({
mockUseUserRepositories.mockReturnValue({
data: undefined,
isLoading: false,
isError: true,
@@ -281,7 +258,7 @@ describe("MicroagentManagement", () => {
// Wait for the error to be handled
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
});
@@ -290,7 +267,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Check that tabs are rendered
@@ -308,7 +285,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and rendered
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Check that repository names are displayed
@@ -323,7 +300,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -332,7 +309,10 @@ describe("MicroagentManagement", () => {
// Wait for microagents to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
});
// Check that microagents are displayed
@@ -345,17 +325,19 @@ describe("MicroagentManagement", () => {
it("should display loading state when fetching microagents", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: undefined,
isLoading: true,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
getRepositoryMicroagentsSpy.mockImplementation(
() => new Promise(() => {}), // Never resolves
);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -368,17 +350,19 @@ describe("MicroagentManagement", () => {
it("should handle error when fetching microagents", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: undefined,
isLoading: false,
isError: true,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
getRepositoryMicroagentsSpy.mockRejectedValue(
new Error("Failed to fetch microagents"),
);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -387,23 +371,23 @@ describe("MicroagentManagement", () => {
// Wait for the error to be handled
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith("user", "repo2");
});
});
it("should display empty state when no microagents are found", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
getRepositoryMicroagentsSpy.mockResolvedValue([]);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -412,7 +396,7 @@ describe("MicroagentManagement", () => {
// Wait for microagents to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith("user", "repo2");
});
// Check that no microagents are displayed
@@ -426,7 +410,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -435,7 +419,10 @@ describe("MicroagentManagement", () => {
// Wait for microagents to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
});
// Check that microagent cards display correct information
@@ -462,7 +449,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -481,7 +468,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -505,7 +492,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click the first add microagent button
@@ -526,7 +513,7 @@ describe("MicroagentManagement", () => {
it("should display empty state when no repositories are found", async () => {
// Mock empty repositories
mockUseGitRepositories.mockReturnValue({
mockUseUserRepositories.mockReturnValue({
data: {
pages: [
{
@@ -546,7 +533,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Check that empty state messages are displayed
@@ -563,7 +550,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -572,11 +559,14 @@ describe("MicroagentManagement", () => {
// Wait for microagents to be fetched for first repo
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
});
// Check that the hook was called
expect(mockUseRepositoryMicroagents).toHaveBeenCalledTimes(1);
// Check that the API call was made
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledTimes(1);
});
it("should display ready to add microagent message in main area", async () => {
@@ -601,7 +591,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Check that search input is rendered
@@ -621,7 +611,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Initially only repositories with .openhands should be visible
@@ -652,7 +642,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input with uppercase
@@ -675,7 +665,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input with partial match
@@ -701,7 +691,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input
@@ -734,7 +724,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input with non-existent repository name
@@ -762,7 +752,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input with special characters
@@ -783,7 +773,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Filter to show only repo2
@@ -798,7 +788,10 @@ describe("MicroagentManagement", () => {
// Wait for microagents to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
});
// Check that microagents are displayed
@@ -815,7 +808,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Type in search input with leading/trailing whitespace
@@ -835,7 +828,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
const searchInput = screen.getByRole("textbox", {
@@ -860,14 +853,14 @@ describe("MicroagentManagement", () => {
});
// Search conversations functionality tests
describe("Microagent management conversations functionality", () => {
it("should call useMicroagentManagementConversations API when repository is expanded", async () => {
describe("Search conversations functionality", () => {
it("should call searchConversations API when repository is expanded", async () => {
const user = userEvent.setup();
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -876,8 +869,15 @@ describe("MicroagentManagement", () => {
// Wait for both microagents and conversations to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
expect(OpenHands.searchConversations).toHaveBeenCalledWith(
"user/repo2/.openhands",
"microagent_management",
1000,
);
});
});
@@ -887,7 +887,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -896,8 +896,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.searchConversations).toHaveBeenCalled();
});
// Check that microagents are displayed
@@ -917,22 +917,23 @@ describe("MicroagentManagement", () => {
it("should show loading state when both microagents and conversations are loading", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: undefined,
isLoading: true,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: undefined,
isLoading: true,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
// Make both queries never resolve
getRepositoryMicroagentsSpy.mockImplementation(
() => new Promise(() => {}),
);
searchConversationsSpy.mockImplementation(() => new Promise(() => {}));
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -949,7 +950,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -958,8 +959,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.searchConversations).toHaveBeenCalled();
});
// Check that loading spinner is not displayed
@@ -974,7 +975,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -983,8 +984,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.searchConversations).toHaveBeenCalled();
});
// Check that microagent file paths are displayed for microagents
@@ -1009,22 +1010,21 @@ describe("MicroagentManagement", () => {
it("should show learn this repo component when no microagents and no conversations", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
// Mock both queries to return empty arrays
getRepositoryMicroagentsSpy.mockResolvedValue([]);
searchConversationsSpy.mockResolvedValue([]);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1033,8 +1033,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalled();
});
// Check that the learn this repo component is displayed
@@ -1046,22 +1046,21 @@ describe("MicroagentManagement", () => {
it("should show learn this repo component when only conversations exist but no microagents", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [...mockConversations],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
// Mock microagents to return empty array, conversations to return data
getRepositoryMicroagentsSpy.mockResolvedValue([]);
searchConversationsSpy.mockResolvedValue([...mockConversations]);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1070,8 +1069,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalled();
});
// Check that conversations are displayed
@@ -1089,22 +1088,21 @@ describe("MicroagentManagement", () => {
it("should show learn this repo component when only microagents exist but no conversations", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: [...mockMicroagents],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
// Mock microagents to return data, conversations to return empty array
getRepositoryMicroagentsSpy.mockResolvedValue([...mockMicroagents]);
searchConversationsSpy.mockResolvedValue([]);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1113,8 +1111,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalled();
});
// Check that microagents are displayed
@@ -1132,17 +1130,16 @@ describe("MicroagentManagement", () => {
it("should handle error when fetching conversations", async () => {
const user = userEvent.setup();
mockUseMicroagentManagementConversations.mockReturnValue({
data: undefined,
isLoading: false,
isError: true,
});
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
searchConversationsSpy.mockRejectedValue(
new Error("Failed to fetch conversations"),
);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1151,7 +1148,11 @@ describe("MicroagentManagement", () => {
// Wait for the error to be handled
await waitFor(() => {
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalledWith(
"user/repo2/.openhands",
"microagent_management",
1000,
);
});
// Check that the learn this repo component is displayed (since conversations failed)
@@ -1162,22 +1163,27 @@ describe("MicroagentManagement", () => {
});
// Also check that the microagents query was called successfully
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
});
it("should handle error when fetching microagents but conversations succeed", async () => {
const user = userEvent.setup();
mockUseRepositoryMicroagents.mockReturnValue({
data: undefined,
isLoading: false,
isError: true,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
getRepositoryMicroagentsSpy.mockRejectedValue(
new Error("Failed to fetch microagents"),
);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1186,7 +1192,10 @@ describe("MicroagentManagement", () => {
// Wait for the error to be handled
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith(
"user",
"repo2",
);
});
// Check that the learn this repo component is displayed (since microagents failed)
@@ -1196,22 +1205,28 @@ describe("MicroagentManagement", () => {
expect(learnThisRepo).toBeInTheDocument();
});
it("should call useMicroagentManagementConversations with correct parameters", async () => {
it("should call searchConversations with correct parameters", async () => {
const user = userEvent.setup();
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
const repoAccordion = screen.getByTestId("repository-name-tooltip");
await user.click(repoAccordion);
// Wait for useMicroagentManagementConversations to be called
// Wait for searchConversations to be called
await waitFor(() => {
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalledWith(
"user/repo2/.openhands",
"microagent_management",
1000,
);
});
});
@@ -1221,7 +1236,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1230,8 +1245,8 @@ describe("MicroagentManagement", () => {
// Wait for both queries to complete
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
expect(OpenHands.searchConversations).toHaveBeenCalled();
});
// Check that conversations display correct information
@@ -1248,7 +1263,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion
@@ -1257,8 +1272,15 @@ describe("MicroagentManagement", () => {
// Wait for both queries to be called for first repo
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
"user",
"repo2",
);
expect(OpenHands.searchConversations).toHaveBeenCalledWith(
"user/repo2/.openhands",
"microagent_management",
1000,
);
});
// Check that both microagents and conversations are displayed
@@ -1282,7 +1304,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1303,7 +1325,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1365,7 +1387,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1396,7 +1418,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1426,7 +1448,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1466,7 +1488,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1500,7 +1522,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -1533,7 +1555,7 @@ describe("MicroagentManagement", () => {
// Wait for repositories to be loaded and processed
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Wait for repositories to be displayed in the accordion
@@ -2387,22 +2409,19 @@ describe("MicroagentManagement", () => {
const user = userEvent.setup();
// Setup mocks before rendering
mockUseRepositoryMicroagents.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
getRepositoryMicroagentsSpy.mockResolvedValue([]);
searchConversationsSpy.mockResolvedValue([]);
renderMicroagentManagement();
// Wait for repositories to be loaded
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
// Find and click on the first repository accordion to expand it
@@ -2411,8 +2430,8 @@ describe("MicroagentManagement", () => {
// Wait for microagents and conversations to be fetched
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalled();
});
// Verify the learn this repo trigger is displayed when no microagents exist
@@ -2432,22 +2451,19 @@ describe("MicroagentManagement", () => {
const user = userEvent.setup();
// Setup mocks
mockUseRepositoryMicroagents.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
getRepositoryMicroagentsSpy.mockResolvedValue([]);
searchConversationsSpy.mockResolvedValue([]);
renderMicroagentManagement();
// Wait for repositories and expand accordion
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
const repoAccordion = screen.getByTestId("repository-name-tooltip");
@@ -2480,36 +2496,35 @@ describe("MicroagentManagement", () => {
const user = userEvent.setup();
// Setup mocks with existing microagents (should NOT show trigger)
mockUseRepositoryMicroagents.mockReturnValue({
data: [
{
name: "test-microagent",
created_at: "2021-10-01",
git_provider: "github",
path: ".openhands/microagents/test",
},
],
isLoading: false,
isError: false,
});
mockUseMicroagentManagementConversations.mockReturnValue({
data: [],
isLoading: false,
isError: false,
});
const getRepositoryMicroagentsSpy = vi.spyOn(
OpenHands,
"getRepositoryMicroagents",
);
const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
// Mock with existing microagent
getRepositoryMicroagentsSpy.mockResolvedValue([
{
name: "test-microagent",
created_at: "2021-10-01",
git_provider: "github",
path: ".openhands/microagents/test",
},
]);
searchConversationsSpy.mockResolvedValue([]);
renderMicroagentManagement();
await waitFor(() => {
expect(mockUseGitRepositories).toHaveBeenCalled();
expect(mockUseUserRepositories).toHaveBeenCalled();
});
const repoAccordion = screen.getByTestId("repository-name-tooltip");
await user.click(repoAccordion);
await waitFor(() => {
expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
expect(searchConversationsSpy).toHaveBeenCalled();
});
// Should NOT show the learn this repo trigger when microagents exist
+22 -44
View File
@@ -79,35 +79,6 @@ describe("Content", () => {
expect(screen.getByTestId("set-indicator")).toBeInTheDocument();
});
});
it("should conditionally show security analyzer based on confirmation mode", async () => {
renderLlmSettingsScreen();
await screen.findByTestId("llm-settings-screen");
const confirmation = screen.getByTestId("enable-confirmation-mode-switch");
// Initially confirmation mode is false, so security analyzer should not be visible
expect(confirmation).not.toBeChecked();
expect(
screen.queryByTestId("security-analyzer-input"),
).not.toBeInTheDocument();
// Enable confirmation mode
await userEvent.click(confirmation);
expect(confirmation).toBeChecked();
// Security analyzer should now be visible
screen.getByTestId("security-analyzer-input");
// Disable confirmation mode again
await userEvent.click(confirmation);
expect(confirmation).not.toBeChecked();
// Security analyzer should be hidden again
expect(
screen.queryByTestId("security-analyzer-input"),
).not.toBeInTheDocument();
});
});
describe("Advanced form", () => {
@@ -136,6 +107,7 @@ describe("Content", () => {
within(advancedForm).getByTestId("llm-api-key-input");
within(advancedForm).getByTestId("llm-api-key-help-anchor-advanced");
within(advancedForm).getByTestId("agent-input");
within(advancedForm).getByTestId("enable-confirmation-mode-switch");
within(advancedForm).getByTestId("enable-memory-condenser-switch");
await userEvent.click(advancedSwitch);
@@ -158,6 +130,9 @@ describe("Content", () => {
const baseUrl = screen.getByTestId("base-url-input");
const apiKey = screen.getByTestId("llm-api-key-input");
const agent = screen.getByTestId("agent-input");
const confirmation = screen.getByTestId(
"enable-confirmation-mode-switch",
);
const condensor = screen.getByTestId("enable-memory-condenser-switch");
expect(model).toHaveValue("openhands/claude-sonnet-4-20250514");
@@ -165,7 +140,15 @@ describe("Content", () => {
expect(apiKey).toHaveValue("");
expect(apiKey).toHaveProperty("placeholder", "");
expect(agent).toHaveValue("CodeActAgent");
expect(confirmation).not.toBeChecked();
expect(condensor).toBeChecked();
// check that security analyzer is present
expect(
screen.queryByTestId("security-analyzer-input"),
).not.toBeInTheDocument();
await userEvent.click(confirmation);
screen.getByTestId("security-analyzer-input");
});
it("should render the advanced form if existings settings are advanced", async () => {
@@ -194,7 +177,7 @@ describe("Content", () => {
agent: "CoActAgent",
confirmation_mode: true,
enable_default_condenser: false,
security_analyzer: "none",
security_analyzer: "mock-invariant",
});
renderLlmSettingsScreen();
@@ -220,7 +203,7 @@ describe("Content", () => {
expect(agent).toHaveValue("CoActAgent");
expect(confirmation).toBeChecked();
expect(condensor).not.toBeChecked();
expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_NONE");
expect(securityAnalyzer).toHaveValue("mock-invariant");
});
});
});
@@ -310,7 +293,7 @@ describe("Form submission", () => {
// select security analyzer
const securityAnalyzer = screen.getByTestId("security-analyzer-input");
await userEvent.click(securityAnalyzer);
const securityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_NONE");
const securityAnalyzerOption = screen.getByText("mock-invariant");
await userEvent.click(securityAnalyzerOption);
const submitButton = screen.getByTestId("submit-button");
@@ -323,7 +306,7 @@ describe("Form submission", () => {
agent: "CoActAgent",
confirmation_mode: true,
enable_default_condenser: false,
security_analyzer: null,
security_analyzer: "mock-invariant",
}),
);
});
@@ -392,10 +375,8 @@ describe("Form submission", () => {
const baseUrl = await screen.findByTestId("base-url-input");
const apiKey = await screen.findByTestId("llm-api-key-input");
const agent = await screen.findByTestId("agent-input");
const condensor = await screen.findByTestId("enable-memory-condenser-switch");
// Confirmation mode switch is now in basic settings, always visible
const confirmation = await screen.findByTestId("enable-confirmation-mode-switch");
const condensor = await screen.findByTestId("enable-memory-condenser-switch");
// enter custom model
await userEvent.type(model, "-mini");
@@ -470,17 +451,14 @@ describe("Form submission", () => {
// select security analyzer
const securityAnalyzer = await screen.findByTestId("security-analyzer-input");
await userEvent.click(securityAnalyzer);
const securityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_NONE");
const securityAnalyzerOption = screen.getByText("mock-invariant");
await userEvent.click(securityAnalyzerOption);
expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_NONE");
expect(securityAnalyzer).toHaveValue("mock-invariant");
expect(submitButton).not.toBeDisabled();
// revert back to original value
await userEvent.click(securityAnalyzer);
const originalSecurityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT");
await userEvent.click(originalSecurityAnalyzerOption);
expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT");
await userEvent.clear(securityAnalyzer);
expect(securityAnalyzer).toHaveValue("");
expect(submitButton).toBeDisabled();
});
@@ -574,7 +552,7 @@ describe("Form submission", () => {
expect.objectContaining({
llm_model: "openhands/claude-sonnet-4-20250514",
llm_base_url: "",
confirmation_mode: true, // Confirmation mode is now a basic setting, should be preserved
confirmation_mode: false,
}),
);
});
@@ -107,7 +107,9 @@ describe("Content", () => {
expect(screen.queryByTestId("add-secret-button")).not.toBeInTheDocument(),
);
const button = await screen.findByTestId("connect-git-button");
expect(button).toHaveAttribute("href", "/settings/integrations");
await userEvent.click(button);
screen.getByTestId("git-settings-screen");
});
it("should render an empty table when there are no existing secrets", async () => {
@@ -29,5 +29,23 @@ describe("hasAdvancedSettingsSet", () => {
}),
).toBe(true);
});
test("CONFIRMATION_MODE is true", () => {
expect(
hasAdvancedSettingsSet({
...DEFAULT_SETTINGS,
CONFIRMATION_MODE: true,
}),
).toBe(true);
});
test("SECURITY_ANALYZER is set", () => {
expect(
hasAdvancedSettingsSet({
...DEFAULT_SETTINGS,
SECURITY_ANALYZER: "test",
}),
).toBe(true);
});
});
});
+291 -378
View File
File diff suppressed because it is too large Load Diff
+25 -25
View File
@@ -11,17 +11,17 @@
"@heroui/use-infinite-scroll": "^2.2.10",
"@microlink/react-json-view": "^1.26.2",
"@monaco-editor/react": "^4.7.0-rc.0",
"@react-router/node": "^7.8.2",
"@react-router/serve": "^7.8.2",
"@react-types/shared": "^3.32.0",
"@react-router/node": "^7.8.0",
"@react-router/serve": "^7.8.0",
"@react-types/shared": "^3.31.0",
"@reduxjs/toolkit": "^2.8.2",
"@stripe/react-stripe-js": "^3.9.2",
"@stripe/stripe-js": "^7.9.0",
"@stripe/react-stripe-js": "^3.9.0",
"@stripe/stripe-js": "^7.8.0",
"@tailwindcss/postcss": "^4.1.12",
"@tailwindcss/vite": "^4.1.12",
"@tanstack/react-query": "^5.85.5",
"@tanstack/react-query": "^5.85.3",
"@uidotdev/usehooks": "^2.4.1",
"@vitejs/plugin-react": "^5.0.1",
"@vitejs/plugin-react": "^5.0.0",
"@xterm/addon-fit": "^0.10.0",
"@xterm/xterm": "^5.4.0",
"axios": "^1.11.0",
@@ -29,32 +29,32 @@
"date-fns": "^4.1.0",
"eslint-config-airbnb-typescript": "^18.0.0",
"framer-motion": "^12.23.12",
"i18next": "^25.4.2",
"i18next": "^25.3.6",
"i18next-browser-languagedetector": "^8.2.0",
"i18next-http-backend": "^3.0.2",
"isbot": "^5.1.30",
"jose": "^6.0.13",
"lucide-react": "^0.542.0",
"isbot": "^5.1.29",
"jose": "^6.0.12",
"lucide-react": "^0.539.0",
"monaco-editor": "^0.52.2",
"posthog-js": "^1.260.3",
"posthog-js": "^1.260.1",
"react": "^19.1.1",
"react-dom": "^19.1.1",
"react-highlight": "^0.15.0",
"react-hot-toast": "^2.6.0",
"react-i18next": "^15.7.2",
"react-hot-toast": "^2.5.1",
"react-i18next": "^15.6.1",
"react-icons": "^5.5.0",
"react-markdown": "^10.1.0",
"react-redux": "^9.2.0",
"react-router": "^7.8.2",
"react-router": "^7.8.0",
"react-select": "^5.10.2",
"react-syntax-highlighter": "^15.6.6",
"react-syntax-highlighter": "^15.6.1",
"react-textarea-autosize": "^8.5.9",
"remark-breaks": "^4.0.0",
"remark-gfm": "^4.0.1",
"sirv-cli": "^3.0.1",
"socket.io-client": "^4.8.1",
"tailwind-merge": "^3.3.1",
"vite": "^7.1.3",
"vite": "^7.1.1",
"web-vitals": "^5.1.0",
"ws": "^8.18.2"
},
@@ -88,17 +88,17 @@
"@babel/traverse": "^7.28.3",
"@babel/types": "^7.28.2",
"@mswjs/socket.io-binding": "^0.2.0",
"@playwright/test": "^1.55.0",
"@react-router/dev": "^7.8.2",
"@playwright/test": "^1.54.2",
"@react-router/dev": "^7.8.0",
"@tailwindcss/typography": "^0.5.16",
"@tanstack/eslint-plugin-query": "^5.83.1",
"@testing-library/dom": "^10.4.1",
"@testing-library/jest-dom": "^6.8.0",
"@testing-library/jest-dom": "^6.7.0",
"@testing-library/react": "^16.3.0",
"@testing-library/user-event": "^14.6.1",
"@types/node": "^24.3.0",
"@types/react": "^19.1.11",
"@types/react-dom": "^19.1.8",
"@types/node": "^24.2.0",
"@types/react": "^19.1.9",
"@types/react-dom": "^19.1.7",
"@types/react-highlight": "^0.12.8",
"@types/react-syntax-highlighter": "^15.5.13",
"@types/ws": "^8.18.1",
@@ -117,7 +117,7 @@
"eslint-plugin-prettier": "^5.5.4",
"eslint-plugin-react": "^7.37.5",
"eslint-plugin-react-hooks": "^4.6.2",
"eslint-plugin-unused-imports": "^4.2.0",
"eslint-plugin-unused-imports": "^4.1.4",
"husky": "^9.1.7",
"jsdom": "^26.1.0",
"lint-staged": "^16.1.4",
@@ -126,7 +126,7 @@
"stripe": "^18.4.0",
"tailwindcss": "^4.1.8",
"typescript": "^5.9.2",
"vite-plugin-svgr": "^4.5.0",
"vite-plugin-svgr": "^4.2.0",
"vite-tsconfig-paths": "^5.1.4",
"vitest": "^3.0.2"
},
-21
View File
@@ -726,27 +726,6 @@ class OpenHands {
);
return data;
}
static async getMicroagentManagementConversations(
selectedRepository: string,
pageId?: string,
limit: number = 100,
): Promise<Conversation[]> {
const params: Record<string, string | number> = {
limit,
selected_repository: selectedRepository,
};
if (pageId) {
params.page_id = pageId;
}
const { data } = await openHands.get<ResultSet<Conversation>>(
"/api/microagent-management/conversations",
{ params },
);
return data.results;
}
}
export default OpenHands;
@@ -1,5 +1,4 @@
import { useMemo } from "react";
import { StylesConfig } from "react-select";
import { Provider } from "../../types/settings";
import { ReactSelectDropdown, SelectOption } from "./react-select-dropdown";
@@ -12,8 +11,6 @@ export interface GitProviderDropdownProps {
disabled?: boolean;
isLoading?: boolean;
onChange?: (provider: Provider | null) => void;
classNamePrefix?: string;
styles?: StylesConfig<SelectOption, false>;
}
export function GitProviderDropdown({
@@ -25,8 +22,6 @@ export function GitProviderDropdown({
disabled = false,
isLoading = false,
onChange,
classNamePrefix,
styles,
}: GitProviderDropdownProps) {
const options: SelectOption[] = useMemo(
() =>
@@ -58,8 +53,6 @@ export function GitProviderDropdown({
isSearchable={false}
isLoading={isLoading}
onChange={handleChange}
classNamePrefix={classNamePrefix}
styles={styles}
/>
);
}
@@ -1,5 +1,5 @@
import { useMemo } from "react";
import Select, { StylesConfig } from "react-select";
import Select from "react-select";
import { cn } from "#/utils/utils";
import { SelectOptionBase, getCustomStyles } from "./react-select-styles";
@@ -17,8 +17,6 @@ export interface ReactSelectDropdownProps {
isSearchable?: boolean;
isLoading?: boolean;
onChange?: (option: SelectOption | null) => void;
classNamePrefix?: string;
styles?: StylesConfig<SelectOption, false>;
}
export function ReactSelectDropdown({
@@ -33,8 +31,6 @@ export function ReactSelectDropdown({
isSearchable = true,
isLoading = false,
onChange,
classNamePrefix,
styles,
}: ReactSelectDropdownProps) {
const customStyles = useMemo(() => getCustomStyles<SelectOption>(), []);
@@ -50,9 +46,8 @@ export function ReactSelectDropdown({
isSearchable={isSearchable}
isLoading={isLoading}
onChange={onChange}
styles={styles || customStyles}
styles={customStyles}
className="w-full"
classNamePrefix={classNamePrefix}
/>
{errorMessage && (
<p className="text-red-500 text-sm mt-1">{errorMessage}</p>
@@ -90,26 +90,3 @@ export const getCustomStyles = <T extends SelectOptionBase>(): StylesConfig<
color: "#B7BDC2", // tertiary-light
}),
});
export const getGitProviderMicroagentManagementCustomStyles = <
T extends SelectOptionBase,
>(): StylesConfig<T, false> => ({
...getCustomStyles<T>(),
control: (provided, state) => ({
...provided,
backgroundColor: state.isDisabled ? "#363636" : "#454545", // darker tertiary when disabled
border: "1px solid #717888",
borderRadius: "0.125rem",
minHeight: "2.5rem",
padding: "0 0.5rem",
boxShadow: "none",
opacity: state.isDisabled ? 0.6 : 1,
cursor: state.isDisabled ? "not-allowed" : "pointer",
"&:hover": {
borderColor: "#717888",
},
"& .git-provider-dropdown__value-container": {
padding: "2px 0",
},
}),
});
@@ -9,7 +9,6 @@ import { CopyToClipboardButton } from "#/components/shared/buttons/copy-to-clipb
import { anchor } from "../markdown/anchor";
import { OpenHandsSourceType } from "#/types/core/base";
import { paragraph } from "../markdown/paragraph";
import { TooltipButton } from "#/components/shared/buttons/tooltip-button";
interface ChatMessageProps {
type: OpenHandsSourceType;
@@ -17,7 +16,6 @@ interface ChatMessageProps {
actions?: Array<{
icon: React.ReactNode;
onClick: () => void;
tooltip?: string;
}>;
}
@@ -68,35 +66,17 @@ export function ChatMessage({
"items-center gap-1",
)}
>
{actions?.map((action, index) =>
action.tooltip ? (
<TooltipButton
key={index}
tooltip={action.tooltip}
ariaLabel={action.tooltip}
placement="top"
>
<button
type="button"
onClick={action.onClick}
className="button-base p-1 cursor-pointer"
aria-label={`Action ${index + 1}`}
>
{action.icon}
</button>
</TooltipButton>
) : (
<button
key={index}
type="button"
onClick={action.onClick}
className="button-base p-1 cursor-pointer"
aria-label={`Action ${index + 1}`}
>
{action.icon}
</button>
),
)}
{actions?.map((action, index) => (
<button
key={index}
type="button"
onClick={action.onClick}
className="button-base p-1 cursor-pointer"
aria-label={`Action ${index + 1}`}
>
{action.icon}
</button>
))}
<CopyToClipboardButton
isHidden={!isHovering}
@@ -72,9 +72,6 @@ const getRecallObservationContent = (event: RecallObservation): string => {
if (event.extras.repo_instructions) {
content += `\n\n**Repository Instructions:**\n\n${event.extras.repo_instructions}`;
}
if (event.extras.conversation_instructions) {
content += `\n\n**Conversation Instructions:**\n\n${event.extras.conversation_instructions}`;
}
if (event.extras.additional_agent_instructions) {
content += `\n\n**Additional Instructions:**\n\n${event.extras.additional_agent_instructions}`;
}
@@ -46,7 +46,6 @@ interface EventMessageProps {
actions?: Array<{
icon: React.ReactNode;
onClick: () => void;
tooltip?: string;
}>;
isInLast10Actions: boolean;
}
@@ -1,5 +1,4 @@
import React from "react";
import { useTranslation } from "react-i18next";
import { createPortal } from "react-dom";
import { OpenHandsAction } from "#/types/core/actions";
import { OpenHandsObservation } from "#/types/core/observations";
@@ -25,17 +24,6 @@ import { AgentState } from "#/types/agent-state";
import { getFirstPRUrl } from "#/utils/parse-pr-url";
import MemoryIcon from "#/icons/memory_icon.svg?react";
const isErrorEvent = (evt: unknown): evt is { error: true; message: string } =>
typeof evt === "object" &&
evt !== null &&
"error" in evt &&
evt.error === true;
const isAgentStatusError = (evt: unknown): boolean =>
isOpenHandsEvent(evt) &&
isAgentStateChangeObservation(evt) &&
evt.extras.agent_state === AgentState.ERROR;
interface MessagesProps {
messages: (OpenHandsAction | OpenHandsObservation)[];
isAwaitingUserConfirmation: boolean;
@@ -43,11 +31,8 @@ interface MessagesProps {
export const Messages: React.FC<MessagesProps> = React.memo(
({ messages, isAwaitingUserConfirmation }) => {
const {
createConversationAndSubscribe,
isPending,
unsubscribeFromConversation,
} = useCreateConversationAndSubscribeMultiple();
const { createConversationAndSubscribe, isPending } =
useCreateConversationAndSubscribeMultiple();
const { getOptimisticUserMessage } = useOptimisticUserMessage();
const { conversationId } = useConversationId();
const { data: conversation } = useUserConversation(conversationId);
@@ -63,8 +48,6 @@ export const Messages: React.FC<MessagesProps> = React.memo(
EventMicroagentStatus[]
>([]);
const { t } = useTranslation();
const actionHasObservationPair = React.useCallback(
(event: OpenHandsAction | OpenHandsObservation): boolean => {
if (isOpenHandsAction(event)) {
@@ -110,6 +93,20 @@ export const Messages: React.FC<MessagesProps> = React.memo(
const handleMicroagentEvent = React.useCallback(
(socketEvent: unknown, microagentConversationId: string) => {
// Handle error events
const isErrorEvent = (
evt: unknown,
): evt is { error: true; message: string } =>
typeof evt === "object" &&
evt !== null &&
"error" in evt &&
evt.error === true;
const isAgentStatusError = (evt: unknown): boolean =>
isOpenHandsEvent(evt) &&
isAgentStateChangeObservation(evt) &&
evt.extras.agent_state === AgentState.ERROR;
if (isErrorEvent(socketEvent) || isAgentStatusError(socketEvent)) {
setMicroagentStatuses((prev) =>
prev.map((statusEntry) =>
@@ -122,11 +119,7 @@ export const Messages: React.FC<MessagesProps> = React.memo(
isOpenHandsEvent(socketEvent) &&
isAgentStateChangeObservation(socketEvent)
) {
// Handle completion states
if (
socketEvent.extras.agent_state === AgentState.FINISHED ||
socketEvent.extras.agent_state === AgentState.AWAITING_USER_INPUT
) {
if (socketEvent.extras.agent_state === AgentState.FINISHED) {
setMicroagentStatuses((prev) =>
prev.map((statusEntry) =>
statusEntry.conversationId === microagentConversationId
@@ -134,8 +127,6 @@ export const Messages: React.FC<MessagesProps> = React.memo(
: statusEntry,
),
);
unsubscribeFromConversation(microagentConversationId);
}
} else if (
isOpenHandsEvent(socketEvent) &&
@@ -156,27 +147,9 @@ export const Messages: React.FC<MessagesProps> = React.memo(
),
);
}
unsubscribeFromConversation(microagentConversationId);
} else {
// For any other event, transition from WAITING to CREATING if still waiting
setMicroagentStatuses((prev) => {
const currentStatus = prev.find(
(entry) => entry.conversationId === microagentConversationId,
)?.status;
if (currentStatus === MicroagentStatus.WAITING) {
return prev.map((statusEntry) =>
statusEntry.conversationId === microagentConversationId
? { ...statusEntry, status: MicroagentStatus.CREATING }
: statusEntry,
);
}
return prev; // No change needed
});
}
},
[setMicroagentStatuses, unsubscribeFromConversation],
[setMicroagentStatuses],
);
const handleLaunchMicroagent = (
@@ -205,13 +178,13 @@ export const Messages: React.FC<MessagesProps> = React.memo(
},
onSuccessCallback: (newConversationId: string) => {
setShowLaunchMicroagentModal(false);
// Update status with conversation ID - start with WAITING
// Update status with conversation ID
setMicroagentStatuses((prev) => [
...prev.filter((status) => status.eventId !== selectedEventId),
{
eventId: selectedEventId,
conversationId: newConversationId,
status: MicroagentStatus.WAITING,
status: MicroagentStatus.CREATING,
},
]);
},
@@ -246,7 +219,6 @@ export const Messages: React.FC<MessagesProps> = React.memo(
setSelectedEventId(message.id);
setShowLaunchMicroagentModal(true);
},
tooltip: t("MICROAGENT$ADD_TO_MEMORY"),
},
]
: undefined
@@ -76,10 +76,6 @@ export function LaunchMicroagentModal({
</button>
</div>
<span className="text-sm text-[#A3A3A3] font-normal leading-5">
{t("MICROAGENT$DEFINITION")}
</span>
<form
data-testid="launch-microagent-modal"
onSubmit={onSubmit}
@@ -19,8 +19,6 @@ export function MicroagentStatusIndicator({
const getStatusText = () => {
switch (status) {
case MicroagentStatus.WAITING:
return t("MICROAGENT$STATUS_WAITING");
case MicroagentStatus.CREATING:
return t("MICROAGENT$STATUS_CREATING");
case MicroagentStatus.COMPLETED:
@@ -37,8 +35,6 @@ export function MicroagentStatusIndicator({
const getStatusIcon = () => {
switch (status) {
case MicroagentStatus.WAITING:
return <Spinner size="sm" />;
case MicroagentStatus.CREATING:
return <Spinner size="sm" />;
case MicroagentStatus.COMPLETED:
@@ -10,11 +10,6 @@ interface ConversationCreatedToastProps {
onClose: () => void;
}
interface ConversationStartingToastProps {
conversationId: string;
onClose: () => void;
}
function ConversationCreatedToast({
conversationId,
onClose,
@@ -42,33 +37,6 @@ function ConversationCreatedToast({
);
}
function ConversationStartingToast({
conversationId,
onClose,
}: ConversationStartingToastProps) {
const { t } = useTranslation();
return (
<div className="flex items-start gap-2">
<Spinner size="sm" />
<div>
{t("MICROAGENT$CONVERSATION_STARTING")}
<br />
<a
href={`/conversations/${conversationId}`}
target="_blank"
rel="noopener noreferrer"
className="underline"
>
{t("MICROAGENT$VIEW_CONVERSATION")}
</a>
</div>
<button type="button" onClick={onClose}>
<CloseIcon />
</button>
</div>
);
}
interface ConversationFinishedToastProps {
conversationId: string;
onClose: () => void;
@@ -110,18 +78,10 @@ function ConversationErroredToast({
errorMessage,
onClose,
}: ConversationErroredToastProps) {
const { t } = useTranslation();
// Check if the error message is a translation key
const displayMessage =
errorMessage === "MICROAGENT$UNKNOWN_ERROR"
? t(errorMessage)
: errorMessage;
return (
<div className="flex items-start gap-2">
<SuccessIndicator status="error" />
<div>{displayMessage}</div>
<div>{errorMessage}</div>
<button type="button" onClick={onClose}>
<CloseIcon />
</button>
@@ -176,18 +136,3 @@ export const renderConversationErroredToast = (
duration: 5000,
},
);
export const renderConversationStartingToast = (conversationId: string) =>
toast(
(toastInstance) => (
<ConversationStartingToast
conversationId={conversationId}
onClose={() => toast.dismiss(toastInstance.id)}
/>
),
{
...TOAST_OPTIONS,
id: `starting-${conversationId}`,
duration: 10000, // Show for 10 seconds or until dismissed
},
);
@@ -7,10 +7,11 @@ import { ConversationCard } from "../conversation-panel/conversation-card";
import { Provider } from "#/types/settings";
interface ControlsProps {
setSecurityOpen: (isOpen: boolean) => void;
showSecurityLock: boolean;
}
export function Controls({ showSecurityLock }: ControlsProps) {
export function Controls({ setSecurityOpen, showSecurityLock }: ControlsProps) {
const { data: conversation } = useActiveConversation();
const [contextMenuOpen, setContextMenuOpen] = React.useState(false);
@@ -20,7 +21,9 @@ export function Controls({ showSecurityLock }: ControlsProps) {
<AgentControlBar />
<AgentStatusBar />
{showSecurityLock && <SecurityLock />}
{showSecurityLock && (
<SecurityLock onClick={() => setSecurityOpen(true)} />
)}
</div>
<ConversationCard
@@ -1,28 +1,17 @@
import { IoLockClosed } from "react-icons/io5";
import { Tooltip } from "@heroui/react";
import { useTranslation } from "react-i18next";
import { Link } from "react-router";
import { I18nKey } from "#/i18n/declaration";
export function SecurityLock() {
const { t } = useTranslation();
interface SecurityLockProps {
onClick: () => void;
}
export function SecurityLock({ onClick }: SecurityLockProps) {
return (
<Tooltip
content={
<div className="max-w-xs p-2">
{t(I18nKey.SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP)}
</div>
}
placement="top"
<div
className="cursor-pointer hover:opacity-80 transition-all"
style={{ marginRight: "8px" }}
onClick={onClick}
>
<Link
to="/settings"
className="mr-2 cursor-pointer hover:opacity-80 transition-all"
aria-label={t(I18nKey.SETTINGS$TITLE)}
>
<IoLockClosed size={20} />
</Link>
</Tooltip>
<IoLockClosed size={20} />
</div>
);
}
@@ -23,9 +23,9 @@ export function ConfirmStopModal({
<ModalBackdrop>
<ModalBody className="items-start border border-tertiary">
<div className="flex flex-col gap-2">
<BaseModalTitle title={t(I18nKey.CONVERSATION$CONFIRM_PAUSE)} />
<BaseModalTitle title={t(I18nKey.CONVERSATION$CONFIRM_STOP)} />
<BaseModalDescription
description={t(I18nKey.CONVERSATION$PAUSE_WARNING)}
description={t(I18nKey.CONVERSATION$STOP_WARNING)}
/>
</div>
<div
@@ -129,7 +129,7 @@ export function ConversationCardContextMenu({
{onStop && (
<ContextMenuListItem testId="stop-button" onClick={onStop}>
<ContextMenuIconText icon={Power} text={t(I18nKey.BUTTON$PAUSE)} />
<ContextMenuIconText icon={Power} text={t(I18nKey.BUTTON$STOP)} />
</ContextMenuListItem>
)}
@@ -1,6 +1,4 @@
import { ConversationStatus } from "#/types/conversation-status";
import ArchivedIcon from "./state-indicators/archived.svg?react";
import ErrorIcon from "./state-indicators/error.svg?react";
import RunningIcon from "./state-indicators/running.svg?react";
import StartingIcon from "./state-indicators/starting.svg?react";
import StoppedIcon from "./state-indicators/stopped.svg?react";
@@ -11,8 +9,6 @@ const CONVERSATION_STATUS_INDICATORS: Record<ConversationStatus, SVGIcon> = {
STOPPED: StoppedIcon,
RUNNING: RunningIcon,
STARTING: StartingIcon,
ARCHIVED: ArchivedIcon,
ERROR: ErrorIcon,
};
interface ConversationStateIndicatorProps {
@@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#A7A9AC"><path d="M0 0h24v24H0V0z" fill="none"/><path d="M17 7h-4v1.9h4c1.71 0 3.1 1.39 3.1 3.1 0 1.43-.98 2.63-2.31 2.98l1.46 1.46C20.88 15.61 22 13.95 22 12c0-2.76-2.24-5-5-5zm-1 4h-2.19l2 2H16zM2 4.27l3.11 3.11C3.29 8.12 2 9.91 2 12c0 2.76 2.24 5 5 5h4v-1.9H7c-1.71 0-3.1-1.39-3.1-3.1 0-1.59 1.21-2.9 2.76-3.07L8.73 11H8v2h2.73L13 15.27V17h1.73l4.01 4L20 19.74 3.27 3 2 4.27z"/><path d="M0 24V0" fill="none"/></svg>

Before

Width:  |  Height:  |  Size: 512 B

@@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#e7000b"><path d="M0 0h24v24H0z" fill="none"/><path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"/></svg>

Before

Width:  |  Height:  |  Size: 254 B

@@ -17,7 +17,7 @@ export function MicroagentManagementAccordionTitle({
<TooltipButton
tooltip={repository.full_name}
ariaLabel={repository.full_name}
className="text-white text-base font-normal bg-transparent p-0 min-w-0 h-auto cursor-pointer truncate max-w-[194px] translate-y-[-1px]"
className="text-white text-base font-normal bg-transparent p-0 min-w-0 h-auto cursor-pointer truncate max-w-[200px] translate-y-[-1px]"
testId="repository-name-tooltip"
placement="bottom"
>
@@ -32,7 +32,6 @@ import {
} from "#/utils/custom-toast-handlers";
import { getFirstPRUrl } from "#/utils/parse-pr-url";
import { I18nKey } from "#/i18n/declaration";
import { useUserProviders } from "#/hooks/use-user-providers";
// Handle error events
const isErrorEvent = (evt: unknown): evt is { error: true; message: string } =>
@@ -66,10 +65,16 @@ const getConversationInstructions = (
gitProvider: Provider,
) => `Create a microagent for the repository ${repositoryName} by following the steps below:
- Step 1: Create a markdown file inside the .openhands/microagents folder with the name of the microagent (The microagent must be created in the .openhands/microagents folder and should be able to perform the described task when triggered). This is the instructions about what the microagent should do: ${formData.query}. ${
- Step 1: Create a markdown file inside the .openhands/microagents folder with the name of the microagent (The microagent must be created in the .openhands/microagents folder and should be able to perform the described task when triggered).
- This is the instructions about what the microagent should do: ${formData.query}
${
formData.triggers && formData.triggers.length > 0
? `This is the triggers of the microagent: ${formData.triggers.join(", ")}`
: "Please be noted that the microagent doesn't have any triggers."
? `
- This is the triggers of the microagent: ${formData.triggers.join(", ")}
`
: "- Please be noted that the microagent doesn't have any triggers."
}
- Step 2: Create a new branch for the repository ${repositoryName}, must avoid duplicated branches.
@@ -86,10 +91,16 @@ const getUpdateConversationInstructions = (
) => `Update the microagent for the repository ${repositoryName} by following the steps below:
- Step 1: Update the microagent. This is the path of the microagent: ${formData.microagentPath} (The updated microagent must be in the .openhands/microagents folder and should be able to perform the described task when triggered). This is the updated instructions about what the microagent should do: ${formData.query}. ${
- Step 1: Update the microagent. This is the path of the microagent: ${formData.microagentPath} (The updated microagent must be in the .openhands/microagents folder and should be able to perform the described task when triggered).
- This is the updated instructions about what the microagent should do: ${formData.query}
${
formData.triggers && formData.triggers.length > 0
? `This is the triggers of the microagent: ${formData.triggers.join(", ")}`
: "Please be noted that the microagent doesn't have any triggers."
? `
- This is the triggers of the microagent: ${formData.triggers.join(", ")}
`
: "- Please be noted that the microagent doesn't have any triggers."
}
- Step 2: Create a new branch for the repository ${repositoryName}, must avoid duplicated branches.
@@ -108,8 +119,6 @@ export function MicroagentManagementContent() {
learnThisRepoModalVisible,
} = useSelector((state: RootState) => state.microagentManagement);
const { providers } = useUserProviders();
const { t } = useTranslation();
const dispatch = useDispatch();
@@ -173,7 +182,11 @@ export function MicroagentManagementContent() {
// Check if agent has finished and we have a PR
if (isOpenHandsEvent(socketEvent) && isFinishAction(socketEvent)) {
const prUrl = getFirstPRUrl(socketEvent.args.final_thought || "");
if (!prUrl) {
if (prUrl) {
displaySuccessToast(
t(I18nKey.MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW),
);
} else {
// Agent finished but no PR found
displaySuccessToast(t(I18nKey.MICROAGENT_MANAGEMENT$PR_NOT_CREATED));
}
@@ -277,12 +290,6 @@ export function MicroagentManagementContent() {
const repositoryName = repository.full_name;
const gitProvider = repository.git_provider;
const createMicroagent = {
repo: repositoryName,
git_provider: gitProvider,
title: formData.query,
};
// Launch a new conversation to help the user understand the repo
createConversationAndSubscribe({
query: formData.query,
@@ -292,7 +299,6 @@ export function MicroagentManagementContent() {
branch: formData.selectedBranch,
gitProvider,
},
createMicroagent,
onSuccessCallback: () => {
hideLearnThisRepoModal();
},
@@ -323,18 +329,11 @@ export function MicroagentManagementContent() {
</>
);
const providersAreSet = providers.length > 0;
if (width < 1024) {
return (
<div className="w-full h-full flex flex-col gap-6">
<div className="w-full rounded-lg border border-[#525252] bg-[#24272E] max-h-[494px] min-h-[494px]">
{providersAreSet && (
<MicroagentManagementSidebar
isSmallerScreen
providers={providers}
/>
)}
<MicroagentManagementSidebar isSmallerScreen />
</div>
<div className="w-full rounded-lg border border-[#525252] bg-[#24272E] flex-1 min-h-[494px]">
<MicroagentManagementMain />
@@ -346,7 +345,7 @@ export function MicroagentManagementContent() {
return (
<div className="w-full h-full flex rounded-lg border border-[#525252] bg-[#24272E] overflow-hidden">
{providersAreSet && <MicroagentManagementSidebar providers={providers} />}
<MicroagentManagementSidebar />
<div className="flex-1">
<MicroagentManagementMain />
</div>
@@ -8,7 +8,7 @@ import { BrandButton } from "../settings/brand-button";
import { I18nKey } from "#/i18n/declaration";
import { RootState } from "#/store";
import XIcon from "#/icons/x.svg?react";
import { cn, getRepoMdCreatePrompt } from "#/utils/utils";
import { cn } from "#/utils/utils";
import { LearnThisRepoFormData } from "#/types/microagent-management";
import { Branch } from "#/types/git";
import { useRepositoryBranches } from "#/hooks/query/use-repository-branches";
@@ -76,25 +76,23 @@ export function MicroagentManagementLearnThisRepoModal({
const onSubmit = (event: React.FormEvent<HTMLFormElement>) => {
event.preventDefault();
const finalQuery = getRepoMdCreatePrompt(
selectedRepository?.git_provider || "github",
query.trim(),
);
if (!query.trim()) {
return;
}
onConfirm({
query: finalQuery,
query: query.trim(),
selectedBranch: selectedBranch?.name || "",
});
};
const handleConfirm = () => {
const finalQuery = getRepoMdCreatePrompt(
selectedRepository?.git_provider || "github",
query.trim(),
);
if (!query.trim()) {
return;
}
onConfirm({
query: finalQuery,
query: query.trim(),
selectedBranch: selectedBranch?.name || "",
});
};
@@ -246,6 +244,7 @@ export function MicroagentManagementLearnThisRepoModal({
onClick={handleConfirm}
testId="confirm-button"
isDisabled={
!query.trim() ||
isLoading ||
isLoadingBranches ||
!selectedBranch ||
@@ -59,10 +59,8 @@ export function MicroagentManagementMicroagentCard({
if (runtimeStatus === "STATUS$ERROR") {
return t(I18nKey.MICROAGENT$STATUS_ERROR);
}
if (conversationStatus === "RUNNING") {
return runtimeStatus === "STATUS$READY"
? t(I18nKey.MICROAGENT$STATUS_OPENING_PR)
: t(I18nKey.COMMON$STARTING);
if (conversationStatus === "RUNNING" && runtimeStatus === "STATUS$READY") {
return t(I18nKey.MICROAGENT$STATUS_OPENING_PR);
}
return "";
}, [conversationStatus, runtimeStatus, t, hasPr]);
@@ -1,16 +1,13 @@
import { useTranslation } from "react-i18next";
import { useEffect } from "react";
import { useDispatch, useSelector } from "react-redux";
import { Spinner } from "@heroui/react";
import { MicroagentManagementMicroagentCard } from "./microagent-management-microagent-card";
import { MicroagentManagementLearnThisRepo } from "./microagent-management-learn-this-repo";
import { useRepositoryMicroagents } from "#/hooks/query/use-repository-microagents";
import { useMicroagentManagementConversations } from "#/hooks/query/use-microagent-management-conversations";
import { useSearchConversations } from "#/hooks/query/use-search-conversations";
import { GitRepository } from "#/types/git";
import { RootState } from "#/store";
import { setSelectedMicroagentItem } from "#/state/microagent-management-slice";
import { cn } from "#/utils/utils";
import { I18nKey } from "#/i18n/declaration";
interface MicroagentManagementRepoMicroagentsProps {
repository: GitRepository;
@@ -25,8 +22,6 @@ export function MicroagentManagementRepoMicroagents({
const dispatch = useDispatch();
const { t } = useTranslation();
const { full_name: repositoryName } = repository;
// Extract owner and repo from repositoryName (format: "owner/repo")
@@ -42,9 +37,9 @@ export function MicroagentManagementRepoMicroagents({
data: conversations,
isLoading: isLoadingConversations,
isError: isErrorConversations,
} = useMicroagentManagementConversations(
} = useSearchConversations(
repositoryName,
undefined,
"microagent_management",
1000,
true,
);
@@ -108,47 +103,34 @@ export function MicroagentManagementRepoMicroagents({
const numberOfMicroagents = microagents?.length || 0;
const numberOfConversations = conversations?.length || 0;
const totalItems = numberOfMicroagents + numberOfConversations;
const hasMicroagents = numberOfMicroagents > 0;
const hasConversations = numberOfConversations > 0;
return (
<div>
{totalItems === 0 && (
<MicroagentManagementLearnThisRepo repository={repository} />
)}
{/* Render microagents */}
{hasMicroagents && (
<div className="flex flex-col">
<span className="text-md text-white font-medium leading-5 mb-4">
{t(I18nKey.MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS)}
</span>
{microagents?.map((microagent) => (
<div key={microagent.name} className="pb-4 last:pb-0">
<MicroagentManagementMicroagentCard
microagent={microagent}
repository={repository}
/>
</div>
))}
</div>
)}
{numberOfMicroagents > 0 &&
microagents?.map((microagent) => (
<div key={microagent.name} className="pb-4 last:pb-0">
<MicroagentManagementMicroagentCard
microagent={microagent}
repository={repository}
/>
</div>
))}
{/* Render conversations */}
{hasConversations && (
<div className={cn("flex flex-col", hasMicroagents && "mt-4")}>
<span className="text-md text-white font-medium leading-5 mb-4">
{t(I18nKey.MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS)}
</span>
{conversations?.map((conversation) => (
<div key={conversation.conversation_id} className="pb-4 last:pb-0">
<MicroagentManagementMicroagentCard
conversation={conversation}
repository={repository}
/>
</div>
))}
</div>
)}
{numberOfConversations > 0 &&
conversations?.map((conversation) => (
<div key={conversation.conversation_id} className="pb-4 last:pb-0">
<MicroagentManagementMicroagentCard
conversation={conversation}
repository={repository}
/>
</div>
))}
</div>
);
}
@@ -1,12 +1,15 @@
import { useState, useMemo } from "react";
import { useTranslation } from "react-i18next";
import { Accordion, AccordionItem } from "@heroui/react";
import { MicroagentManagementRepoMicroagents } from "./microagent-management-repo-microagents";
import { GitRepository } from "#/types/git";
import { cn } from "#/utils/utils";
import { TabType } from "#/types/microagent-management";
import { MicroagentManagementNoRepositories } from "./microagent-management-no-repositories";
import { I18nKey } from "#/i18n/declaration";
import { DOCUMENTATION_URL } from "#/utils/constants";
import { MicroagentManagementAccordionTitle } from "./microagent-management-accordion-title";
import { sanitizeQuery } from "#/utils/sanitize-query";
type MicroagentManagementRepositoriesProps = {
repositories: GitRepository[];
@@ -18,9 +21,23 @@ export function MicroagentManagementRepositories({
tabType,
}: MicroagentManagementRepositoriesProps) {
const { t } = useTranslation();
const [searchQuery, setSearchQuery] = useState("");
const numberOfRepoMicroagents = repositories.length;
// Filter repositories based on search query
const filteredRepositories = useMemo(() => {
if (!searchQuery.trim()) {
return repositories;
}
const sanitizedQuery = sanitizeQuery(searchQuery);
return repositories.filter((repository) => {
const sanitizedRepoName = sanitizeQuery(repository.full_name);
return sanitizedRepoName.includes(sanitizedQuery);
});
}, [repositories, searchQuery]);
if (numberOfRepoMicroagents === 0) {
if (tabType === "personal") {
return (
@@ -56,6 +73,25 @@ export function MicroagentManagementRepositories({
return (
<div className="flex flex-col gap-4 w-full">
{/* Search Input */}
<div className="flex flex-col gap-2 w-full">
<label htmlFor="repository-search" className="sr-only">
{t(I18nKey.COMMON$SEARCH_REPOSITORIES)}
</label>
<input
id="repository-search"
name="repository-search"
type="text"
placeholder={`${t(I18nKey.COMMON$SEARCH_REPOSITORIES)}...`}
value={searchQuery}
onChange={(e) => setSearchQuery(e.target.value)}
className={cn(
"bg-tertiary border border-[#717888] bg-[#454545] w-full rounded-sm p-2 placeholder:italic placeholder:text-tertiary-alt",
"disabled:bg-[#2D2F36] disabled:border-[#2D2F36] disabled:cursor-not-allowed",
)}
/>
</div>
{/* Repositories Accordion */}
<Accordion
variant="splitted"
@@ -68,7 +104,7 @@ export function MicroagentManagementRepositories({
}}
selectionMode="multiple"
>
{repositories.map((repository) => (
{filteredRepositories.map((repository) => (
<AccordionItem
key={repository.id}
aria-label={repository.full_name}
@@ -1,109 +1,59 @@
import { useEffect, useState, useMemo } from "react";
import { useEffect } from "react";
import { useDispatch } from "react-redux";
import { useTranslation } from "react-i18next";
import { Spinner } from "@heroui/react";
import { MicroagentManagementSidebarHeader } from "./microagent-management-sidebar-header";
import { MicroagentManagementSidebarTabs } from "./microagent-management-sidebar-tabs";
import { useGitRepositories } from "#/hooks/query/use-git-repositories";
import { GitProviderDropdown } from "#/components/common/git-provider-dropdown";
import { useUserRepositories } from "#/hooks/query/use-user-repositories";
import { useUserProviders } from "#/hooks/use-user-providers";
import {
setPersonalRepositories,
setOrganizationRepositories,
setRepositories,
} from "#/state/microagent-management-slice";
import { GitRepository } from "#/types/git";
import { Provider } from "#/types/settings";
import { cn } from "#/utils/utils";
import { sanitizeQuery } from "#/utils/sanitize-query";
import { I18nKey } from "#/i18n/declaration";
import { getGitProviderMicroagentManagementCustomStyles } from "#/components/common/react-select-styles";
interface MicroagentManagementSidebarProps {
isSmallerScreen?: boolean;
providers: Provider[];
}
export function MicroagentManagementSidebar({
isSmallerScreen = false,
providers,
}: MicroagentManagementSidebarProps) {
const [selectedProvider, setSelectedProvider] = useState<Provider | null>(
providers.length > 0 ? providers[0] : null,
);
const [searchQuery, setSearchQuery] = useState("");
const dispatch = useDispatch();
const { t } = useTranslation();
const { data: repositories, isLoading } = useGitRepositories({
provider: selectedProvider,
pageSize: 200,
enabled: !!selectedProvider,
});
// Auto-select provider if there's only one
useEffect(() => {
if (providers.length > 0 && !selectedProvider) {
setSelectedProvider(providers[0]);
}
}, [providers, selectedProvider]);
const handleProviderChange = (provider: Provider | null) => {
setSelectedProvider(provider);
setSearchQuery("");
};
// Filter repositories based on search query
const filteredRepositories = useMemo(() => {
if (!repositories?.pages) return null;
// Flatten all pages to get all repositories
const allRepositories = repositories.pages.flatMap((page) => page.data);
if (!searchQuery.trim()) {
return allRepositories;
}
const sanitizedQuery = sanitizeQuery(searchQuery);
return allRepositories.filter((repository: GitRepository) => {
const sanitizedRepoName = sanitizeQuery(repository.full_name);
return sanitizedRepoName.includes(sanitizedQuery);
});
}, [repositories, searchQuery, selectedProvider]);
const { providers } = useUserProviders();
const selectedProvider = providers.length > 0 ? providers[0] : null;
const { data: repositories, isLoading } =
useUserRepositories(selectedProvider);
useEffect(() => {
if (!filteredRepositories?.length) {
dispatch(setPersonalRepositories([]));
dispatch(setOrganizationRepositories([]));
dispatch(setRepositories([]));
return;
if (repositories?.pages) {
const personalRepos: GitRepository[] = [];
const organizationRepos: GitRepository[] = [];
const otherRepos: GitRepository[] = [];
// Flatten all pages to get all repositories
const allRepositories = repositories.pages.flatMap((page) => page.data);
allRepositories.forEach((repo: GitRepository) => {
const hasOpenHandsSuffix = repo.full_name.endsWith("/.openhands");
if (repo.owner_type === "user" && hasOpenHandsSuffix) {
personalRepos.push(repo);
} else if (repo.owner_type === "organization" && hasOpenHandsSuffix) {
organizationRepos.push(repo);
} else {
otherRepos.push(repo);
}
});
dispatch(setPersonalRepositories(personalRepos));
dispatch(setOrganizationRepositories(organizationRepos));
dispatch(setRepositories(otherRepos));
}
const personalRepos: GitRepository[] = [];
const organizationRepos: GitRepository[] = [];
const otherRepos: GitRepository[] = [];
filteredRepositories.forEach((repo: GitRepository) => {
const hasOpenHandsSuffix =
selectedProvider === "gitlab"
? repo.full_name.endsWith("/openhands-config")
: repo.full_name.endsWith("/.openhands");
if (repo.owner_type === "user" && hasOpenHandsSuffix) {
personalRepos.push(repo);
} else if (repo.owner_type === "organization" && hasOpenHandsSuffix) {
organizationRepos.push(repo);
} else {
otherRepos.push(repo);
}
});
dispatch(setPersonalRepositories(personalRepos));
dispatch(setOrganizationRepositories(organizationRepos));
dispatch(setRepositories(otherRepos));
}, [filteredRepositories, selectedProvider, dispatch]);
}, [repositories, dispatch]);
return (
<div
@@ -113,41 +63,6 @@ export function MicroagentManagementSidebar({
)}
>
<MicroagentManagementSidebarHeader />
{/* Provider Selection */}
{providers.length > 1 && (
<div className="mt-6">
<GitProviderDropdown
providers={providers}
value={selectedProvider}
placeholder="Select Provider"
onChange={handleProviderChange}
className="w-full"
classNamePrefix="git-provider-dropdown"
styles={getGitProviderMicroagentManagementCustomStyles()}
/>
</div>
)}
{/* Search Input */}
<div className="flex flex-col gap-2 w-full mt-6">
<label htmlFor="repository-search" className="sr-only">
{t(I18nKey.COMMON$SEARCH_REPOSITORIES)}
</label>
<input
id="repository-search"
name="repository-search"
type="text"
placeholder={`${t(I18nKey.COMMON$SEARCH_REPOSITORIES)}...`}
value={searchQuery}
onChange={(e) => setSearchQuery(e.target.value)}
className={cn(
"bg-tertiary border border-[#717888] bg-[#454545] w-full rounded-sm p-2 placeholder:italic placeholder:text-tertiary-alt",
"disabled:bg-[#2D2F36] disabled:border-[#2D2F36] disabled:cursor-not-allowed h-10 box-shadow-none outline-none",
)}
/>
</div>
{isLoading ? (
<div className="flex flex-col items-center justify-center gap-4 flex-1">
<Spinner size="sm" />
@@ -1,7 +1,8 @@
import { Tooltip } from "@heroui/react";
import { useTranslation } from "react-i18next";
import ConfirmIcon from "#/assets/confirm";
import RejectIcon from "#/assets/reject";
import { I18nKey } from "#/i18n/declaration";
import { cn } from "#/utils/utils";
interface ActionTooltipProps {
type: "confirm" | "reject";
@@ -11,35 +12,25 @@ interface ActionTooltipProps {
export function ActionTooltip({ type, onClick }: ActionTooltipProps) {
const { t } = useTranslation();
const isConfirm = type === "confirm";
const ariaLabel = isConfirm
? t(I18nKey.ACTION$CONFIRM)
: t(I18nKey.ACTION$REJECT);
const content = isConfirm
? t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)
: t(I18nKey.CHAT_INTERFACE$USER_REJECTED);
const buttonLabel = isConfirm
? `${t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE)} ⌘↩`
: `${t(I18nKey.BUTTON$CANCEL)} ⇧⌘⌫`;
const content =
type === "confirm"
? t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)
: t(I18nKey.CHAT_INTERFACE$USER_REJECTED);
return (
<Tooltip content={content} closeDelay={100}>
<button
data-testid={`action-${type}-button`}
type="button"
aria-label={ariaLabel}
className={cn(
"rounded px-2 h-6.5 text-sm font-medium leading-5 cursor-pointer hover:opacity-80",
aria-label={
type === "confirm"
? "bg-tertiary text-white"
: "bg-white text-[#0D0F11]",
)}
? t(I18nKey.ACTION$CONFIRM)
: t(I18nKey.ACTION$REJECT)
}
className="bg-tertiary rounded-full p-1 hover:bg-base-secondary"
onClick={onClick}
>
{buttonLabel}
{type === "confirm" ? <ConfirmIcon /> : <RejectIcon />}
</button>
</Tooltip>
);
@@ -1,120 +1,31 @@
import { useDispatch, useSelector } from "react-redux";
import { useCallback, useEffect } from "react";
import { useTranslation } from "react-i18next";
import { I18nKey } from "#/i18n/declaration";
import { AgentState } from "#/types/agent-state";
import { generateAgentStateChangeEvent } from "#/services/agent-state-service";
import { useWsClient } from "#/context/ws-client-provider";
import { ActionTooltip } from "../action-tooltip";
import { isOpenHandsAction } from "#/types/core/guards";
import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
import { RiskAlert } from "#/components/shared/risk-alert";
import WarningIcon from "#/icons/u-warning.svg?react";
import { RootState } from "#/store";
import { addSubmittedEventId } from "#/state/event-message-slice";
export function ConfirmationButtons() {
const submittedEventIds = useSelector(
(state: RootState) => state.eventMessage.submittedEventIds,
);
const dispatch = useDispatch();
const { t } = useTranslation();
const { send } = useWsClient();
const { send, parsedEvents } = useWsClient();
// Find the most recent action awaiting confirmation
const awaitingAction = parsedEvents
.slice()
.reverse()
.find((ev) => {
if (!isOpenHandsAction(ev) || ev.source !== "agent") return false;
const args = ev.args as Record<string, unknown>;
return args?.confirmation_state === "awaiting_confirmation";
});
const handleStateChange = useCallback(
(state: AgentState) => {
if (!awaitingAction) {
return;
}
dispatch(addSubmittedEventId(awaitingAction.id));
send(generateAgentStateChangeEvent(state));
},
[send],
);
// Handle keyboard shortcuts
useEffect(() => {
if (!awaitingAction) {
return undefined;
}
const handleCancelShortcut = (event: KeyboardEvent) => {
if (event.shiftKey && event.metaKey && event.key === "Backspace") {
event.preventDefault();
handleStateChange(AgentState.USER_REJECTED);
}
};
const handleContinueShortcut = (event: KeyboardEvent) => {
if (event.metaKey && event.key === "Enter") {
event.preventDefault();
handleStateChange(AgentState.USER_CONFIRMED);
}
};
const handleKeyDown = (event: KeyboardEvent) => {
// Cancel: Shift+Cmd+Backspace (⇧⌘⌫)
handleCancelShortcut(event);
// Continue: Cmd+Enter (⌘↩)
handleContinueShortcut(event);
};
document.addEventListener("keydown", handleKeyDown);
return () => document.removeEventListener("keydown", handleKeyDown);
}, [awaitingAction, handleStateChange]);
if (!awaitingAction || submittedEventIds.includes(awaitingAction.id)) {
return null;
}
const { args } = awaitingAction as { args: Record<string, unknown> };
const risk = args?.security_risk;
const isHighRisk =
typeof risk === "string"
? risk.toLowerCase() === "high"
: Number(risk) === ActionSecurityRisk.HIGH;
const handleStateChange = (state: AgentState) => {
const event = generateAgentStateChangeEvent(state);
send(event);
};
return (
<div className="flex flex-col gap-2 pt-4">
{isHighRisk && (
<RiskAlert
content={t(I18nKey.CHAT_INTERFACE$HIGH_RISK_WARNING)}
icon={<WarningIcon width={16} height={16} color="#fff" />}
severity="high"
title={t(I18nKey.COMMON$HIGH_RISK)}
<div className="flex justify-between items-center pt-4">
<p>{t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}</p>
<div className="flex items-center gap-3">
<ActionTooltip
type="confirm"
onClick={() => handleStateChange(AgentState.USER_CONFIRMED)}
/>
<ActionTooltip
type="reject"
onClick={() => handleStateChange(AgentState.USER_REJECTED)}
/>
)}
<div className="flex justify-between items-center">
<p className="text-sm font-normal text-white">
{t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}
</p>
<div className="flex items-center gap-3">
<ActionTooltip
type="reject"
onClick={() => handleStateChange(AgentState.USER_REJECTED)}
/>
<ActionTooltip
type="confirm"
onClick={() => handleStateChange(AgentState.USER_CONFIRMED)}
/>
</div>
</div>
</div>
);
@@ -93,14 +93,14 @@ function SecurityInvariant() {
(risk: ActionSecurityRisk) => {
switch (risk) {
case ActionSecurityRisk.LOW:
return t(I18nKey.SECURITY$LOW_RISK);
return t(I18nKey.SECURITY_ANALYZER$LOW_RISK);
case ActionSecurityRisk.MEDIUM:
return t(I18nKey.SECURITY$MEDIUM_RISK);
return t(I18nKey.SECURITY_ANALYZER$MEDIUM_RISK);
case ActionSecurityRisk.HIGH:
return t(I18nKey.SECURITY$HIGH_RISK);
return t(I18nKey.SECURITY_ANALYZER$HIGH_RISK);
case ActionSecurityRisk.UNKNOWN:
default:
return t(I18nKey.SECURITY$UNKNOWN_RISK);
return t(I18nKey.SECURITY_ANALYZER$UNKNOWN_RISK);
}
},
[t],
@@ -1,36 +0,0 @@
import { ReactNode } from "react";
import { cn } from "#/utils/utils";
interface RiskAlertProps {
className?: string;
content: ReactNode;
icon?: ReactNode;
severity: "high" | "medium" | "low";
title: string;
}
export function RiskAlert({
className,
content,
icon,
severity,
title,
}: RiskAlertProps) {
// Currently, we are only supporting the high risk alert. If we use want to support other risk levels, we can add them here and use cva to create different variants of this component.
if (severity === "high") {
return (
<div
className={cn(
"flex items-center gap-3.5 bg-[#4A0709] border border-[#FF0006] text-red-400 rounded-xl px-3.5 h-13 text-sm text-white",
className,
)}
>
{icon && <span className="">{icon}</span>}
<span className="font-bold">{title}</span>
<span className="font-normal">{content}</span>
</div>
);
}
return null;
}
@@ -33,7 +33,6 @@ interface ConversationSubscriptionsContextType {
sessionApiKey: string | null;
providersSet: ("github" | "gitlab" | "bitbucket" | "enterprise_sso")[];
baseUrl: string;
socketPath?: string;
onEvent?: (event: unknown, conversationId: string) => void;
}) => void;
unsubscribeFromConversation: (conversationId: string) => void;
@@ -96,10 +95,10 @@ export function ConversationSubscriptionsProvider({
[],
);
const unsubscribeFromConversation = useCallback((conversationId: string) => {
// Use functional update to access current socket data and perform cleanup
setConversationSockets((prev) => {
const socketData = prev[conversationId];
const unsubscribeFromConversation = useCallback(
(conversationId: string) => {
// Get a local reference to the socket data to avoid race conditions
const socketData = conversationSockets[conversationId];
if (socketData) {
const { socket } = socketData;
@@ -113,23 +112,24 @@ export function ConversationSubscriptionsProvider({
socket.disconnect();
}
// Update state to remove the socket
setConversationSockets((prev) => {
const newSockets = { ...prev };
delete newSockets[conversationId];
return newSockets;
});
// Remove from active IDs
setActiveConversationIds((prev) =>
prev.filter((id) => id !== conversationId),
);
// Clean up event handler reference
delete eventHandlersRef.current[conversationId];
// Remove the socket from state
const newSockets = { ...prev };
delete newSockets[conversationId];
return newSockets;
}
return prev; // No change if socket not found
});
// Remove from active IDs
setActiveConversationIds((prev) =>
prev.filter((id) => id !== conversationId),
);
}, []);
},
[conversationSockets],
);
const subscribeToConversation = useCallback(
(options: {
@@ -137,17 +137,10 @@ export function ConversationSubscriptionsProvider({
sessionApiKey: string | null;
providersSet: ("github" | "gitlab" | "bitbucket" | "enterprise_sso")[];
baseUrl: string;
socketPath?: string;
onEvent?: (event: unknown, conversationId: string) => void;
}) => {
const {
conversationId,
sessionApiKey,
providersSet,
baseUrl,
socketPath,
onEvent,
} = options;
const { conversationId, sessionApiKey, providersSet, baseUrl, onEvent } =
options;
// If already subscribed, don't create a new subscription
if (conversationSockets[conversationId]) {
@@ -180,7 +173,9 @@ export function ConversationSubscriptionsProvider({
if (isErrorEvent(event) || isAgentStatusError(event)) {
renderConversationErroredToast(
conversationId,
isErrorEvent(event) ? event.message : "MICROAGENT$UNKNOWN_ERROR",
isErrorEvent(event)
? event.message
: "Unknown error, please try again",
);
} else if (isStatusUpdate(event)) {
if (event.type === "info" && event.id === "STATUS$STARTING_RUNTIME") {
@@ -204,7 +199,6 @@ export function ConversationSubscriptionsProvider({
// Create socket connection
const socket = io(baseUrl, {
transports: ["websocket"],
path: socketPath ?? "/socket.io",
query: {
conversation_id: conversationId,
session_api_key: sessionApiKey,
+3 -12
View File
@@ -317,24 +317,15 @@ export function WsClientProvider({
session_api_key: conversation.session_api_key, // Have to set here because socketio doesn't support custom headers. :(
};
let baseUrl: string | null = null;
let socketPath: string;
let baseUrl = null;
if (conversation.url && !conversation.url.startsWith("/")) {
const u = new URL(conversation.url);
baseUrl = u.host;
const pathBeforeApi = u.pathname.split("/api/conversations")[0] || "/";
// Socket.IO server default path is /socket.io; prefix with pathBeforeApi for path mode
socketPath = `${pathBeforeApi.replace(/\/$/, "")}/socket.io`;
baseUrl = new URL(conversation.url).host;
} else {
baseUrl =
(import.meta.env.VITE_BACKEND_BASE_URL as string | undefined) ||
window?.location.host;
socketPath = "/socket.io";
baseUrl = import.meta.env.VITE_BACKEND_BASE_URL || window?.location.host;
}
sio = io(baseUrl, {
transports: ["websocket"],
path: socketPath,
query,
});
@@ -19,8 +19,6 @@ const saveSettingsMutationFn = async (settings: Partial<PostSettings>) => {
: settings.llm_api_key?.trim() || undefined,
remote_runtime_resource_factor: settings.REMOTE_RUNTIME_RESOURCE_FACTOR,
enable_default_condenser: settings.ENABLE_DEFAULT_CONDENSER,
condenser_max_size:
settings.CONDENSER_MAX_SIZE ?? DEFAULT_SETTINGS.CONDENSER_MAX_SIZE,
enable_sound_notifications: settings.ENABLE_SOUND_NOTIFICATIONS,
user_consents_to_analytics: settings.user_consents_to_analytics,
provider_tokens_set: settings.PROVIDER_TOKENS_SET,
@@ -1,27 +0,0 @@
import { useQuery } from "@tanstack/react-query";
import OpenHands from "#/api/open-hands";
export const useMicroagentManagementConversations = (
selectedRepository: string,
pageId?: string,
limit: number = 100,
cacheDisabled: boolean = false,
) =>
useQuery({
queryKey: [
"conversations",
"microagent-management",
pageId,
limit,
selectedRepository,
],
queryFn: () =>
OpenHands.getMicroagentManagementConversations(
selectedRepository,
pageId,
limit,
),
enabled: !!selectedRepository,
staleTime: cacheDisabled ? 0 : 1000 * 60 * 5, // 5 minutes
gcTime: cacheDisabled ? 0 : 1000 * 60 * 15, // 15 minutes
});
-2
View File
@@ -22,8 +22,6 @@ const getSettingsQueryFn = async (): Promise<Settings> => {
REMOTE_RUNTIME_RESOURCE_FACTOR: apiSettings.remote_runtime_resource_factor,
PROVIDER_TOKENS_SET: apiSettings.provider_tokens_set,
ENABLE_DEFAULT_CONDENSER: apiSettings.enable_default_condenser,
CONDENSER_MAX_SIZE:
apiSettings.condenser_max_size ?? DEFAULT_SETTINGS.CONDENSER_MAX_SIZE,
ENABLE_SOUND_NOTIFICATIONS: apiSettings.enable_sound_notifications,
ENABLE_PROACTIVE_CONVERSATION_STARTERS:
apiSettings.enable_proactive_conversation_starters,
@@ -1,27 +1,14 @@
import React from "react";
import { useQueries, type Query } from "@tanstack/react-query";
import toast from "react-hot-toast";
import { AxiosError } from "axios";
import { useCreateConversation } from "./mutation/use-create-conversation";
import { useUserProviders } from "./use-user-providers";
import { useConversationSubscriptions } from "#/context/conversation-subscriptions-provider";
import { Provider } from "#/types/settings";
import { CreateMicroagent, Conversation } from "#/api/open-hands.types";
import OpenHands from "#/api/open-hands";
import { renderConversationStartingToast } from "#/components/features/chat/microagent/microagent-status-toast";
interface ConversationData {
conversationId: string;
sessionApiKey: string | null;
baseUrl: string;
socketPath: string;
onEventCallback?: (event: unknown, conversationId: string) => void;
}
import { CreateMicroagent } from "#/api/open-hands.types";
/**
* Custom hook to create a conversation and subscribe to it, supporting multiple subscriptions.
* This version waits for conversation status to be "RUNNING" before establishing WebSocket connection.
* Shows immediate toast feedback and polls conversation status until ready.
* This extends the functionality of useCreateConversationAndSubscribe to allow subscribing to
* multiple conversations simultaneously.
*/
export const useCreateConversationAndSubscribeMultiple = () => {
const { mutate: createConversation, isPending } = useCreateConversation();
@@ -33,88 +20,6 @@ export const useCreateConversationAndSubscribeMultiple = () => {
activeConversationIds,
} = useConversationSubscriptions();
// Store conversation data immediately after creation
const [createdConversations, setCreatedConversations] = React.useState<
Record<string, ConversationData>
>({});
// Get conversation IDs that need polling
const conversationIdsToWatch = Object.keys(createdConversations);
// Poll each conversation until it's ready
const conversationQueries = useQueries({
queries: conversationIdsToWatch.map((conversationId) => ({
queryKey: ["conversation-ready-poll", conversationId],
queryFn: () => OpenHands.getConversation(conversationId),
enabled: !!conversationId,
refetchInterval: (query: Query<Conversation | null, AxiosError>) => {
const status = query.state.data?.status;
if (status === "STARTING") {
return 3000; // Poll every 3 seconds while STARTING
}
return false; // Stop polling once not STARTING
},
retry: false,
})),
});
// Extract stable values from queries for dependency array
const queryStatuses = conversationQueries.map((query) => query.data?.status);
const queryDataExists = conversationQueries.map((query) => !!query.data);
// Effect to handle subscription when conversations are ready
React.useEffect(() => {
conversationQueries.forEach((query, index) => {
const conversationId = conversationIdsToWatch[index];
const conversationData = createdConversations[conversationId];
if (!query.data || !conversationData) return;
const { status, url, session_api_key: sessionApiKey } = query.data;
let { baseUrl } = conversationData;
if (url && !url.startsWith("/")) {
baseUrl = new URL(url).host;
}
if (status === "RUNNING") {
// Conversation is ready - subscribe to WebSocket
subscribeToConversation({
conversationId,
sessionApiKey,
providersSet: providers,
baseUrl,
socketPath: conversationData.socketPath,
onEvent: conversationData.onEventCallback,
});
// Remove from created conversations (cleanup)
setCreatedConversations((prev) => {
const newCreated = { ...prev };
delete newCreated[conversationId];
return newCreated;
});
} else if (status === "STOPPED") {
// Dismiss the starting toast
toast.dismiss(`starting-${conversationId}`);
// Remove from created conversations (cleanup)
setCreatedConversations((prev) => {
const newCreated = { ...prev };
delete newCreated[conversationId];
return newCreated;
});
}
});
}, [
queryStatuses,
queryDataExists,
conversationIdsToWatch,
createdConversations,
subscribeToConversation,
providers,
]);
const createConversationAndSubscribe = React.useCallback(
({
query,
@@ -144,46 +49,33 @@ export const useCreateConversationAndSubscribeMultiple = () => {
},
{
onSuccess: (data) => {
// Show immediate toast to let user know something is happening
renderConversationStartingToast(data.conversation_id);
// Call the success callback immediately
if (onSuccessCallback) {
onSuccessCallback(data.conversation_id);
}
// Only handle immediate post-creation tasks here
let baseUrl = "";
let socketPath: string;
if (data?.url && !data.url.startsWith("/")) {
const u = new URL(data.url);
baseUrl = u.host;
const pathBeforeApi =
u.pathname.split("/api/conversations")[0] || "/";
socketPath = `${pathBeforeApi.replace(/\/$/, "")}/socket.io`;
baseUrl = new URL(data.url).host;
} else {
baseUrl =
(import.meta.env.VITE_BACKEND_BASE_URL as string | undefined) ||
window?.location.host;
socketPath = "/socket.io";
}
// Store conversation data for polling and eventual subscription
setCreatedConversations((prev) => ({
...prev,
[data.conversation_id]: {
conversationId: data.conversation_id,
sessionApiKey: data.session_api_key,
baseUrl,
socketPath,
onEventCallback,
},
}));
// Subscribe to the conversation
subscribeToConversation({
conversationId: data.conversation_id,
sessionApiKey: data.session_api_key,
providersSet: providers,
baseUrl,
onEvent: onEventCallback,
});
// Call the success callback if provided
if (onSuccessCallback) {
onSuccessCallback(data.conversation_id);
}
},
},
);
},
[createConversation],
[createConversation, subscribeToConversation, providers],
);
return {
+11 -22
View File
@@ -97,8 +97,6 @@ export enum I18nKey {
SETTINGS$BASE_URL = "SETTINGS$BASE_URL",
SETTINGS$AGENT = "SETTINGS$AGENT",
SETTINGS$ENABLE_MEMORY_CONDENSATION = "SETTINGS$ENABLE_MEMORY_CONDENSATION",
SETTINGS$CONDENSER_MAX_SIZE = "SETTINGS$CONDENSER_MAX_SIZE",
SETTINGS$CONDENSER_MAX_SIZE_TOOLTIP = "SETTINGS$CONDENSER_MAX_SIZE_TOOLTIP",
SETTINGS$LANGUAGE = "SETTINGS$LANGUAGE",
ACTION$PUSH_TO_BRANCH = "ACTION$PUSH_TO_BRANCH",
ACTION$PUSH_CREATE_PR = "ACTION$PUSH_CREATE_PR",
@@ -131,6 +129,7 @@ export enum I18nKey {
CONVERSATION$REPOSITORY = "CONVERSATION$REPOSITORY",
CONVERSATION$BRANCH = "CONVERSATION$BRANCH",
CONVERSATION$GIT_PROVIDER = "CONVERSATION$GIT_PROVIDER",
ACCOUNT_SETTINGS$TITLE = "ACCOUNT_SETTINGS$TITLE",
WORKSPACE$TERMINAL_TAB_LABEL = "WORKSPACE$TERMINAL_TAB_LABEL",
WORKSPACE$BROWSER_TAB_LABEL = "WORKSPACE$BROWSER_TAB_LABEL",
WORKSPACE$JUPYTER_TAB_LABEL = "WORKSPACE$JUPYTER_TAB_LABEL",
@@ -327,7 +326,6 @@ export enum I18nKey {
USER$ACCOUNT_SETTINGS = "USER$ACCOUNT_SETTINGS",
JUPYTER$OUTPUT_LABEL = "JUPYTER$OUTPUT_LABEL",
BUTTON$STOP = "BUTTON$STOP",
BUTTON$PAUSE = "BUTTON$PAUSE",
BUTTON$EDIT_TITLE = "BUTTON$EDIT_TITLE",
BUTTON$DOWNLOAD_VIA_VSCODE = "BUTTON$DOWNLOAD_VIA_VSCODE",
BUTTON$DISPLAY_COST = "BUTTON$DISPLAY_COST",
@@ -339,8 +337,6 @@ export enum I18nKey {
LANDING$RECENT_CONVERSATION = "LANDING$RECENT_CONVERSATION",
CONVERSATION$CONFIRM_DELETE = "CONVERSATION$CONFIRM_DELETE",
CONVERSATION$CONFIRM_STOP = "CONVERSATION$CONFIRM_STOP",
CONVERSATION$CONFIRM_PAUSE = "CONVERSATION$CONFIRM_PAUSE",
CONVERSATION$PAUSE_WARNING = "CONVERSATION$PAUSE_WARNING",
CONVERSATION$STOP_WARNING = "CONVERSATION$STOP_WARNING",
CONVERSATION$METRICS_INFO = "CONVERSATION$METRICS_INFO",
CONVERSATION$CREATED = "CONVERSATION$CREATED",
@@ -361,7 +357,6 @@ export enum I18nKey {
CHAT_INTERFACE$INPUT_PLACEHOLDER = "CHAT_INTERFACE$INPUT_PLACEHOLDER",
CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE = "CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE",
CHAT_INTERFACE$USER_ASK_CONFIRMATION = "CHAT_INTERFACE$USER_ASK_CONFIRMATION",
CHAT_INTERFACE$HIGH_RISK_WARNING = "CHAT_INTERFACE$HIGH_RISK_WARNING",
CHAT_INTERFACE$USER_CONFIRMED = "CHAT_INTERFACE$USER_CONFIRMED",
CHAT_INTERFACE$USER_REJECTED = "CHAT_INTERFACE$USER_REJECTED",
CHAT_INTERFACE$INPUT_SEND_MESSAGE_BUTTON_CONTENT = "CHAT_INTERFACE$INPUT_SEND_MESSAGE_BUTTON_CONTENT",
@@ -376,6 +371,10 @@ export enum I18nKey {
CHAT_INTERFACE$MESSAGE_ARIA_LABEL = "CHAT_INTERFACE$MESSAGE_ARIA_LABEL",
CHAT_INTERFACE$CHAT_CONVERSATION = "CHAT_INTERFACE$CHAT_CONVERSATION",
CHAT_INTERFACE$UNKNOWN_SENDER = "CHAT_INTERFACE$UNKNOWN_SENDER",
SECURITY_ANALYZER$UNKNOWN_RISK = "SECURITY_ANALYZER$UNKNOWN_RISK",
SECURITY_ANALYZER$LOW_RISK = "SECURITY_ANALYZER$LOW_RISK",
SECURITY_ANALYZER$MEDIUM_RISK = "SECURITY_ANALYZER$MEDIUM_RISK",
SECURITY_ANALYZER$HIGH_RISK = "SECURITY_ANALYZER$HIGH_RISK",
SETTINGS$MODEL_TOOLTIP = "SETTINGS$MODEL_TOOLTIP",
SETTINGS$AGENT_TOOLTIP = "SETTINGS$AGENT_TOOLTIP",
SETTINGS$LANGUAGE_TOOLTIP = "SETTINGS$LANGUAGE_TOOLTIP",
@@ -386,12 +385,9 @@ export enum I18nKey {
SETTINGS$REFRESH_LLM_API_KEY = "SETTINGS$REFRESH_LLM_API_KEY",
SETTINGS$CONFIRMATION_MODE = "SETTINGS$CONFIRMATION_MODE",
SETTINGS$CONFIRMATION_MODE_TOOLTIP = "SETTINGS$CONFIRMATION_MODE_TOOLTIP",
SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP = "SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP",
SETTINGS$AGENT_SELECT_ENABLED = "SETTINGS$AGENT_SELECT_ENABLED",
SETTINGS$SECURITY_ANALYZER = "SETTINGS$SECURITY_ANALYZER",
SETTINGS$SECURITY_ANALYZER_PLACEHOLDER = "SETTINGS$SECURITY_ANALYZER_PLACEHOLDER",
SETTINGS$SECURITY_ANALYZER_TOOLTIP = "SETTINGS$SECURITY_ANALYZER_TOOLTIP",
SETTINGS$SECURITY_ANALYZER_DESCRIPTION = "SETTINGS$SECURITY_ANALYZER_DESCRIPTION",
SETTINGS$DONT_KNOW_API_KEY = "SETTINGS$DONT_KNOW_API_KEY",
SETTINGS$CLICK_FOR_INSTRUCTIONS = "SETTINGS$CLICK_FOR_INSTRUCTIONS",
SETTINGS$SAVED = "SETTINGS$SAVED",
@@ -478,6 +474,7 @@ export enum I18nKey {
PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_FILES_LABEL = "PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_FILES_LABEL",
PROJECT_MENU_CARD$OPEN = "PROJECT_MENU_CARD$OPEN",
ACTION_BUTTON$RESUME = "ACTION_BUTTON$RESUME",
ACTION_BUTTON$PAUSE = "ACTION_BUTTON$PAUSE",
BROWSER$SCREENSHOT_ALT = "BROWSER$SCREENSHOT_ALT",
ERROR_TOAST$CLOSE_BUTTON_LABEL = "ERROR_TOAST$CLOSE_BUTTON_LABEL",
FILE_EXPLORER$UPLOAD = "FILE_EXPLORER$UPLOAD",
@@ -516,6 +513,7 @@ export enum I18nKey {
STATUS$CONNECTED = "STATUS$CONNECTED",
BROWSER$NO_PAGE_LOADED = "BROWSER$NO_PAGE_LOADED",
USER$AVATAR_PLACEHOLDER = "USER$AVATAR_PLACEHOLDER",
ACCOUNT_SETTINGS$SETTINGS = "ACCOUNT_SETTINGS$SETTINGS",
ACCOUNT_SETTINGS$LOGOUT = "ACCOUNT_SETTINGS$LOGOUT",
SETTINGS_FORM$ADVANCED_OPTIONS_LABEL = "SETTINGS_FORM$ADVANCED_OPTIONS_LABEL",
CONVERSATION$NO_CONVERSATIONS = "CONVERSATION$NO_CONVERSATIONS",
@@ -575,6 +573,8 @@ export enum I18nKey {
ENTERPRISE_SSO$CONNECT_TO_ENTERPRISE_SSO = "ENTERPRISE_SSO$CONNECT_TO_ENTERPRISE_SSO",
AUTH$SIGN_IN_WITH_IDENTITY_PROVIDER = "AUTH$SIGN_IN_WITH_IDENTITY_PROVIDER",
WAITLIST$JOIN_WAITLIST = "WAITLIST$JOIN_WAITLIST",
ACCOUNT_SETTINGS$ADDITIONAL_SETTINGS = "ACCOUNT_SETTINGS$ADDITIONAL_SETTINGS",
ACCOUNT_SETTINGS$DISCONNECT_FROM_GITHUB = "ACCOUNT_SETTINGS$DISCONNECT_FROM_GITHUB",
CONVERSATION$DELETE_WARNING = "CONVERSATION$DELETE_WARNING",
FEEDBACK$TITLE = "FEEDBACK$TITLE",
FEEDBACK$DESCRIPTION = "FEEDBACK$DESCRIPTION",
@@ -781,6 +781,8 @@ export enum I18nKey {
PROJECT_MANAGEMENT$SVC_ACC_EMAIL_VALIDATION_ERROR = "PROJECT_MANAGEMENT$SVC_ACC_EMAIL_VALIDATION_ERROR",
PROJECT_MANAGEMENT$SVC_ACC_API_KEY_VALIDATION_ERROR = "PROJECT_MANAGEMENT$SVC_ACC_API_KEY_VALIDATION_ERROR",
MICROAGENT_MANAGEMENT$ERROR_LOADING_MICROAGENT_CONTENT = "MICROAGENT_MANAGEMENT$ERROR_LOADING_MICROAGENT_CONTENT",
SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT = "SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT",
SETTINGS$MCP_ERROR_URL_DUPLICATE = "SETTINGS$MCP_ERROR_URL_DUPLICATE",
SETTINGS$MCP_SERVER_TYPE_SSE = "SETTINGS$MCP_SERVER_TYPE_SSE",
SETTINGS$MCP_SERVER_TYPE_STDIO = "SETTINGS$MCP_SERVER_TYPE_STDIO",
SETTINGS$MCP_SERVER_TYPE_SHTTP = "SETTINGS$MCP_SERVER_TYPE_SHTTP",
@@ -792,8 +794,6 @@ export enum I18nKey {
SETTINGS$MCP_ERROR_NAME_DUPLICATE = "SETTINGS$MCP_ERROR_NAME_DUPLICATE",
SETTINGS$MCP_ERROR_COMMAND_REQUIRED = "SETTINGS$MCP_ERROR_COMMAND_REQUIRED",
SETTINGS$MCP_ERROR_COMMAND_NO_SPACES = "SETTINGS$MCP_ERROR_COMMAND_NO_SPACES",
SETTINGS$MCP_ERROR_URL_DUPLICATE = "SETTINGS$MCP_ERROR_URL_DUPLICATE",
SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT = "SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT",
SETTINGS$MCP_SERVER_TYPE = "SETTINGS$MCP_SERVER_TYPE",
SETTINGS$MCP_API_KEY_PLACEHOLDER = "SETTINGS$MCP_API_KEY_PLACEHOLDER",
SETTINGS$MCP_COMMAND_ARGUMENTS = "SETTINGS$MCP_COMMAND_ARGUMENTS",
@@ -814,15 +814,4 @@ export enum I18nKey {
MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW = "MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW",
MICROAGENT_MANAGEMENT$PR_NOT_CREATED = "MICROAGENT_MANAGEMENT$PR_NOT_CREATED",
MICROAGENT_MANAGEMENT$ERROR_CREATING_MICROAGENT = "MICROAGENT_MANAGEMENT$ERROR_CREATING_MICROAGENT",
MICROAGENT$STATUS_WAITING = "MICROAGENT$STATUS_WAITING",
MICROAGENT$UNKNOWN_ERROR = "MICROAGENT$UNKNOWN_ERROR",
MICROAGENT$CONVERSATION_STARTING = "MICROAGENT$CONVERSATION_STARTING",
MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS = "MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS",
MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS = "MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS",
SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT = "SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT",
SETTINGS$SECURITY_ANALYZER_NONE = "SETTINGS$SECURITY_ANALYZER_NONE",
SETTINGS$SECURITY_ANALYZER_INVARIANT = "SETTINGS$SECURITY_ANALYZER_INVARIANT",
COMMON$HIGH_RISK = "COMMON$HIGH_RISK",
MICROAGENT$DEFINITION = "MICROAGENT$DEFINITION",
MICROAGENT$ADD_TO_MEMORY = "MICROAGENT$ADD_TO_MEMORY",
}

Some files were not shown because too many files have changed in this diff Show More