Fix WebArena evaluation compatibility with BrowserGym 0.14.2

- Upgrade browsergym packages from 0.13.3 to 0.14.2 in pyproject.toml - Update browser tool to include new enable_autocomplete_menu parameter for fill() action - Fix BrowserEnv pre_observation_delay parameter compatibility - Fix mypy type annotations in browsergym_state_capture.py - Successfully validated WebArena evaluation on 3 examples with proper: * Accessibility tree parsing (273, 65, 65 nodes respectively) * Automatic login functionality (credentials filled correctly) * Task completion attempts (13, 32, 32 interaction steps) Co-authored-by: openhands <openhands@all-hands.dev>
Apply ruff formatting fixes
2026-04-29 03:00:45 -04:00 · 2025-09-06 13:01:21 +00:00 · 2025-09-06 01:15:50 +00:00 · 2025-09-06 01:15:50 +00:00 · 2025-09-06 01:15:50 +00:00 · 2025-09-06 01:15:50 +00:00
220 changed files with 8464 additions and 3111 deletions
@@ -187,6 +187,7 @@ jobs:
            test_settings.py::test_github_token_configuration \
            test_conversation.py::test_conversation_start \
            test_browsing_catchphrase.py::test_browsing_catchphrase \
+            test_multi_conversation_resume.py::test_multi_conversation_resume \
            -v --no-header --capture=no --timeout=900

      - name: Upload test results
@@ -15,7 +15,7 @@ jobs:
          stale-issue-message: 'This issue is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
          stale-pr-message: 'This PR is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days.'
          days-before-stale: 40
-          exempt-issue-labels: 'roadmap'
+          exempt-issue-labels: roadmap,backlog
          close-issue-message: 'This issue was automatically closed due to 50 days of inactivity. We do this to help keep the issues somewhat manageable and focus on active issues.'
          close-pr-message: 'This PR was closed because it had no activity for 50 days. If you feel this was closed in error, and you would like to continue the PR, please resubmit or let us know.'
          days-before-close: 10
@@ -257,3 +257,5 @@ containers/runtime/code

 # test results
 test-results
+.sessions
+.eval_sessions
@@ -363,10 +363,11 @@ classpath = "my_package.my_module.MyCustomAgent"
 #confirmation_mode = false

 # The security analyzer to use (For Headless / CLI only -  In Web this is overridden by Session Init)
-#security_analyzer = ""
+# Available options: 'llm' (default), 'invariant'
+#security_analyzer = "llm"

 # Whether to enable security analyzer
-#enable_security_analyzer = false
+#enable_security_analyzer = true

 #################################### Condenser #################################
 # Condensers control how conversation history is managed and compressed when
@@ -58,34 +58,34 @@ RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
 # Default is 60000, but we've seen up to 200000
 RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs

-RUN groupadd --gid $OPENHANDS_USER_ID app
+RUN groupadd --gid $OPENHANDS_USER_ID openhands
 RUN useradd -l -m -u $OPENHANDS_USER_ID --gid $OPENHANDS_USER_ID -s /bin/bash openhands && \
-    usermod -aG app openhands && \
+    usermod -aG openhands openhands && \
    usermod -aG sudo openhands && \
    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-RUN chown -R openhands:app /app && chmod -R 770 /app
-RUN sudo chown -R openhands:app $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
+RUN chown -R openhands:openhands /app && chmod -R 770 /app
+RUN sudo chown -R openhands:openhands $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
 USER openhands

 ENV VIRTUAL_ENV=/app/.venv \
    PATH="/app/.venv/bin:$PATH" \
    PYTHONPATH='/app'

-COPY --chown=openhands:app --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+COPY --chown=openhands:openhands --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}

-COPY --chown=openhands:app --chmod=770 ./microagents ./microagents
-COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
-COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
-COPY --chown=openhands:app pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./
+COPY --chown=openhands:openhands --chmod=770 ./microagents ./microagents
+COPY --chown=openhands:openhands --chmod=770 ./openhands ./openhands
+COPY --chown=openhands:openhands --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
+COPY --chown=openhands:openhands pyproject.toml poetry.lock README.md MANIFEST.in LICENSE ./

 # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
 RUN python openhands/core/download.py # No-op to download assets
 # Add this line to set group ownership of all files/directories not already in "app" group
-# openhands:openhands -> openhands:app
-RUN find /app \! -group app -exec chgrp app {} +
+# openhands:openhands -> openhands:openhands
+RUN find /app \! -group openhands -exec chgrp openhands {} +

-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
-COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
+COPY --chown=openhands:openhands --chmod=770 --from=frontend-builder /app/build ./frontend/build
+COPY --chown=openhands:openhands --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root

@@ -54,7 +54,7 @@ else
      fi
    fi
  fi
-  usermod -aG app enduser
+  usermod -aG openhands enduser
  # get the user group of /var/run/docker.sock and set openhands to that group
  DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
  echo "Docker socket group id: $DOCKER_SOCKET_GID"
@@ -1,5 +1,5 @@
 ---
-title: Jira Data Center Integration (Beta)
+title: Jira Data Center Integration (Coming soon...)
 description: Complete guide for setting up Jira Data Center integration with OpenHands Cloud, including service account creation, personal access token generation, webhook configuration, and workspace integration setup.
 ---

@@ -1,5 +1,5 @@
 ---
-title: Jira Cloud Integration
+title: Jira Cloud Integration (Coming soon...)
 description: Complete guide for setting up Jira Cloud integration with OpenHands Cloud, including service account creation, API token generation, webhook configuration, and workspace integration setup.
 ---

@@ -1,5 +1,5 @@
 ---
-title: Linear Integration
+title: Linear Integration (Coming soon...)
 description: Complete guide for setting up Linear integration with OpenHands Cloud, including service account creation, API key generation, webhook configuration, and workspace integration setup.
 ---

@@ -1,5 +1,5 @@
 ---
-title: Project Management Tool Integrations
+title: Project Management Tool Integrations (Coming soon...)
 description: Overview of OpenHands Cloud integrations with project management platforms including Jira Cloud, Jira Data Center, and Linear. Learn about setup requirements, usage methods, and troubleshooting.
 ---

@@ -18,9 +18,9 @@ Integration requires two levels of setup:
 2. **Workspace Integration** - Self-service configuration through the OpenHands Cloud UI to link your OpenHands account to the target workspace

 ### Platform-Specific Setup Guides:
- [Jira Cloud Integration](./jira-integration.md)
- [Jira Data Center Integration](./jira-dc-integration.md)
- [Linear Integration](./linear-integration.md)
+- [Jira Cloud Integration (Coming soon...)](./jira-integration.md)
+- [Jira Data Center Integration (Coming soon...)](./jira-dc-integration.md)
+- [Linear Integration (Coming soon...)](./linear-integration.md)

 ## Usage

@@ -0,0 +1,52 @@
+# Confirmation Mode and Security Analyzers
+
+OpenHands provides a security framework to help protect users from potentially risky actions through **Confirmation Mode** and **Security Analyzers**. This system analyzes agent actions and prompts users for confirmation when high-risk operations are detected.
+
+## Overview
+
+The security system consists of two main components:
+
+1. **Confirmation Mode**: When enabled, the agent will pause and ask for user confirmation before executing actions that are flagged as high-risk by the security analyzer.
+
+2. **Security Analyzers**: These are modules that evaluate the risk level of agent actions and determine whether user confirmation is required.
+
+## Configuration
+
+### CLI
+In CLI mode, confirmation is enabled by default. You will have an option to uses the LLM Analyzer and will automatically confirm LOW and MEDIUM risk actions, only prompting for HIGH risk actions.
+
+## Security Analyzers
+
+OpenHands includes multiple analyzers:
+
+- **No Analyzer**: Do not use any security analyzer. The agent will prompt you to confirm *EVERY* action.
+- **LLM Risk Analyzer** (default): Uses the same LLM as the agent to assess action risk levels
+- **Invariant Analyzer**: Uses Invariant Labs' policy engine to evaluate action traces against security policies
+
+### LLM Risk Analyzer
+The default analyzer that leverages the agent's LLM to evaluate the security risk of each action. It considers the action type, parameters, and context to assign risk levels.
+
+### Invariant Analyzer
+An advanced analyzer that:
+- Collects conversation events and parses them into a trace
+- Checks the trace against an Invariant policy to classify risk (low, medium, high)
+- Manages an Invariant server container automatically if needed
+- Supports optional browsing-alignment and harmful-content checks
+
+## How It Works
+
+1. **Action Analysis**: When the agent wants to perform an action, the selected security analyzer evaluates its risk level.
+
+2. **Risk Assessment**: The analyzer returns one of three risk levels:
+   - **LOW**: Action proceeds without confirmation
+   - **MEDIUM**: Action proceeds without confirmation (may be configurable in future)
+   - **HIGH**: Action is paused, and user confirmation is requested
+
+3. **User Confirmation**: For high-risk actions, a confirmation dialog appears with:
+   - Description of the action
+   - Risk assessment explanation
+   - Options to approve or deny action
+
+4. **Action Execution**: Based on user response:
+   - **Approve**: Action proceeds as planned
+   - **Deny**: Action is cancelled
@@ -87,19 +87,13 @@ source ~/.bashrc  # or source ~/.zshrc

 </AccordionGroup>

-3. Launch an interactive OpenHands conversation from the command line:
-```bash
-# If using uvx (recommended)
-uvx --python 3.12 --from openhands-ai openhands
-```
-
 <Note>
  If you have cloned the repository, you can also run the CLI directly using Poetry:

  poetry run openhands
 </Note>

-4. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).
+3. Set your model, API key, and other preferences using the UI (or alternatively environment variables, below).

 This command opens an interactive prompt where you can type tasks or commands and get responses from OpenHands.
 The first time you run the CLI, it will take you through configuring the required LLM
@@ -45,6 +45,13 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
  1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
  2. Run `wsl --version` in powershell and confirm `Default Version: 2`.

+  **Ubuntu (Linux Distribution)**
+
+  1. Install Ubuntu: `wsl --install -d Ubuntu` in PowerShell as Administrator.
+  2. Restart computer when prompted.
+  3. Open Ubuntu from Start menu to complete setup.
+  4. Verify installation: `wsl --list` should show Ubuntu.
+
  **Docker Desktop**

  1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
@@ -53,7 +60,7 @@ A system with a modern processor and a minimum of **4GB RAM** is recommended to
  - Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.

  <Note>
-  The docker command below to start the app must be run inside the WSL terminal.
+  The docker command below to start the app must be run inside the WSL terminal. Use `wsl -d Ubuntu` in PowerShell or search "Ubuntu" in the Start menu to access the Ubuntu terminal.
  </Note>

  **Alternative: Windows without WSL**
@@ -22,7 +22,7 @@ SDK to spawn and control these sandboxes.

 You can use the E2B CLI to create a custom sandbox with a Dockerfile. Read the full guide
 [here](https://e2b.dev/docs/guide/custom-sandbox). The premade OpenHands sandbox for E2B is set up in the `containers`
-directory. and it's called `openhands`.
+directory, and it's called `openhands`.

 ## Debugging

@@ -9,8 +9,8 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
-    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -61,18 +61,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
    metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    # Create config with EDA-specific container image
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
    )
+
+    # Override the container image for EDA
+    config.sandbox.base_container_image = 'python:3.12-bookworm'
+
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
    agent_config.enable_prompt_extensions = False
@@ -17,8 +17,8 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    compatibility_for_eval_history_pairs,
-    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
 def get_config(
    metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-slim'
+    # Create config with agent_bench-specific container image
+    config = get_openhands_config_for_eval(metadata=metadata)
+
+    # Override the container image for agent_bench
+    config.sandbox.base_container_image = 'python:3.12-slim'

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
    agent_config.enable_prompt_extensions = False
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -50,15 +51,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.11-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -61,15 +62,10 @@ def get_config(
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -75,15 +76,10 @@ def get_config(
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -40,14 +41,8 @@ def get_config(
    )
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        workspace_base=None,
-        workspace_mount_path=None,
+    config = get_openhands_config_for_eval(
+        metadata=metadata, runtime='docker', sandbox_config=sandbox_config
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -114,16 +115,11 @@ def get_config(
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = base_container_image

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        enable_browser=RUN_WITH_BROWSING,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        enable_browser=RUN_WITH_BROWSING,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -65,15 +66,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -60,15 +61,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
    if metadata.agent_config:
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -43,15 +44,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -31,6 +31,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -64,15 +65,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -24,6 +24,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -85,15 +86,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -16,6 +16,7 @@ import ruamel.yaml
 from evaluation.utils.shared import (
    EvalMetadata,
    get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
    make_metadata,
 )
 from openhands.core.config import (
@@ -37,15 +38,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -48,15 +49,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -52,15 +53,10 @@ def get_config(
        '$OH_INTERPRETER_PATH -m pip install scitools-pyke'
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -58,15 +59,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -110,15 +111,10 @@ def get_config(
        f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -27,6 +27,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -80,15 +81,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
@@ -87,13 +88,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
        dataset_name=metadata.dataset,
        instance_id=instance['instance_id'],
    )
-    config = OpenHandsConfig(
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    return config

@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -341,16 +342,11 @@ def get_config(
        instance_id=instance['instance_id'],
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        enable_browser=RUN_WITH_BROWSING,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -31,6 +31,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -174,15 +175,10 @@ def get_config(
        instance_id=instance['instance_id'],
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )

    config.set_llm_config(
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -64,16 +65,10 @@ def get_config(
    sandbox_config.base_container_image = (
        'docker.io/xingyaoww/openhands-eval-scienceagentbench'
    )
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_budget_per_task=4,
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
@@ -83,13 +84,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
        dataset_name=metadata.dataset,
        instance_id=instance['instance_id'],
    )
-    config = OpenHandsConfig(
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    return config

@@ -32,6 +32,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -227,16 +228,11 @@ def get_config(
        instance_id=instance['instance_id'],
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        enable_browser=RUN_WITH_BROWSING,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )

    config.set_llm_config(
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -199,16 +200,11 @@ def get_config(
        'REPO_PATH': f'/workspace/{workspace_dir_name}/',
    }

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        enable_browser=RUN_WITH_BROWSING,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -13,6 +13,7 @@ N_RUNS=${4:-1}
 export EXP_NAME=$EXP_NAME
 # use 2x resources for rollout since some codebases are pretty resource-intensive
 export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
+export ITERATIVE_EVAL_MODE=false
 echo "MODEL: $MODEL"
 echo "EXP_NAME: $EXP_NAME"
 DATASET="SWE-Gym/SWE-Gym"  # change this to the "/SWE-Gym-Lite" if you want to rollout the lite subset
@@ -37,6 +37,7 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    get_openhands_config_for_eval,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
@@ -58,20 +59,21 @@ def get_config(instance: pd.Series) -> OpenHandsConfig:
        f'Invalid container image for instance {instance["instance_id_swebench"]}.'
    )
    logger.info(f'Using instance container image: {base_container_image}.')
-    return OpenHandsConfig(
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            use_host_network=False,
-            timeout=1800,
-            api_key=os.environ.get('ALLHANDS_API_KEY'),
-            remote_runtime_api_url=os.environ.get(
-                'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
-            ),
+
+    # Create custom sandbox config for testgeneval with specific requirements
+    sandbox_config = SandboxConfig(
+        base_container_image=base_container_image,
+        use_host_network=False,
+        timeout=1800,  # Longer timeout than default (300)
+        api_key=os.environ.get('ALLHANDS_API_KEY'),
+        remote_runtime_api_url=os.environ.get(
+            'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
        ),
-        workspace_base=None,
-        workspace_mount_path=None,
+    )
+
+    return get_openhands_config_for_eval(
+        sandbox_config=sandbox_config,
+        runtime=os.environ.get('RUNTIME', 'docker'),  # Different default runtime
    )


@@ -25,6 +25,7 @@ from evaluation.utils.shared import (
    assert_and_raise,
    codeact_user_response,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -126,29 +127,26 @@ def get_config(
        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            # Add platform to the sandbox config to solve issue 4401
-            platform='linux/amd64',
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get(
-                'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
-            ),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
+    sandbox_config = SandboxConfig(
+        base_container_image=base_container_image,
+        enable_auto_lint=True,
+        use_host_network=False,
+        # large enough timeout, since some testcases take very long to run
+        timeout=300,
+        # Add platform to the sandbox config to solve issue 4401
+        platform='linux/amd64',
+        api_key=os.environ.get('ALLHANDS_API_KEY', None),
+        remote_runtime_api_url=os.environ.get(
+            'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
        ),
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        keep_runtime_alive=False,
+        remote_runtime_init_timeout=3600,
+    )
+
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
+        runtime=os.environ.get('RUNTIME', 'docker'),
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -12,7 +12,10 @@ import tempfile
 import yaml
 from browsing import pre_login

-from evaluation.utils.shared import get_default_sandbox_config_for_eval
+from evaluation.utils.shared import (
+    get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
+)
 from openhands.controller.state.state import State
 from openhands.core.config import (
    LLMConfig,
@@ -42,19 +45,17 @@ def get_config(
    sandbox_config.enable_auto_lint = True
    # If the web services are running on the host machine, this must be set to True
    sandbox_config.use_host_network = True
-    config = OpenHandsConfig(
-        run_as_openhands=False,
-        max_budget_per_task=4,
+    config = get_openhands_config_for_eval(
        max_iterations=100,
-        save_trajectory_path=os.path.join(
-            mount_path_on_host, f'traj_{task_short_name}.json'
-        ),
-        sandbox=sandbox_config,
        # we mount trajectories path so that trajectories, generated by OpenHands
        # controller, can be accessible to the evaluator file in the runtime container
+        sandbox_config=sandbox_config,
        workspace_mount_path=mount_path_on_host,
-        workspace_mount_path_in_sandbox='/outputs',
    )
+    config.save_trajectory_path = os.path.join(
+        mount_path_on_host, f'traj_{task_short_name}.json'
+    )
+    config.max_budget_per_task = 4
    config.set_llm_config(llm_config)
    if agent_config:
        config.set_agent_config(agent_config)
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -44,15 +45,10 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
    codeact_user_response,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    is_fatal_evaluation_error,
    make_metadata,
    prepare_dataset,
@@ -160,16 +161,11 @@ def get_config(
        instance_id=instance['instance_id'],
    )

-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        enable_browser=RUN_WITH_BROWSING,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -73,16 +74,10 @@ def get_config(
        'VWA_WIKIPEDIA': f'{base_url}:8888',
        'VWA_HOMEPAGE': f'{base_url}:4399',
    }
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-        attach_to_existing=True,
+        sandbox_config=sandbox_config,
    )
    config.set_llm_config(
        update_llm_config_for_completions_logging(
@@ -0,0 +1,212 @@
+# WebArena CDP Integration Implementation Plan
+
+## Overview
+
+This document outlines the proper solution for integrating OpenHands with the official WebArena evaluation harness using Chrome DevTools Protocol (CDP) session logging.
+
+## The Problem
+
+WebArena evaluators require:
+1. Live browser state (DOM, cookies, localStorage, etc.)
+2. CDPSession object for making CDP calls
+3. Page object for accessing current URL, title, content
+
+OpenHands only provides:
+1. Action/observation pairs in text format
+2. No live browser state
+3. No CDP access during evaluation
+
+## The Solution: CDP Session Logging
+
+### Phase 1: Capture Browser State During Inference
+
+**Modify `openhands/runtime/browser/browser_env.py`:**
+
+```python
+class BrowserEnv:
+    def __init__(self, ...):
+        # ... existing code ...
+        self.cdp_logger = CDPSessionLogger() if should_log_cdp() else None
+
+    def step(self, action):
+        # ... existing action execution ...
+
+        # Log CDP state after each action
+        if self.cdp_logger:
+            self.cdp_logger.capture_state_snapshot(f"after_action_{action.action}")
+
+        # ... return observation ...
+
+    def close(self):
+        # Save final CDP session
+        if self.cdp_logger:
+            instance_id = get_current_instance_id()  # from evaluation context
+            self.cdp_logger.save_session(instance_id)
+```
+
+**Add CDP Logger Integration:**
+
+```python
+class CDPSessionLogger:
+    def attach_to_browsergym_env(self, env):
+        """Attach to BrowserGym environment's Playwright page."""
+        # Access the underlying Playwright page from BrowserGym
+        playwright_page = env.page  # or however BrowserGym exposes it
+        self.attach_to_page(playwright_page)
+
+    def capture_state_snapshot(self, trigger: str):
+        """Capture complete browser state using CDP."""
+        # DOM snapshot (key for WebArena evaluators)
+        dom_snapshot = self.cdp_session.send("DOMSnapshot.captureSnapshot", {
+            "computedStyles": [],
+            "includeDOMRects": True,
+            "includePaintOrder": True,
+        })
+
+        # All other state (cookies, localStorage, etc.)
+        # ... as shown in POC ...
+```
+
+### Phase 2: Mock Objects for Evaluation
+
+**Create Mock Page/CDPSession:**
+
+```python
+class MockCDPSession:
+    def __init__(self, saved_state):
+        self.saved_state = saved_state
+
+    def send(self, method: str, params=None):
+        """Return saved state instead of making live CDP calls."""
+        if method == "DOMSnapshot.captureSnapshot":
+            return self.saved_state["dom_snapshot"]
+        elif method == "Network.getAllCookies":
+            return self.saved_state["cookies"]
+        # ... handle all CDP methods WebArena uses ...
+
+class MockPage:
+    def __init__(self, saved_state):
+        self.saved_state = saved_state
+
+    def url(self): return self.saved_state["final_url"]
+    def title(self): return self.saved_state["final_title"]
+    def context(self): return MockBrowserContext(self.saved_state)
+    # ... implement all Page methods WebArena uses ...
+```
+
+### Phase 3: Updated Evaluation Script
+
+**Modify `eval_infer.py`:**
+
+```python
+def evaluate_with_official_webarena_harness(instance_data, config_file):
+    """Use official WebArena evaluators with saved CDP state."""
+
+    # Load saved CDP session
+    cdp_integration = WebArenaCDPIntegration()
+    mock_page, mock_client = cdp_integration.create_mock_page_and_client(
+        instance_data["instance_id"]
+    )
+
+    # Convert OpenHands trajectory to WebArena format
+    trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
+
+    # Use official WebArena evaluator with mock objects
+    evaluator = evaluator_router(config_file)
+    score = evaluator(
+        trajectory=trajectory,
+        config_file=config_file,
+        page=mock_page,        # Mock page with saved state
+        client=mock_client,    # Mock CDP session with saved state
+    )
+
+    return score
+```
+
+## Implementation Steps
+
+### Step 1: Integrate CDP Logger into BrowserEnv
+
+1. **Add CDP logging to `browser_env.py`:**
+   - Detect when running WebArena evaluation
+   - Attach CDP logger to BrowserGym's Playwright page
+   - Capture state snapshots after each action
+   - Save final session with instance ID
+
+2. **Environment variable setup:**
+   ```bash
+   export WEBARENA_CDP_LOGGING=true
+   export WEBARENA_CDP_SESSION_DIR=/tmp/cdp_sessions
+   ```
+
+### Step 2: Create Mock Objects
+
+1. **Implement `MockCDPSession`:**
+   - Handle all CDP methods WebArena evaluators use
+   - Return saved state instead of making live calls
+   - Support `DOMSnapshot.captureSnapshot`, `Network.getAllCookies`, etc.
+
+2. **Implement `MockPage`:**
+   - Provide saved URL, title, content
+   - Mock JavaScript evaluation with saved state
+   - Support element queries using DOM snapshot
+
+### Step 3: Update Evaluation Pipeline
+
+1. **Modify `run_infer.py`:**
+   - Enable CDP logging for WebArena tasks
+   - Ensure instance IDs are properly set
+   - Save CDP sessions to accessible location
+
+2. **Update `eval_infer.py`:**
+   - Load saved CDP sessions
+   - Create mock objects
+   - Use official WebArena evaluators
+   - Remove all heuristic evaluation logic
+
+### Step 4: Testing and Validation
+
+1. **Test with known tasks:**
+   - Run inference with CDP logging
+   - Verify CDP sessions are saved correctly
+   - Test evaluation with mock objects
+   - Compare results with expected outcomes
+
+2. **Validate DOM snapshot format:**
+   - Ensure saved DOM snapshots match WebArena expectations
+   - Test all CDP methods used by evaluators
+   - Verify JavaScript evaluation works correctly
+
+## Benefits of This Approach
+
+1. **✅ Uses Official WebArena Evaluation:** No heuristics or approximations
+2. **✅ Preserves Exact Browser State:** DOM, cookies, localStorage, etc.
+3. **✅ No Live Browser Needed:** Evaluation works offline with saved state
+4. **✅ Scalable:** Can evaluate many instances without browser overhead
+5. **✅ Accurate:** Evaluators get exactly the state they expect
+
+## File Structure
+
+```
+/tmp/cdp_sessions/
+├── webarena.1.json          # CDP session for task 1
+├── webarena.2.json          # CDP session for task 2
+├── webarena.3.json          # CDP session for task 3
+└── webarena.4.json          # CDP session for task 4
+
+evaluation/benchmarks/webarena/
+├── run_infer.py             # Modified to enable CDP logging
+├── eval_infer.py            # Uses mock objects with saved state
+├── cdp_integration.py       # Mock Page/CDPSession implementation
+└── IMPLEMENTATION_PLAN.md   # This document
+```
+
+## Next Steps
+
+1. **Implement CDP logger integration in `browser_env.py`**
+2. **Create comprehensive mock objects**
+3. **Update evaluation scripts**
+4. **Test with actual WebArena tasks**
+5. **Validate results against expected outcomes**
+
+This approach solves the fundamental problem: WebArena evaluators need live browser state, but OpenHands only provides action/observation pairs. By capturing and replaying the exact browser state, we can use the official WebArena evaluation harness without any compromises.
@@ -6,11 +6,21 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we

 Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

+Make sure to install the evaluation dependencies:
+
+```bash
+poetry install --with evaluation
+```
+
 ## Setup WebArena Environment

-WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
-Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
-Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
+WebArena requires access to websites containing pre-populated content. You can either:
+
+1. **Use an existing WebArena environment** (recommended for evaluation): Set the `WEBARENA_BASE_URL` environment variable to point to an existing WebArena server.
+
+2. **Set up your own environment**: Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
+
+The WebArena evaluation package is already installed with the evaluation dependencies, so you don't need to clone the WebArena repository separately.

 ## Test if your environment works

@@ -21,20 +31,51 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie

 ## Run Evaluation

+### Step 1: Run Inference
+Before running, you must provide an LLM config in a local config.toml and pass its name to run_infer.sh:
+
+1) Create config.toml in the repo root (this file is gitignored):
+
+```toml
+[llm.eval_openai]
+model = "gpt-4o"
+api_key = "sk-..."   # Your OpenAI API key
+```
+
+2) Ensure Docker is installed and running (the first run will build a browser-enabled runtime image).
+
+
 ```bash
 export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
 export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
-bash evaluation/benchmarks/webarena/scripts/run_infer.sh
+# args: MODEL_CONFIG  COMMIT_HASH  AGENT  EVAL_LIMIT  NUM_WORKERS
+bash evaluation/benchmarks/webarena/scripts/run_infer.sh llm.eval_openai HEAD BrowsingAgent 3 1
 ```

 Results will be in `evaluation/evaluation_outputs/outputs/webarena/`

-To calculate the success rate, run:
+### Step 2: Evaluate Results

-```sh
-poetry run python evaluation/benchmarks/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+To evaluate the results and calculate success rate using the official WebArena harness, you must have the official WebArena repo and its Python dependencies available locally:
+
+1) Clone the official repo and install deps (one-time):
+
+```bash
+cd /workspace/project
+git clone https://github.com/web-arena-x/webarena
+cd webarena && pip install -e .
 ```

+2) Then run the evaluator:
+
+```bash
+poetry run python evaluation/benchmarks/webarena/eval_infer.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+Notes:
+- The evaluator expects WEBARENA_BASE_URL to be set and the WebArena services to be reachable.
+- If you skip installing the official harness, you can still inspect output.jsonl manually or write your own scorer, but the script above will fail without the harness.
+
 ## Submit your evaluation results

 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+BrowserGym State Capture for WebArena Evaluation
+
+This module leverages BrowserGym's existing state capture capabilities to save
+browser state for proper WebArena evaluation. BrowserGym already provides:
+- extract_dom_snapshot() - exactly what WebArena evaluators need
+- Direct Playwright page access via env.page
+- CDP session access via page.context.new_cdp_session()
+
+This is much simpler than our original CDP logging approach because BrowserGym
+already has all the infrastructure we need.
+"""
+
+import json
+from pathlib import Path
+from typing import Any, Optional
+
+import browsergym.core.observation as obs
+
+
+class BrowserGymStateCapture:
+    """
+    Captures browser state using BrowserGym's existing observation functions.
+    This provides everything WebArena evaluators need without custom CDP logging.
+    """
+
+    def __init__(self, output_dir: str = '/tmp/webarena_states'):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.current_instance_id: str | None = None
+
+    def set_instance_id(self, instance_id: str) -> None:
+        """Set the current WebArena instance ID for state saving."""
+        self.current_instance_id = instance_id
+
+    def capture_final_state(self, browsergym_env) -> dict[str, Any]:
+        """
+        Capture the final browser state using BrowserGym's observation functions.
+        This captures everything WebArena evaluators need.
+        """
+        if not hasattr(browsergym_env, 'page'):
+            raise RuntimeError('BrowserGym environment does not have page attribute')
+
+        page = browsergym_env.page
+
+        # Use BrowserGym's existing observation extraction functions
+        state = {
+            'instance_id': self.current_instance_id,
+            'final_url': page.url,
+            'final_title': page.title(),
+            # This is the key - BrowserGym's extract_dom_snapshot uses CDP internally
+            # and returns exactly the format WebArena evaluators expect
+            'dom_snapshot': obs.extract_dom_snapshot(page),
+            # Additional state that might be useful
+            'screenshot': obs.extract_screenshot(page),
+            'axtree': obs.extract_merged_axtree(page),
+            'focused_element': obs.extract_focused_element_bid(page),
+        }
+
+        # Get additional browser state via CDP
+        try:
+            cdp_session = page.context.new_cdp_session(page)
+
+            # Get cookies
+            cookies_result = cdp_session.send('Network.getAllCookies')
+            state['cookies'] = cookies_result
+
+            # Get localStorage
+            local_storage = cdp_session.send(
+                'Runtime.evaluate',
+                {'expression': 'JSON.stringify(localStorage)', 'returnByValue': True},
+            )
+            state['local_storage'] = local_storage.get('result', {}).get('value', '{}')
+
+            # Get sessionStorage
+            session_storage = cdp_session.send(
+                'Runtime.evaluate',
+                {'expression': 'JSON.stringify(sessionStorage)', 'returnByValue': True},
+            )
+            state['session_storage'] = session_storage.get('result', {}).get(
+                'value', '{}'
+            )
+
+            cdp_session.detach()
+
+        except Exception as e:
+            print(f'Warning: Could not capture additional state via CDP: {e}')
+            state['cookies'] = {'cookies': []}
+            state['local_storage'] = '{}'
+            state['session_storage'] = '{}'
+
+        return state
+
+    def save_state(self, browsergym_env) -> str:
+        """Save the current browser state to disk."""
+        if self.current_instance_id is None:
+            raise RuntimeError('Instance ID not set. Call set_instance_id() first.')
+
+        state = self.capture_final_state(browsergym_env)
+
+        # Save to file
+        state_file = self.output_dir / f'{self.current_instance_id}.json'
+        with open(state_file, 'w') as f:
+            json.dump(state, f, indent=2, default=str)
+
+        print(f'✅ Saved browser state to: {state_file}')
+        return str(state_file)
+
+    def load_state(self, instance_id: str) -> dict[str, Any]:
+        """Load saved browser state from disk."""
+        state_file = self.output_dir / f'{instance_id}.json'
+
+        if not state_file.exists():
+            raise FileNotFoundError(f'State file not found: {state_file}')
+
+        with open(state_file, 'r') as f:
+            state = json.load(f)
+
+        return state
+
+
+class MockPageForWebArena:
+    """
+    Mock Page object that provides saved browser state for WebArena evaluation.
+    This uses the exact state captured by BrowserGym's observation functions.
+    """
+
+    def __init__(self, saved_state: dict[str, Any]):
+        self.saved_state = saved_state
+        self._url = saved_state.get('final_url', '')
+        self._title = saved_state.get('final_title', '')
+        self._context = MockBrowserContextForWebArena(saved_state)
+
+    def url(self) -> str:
+        return self._url
+
+    def title(self) -> str:
+        return self._title
+
+    @property
+    def context(self):
+        return self._context
+
+    def evaluate(self, expression: str) -> Any:
+        """Mock JavaScript evaluation using saved state."""
+        if 'window.location.href' in expression:
+            return self._url
+        elif 'document.title' in expression:
+            return self._title
+        elif 'localStorage' in expression:
+            return self.saved_state.get('local_storage', '{}')
+        elif 'sessionStorage' in expression:
+            return self.saved_state.get('session_storage', '{}')
+        return None
+
+
+class MockCDPSessionForWebArena:
+    """
+    Mock CDPSession that returns saved state from BrowserGym's observations.
+    This is the key component that makes WebArena evaluators work.
+    """
+
+    def __init__(self, saved_state: dict[str, Any]):
+        self.saved_state = saved_state
+
+    def send(self, method: str, params: Optional[dict] = None) -> dict[str, Any]:
+        """
+        Mock CDP send method that returns BrowserGym's captured state.
+        The key insight: BrowserGym's extract_dom_snapshot() already returns
+        the exact format that WebArena evaluators expect from CDP calls.
+        """
+        if method == 'DOMSnapshot.captureSnapshot':
+            # BrowserGym's extract_dom_snapshot already returns the right format!
+            return self.saved_state.get('dom_snapshot', {})
+
+        elif method == 'Network.getAllCookies':
+            return self.saved_state.get('cookies', {'cookies': []})
+
+        elif method == 'Runtime.evaluate':
+            if params and 'expression' in params:
+                expression = params['expression']
+                if 'localStorage' in expression:
+                    return {
+                        'result': {'value': self.saved_state.get('local_storage', '{}')}
+                    }
+                elif 'sessionStorage' in expression:
+                    return {
+                        'result': {
+                            'value': self.saved_state.get('session_storage', '{}')
+                        }
+                    }
+                elif 'window.location.href' in expression:
+                    return {'result': {'value': self.saved_state.get('final_url', '')}}
+                elif 'document.title' in expression:
+                    return {
+                        'result': {'value': self.saved_state.get('final_title', '')}
+                    }
+
+        return {}
+
+    def detach(self):
+        """Mock detach method."""
+        pass
+
+
+class MockBrowserContextForWebArena:
+    """Mock browser context for WebArena evaluation."""
+
+    def __init__(self, saved_state: dict[str, Any]):
+        self.saved_state = saved_state
+
+    def new_cdp_session(self, page) -> MockCDPSessionForWebArena:
+        """Return mock CDP session with BrowserGym's captured state."""
+        return MockCDPSessionForWebArena(self.saved_state)
+
+
+def integrate_with_openhands_browser_env():
+    """
+    Integration point for OpenHands browser_env.py.
+    This shows how to add state capture to the existing BrowserGym usage.
+    """
+
+    # This would be added to browser_env.py in the browser_process method
+    example_integration = """
+    def browser_process(self) -> None:
+        env = gym.make('browsergym/openended', ...)
+        obs, info = env.reset()
+
+        # Add state capture for WebArena evaluation
+        state_capture = None
+        if os.getenv('WEBARENA_EVALUATION'):
+            state_capture = BrowserGymStateCapture()
+
+        while should_continue():
+            if self.browser_side.poll(timeout=0.01):
+                unique_request_id, action_data = self.browser_side.recv()
+
+                # Handle WebArena instance ID setting
+                if unique_request_id == 'SET_WEBARENA_INSTANCE':
+                    if state_capture:
+                        state_capture.set_instance_id(action_data['instance_id'])
+                    continue
+
+                action = action_data['action']
+                obs, reward, terminated, truncated, info = env.step(action)
+
+                # Capture final state when task completes
+                if terminated and state_capture:
+                    state_capture.save_state(env)
+
+                # ... rest of existing code ...
+    """
+
+    return example_integration
+
+
+def demonstrate_integration():
+    """Demonstrate how this integrates with WebArena evaluation."""
+    print('🚀 BrowserGym State Capture for WebArena')
+    print('=' * 50)
+
+    print('✅ Key advantages of this approach:')
+    print("   1. Uses BrowserGym's existing observation functions")
+    print('   2. extract_dom_snapshot() already returns WebArena-compatible format')
+    print('   3. No custom CDP logging needed')
+    print('   4. Minimal changes to OpenHands browser_env.py')
+    print('   5. Leverages existing, tested BrowserGym infrastructure')
+
+    print('\n📋 Integration steps:')
+    print('   1. Add BrowserGymStateCapture to browser_env.py')
+    print('   2. Capture state when WebArena tasks complete')
+    print(
+        '   3. Use MockPageForWebArena and MockCDPSessionForWebArena in eval_infer.py'
+    )
+    print('   4. Official WebArena evaluators work with mock objects')
+
+    print('\n🎯 This is much simpler than custom CDP logging because')
+    print('   BrowserGym already provides everything we need!')
+
+
+if __name__ == '__main__':
+    demonstrate_integration()
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""
+WebArena evaluation script for OpenHands outputs using official WebArena evaluation harness.
+This script evaluates the results from run_infer.py using the official WebArena evaluation code.
+
+This script requires:
+1. Official WebArena repository cloned to /workspace/project/webarena
+2. WebArena environment variables properly configured
+3. Authentication files set up for WebArena sites
+4. Docker containers running for WebArena sites
+"""
+
+import argparse
+import json
+import os
+import sys
+from typing import Any
+
+# Set up environment variables for WebArena
+WEBARENA_BASE_URL = os.environ.get('WEBARENA_BASE_URL', '')
+if WEBARENA_BASE_URL:
+    os.environ['REDDIT'] = f'{WEBARENA_BASE_URL}:9999'
+    os.environ['SHOPPING'] = f'{WEBARENA_BASE_URL}:7770'
+    os.environ['SHOPPING_ADMIN'] = f'{WEBARENA_BASE_URL}:7780'
+    os.environ['GITLAB'] = f'{WEBARENA_BASE_URL}:8023'
+    os.environ['WIKIPEDIA'] = f'{WEBARENA_BASE_URL}:8888'
+    os.environ['MAP'] = f'{WEBARENA_BASE_URL}:3000'
+    os.environ['HOMEPAGE'] = f'{WEBARENA_BASE_URL}:4399'
+
+# Add the webarena path to sys.path to import its modules
+WEBARENA_PATH = '/workspace/project/webarena'
+sys.path.insert(0, WEBARENA_PATH)
+
+try:
+    from browser_env import ScriptBrowserEnv, create_stop_action
+    from browser_env.actions import Action
+    from browser_env.utils import StateInfo
+    from evaluation_harness import evaluator_router
+
+    print('✅ WebArena evaluation harness imported successfully')
+except ImportError as e:
+    print(f'❌ Failed to import WebArena evaluation harness: {e}')
+    print('Make sure the WebArena repository is cloned to /workspace/project/webarena')
+    print('and all dependencies are installed.')
+    sys.exit(1)
+
+
+def load_config_file(config_path: str) -> dict[str, Any]:
+    """Load WebArena config file."""
+    with open(config_path, 'r') as f:
+        return json.load(f)
+
+
+def convert_openhands_action_to_webarena(action_data: dict[str, Any]) -> Action:
+    """Convert OpenHands action format to WebArena action format."""
+    action_type = action_data.get('action', '')
+    args = action_data.get('args', {})
+
+    if action_type == 'browse':
+        url = args.get('url', '')
+        if url:
+            return Action(action_type='goto', coordinate=[0, 0], text=url)
+
+    elif action_type == 'click':
+        coordinate = args.get('coordinate', [0, 0])
+        return Action(action_type='click', coordinate=coordinate)
+
+    elif action_type == 'type':
+        text = args.get('text', '')
+        return Action(action_type='type', text=text, coordinate=[0, 0])
+
+    elif action_type == 'key':
+        key = args.get('key', '')
+        return Action(action_type='key', text=key, coordinate=[0, 0])
+
+    elif action_type == 'scroll':
+        coordinate = args.get('coordinate', [0, 0])
+        direction = args.get('direction', 'down')
+        return Action(action_type='scroll', coordinate=coordinate, text=direction)
+
+    elif action_type == 'finish':
+        return create_stop_action('')
+
+    # Default fallback for unknown actions
+    return Action(action_type='none', coordinate=[0, 0])
+
+
+def convert_openhands_trajectory_to_webarena_format(
+    openhands_output: dict[str, Any],
+) -> list[Any]:
+    """
+    Convert OpenHands trajectory format to WebArena trajectory format.
+
+    OpenHands format: history contains pairs of [action, observation]
+    WebArena format: trajectory is a list alternating between StateInfo and Action
+    """
+    trajectory = []
+
+    # Add initial state
+    initial_state = StateInfo(
+        observation={'text': 'Initial state'}, info={'observation_metadata': {}}
+    )
+    trajectory.append(initial_state)
+
+    # Process the history
+    history = openhands_output.get('history', [])
+    for history_pair in history:
+        if len(history_pair) >= 2:
+            action_data = history_pair[0]
+            observation_data = history_pair[1]
+
+            # Convert action
+            webarena_action = convert_openhands_action_to_webarena(action_data)
+            trajectory.append(webarena_action)
+
+            # Add state info from observation
+            state_info = StateInfo(
+                observation={'text': observation_data.get('content', '')},
+                info={'observation_metadata': observation_data.get('extras', {})},
+            )
+            trajectory.append(state_info)
+
+    return trajectory
+
+
+def evaluate_with_official_webarena_harness(
+    instance_data: dict[str, Any], config_file_path: str
+) -> dict[str, Any]:
+    """
+    Evaluate a single WebArena instance using the official evaluation harness.
+
+    This function:
+    1. Converts OpenHands trajectory to WebArena format
+    2. Sets up a browser environment
+    3. Replays the trajectory to reach the final state
+    4. Runs the official WebArena evaluator
+    """
+
+    instance_id = instance_data.get('instance_id', 'unknown')
+    print(f'\n🔍 Evaluating instance: {instance_id}')
+
+    try:
+        # Load config to understand the task
+        config_data = load_config_file(config_file_path)
+        intent = config_data.get('intent', '')
+        start_url = config_data.get('start_url', '')
+
+        print(f'   Task: {intent}')
+        print(f'   Start URL: {start_url}')
+
+        # Convert OpenHands trajectory to WebArena format
+        trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
+        print(f'   Converted trajectory with {len(trajectory)} steps')
+
+        # Get the evaluator for this config
+        evaluator = evaluator_router(config_file_path)
+        print(f'   Using evaluator: {type(evaluator).__name__}')
+
+        # Create browser environment for evaluation
+        env = ScriptBrowserEnv(
+            headless=True,
+            slow_mo=0,
+            observation_type='accessibility_tree',
+            current_viewport_only=True,
+            viewport_size={'width': 1280, 'height': 720},
+        )
+
+        try:
+            # Initialize the environment with the task
+            obs, info = env.reset(options={'config_file': config_file_path})
+
+            # Replay the trajectory to reach the final state
+            # This is necessary because the evaluator needs the actual browser state
+            current_obs = obs
+            for i, step in enumerate(trajectory):
+                if isinstance(step, Action):
+                    try:
+                        current_obs, reward, done, info = env.step(step)
+                        if done:
+                            break
+                    except Exception as e:
+                        print(f'   Warning: Error replaying step {i}: {e}')
+                        continue
+
+            # Run the official evaluation
+            score = evaluator(
+                trajectory=trajectory,
+                config_file=config_file_path,
+                page=env.page,
+                client=env.page.context.new_cdp_session(env.page),
+            )
+
+            result = {
+                'instance_id': instance_id,
+                'score': score,
+                'success': score == 1.0,
+                'trajectory_length': len(trajectory),
+                'evaluator': type(evaluator).__name__,
+                'evaluation_type': 'official_webarena_harness',
+                'intent': intent,
+            }
+
+            print(
+                f'   Result: {"✅ PASS" if score == 1.0 else "❌ FAIL"} (score: {score})'
+            )
+            return result
+
+        finally:
+            env.close()
+
+    except Exception as e:
+        print(f'   ❌ Error evaluating {instance_id}: {e}')
+        return {
+            'instance_id': instance_id,
+            'score': 0.0,
+            'success': False,
+            'error': str(e),
+            'evaluator': 'error',
+            'evaluation_type': 'error',
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Evaluate WebArena results using ONLY the official WebArena evaluation harness'
+    )
+    parser.add_argument(
+        'output_file', type=str, help='Path to OpenHands output.jsonl file'
+    )
+    parser.add_argument(
+        '--results_file',
+        type=str,
+        default='webarena_official_eval_results.json',
+        help='Path to save evaluation results',
+    )
+    parser.add_argument(
+        '--config_dir',
+        type=str,
+        default='/workspace/project/webarena/config_files/examples',
+        help='Directory containing WebArena config files',
+    )
+
+    args = parser.parse_args()
+
+    print('🚀 Starting WebArena Evaluation with Official WebArena Harness ONLY')
+    print(f'📁 Output file: {args.output_file}')
+    print(f'📁 Config directory: {args.config_dir}')
+
+    # Verify WebArena environment is properly set up
+    if not WEBARENA_BASE_URL:
+        print('❌ WEBARENA_BASE_URL environment variable not set')
+        print('Please set WEBARENA_BASE_URL to your WebArena server URL')
+        sys.exit(1)
+
+    print(f'🌐 WebArena base URL: {WEBARENA_BASE_URL}')
+
+    # Load OpenHands results
+    results = []
+    with open(args.output_file, 'r') as f:
+        for line in f:
+            if line.strip():
+                results.append(json.loads(line))
+
+    print(f'📊 Found {len(results)} instances to evaluate')
+
+    # Evaluate each instance using ONLY official WebArena evaluation harness
+    evaluation_results = []
+    total_score = 0.0
+
+    for result in results:
+        instance_id = result.get('instance_id', 'unknown')
+
+        # Find corresponding config file
+        config_file = None
+        # Accept either plain numeric id ("8") or legacy prefixed id ("webarena.8")
+        task_num = instance_id.split('.')[-1]
+        config_file = f'{args.config_dir}/{task_num}.json'
+
+        if config_file and os.path.exists(config_file):
+            eval_result = evaluate_with_official_webarena_harness(result, config_file)
+            evaluation_results.append(eval_result)
+            total_score += eval_result.get('score', 0.0)
+        else:
+            print(f'\n🔍 Evaluating instance: {instance_id}')
+            print(f'   ⚠️  Config file not found: {config_file}')
+            evaluation_results.append(
+                {
+                    'instance_id': instance_id,
+                    'score': 0.0,
+                    'success': False,
+                    'error': f'Config file not found: {config_file}',
+                    'evaluation_type': 'config_error',
+                }
+            )
+
+    # Calculate final metrics
+    total_instances = len(evaluation_results)
+    success_count = sum(1 for r in evaluation_results if r.get('success', False))
+    success_rate = success_count / total_instances if total_instances > 0 else 0.0
+    average_score = total_score / total_instances if total_instances > 0 else 0.0
+
+    # Save results
+    final_results = {
+        'evaluation_method': 'official_webarena_harness_only',
+        'webarena_base_url': WEBARENA_BASE_URL,
+        'total_instances': total_instances,
+        'success_count': success_count,
+        'success_rate': success_rate,
+        'average_score': average_score,
+        'individual_results': evaluation_results,
+    }
+
+    with open(args.results_file, 'w') as f:
+        json.dump(final_results, f, indent=2)
+
+    # Print summary
+    print('\n' + '=' * 70)
+    print('🎯 WEBARENA EVALUATION RESULTS (Official Harness ONLY)')
+    print('=' * 70)
+    print(f'📊 Total instances: {total_instances}')
+    print(f'✅ Successful: {success_count}')
+    print(f'❌ Failed: {total_instances - success_count}')
+    print(f'📈 Success rate: {success_rate:.2%}')
+    print(f'📊 Average score: {average_score:.4f}')
+    print(f'💾 Results saved to: {args.results_file}')
+    print('=' * 70)
+
+    # Print individual results
+    print('\n📋 Individual Results:')
+    for result in evaluation_results:
+        status = '✅ PASS' if result.get('success', False) else '❌ FAIL'
+        score = result.get('score', 0.0)
+        instance_id = result.get('instance_id', 'unknown')
+        evaluator = result.get('evaluator', 'unknown')
+        error = result.get('error', '')
+        if error:
+            print(f'   {instance_id}: {status} (score: {score:.2f}) - Error: {error}')
+        else:
+            print(
+                f'   {instance_id}: {status} (score: {score:.2f}) - Evaluator: {evaluator}'
+            )
+
+    # Print requirements if there were errors
+    error_count = sum(1 for r in evaluation_results if r.get('error'))
+    if error_count > 0:
+        print('\n' + '⚠️' * 20)
+        print('EVALUATION ERRORS DETECTED')
+        print('⚠️' * 20)
+        print('This evaluation requires:')
+        print('1. WebArena Docker containers running and accessible')
+        print('2. Authentication files (.auth/) properly set up')
+        print('3. All WebArena dependencies installed')
+        print('4. Proper network access to WebArena sites')
+        print('\nPlease resolve these issues for accurate evaluation.')
+        print('⚠️' * 20)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+WebArena Evaluation Script
+
+This script evaluates WebArena task results using the official WebArena evaluation harness
+with BrowserGym state capture. It loads saved browser state and creates mock objects
+that provide the exact state WebArena evaluators need.
+
+This approach leverages BrowserGym's existing observation functions (extract_dom_snapshot, etc.)
+which already provide WebArena-compatible state capture.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+# Add WebArena to path
+sys.path.insert(0, '/workspace/project/webarena')
+
+
+def convert_openhands_trajectory_to_webarena_format(
+    instance_data: dict[str, Any],
+) -> list[Any]:
+    """
+    Convert OpenHands trajectory format to WebArena trajectory format.
+
+    WebArena expects a list of alternating Action and StateInfo objects.
+    OpenHands provides action/observation pairs in text format.
+    """
+    trajectory = []
+
+    # Get the conversation history
+    history = instance_data.get('history', [])
+
+    for entry in history:
+        if entry.get('source') == 'agent':
+            # This is an agent action
+            content = entry.get('message', {}).get('content', '')
+
+            # Create a WebArena-compatible action
+            action = {
+                'action_type': 'browser_action',
+                'content': content,
+                'timestamp': entry.get('timestamp', 0),
+            }
+            trajectory.append(action)
+
+        elif entry.get('source') == 'user':
+            # This might be an observation or state info
+            content = entry.get('message', {}).get('content', '')
+
+            # Create a WebArena-compatible state info
+            state_info = {
+                'observation': content,
+                'timestamp': entry.get('timestamp', 0),
+            }
+            trajectory.append(state_info)
+
+    # Add a final stop action if needed
+    if trajectory and not trajectory[-1].get('action_type'):
+        trajectory.append(
+            {
+                'action_type': 'stop',
+                'content': 'Task completed',
+                'timestamp': trajectory[-1].get('timestamp', 0) + 1,
+            }
+        )
+
+    return trajectory
+
+
+def evaluate_with_browsergym_state_capture(
+    instance_data: dict[str, Any], config_file: str
+) -> float:
+    """
+    Evaluate using official WebArena harness with BrowserGym state capture.
+
+    This loads the saved browser state captured during inference and creates
+    mock Page/CDPSession objects that provide the exact state WebArena evaluators need.
+    """
+    try:
+        # Import BrowserGym state capture
+        from browsergym_state_capture import (
+            BrowserGymStateCapture,
+            MockCDPSessionForWebArena,
+            MockPageForWebArena,
+        )
+
+        # Import WebArena evaluation components
+        from evaluation_harness import evaluator_router
+
+        # Load saved browser state
+        instance_id = instance_data.get('instance_id', 'unknown')
+        state_capture = BrowserGymStateCapture()
+
+        try:
+            saved_state = state_capture.load_state(instance_id)
+            print(f'   ✅ Loaded browser state for {instance_id}')
+        except FileNotFoundError:
+            print(f'   ❌ No saved browser state found for {instance_id}')
+            print('      Make sure inference was run with browser_logging_dir enabled')
+            return 0.0
+
+        # Create mock objects with saved state
+        mock_page = MockPageForWebArena(saved_state)
+        mock_client = MockCDPSessionForWebArena(saved_state)
+
+        # Convert trajectory format
+        trajectory = convert_openhands_trajectory_to_webarena_format(instance_data)
+
+        # Get the official evaluator
+        evaluator = evaluator_router(config_file)
+
+        # Run evaluation with mock objects containing saved browser state
+        score = evaluator(
+            trajectory=trajectory,
+            config_file=config_file,
+            page=mock_page,  # Mock page with BrowserGym's captured state
+            client=mock_client,  # Mock CDP session with BrowserGym's captured state
+        )
+
+        return score
+
+    except ImportError as e:
+        print(f'   ❌ Could not import BrowserGym state capture: {e}')
+        print('      Make sure browsergym_state_capture.py is available')
+        return 0.0
+    except Exception as e:
+        print(f'   ❌ Evaluation failed: {e}')
+        import traceback
+
+        traceback.print_exc()
+        return 0.0
+
+
+def main():
+    """Main evaluation function."""
+    if len(sys.argv) != 2:
+        print('Usage: python eval_infer.py <output_file>')
+        sys.exit(1)
+
+    output_file = sys.argv[1]
+
+    if not os.path.exists(output_file):
+        print(f'❌ Output file not found: {output_file}')
+        sys.exit(1)
+
+    print('🔍 WebArena Evaluation (BrowserGym State Capture)')
+    print('=' * 60)
+
+    # Load results
+    with open(output_file, 'r') as f:
+        results = [json.loads(line) for line in f]
+
+    print(f'📊 Evaluating {len(results)} WebArena tasks...')
+
+    # WebArena config files
+    config_dir = Path('/workspace/project/webarena/config_files/examples')
+
+    total_score = 0
+    evaluated_count = 0
+
+    for result in results:
+        instance_id = result.get('instance_id', 'unknown')
+
+        # Find corresponding config file
+        config_file = config_dir / f'{instance_id}.json'
+
+        if not config_file.exists():
+            print(f'⚠️  Config file not found for {instance_id}')
+            continue
+
+        print(f'\n🧪 Evaluating {instance_id}...')
+
+        try:
+            # Use official WebArena evaluation with BrowserGym state capture
+            score = evaluate_with_browsergym_state_capture(result, str(config_file))
+
+            print(f'   Score: {score}')
+            total_score += score
+            evaluated_count += 1
+
+        except Exception as e:
+            print(f'   ❌ Evaluation failed: {e}')
+
+    if evaluated_count > 0:
+        average_score = total_score / evaluated_count
+        print('\n📈 Results Summary:')
+        print(f'   Tasks evaluated: {evaluated_count}')
+        print(f'   Total score: {total_score}')
+        print(f'   Average score: {average_score:.3f}')
+        print(
+            f'   Pass rate: {total_score}/{evaluated_count} ({100 * total_score / evaluated_count:.1f}%)'
+        )
+    else:
+        print('\n❌ No tasks could be evaluated')
+
+    print('\n🎯 Evaluation Method:')
+    print('   - Uses official WebArena evaluation harness')
+    print('   - Loads browser state captured by BrowserGym during inference')
+    print('   - Creates mock Page/CDPSession objects with exact browser state')
+    print('   - WebArena evaluators get the exact state they need')
+
+    print('\n💡 To enable browser state capture during inference:')
+    print('   export WEBARENA_BROWSER_LOGGING_DIR=/tmp/webarena_states')
+
+
+if __name__ == '__main__':
+    main()
@@ -1,33 +0,0 @@
-import argparse
-import json
-
-import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
-import gymnasium as gym
-
-parser = argparse.ArgumentParser(description='Calculate average reward.')
-parser.add_argument('output_path', type=str, help='path to output.jsonl')
-
-args = parser.parse_args()
-
-if __name__ == '__main__':
-    env_ids = [
-        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
-    ]
-    total_num = len(env_ids)
-    print('Total number of tasks: ', total_num)
-    total_reward = 0
-    total_cost = 0
-    actual_num = 0
-    with open(args.output_path, 'r') as f:
-        for line in f:
-            data = json.loads(line)
-            actual_num += 1
-            total_cost += data['metrics']['accumulated_cost']
-            total_reward += data['test_result']
-
-    avg_reward = total_reward / total_num
-    print('Success Rate: ', avg_reward)
-
-    avg_cost = total_cost / actual_num
-    print('Avg Cost: ', avg_cost)
-    print('Actual number of tasks finished: ', actual_num)
@@ -1,18 +1,17 @@
 import asyncio
-import json
 import os
 from typing import Any

-import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
-import gymnasium as gym
 import pandas as pd

 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    codeact_user_response,
    compatibility_for_eval_history_pairs,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -22,29 +21,32 @@ from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
    get_llm_config_arg,
-    parse_arguments,
 )
+from openhands.core.config.arg_utils import get_evaluation_parser
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
-    BrowseInteractiveAction,
    CmdRunAction,
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
-from openhands.runtime.browser.browser_env import (
-    BROWSER_EVAL_GET_GOAL_ACTION,
-    BROWSER_EVAL_GET_REWARDS_ACTION,
-)
 from openhands.utils.async_utils import call_async_from_sync

-SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'BrowsingAgent': codeact_user_response,
+}
+
+# Global variable to store task configs
+TASK_CONFIGS = {}


 def get_config(
    metadata: EvalMetadata,
-    env_id: str,
+    task_config: dict,
 ) -> OpenHandsConfig:
    base_url = os.environ.get('WEBARENA_BASE_URL', None)
    openai_api_key = os.environ.get('OPENAI_API_KEY', None)
@@ -53,7 +55,7 @@ def get_config(

    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    sandbox_config.browsergym_eval_env = env_id
+    # Remove browsergym_eval_env dependency - we'll use regular browser environment
    sandbox_config.runtime_startup_env_vars = {
        'BASE_URL': base_url,
        'OPENAI_API_KEY': openai_api_key,
@@ -65,15 +67,11 @@ def get_config(
        'MAP': f'{base_url}:3000',
        'HOMEPAGE': f'{base_url}:4399',
    }
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
+        enable_browser=True,
    )
    config.set_llm_config(metadata.llm_config)
    agent_config = config.get_agent_config(metadata.agent_class)
@@ -81,30 +79,59 @@ def get_config(
    return config


+def get_instruction(task_config: dict) -> MessageAction:
+    """Create the instruction message for the agent based on the task config."""
+    intent = task_config.get('intent', 'Complete the task')
+    start_url = task_config.get('start_url', 'about:blank')
+
+    # BrowserGym WebArena already handles URL substitution, so we can use start_url directly
+    # Create a comprehensive instruction that includes the task and starting point
+    instruction = f"""You are a web browsing agent. Your task is: {intent}
+
+Please start by navigating to: {start_url}
+
+Complete the task by interacting with the webpage as needed. Use the browser tool to navigate, click, fill forms, and perform other web interactions to accomplish the goal."""
+
+    return MessageAction(content=instruction)
+
+
 def initialize_runtime(
    runtime: Runtime,
-) -> dict:
+    task_config: dict,
+) -> None:
    """Initialize the runtime for the agent.

    This function is called before the runtime is used to run the agent.
+    Also performs initial navigation to the task's start_url because USE_NAV is disabled during evaluation.
    """
    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

-    # Set instance id
+    # Ensure workspace exists
    action = CmdRunAction(command='mkdir -p /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    goal = obs.content
+    # Navigate to the configured start_url so the page is ready for the agent
+    try:
+        from openhands.events.action import BrowseInteractiveAction
+
+        start_url = task_config.get('start_url')
+        if start_url:
+            browse_action = BrowseInteractiveAction(
+                browser_actions=f'goto("{start_url}")',
+                return_axtree=True,
+            )
+            runtime.browse_interactive(browse_action)
+        else:
+            logger.warning(
+                'No start_url found in task_config; skipping initial navigation'
+            )
+    except Exception as e:
+        logger.error(f'Failed to perform initial navigation: {e}')

    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
-    return goal


 def complete_runtime(
@@ -112,22 +139,40 @@ def complete_runtime(
 ) -> dict[str, Any]:
    """Complete the runtime for the agent.

-    This function is called before the runtime is used to run the agent.
-    If you need to do something in the sandbox to get the correctness metric after
-    the agent has run, modify this function.
+    This function is called after the agent has run.
+    Since we're using the official webarena evaluation, we don't need to get rewards here.
    """
    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
-    obs: CmdOutputObservation

-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    # Capture the final accessibility tree for WebArena evaluation
+    try:
+        # Create a browser action to get the current page state with accessibility tree
+        from openhands.events.action import BrowseInteractiveAction

-    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
-    return {
-        'rewards': json.loads(obs.content),
-    }
+        # Use a no-op action that returns the accessibility tree
+        final_browse_action = BrowseInteractiveAction(
+            browser_actions='noop()',  # No-op action to just get current state
+            return_axtree=True,  # Ensure we get the accessibility tree
+        )
+
+        # Execute the action to get the final observation with accessibility tree
+        final_obs = runtime.browse_interactive(final_browse_action)
+
+        # Extract the accessibility tree from the observation
+        final_axtree = None
+        if hasattr(final_obs, 'axtree_object') and final_obs.axtree_object:
+            final_axtree = final_obs.axtree_object
+            logger.info('Successfully captured final accessibility tree')
+        else:
+            logger.warning('No accessibility tree found in final observation')
+
+        logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
+        return {'final_accessibility_tree': final_axtree}
+
+    except Exception as e:
+        logger.error(f'Error capturing final accessibility tree: {e}')
+        logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
+        return {'final_accessibility_tree': None}


 def process_instance(
@@ -135,31 +180,34 @@ def process_instance(
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ):
-    env_id = instance.instance_id
-    config = get_config(metadata, env_id)
+    task_id = instance.instance_id
+    task_config = TASK_CONFIGS.get(task_id, {})
+    config = get_config(metadata, task_config)

    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-        reset_logger_for_multiprocessing(logger, env_id, log_dir)
+        reset_logger_for_multiprocessing(logger, str(task_id), log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {env_id}.')
+        logger.info(f'Starting evaluation for task {task_id}.')

    runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)
-    task_str = initialize_runtime(runtime)
+    initialize_runtime(runtime, task_config)
+
+    # Get the proper instruction message
+    message_action = get_instruction(task_config)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            initial_user_action=MessageAction(content=task_str),
+            initial_user_action=message_action,
            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                metadata.agent_class
+            ],
        )
    )
-    # ======= Attempt to evaluate the agent's environment impact =======
-
-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.

    if state is None:
        raise ValueError('State should not be None.')
@@ -175,7 +223,6 @@ def process_instance(

    return_val = complete_runtime(runtime)
    logger.info(f'Return value from complete_runtime: {return_val}')
-    reward = max(return_val['rewards'])

    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
@@ -184,43 +231,90 @@ def process_instance(

    # Save the output
    output = EvalOutput(
-        instance_id=env_id,
+        instance_id=str(task_id),
        instruction=instruction,
        metadata=metadata,
        history=histories,
        metrics=metrics,
        error=state.last_error if state and state.last_error else None,
        test_result={
-            'reward': reward,
+            'task_config': task_config,  # Store task config for later evaluation
+            'final_accessibility_tree': return_val.get('final_accessibility_tree')
+            if return_val
+            else None,
        },
    )
    return output


 if __name__ == '__main__':
-    args = parse_arguments()
+    parser = get_evaluation_parser()
+    args = parser.parse_args()

+    # Set up WebArena environment variables for BrowserGym
+    base_url = os.environ.get('WEBARENA_BASE_URL', None)
+    if not base_url:
+        raise ValueError('WEBARENA_BASE_URL must be set')
+
+    # Set up the WA_ prefixed environment variables that BrowserGym expects
+    os.environ['WA_SHOPPING'] = f'{base_url}:7770/'
+    os.environ['WA_SHOPPING_ADMIN'] = f'{base_url}:7780/admin'
+    os.environ['WA_REDDIT'] = f'{base_url}:9999'
+    os.environ['WA_GITLAB'] = f'{base_url}:8023'
+    os.environ['WA_WIKIPEDIA'] = (
+        f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing'
+    )
+    os.environ['WA_MAP'] = f'{base_url}:3000'
+    os.environ['WA_HOMEPAGE'] = f'{base_url}:4399'
+
+    # Load webarena task configs from BrowserGym
+    from browsergym.webarena.config import TASK_IDS
+    from browsergym.webarena.task import GenericWebArenaTask
+
+    task_configs = []
+
+    # Load a subset of tasks for testing (first 10 tasks)
+    test_task_ids = list(TASK_IDS)[:10]  # Use first 10 tasks for testing
+
+    for task_id in test_task_ids:
+        try:
+            # Create a temporary task to get the config
+            temp_task = GenericWebArenaTask(seed=42, task_id=task_id)
+
+            # Get the first (and likely only) task config for this task_id
+            if temp_task.task_configs:
+                task_config = temp_task.task_configs[0]
+                task_configs.append({'task_id': task_id, 'task_config': task_config})
+        except Exception as e:
+            print(f'Warning: Could not load task {task_id}: {e}')
+            continue
+
+    if not task_configs:
+        raise ValueError('No task configs could be loaded from BrowserGym WebArena')
+
+    print(f'Found {len(task_configs)} task configs from BrowserGym WebArena')
+
+    # Store task configs globally for process_instance to access
+    for task in task_configs:
+        TASK_CONFIGS[str(task['task_id'])] = task['task_config']
+
+    # Create dataset from task configs
    dataset = pd.DataFrame(
-        {
-            'instance_id': [
-                id
-                for id in gym.envs.registry.keys()
-                if id.startswith('browsergym/webarena')
-            ]
-        }
+        [{'instance_id': str(task['task_id'])} for task in task_configs]
    )

    llm_config = None
    if args.llm_config:
-        llm_config = get_llm_config_arg(args.llm_config)
+        llm_config = get_llm_config_arg(args.llm_config, args.config_file)
        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
-        llm_config.modify_params = False
+        if llm_config:
+            llm_config.modify_params = False
    if llm_config is None:
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'webarena',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
@@ -38,7 +38,7 @@ EVAL_NOTE="$OPENHANDS_VERSION"
 COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
-  --max-iterations 15 \
+  --max-iterations 30 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"

@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# WebArena environment configuration
+# This script sets up the environment variables needed for WebArena evaluation
+
+# Check if WEBARENA_BASE_URL is set
+if [ -z "$WEBARENA_BASE_URL" ]; then
+    echo "Warning: WEBARENA_BASE_URL is not set. Please set it to the base URL where webarena services are hosted."
+    echo "Example: export WEBARENA_BASE_URL=http://your-webarena-host"
+fi
+
+# Check if OPENAI_API_KEY is set
+if [ -z "$OPENAI_API_KEY" ]; then
+    echo "Warning: OPENAI_API_KEY is not set. Please set it to your OpenAI API key."
+fi
+
+echo "WebArena environment configured:"
+echo "  WEBARENA_BASE_URL: $WEBARENA_BASE_URL"
+echo "  OPENAI_API_KEY: ${OPENAI_API_KEY:+[SET]}${OPENAI_API_KEY:-[NOT SET]}"
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalOutput,
    get_default_sandbox_config_for_eval,
    get_metrics,
+    get_openhands_config_for_eval,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -45,18 +46,12 @@ def get_config(
 ) -> OpenHandsConfig:
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.platform = 'linux/amd64'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-        # debug
-        debug=True,
+        sandbox_config=sandbox_config,
    )
+    config.debug = True
    config.set_llm_config(
        update_llm_config_for_completions_logging(
            metadata.llm_config, metadata.eval_output_dir, instance_id
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Script to aggregate token usage metrics from LLM completion files.
+
+Usage:
+    python aggregate_token_usage.py <directory_path> [--input-cost <cost>] [--output-cost <cost>] [--cached-cost <cost>]
+
+Arguments:
+    directory_path: Path to the directory containing completion files
+    --input-cost: Cost per input token (default: 0.0)
+    --output-cost: Cost per output token (default: 0.0)
+    --cached-cost: Cost per cached token (default: 0.0)
+"""
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+
+def aggregate_token_usage(
+    directory_path, input_cost=0.0, output_cost=0.0, cached_cost=0.0
+):
+    """
+    Aggregate token usage metrics from all JSON completion files in the directory.
+
+    Args:
+        directory_path (str): Path to directory containing completion files
+        input_cost (float): Cost per input token
+        output_cost (float): Cost per output token
+        cached_cost (float): Cost per cached token
+    """
+
+    # Initialize counters
+    totals = {
+        'input_tokens': 0,
+        'output_tokens': 0,
+        'cached_tokens': 0,
+        'total_tokens': 0,
+        'files_processed': 0,
+        'files_with_errors': 0,
+        'cost': 0,
+    }
+
+    # Find all JSON files recursively
+    json_files = list(Path(directory_path).rglob('*.json'))
+
+    print(f'Found {len(json_files)} JSON files to process...')
+
+    for json_file in json_files:
+        try:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            # Look for usage data in response or fncall_response
+            usage_data = None
+            if (
+                'response' in data
+                and isinstance(data['response'], dict)
+                and 'usage' in data['response']
+            ):
+                usage_data = data['response']['usage']
+            elif (
+                'fncall_response' in data
+                and isinstance(data['fncall_response'], dict)
+                and 'usage' in data['fncall_response']
+            ):
+                usage_data = data['fncall_response']['usage']
+
+            if usage_data:
+                # Extract token counts
+                completion_tokens = usage_data.get('completion_tokens', 0)
+                prompt_tokens = usage_data.get('prompt_tokens', 0)
+                cached_tokens = usage_data.get('cached_tokens', 0)
+
+                # Handle cases where cached_tokens might be in prompt_tokens_details
+                if cached_tokens == 0 and 'prompt_tokens_details' in usage_data:
+                    details = usage_data['prompt_tokens_details']
+                    if isinstance(details, dict) and 'cached_tokens' in details:
+                        cached_tokens = details.get('cached_tokens', 0) or 0
+
+                # Calculate non-cached input tokens
+                non_cached_input = prompt_tokens - cached_tokens
+
+                # Update totals
+                totals['input_tokens'] += non_cached_input
+                totals['output_tokens'] += completion_tokens
+                totals['cached_tokens'] += cached_tokens
+                totals['total_tokens'] += prompt_tokens + completion_tokens
+
+            if 'cost' in data:
+                totals['cost'] += data['cost']
+            totals['files_processed'] += 1
+
+            # Progress indicator
+            if totals['files_processed'] % 1000 == 0:
+                print(f'Processed {totals["files_processed"]} files...')
+
+        except Exception as e:
+            totals['files_with_errors'] += 1
+            if totals['files_with_errors'] <= 5:  # Only show first 5 errors
+                print(f'Error processing {json_file}: {e}')
+
+    # Calculate costs
+    input_cost_total = totals['input_tokens'] * input_cost
+    output_cost_total = totals['output_tokens'] * output_cost
+    cached_cost_total = totals['cached_tokens'] * cached_cost
+    total_cost = input_cost_total + output_cost_total + cached_cost_total
+
+    # Print results
+    print('\n' + '=' * 60)
+    print('TOKEN USAGE AGGREGATION RESULTS')
+    print('=' * 60)
+    print(f'Files processed: {totals["files_processed"]:,}')
+    print(f'Files with errors: {totals["files_with_errors"]:,}')
+    print()
+    print('TOKEN COUNTS:')
+    print(f'  Input tokens (non-cached):             {totals["input_tokens"]:,}')
+    print(f'  Output tokens:                         {totals["output_tokens"]:,}')
+    print(f'  Cached tokens:                         {totals["cached_tokens"]:,}')
+    print(f'  Total tokens:                          {totals["total_tokens"]:,}')
+    print(f'  Total costs (based on returned value): ${totals["cost"]:.6f}')
+    print()
+
+    if input_cost > 0 or output_cost > 0 or cached_cost > 0:
+        print('COST CALCULATED BASED ON PROVIDED RATE:')
+        print(
+            f'  Input cost:   ${input_cost_total:.6f} ({totals["input_tokens"]:,} × ${input_cost:.6f})'
+        )
+        print(
+            f'  Output cost:  ${output_cost_total:.6f} ({totals["output_tokens"]:,} × ${output_cost:.6f})'
+        )
+        print(
+            f'  Cached cost:  ${cached_cost_total:.6f} ({totals["cached_tokens"]:,} × ${cached_cost:.6f})'
+        )
+        print(f'  Total cost:   ${total_cost:.6f}')
+        print()
+
+    print('SUMMARY:')
+    print(
+        f'  Total input tokens:  {totals["input_tokens"] + totals["cached_tokens"]:,}'
+    )
+    print(f'  Total output tokens: {totals["output_tokens"]:,}')
+    print(f'  Grand total tokens:  {totals["total_tokens"]:,}')
+
+    return totals
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Aggregate token usage metrics from LLM completion files',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python aggregate_token_usage.py /path/to/completions
+  python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002
+  python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002 --cached-cost 0.0000005
+        """,
+    )
+
+    parser.add_argument(
+        'directory_path', help='Path to directory containing completion files'
+    )
+
+    parser.add_argument(
+        '--input-cost',
+        type=float,
+        default=0.0,
+        help='Cost per input token (default: 0.0)',
+    )
+
+    parser.add_argument(
+        '--output-cost',
+        type=float,
+        default=0.0,
+        help='Cost per output token (default: 0.0)',
+    )
+
+    parser.add_argument(
+        '--cached-cost',
+        type=float,
+        default=0.0,
+        help='Cost per cached token (default: 0.0)',
+    )
+
+    args = parser.parse_args()
+
+    # Validate directory path
+    if not os.path.exists(args.directory_path):
+        print(f"Error: Directory '{args.directory_path}' does not exist.")
+        return 1
+
+    if not os.path.isdir(args.directory_path):
+        print(f"Error: '{args.directory_path}' is not a directory.")
+        return 1
+
+    # Run aggregation
+    try:
+        aggregate_token_usage(
+            args.directory_path, args.input_cost, args.output_cost, args.cached_cost
+        )
+        return 0
+    except Exception as e:
+        print(f'Error during aggregation: {e}')
+        return 1
+
+
+if __name__ == '__main__':
+    exit(main())
@@ -188,6 +188,14 @@ def make_metadata(
    pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
        parents=True, exist_ok=True
    )
+    # Allow overriding the evaluation output directory via env for smoke runs
+    override_output_dir = os.environ.get('EVAL_OUTPUT_DIR')
+    if override_output_dir:
+        eval_output_path = override_output_dir
+        pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
+        pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
+            parents=True, exist_ok=True
+        )
    logger.info(f'Using evaluation output directory: {eval_output_path}')

    metadata = EvalMetadata(
@@ -703,3 +711,79 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig:
        remote_runtime_enable_retries=True,
        remote_runtime_class='sysbox',
    )
+
+
+def get_openhands_config_for_eval(
+    metadata: EvalMetadata | None = None,
+    sandbox_config: SandboxConfig | None = None,
+    runtime: str | None = None,
+    max_iterations: int | None = None,
+    default_agent: str | None = None,
+    enable_browser: bool = False,
+    workspace_base: str | None = None,
+    workspace_mount_path: str | None = None,
+):
+    """Create an OpenHandsConfig with common patterns used across evaluation scripts.
+
+    This function provides a standardized way to create OpenHands configurations
+    for evaluation runs, with sensible defaults that match the patterns used in
+    most run_infer.py scripts. Individual evaluation scripts can override specific
+    attributes as needed.
+
+    Args:
+        metadata: EvalMetadata containing agent class, max iterations, etc.
+        sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval()
+        runtime: Runtime type. If None, uses environment RUNTIME or 'docker'
+        max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations
+        default_agent: Agent class name. If None, uses metadata.agent_class
+        enable_browser: Whether to enable browser functionality
+        workspace_base: Workspace base path. Defaults to None
+        workspace_mount_path: Workspace mount path. Defaults to None
+
+    Returns:
+        OpenHandsConfig: Configured for evaluation with eval-specific overrides applied
+    """
+    # Defer import to avoid circular imports at module load time
+    from openhands.core.config.openhands_config import (
+        OpenHandsConfig as _OHConfig,  # type: ignore
+    )
+
+    # Use provided sandbox config or get default
+    if sandbox_config is None:
+        sandbox_config = get_default_sandbox_config_for_eval()
+
+    # Extract values from metadata if provided
+    if metadata is not None:
+        if max_iterations is None:
+            max_iterations = metadata.max_iterations
+        if default_agent is None:
+            default_agent = metadata.agent_class
+
+    # Use environment runtime or default
+    if runtime is None:
+        runtime = os.environ.get('RUNTIME', 'docker')
+
+    # Provide sensible defaults if still None
+    if default_agent is None:
+        default_agent = 'CodeActAgent'
+    if max_iterations is None:
+        max_iterations = 50
+
+    # Always use repo-local .eval_sessions directory (absolute path)
+    eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions'))
+
+    # Create the base config with evaluation-specific overrides
+    config = _OHConfig(
+        default_agent=default_agent,
+        run_as_openhands=False,
+        runtime=runtime,
+        max_iterations=max_iterations,
+        enable_browser=enable_browser,
+        sandbox=sandbox_config,
+        workspace_base=workspace_base,
+        workspace_mount_path=workspace_mount_path,
+        file_store='local',
+        file_store_path=eval_store,
+    )
+
+    return config
@@ -14,21 +14,32 @@ import { Conversation } from "#/api/open-hands.types";

 // Mock hooks
 const mockUseUserProviders = vi.fn();
-const mockUseUserRepositories = vi.fn();
+const mockUseGitRepositories = vi.fn();
 const mockUseConfig = vi.fn();
+const mockUseRepositoryMicroagents = vi.fn();
+const mockUseMicroagentManagementConversations = vi.fn();

 vi.mock("#/hooks/use-user-providers", () => ({
  useUserProviders: () => mockUseUserProviders(),
 }));

-vi.mock("#/hooks/query/use-user-repositories", () => ({
-  useUserRepositories: () => mockUseUserRepositories(),
+vi.mock("#/hooks/query/use-git-repositories", () => ({
+  useGitRepositories: () => mockUseGitRepositories(),
 }));

 vi.mock("#/hooks/query/use-config", () => ({
  useConfig: () => mockUseConfig(),
 }));

+vi.mock("#/hooks/query/use-repository-microagents", () => ({
+  useRepositoryMicroagents: () => mockUseRepositoryMicroagents(),
+}));
+
+vi.mock("#/hooks/query/use-microagent-management-conversations", () => ({
+  useMicroagentManagementConversations: () =>
+    mockUseMicroagentManagementConversations(),
+}));
+
 describe("MicroagentManagement", () => {
  const RouterStub = createRoutesStub([
    {
@@ -174,7 +185,7 @@ describe("MicroagentManagement", () => {
      providers: ["github"],
    });

-    mockUseUserRepositories.mockReturnValue({
+    mockUseGitRepositories.mockReturnValue({
      data: {
        pages: [
          {
@@ -196,6 +207,18 @@ describe("MicroagentManagement", () => {
      },
    });

+    mockUseRepositoryMicroagents.mockReturnValue({
+      data: mockMicroagents,
+      isLoading: false,
+      isError: false,
+    });
+
+    mockUseMicroagentManagementConversations.mockReturnValue({
+      data: mockConversations,
+      isLoading: false,
+      isError: false,
+    });
+
    // Setup default mock for retrieveUserGitRepositories
    vi.spyOn(OpenHands, "retrieveUserGitRepositories").mockResolvedValue({
      data: [...mockRepositories],
@@ -227,7 +250,7 @@ describe("MicroagentManagement", () => {

  it("should display loading state when fetching repositories", async () => {
    // Mock loading state
-    mockUseUserRepositories.mockReturnValue({
+    mockUseGitRepositories.mockReturnValue({
      data: undefined,
      isLoading: true,
      isError: false,
@@ -245,7 +268,7 @@ describe("MicroagentManagement", () => {

  it("should handle error when fetching repositories", async () => {
    // Mock error state
-    mockUseUserRepositories.mockReturnValue({
+    mockUseGitRepositories.mockReturnValue({
      data: undefined,
      isLoading: false,
      isError: true,
@@ -258,7 +281,7 @@ describe("MicroagentManagement", () => {

    // Wait for the error to be handled
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });
  });

@@ -267,7 +290,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Check that tabs are rendered
@@ -285,7 +308,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded and rendered
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Check that repository names are displayed
@@ -300,7 +323,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -309,10 +332,7 @@ describe("MicroagentManagement", () => {

    // Wait for microagents to be fetched
    await waitFor(() => {
-      expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-        "user",
-        "repo2",
-      );
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });

    // Check that microagents are displayed
@@ -325,19 +345,17 @@ describe("MicroagentManagement", () => {

  it("should display loading state when fetching microagents", async () => {
    const user = userEvent.setup();
-    const getRepositoryMicroagentsSpy = vi.spyOn(
-      OpenHands,
-      "getRepositoryMicroagents",
-    );
-    getRepositoryMicroagentsSpy.mockImplementation(
-      () => new Promise(() => {}), // Never resolves
-    );
+    mockUseRepositoryMicroagents.mockReturnValue({
+      data: undefined,
+      isLoading: true,
+      isError: false,
+    });

    renderMicroagentManagement();

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -350,19 +368,17 @@ describe("MicroagentManagement", () => {

  it("should handle error when fetching microagents", async () => {
    const user = userEvent.setup();
-    const getRepositoryMicroagentsSpy = vi.spyOn(
-      OpenHands,
-      "getRepositoryMicroagents",
-    );
-    getRepositoryMicroagentsSpy.mockRejectedValue(
-      new Error("Failed to fetch microagents"),
-    );
+    mockUseRepositoryMicroagents.mockReturnValue({
+      data: undefined,
+      isLoading: false,
+      isError: true,
+    });

    renderMicroagentManagement();

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -371,23 +387,23 @@ describe("MicroagentManagement", () => {

    // Wait for the error to be handled
    await waitFor(() => {
-      expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith("user", "repo2");
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });
  });

  it("should display empty state when no microagents are found", async () => {
    const user = userEvent.setup();
-    const getRepositoryMicroagentsSpy = vi.spyOn(
-      OpenHands,
-      "getRepositoryMicroagents",
-    );
-    getRepositoryMicroagentsSpy.mockResolvedValue([]);
+    mockUseRepositoryMicroagents.mockReturnValue({
+      data: [],
+      isLoading: false,
+      isError: false,
+    });

    renderMicroagentManagement();

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -396,7 +412,7 @@ describe("MicroagentManagement", () => {

    // Wait for microagents to be fetched
    await waitFor(() => {
-      expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith("user", "repo2");
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });

    // Check that no microagents are displayed
@@ -410,7 +426,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -419,10 +435,7 @@ describe("MicroagentManagement", () => {

    // Wait for microagents to be fetched
    await waitFor(() => {
-      expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-        "user",
-        "repo2",
-      );
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });

    // Check that microagent cards display correct information
@@ -449,7 +462,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded and processed
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Wait for repositories to be displayed in the accordion
@@ -468,7 +481,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded and processed
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Wait for repositories to be displayed in the accordion
@@ -492,7 +505,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click the first add microagent button
@@ -513,7 +526,7 @@ describe("MicroagentManagement", () => {

  it("should display empty state when no repositories are found", async () => {
    // Mock empty repositories
-    mockUseUserRepositories.mockReturnValue({
+    mockUseGitRepositories.mockReturnValue({
      data: {
        pages: [
          {
@@ -533,7 +546,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Check that empty state messages are displayed
@@ -550,7 +563,7 @@ describe("MicroagentManagement", () => {

    // Wait for repositories to be loaded
    await waitFor(() => {
-      expect(mockUseUserRepositories).toHaveBeenCalled();
+      expect(mockUseGitRepositories).toHaveBeenCalled();
    });

    // Find and click on the first repository accordion
@@ -559,14 +572,11 @@ describe("MicroagentManagement", () => {

    // Wait for microagents to be fetched for first repo
    await waitFor(() => {
-      expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-        "user",
-        "repo2",
-      );
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });

-    // Check that the API call was made
-    expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledTimes(1);
+    // Check that the hook was called
+    expect(mockUseRepositoryMicroagents).toHaveBeenCalledTimes(1);
  });

  it("should display ready to add microagent message in main area", async () => {
@@ -591,7 +601,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Check that search input is rendered
@@ -611,7 +621,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Initially only repositories with .openhands should be visible
@@ -642,7 +652,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input with uppercase
@@ -665,7 +675,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input with partial match
@@ -691,7 +701,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input
@@ -724,7 +734,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input with non-existent repository name
@@ -752,7 +762,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input with special characters
@@ -773,7 +783,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Filter to show only repo2
@@ -788,10 +798,7 @@ describe("MicroagentManagement", () => {

      // Wait for microagents to be fetched
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-          "user",
-          "repo2",
-        );
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
      });

      // Check that microagents are displayed
@@ -808,7 +815,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Type in search input with leading/trailing whitespace
@@ -828,7 +835,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      const searchInput = screen.getByRole("textbox", {
@@ -853,14 +860,14 @@ describe("MicroagentManagement", () => {
  });

  // Search conversations functionality tests
-  describe("Search conversations functionality", () => {
-    it("should call searchConversations API when repository is expanded", async () => {
+  describe("Microagent management conversations functionality", () => {
+    it("should call useMicroagentManagementConversations API when repository is expanded", async () => {
      const user = userEvent.setup();
      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -869,15 +876,8 @@ describe("MicroagentManagement", () => {

      // Wait for both microagents and conversations to be fetched
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-          "user",
-          "repo2",
-        );
-        expect(OpenHands.searchConversations).toHaveBeenCalledWith(
-          "user/repo2/.openhands",
-          "microagent_management",
-          1000,
-        );
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });
    });

@@ -887,7 +887,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -896,8 +896,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
-        expect(OpenHands.searchConversations).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that microagents are displayed
@@ -917,23 +917,22 @@ describe("MicroagentManagement", () => {

    it("should show loading state when both microagents and conversations are loading", async () => {
      const user = userEvent.setup();
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
-      // Make both queries never resolve
-      getRepositoryMicroagentsSpy.mockImplementation(
-        () => new Promise(() => {}),
-      );
-      searchConversationsSpy.mockImplementation(() => new Promise(() => {}));
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: undefined,
+        isLoading: true,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: undefined,
+        isLoading: true,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -950,7 +949,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -959,8 +958,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
-        expect(OpenHands.searchConversations).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that loading spinner is not displayed
@@ -975,7 +974,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -984,8 +983,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
-        expect(OpenHands.searchConversations).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that microagent file paths are displayed for microagents
@@ -1010,21 +1009,22 @@ describe("MicroagentManagement", () => {

    it("should show learn this repo component when no microagents and no conversations", async () => {
      const user = userEvent.setup();
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
-      // Mock both queries to return empty arrays
-      getRepositoryMicroagentsSpy.mockResolvedValue([]);
-      searchConversationsSpy.mockResolvedValue([]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1033,8 +1033,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
-        expect(searchConversationsSpy).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that the learn this repo component is displayed
@@ -1046,21 +1046,22 @@ describe("MicroagentManagement", () => {

    it("should show learn this repo component when only conversations exist but no microagents", async () => {
      const user = userEvent.setup();
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
-      // Mock microagents to return empty array, conversations to return data
-      getRepositoryMicroagentsSpy.mockResolvedValue([]);
-      searchConversationsSpy.mockResolvedValue([...mockConversations]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [...mockConversations],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1069,8 +1070,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
-        expect(searchConversationsSpy).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that conversations are displayed
@@ -1088,21 +1089,22 @@ describe("MicroagentManagement", () => {

    it("should show learn this repo component when only microagents exist but no conversations", async () => {
      const user = userEvent.setup();
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
-      // Mock microagents to return data, conversations to return empty array
-      getRepositoryMicroagentsSpy.mockResolvedValue([...mockMicroagents]);
-      searchConversationsSpy.mockResolvedValue([]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [...mockMicroagents],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1111,8 +1113,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
-        expect(searchConversationsSpy).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that microagents are displayed
@@ -1130,16 +1132,17 @@ describe("MicroagentManagement", () => {

    it("should handle error when fetching conversations", async () => {
      const user = userEvent.setup();
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-      searchConversationsSpy.mockRejectedValue(
-        new Error("Failed to fetch conversations"),
-      );
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: undefined,
+        isLoading: false,
+        isError: true,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1148,11 +1151,7 @@ describe("MicroagentManagement", () => {

      // Wait for the error to be handled
      await waitFor(() => {
-        expect(searchConversationsSpy).toHaveBeenCalledWith(
-          "user/repo2/.openhands",
-          "microagent_management",
-          1000,
-        );
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that the learn this repo component is displayed (since conversations failed)
@@ -1163,27 +1162,22 @@ describe("MicroagentManagement", () => {
      });

      // Also check that the microagents query was called successfully
-      expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-        "user",
-        "repo2",
-      );
+      expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
    });

    it("should handle error when fetching microagents but conversations succeed", async () => {
      const user = userEvent.setup();
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      getRepositoryMicroagentsSpy.mockRejectedValue(
-        new Error("Failed to fetch microagents"),
-      );
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: undefined,
+        isLoading: false,
+        isError: true,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1192,10 +1186,7 @@ describe("MicroagentManagement", () => {

      // Wait for the error to be handled
      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalledWith(
-          "user",
-          "repo2",
-        );
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
      });

      // Check that the learn this repo component is displayed (since microagents failed)
@@ -1205,28 +1196,22 @@ describe("MicroagentManagement", () => {
      expect(learnThisRepo).toBeInTheDocument();
    });

-    it("should call searchConversations with correct parameters", async () => {
+    it("should call useMicroagentManagementConversations with correct parameters", async () => {
      const user = userEvent.setup();
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
      const repoAccordion = screen.getByTestId("repository-name-tooltip");
      await user.click(repoAccordion);

-      // Wait for searchConversations to be called
+      // Wait for useMicroagentManagementConversations to be called
      await waitFor(() => {
-        expect(searchConversationsSpy).toHaveBeenCalledWith(
-          "user/repo2/.openhands",
-          "microagent_management",
-          1000,
-        );
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });
    });

@@ -1236,7 +1221,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1245,8 +1230,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to complete
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalled();
-        expect(OpenHands.searchConversations).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that conversations display correct information
@@ -1263,7 +1248,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion
@@ -1272,15 +1257,8 @@ describe("MicroagentManagement", () => {

      // Wait for both queries to be called for first repo
      await waitFor(() => {
-        expect(OpenHands.getRepositoryMicroagents).toHaveBeenCalledWith(
-          "user",
-          "repo2",
-        );
-        expect(OpenHands.searchConversations).toHaveBeenCalledWith(
-          "user/repo2/.openhands",
-          "microagent_management",
-          1000,
-        );
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Check that both microagents and conversations are displayed
@@ -1304,7 +1282,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1325,7 +1303,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1387,7 +1365,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1418,7 +1396,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1448,7 +1426,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1488,7 +1466,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1522,7 +1500,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -1555,7 +1533,7 @@ describe("MicroagentManagement", () => {

      // Wait for repositories to be loaded and processed
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Wait for repositories to be displayed in the accordion
@@ -2409,19 +2387,22 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Setup mocks before rendering
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-      getRepositoryMicroagentsSpy.mockResolvedValue([]);
-      searchConversationsSpy.mockResolvedValue([]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories to be loaded
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      // Find and click on the first repository accordion to expand it
@@ -2430,8 +2411,8 @@ describe("MicroagentManagement", () => {

      // Wait for microagents and conversations to be fetched
      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
-        expect(searchConversationsSpy).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Verify the learn this repo trigger is displayed when no microagents exist
@@ -2451,19 +2432,22 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Setup mocks
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-      getRepositoryMicroagentsSpy.mockResolvedValue([]);
-      searchConversationsSpy.mockResolvedValue([]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      // Wait for repositories and expand accordion
      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      const repoAccordion = screen.getByTestId("repository-name-tooltip");
@@ -2496,35 +2480,36 @@ describe("MicroagentManagement", () => {
      const user = userEvent.setup();

      // Setup mocks with existing microagents (should NOT show trigger)
-      const getRepositoryMicroagentsSpy = vi.spyOn(
-        OpenHands,
-        "getRepositoryMicroagents",
-      );
-      const searchConversationsSpy = vi.spyOn(OpenHands, "searchConversations");
-
-      // Mock with existing microagent
-      getRepositoryMicroagentsSpy.mockResolvedValue([
-        {
-          name: "test-microagent",
-          created_at: "2021-10-01",
-          git_provider: "github",
-          path: ".openhands/microagents/test",
-        },
-      ]);
-      searchConversationsSpy.mockResolvedValue([]);
+      mockUseRepositoryMicroagents.mockReturnValue({
+        data: [
+          {
+            name: "test-microagent",
+            created_at: "2021-10-01",
+            git_provider: "github",
+            path: ".openhands/microagents/test",
+          },
+        ],
+        isLoading: false,
+        isError: false,
+      });
+      mockUseMicroagentManagementConversations.mockReturnValue({
+        data: [],
+        isLoading: false,
+        isError: false,
+      });

      renderMicroagentManagement();

      await waitFor(() => {
-        expect(mockUseUserRepositories).toHaveBeenCalled();
+        expect(mockUseGitRepositories).toHaveBeenCalled();
      });

      const repoAccordion = screen.getByTestId("repository-name-tooltip");
      await user.click(repoAccordion);

      await waitFor(() => {
-        expect(getRepositoryMicroagentsSpy).toHaveBeenCalled();
-        expect(searchConversationsSpy).toHaveBeenCalled();
+        expect(mockUseRepositoryMicroagents).toHaveBeenCalled();
+        expect(mockUseMicroagentManagementConversations).toHaveBeenCalled();
      });

      // Should NOT show the learn this repo trigger when microagents exist
@@ -79,6 +79,35 @@ describe("Content", () => {
        expect(screen.getByTestId("set-indicator")).toBeInTheDocument();
      });
    });
+
+    it("should conditionally show security analyzer based on confirmation mode", async () => {
+      renderLlmSettingsScreen();
+      await screen.findByTestId("llm-settings-screen");
+
+      const confirmation = screen.getByTestId("enable-confirmation-mode-switch");
+
+      // Initially confirmation mode is false, so security analyzer should not be visible
+      expect(confirmation).not.toBeChecked();
+      expect(
+        screen.queryByTestId("security-analyzer-input"),
+      ).not.toBeInTheDocument();
+
+      // Enable confirmation mode
+      await userEvent.click(confirmation);
+      expect(confirmation).toBeChecked();
+
+      // Security analyzer should now be visible
+      screen.getByTestId("security-analyzer-input");
+
+      // Disable confirmation mode again
+      await userEvent.click(confirmation);
+      expect(confirmation).not.toBeChecked();
+
+      // Security analyzer should be hidden again
+      expect(
+        screen.queryByTestId("security-analyzer-input"),
+      ).not.toBeInTheDocument();
+    });
  });

  describe("Advanced form", () => {
@@ -107,7 +136,6 @@ describe("Content", () => {
      within(advancedForm).getByTestId("llm-api-key-input");
      within(advancedForm).getByTestId("llm-api-key-help-anchor-advanced");
      within(advancedForm).getByTestId("agent-input");
-      within(advancedForm).getByTestId("enable-confirmation-mode-switch");
      within(advancedForm).getByTestId("enable-memory-condenser-switch");

      await userEvent.click(advancedSwitch);
@@ -130,9 +158,6 @@ describe("Content", () => {
      const baseUrl = screen.getByTestId("base-url-input");
      const apiKey = screen.getByTestId("llm-api-key-input");
      const agent = screen.getByTestId("agent-input");
-      const confirmation = screen.getByTestId(
-        "enable-confirmation-mode-switch",
-      );
      const condensor = screen.getByTestId("enable-memory-condenser-switch");

      expect(model).toHaveValue("openhands/claude-sonnet-4-20250514");
@@ -140,15 +165,7 @@ describe("Content", () => {
      expect(apiKey).toHaveValue("");
      expect(apiKey).toHaveProperty("placeholder", "");
      expect(agent).toHaveValue("CodeActAgent");
-      expect(confirmation).not.toBeChecked();
      expect(condensor).toBeChecked();
-
-      // check that security analyzer is present
-      expect(
-        screen.queryByTestId("security-analyzer-input"),
-      ).not.toBeInTheDocument();
-      await userEvent.click(confirmation);
-      screen.getByTestId("security-analyzer-input");
    });

    it("should render the advanced form if existings settings are advanced", async () => {
@@ -177,7 +194,7 @@ describe("Content", () => {
        agent: "CoActAgent",
        confirmation_mode: true,
        enable_default_condenser: false,
-        security_analyzer: "mock-invariant",
+        security_analyzer: "none",
      });

      renderLlmSettingsScreen();
@@ -203,7 +220,7 @@ describe("Content", () => {
        expect(agent).toHaveValue("CoActAgent");
        expect(confirmation).toBeChecked();
        expect(condensor).not.toBeChecked();
-        expect(securityAnalyzer).toHaveValue("mock-invariant");
+        expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_NONE");
      });
    });
  });
@@ -293,7 +310,7 @@ describe("Form submission", () => {
    // select security analyzer
    const securityAnalyzer = screen.getByTestId("security-analyzer-input");
    await userEvent.click(securityAnalyzer);
-    const securityAnalyzerOption = screen.getByText("mock-invariant");
+    const securityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_NONE");
    await userEvent.click(securityAnalyzerOption);

    const submitButton = screen.getByTestId("submit-button");
@@ -306,7 +323,7 @@ describe("Form submission", () => {
        agent: "CoActAgent",
        confirmation_mode: true,
        enable_default_condenser: false,
-        security_analyzer: "mock-invariant",
+        security_analyzer: null,
      }),
    );
  });
@@ -375,9 +392,11 @@ describe("Form submission", () => {
    const baseUrl = await screen.findByTestId("base-url-input");
    const apiKey = await screen.findByTestId("llm-api-key-input");
    const agent = await screen.findByTestId("agent-input");
-    const confirmation = await screen.findByTestId("enable-confirmation-mode-switch");
    const condensor = await screen.findByTestId("enable-memory-condenser-switch");

+    // Confirmation mode switch is now in basic settings, always visible
+    const confirmation = await screen.findByTestId("enable-confirmation-mode-switch");
+
    // enter custom model
    await userEvent.type(model, "-mini");
    expect(model).toHaveValue("openai/gpt-4o-mini");
@@ -451,14 +470,17 @@ describe("Form submission", () => {
    // select security analyzer
    const securityAnalyzer = await screen.findByTestId("security-analyzer-input");
    await userEvent.click(securityAnalyzer);
-    const securityAnalyzerOption = screen.getByText("mock-invariant");
+    const securityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_NONE");
    await userEvent.click(securityAnalyzerOption);
-    expect(securityAnalyzer).toHaveValue("mock-invariant");
+    expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_NONE");

    expect(submitButton).not.toBeDisabled();

-    await userEvent.clear(securityAnalyzer);
-    expect(securityAnalyzer).toHaveValue("");
+    // revert back to original value
+    await userEvent.click(securityAnalyzer);
+    const originalSecurityAnalyzerOption = screen.getByText("SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT");
+    await userEvent.click(originalSecurityAnalyzerOption);
+    expect(securityAnalyzer).toHaveValue("SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT");
    expect(submitButton).toBeDisabled();
  });

@@ -552,7 +574,7 @@ describe("Form submission", () => {
      expect.objectContaining({
        llm_model: "openhands/claude-sonnet-4-20250514",
        llm_base_url: "",
-        confirmation_mode: false,
+        confirmation_mode: true, // Confirmation mode is now a basic setting, should be preserved
      }),
    );
  });
@@ -107,9 +107,7 @@ describe("Content", () => {
      expect(screen.queryByTestId("add-secret-button")).not.toBeInTheDocument(),
    );
    const button = await screen.findByTestId("connect-git-button");
-    await userEvent.click(button);
-
-    screen.getByTestId("git-settings-screen");
+    expect(button).toHaveAttribute("href", "/settings/integrations");
  });

  it("should render an empty table when there are no existing secrets", async () => {
@@ -29,23 +29,5 @@ describe("hasAdvancedSettingsSet", () => {
        }),
      ).toBe(true);
    });
-
-    test("CONFIRMATION_MODE is true", () => {
-      expect(
-        hasAdvancedSettingsSet({
-          ...DEFAULT_SETTINGS,
-          CONFIRMATION_MODE: true,
-        }),
-      ).toBe(true);
-    });
-
-    test("SECURITY_ANALYZER is set", () => {
-      expect(
-        hasAdvancedSettingsSet({
-          ...DEFAULT_SETTINGS,
-          SECURITY_ANALYZER: "test",
-        }),
-      ).toBe(true);
-    });
  });
 });
@@ -11,17 +11,17 @@
    "@heroui/use-infinite-scroll": "^2.2.10",
    "@microlink/react-json-view": "^1.26.2",
    "@monaco-editor/react": "^4.7.0-rc.0",
-    "@react-router/node": "^7.8.0",
-    "@react-router/serve": "^7.8.0",
-    "@react-types/shared": "^3.31.0",
+    "@react-router/node": "^7.8.2",
+    "@react-router/serve": "^7.8.2",
+    "@react-types/shared": "^3.32.0",
    "@reduxjs/toolkit": "^2.8.2",
-    "@stripe/react-stripe-js": "^3.9.0",
-    "@stripe/stripe-js": "^7.8.0",
+    "@stripe/react-stripe-js": "^3.9.2",
+    "@stripe/stripe-js": "^7.9.0",
    "@tailwindcss/postcss": "^4.1.12",
    "@tailwindcss/vite": "^4.1.12",
-    "@tanstack/react-query": "^5.85.3",
+    "@tanstack/react-query": "^5.85.5",
    "@uidotdev/usehooks": "^2.4.1",
-    "@vitejs/plugin-react": "^5.0.0",
+    "@vitejs/plugin-react": "^5.0.1",
    "@xterm/addon-fit": "^0.10.0",
    "@xterm/xterm": "^5.4.0",
    "axios": "^1.11.0",
@@ -29,32 +29,32 @@
    "date-fns": "^4.1.0",
    "eslint-config-airbnb-typescript": "^18.0.0",
    "framer-motion": "^12.23.12",
-    "i18next": "^25.3.6",
+    "i18next": "^25.4.2",
    "i18next-browser-languagedetector": "^8.2.0",
    "i18next-http-backend": "^3.0.2",
-    "isbot": "^5.1.29",
-    "jose": "^6.0.12",
-    "lucide-react": "^0.539.0",
+    "isbot": "^5.1.30",
+    "jose": "^6.0.13",
+    "lucide-react": "^0.542.0",
    "monaco-editor": "^0.52.2",
-    "posthog-js": "^1.260.1",
+    "posthog-js": "^1.260.3",
    "react": "^19.1.1",
    "react-dom": "^19.1.1",
    "react-highlight": "^0.15.0",
-    "react-hot-toast": "^2.5.1",
-    "react-i18next": "^15.6.1",
+    "react-hot-toast": "^2.6.0",
+    "react-i18next": "^15.7.2",
    "react-icons": "^5.5.0",
    "react-markdown": "^10.1.0",
    "react-redux": "^9.2.0",
-    "react-router": "^7.8.0",
+    "react-router": "^7.8.2",
    "react-select": "^5.10.2",
-    "react-syntax-highlighter": "^15.6.1",
+    "react-syntax-highlighter": "^15.6.6",
    "react-textarea-autosize": "^8.5.9",
    "remark-breaks": "^4.0.0",
    "remark-gfm": "^4.0.1",
    "sirv-cli": "^3.0.1",
    "socket.io-client": "^4.8.1",
    "tailwind-merge": "^3.3.1",
-    "vite": "^7.1.1",
+    "vite": "^7.1.3",
    "web-vitals": "^5.1.0",
    "ws": "^8.18.2"
  },
@@ -88,17 +88,17 @@
    "@babel/traverse": "^7.28.3",
    "@babel/types": "^7.28.2",
    "@mswjs/socket.io-binding": "^0.2.0",
-    "@playwright/test": "^1.54.2",
-    "@react-router/dev": "^7.8.0",
+    "@playwright/test": "^1.55.0",
+    "@react-router/dev": "^7.8.2",
    "@tailwindcss/typography": "^0.5.16",
    "@tanstack/eslint-plugin-query": "^5.83.1",
    "@testing-library/dom": "^10.4.1",
-    "@testing-library/jest-dom": "^6.7.0",
+    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@testing-library/user-event": "^14.6.1",
-    "@types/node": "^24.2.0",
-    "@types/react": "^19.1.9",
-    "@types/react-dom": "^19.1.7",
+    "@types/node": "^24.3.0",
+    "@types/react": "^19.1.11",
+    "@types/react-dom": "^19.1.8",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
    "@types/ws": "^8.18.1",
@@ -117,7 +117,7 @@
    "eslint-plugin-prettier": "^5.5.4",
    "eslint-plugin-react": "^7.37.5",
    "eslint-plugin-react-hooks": "^4.6.2",
-    "eslint-plugin-unused-imports": "^4.1.4",
+    "eslint-plugin-unused-imports": "^4.2.0",
    "husky": "^9.1.7",
    "jsdom": "^26.1.0",
    "lint-staged": "^16.1.4",
@@ -126,7 +126,7 @@
    "stripe": "^18.4.0",
    "tailwindcss": "^4.1.8",
    "typescript": "^5.9.2",
-    "vite-plugin-svgr": "^4.2.0",
+    "vite-plugin-svgr": "^4.5.0",
    "vite-tsconfig-paths": "^5.1.4",
    "vitest": "^3.0.2"
  },
@@ -726,6 +726,27 @@ class OpenHands {
    );
    return data;
  }
+
+  static async getMicroagentManagementConversations(
+    selectedRepository: string,
+    pageId?: string,
+    limit: number = 100,
+  ): Promise<Conversation[]> {
+    const params: Record<string, string | number> = {
+      limit,
+      selected_repository: selectedRepository,
+    };
+
+    if (pageId) {
+      params.page_id = pageId;
+    }
+
+    const { data } = await openHands.get<ResultSet<Conversation>>(
+      "/api/microagent-management/conversations",
+      { params },
+    );
+    return data.results;
+  }
 }

 export default OpenHands;
@@ -1,4 +1,5 @@
 import { useMemo } from "react";
+import { StylesConfig } from "react-select";
 import { Provider } from "../../types/settings";
 import { ReactSelectDropdown, SelectOption } from "./react-select-dropdown";

@@ -11,6 +12,8 @@ export interface GitProviderDropdownProps {
  disabled?: boolean;
  isLoading?: boolean;
  onChange?: (provider: Provider | null) => void;
+  classNamePrefix?: string;
+  styles?: StylesConfig<SelectOption, false>;
 }

 export function GitProviderDropdown({
@@ -22,6 +25,8 @@ export function GitProviderDropdown({
  disabled = false,
  isLoading = false,
  onChange,
+  classNamePrefix,
+  styles,
 }: GitProviderDropdownProps) {
  const options: SelectOption[] = useMemo(
    () =>
@@ -53,6 +58,8 @@ export function GitProviderDropdown({
      isSearchable={false}
      isLoading={isLoading}
      onChange={handleChange}
+      classNamePrefix={classNamePrefix}
+      styles={styles}
    />
  );
 }
@@ -1,5 +1,5 @@
 import { useMemo } from "react";
-import Select from "react-select";
+import Select, { StylesConfig } from "react-select";
 import { cn } from "#/utils/utils";
 import { SelectOptionBase, getCustomStyles } from "./react-select-styles";

@@ -17,6 +17,8 @@ export interface ReactSelectDropdownProps {
  isSearchable?: boolean;
  isLoading?: boolean;
  onChange?: (option: SelectOption | null) => void;
+  classNamePrefix?: string;
+  styles?: StylesConfig<SelectOption, false>;
 }

 export function ReactSelectDropdown({
@@ -31,6 +33,8 @@ export function ReactSelectDropdown({
  isSearchable = true,
  isLoading = false,
  onChange,
+  classNamePrefix,
+  styles,
 }: ReactSelectDropdownProps) {
  const customStyles = useMemo(() => getCustomStyles<SelectOption>(), []);

@@ -46,8 +50,9 @@ export function ReactSelectDropdown({
        isSearchable={isSearchable}
        isLoading={isLoading}
        onChange={onChange}
-        styles={customStyles}
+        styles={styles || customStyles}
        className="w-full"
+        classNamePrefix={classNamePrefix}
      />
      {errorMessage && (
        <p className="text-red-500 text-sm mt-1">{errorMessage}</p>
@@ -90,3 +90,26 @@ export const getCustomStyles = <T extends SelectOptionBase>(): StylesConfig<
    color: "#B7BDC2", // tertiary-light
  }),
 });
+
+export const getGitProviderMicroagentManagementCustomStyles = <
+  T extends SelectOptionBase,
+>(): StylesConfig<T, false> => ({
+  ...getCustomStyles<T>(),
+  control: (provided, state) => ({
+    ...provided,
+    backgroundColor: state.isDisabled ? "#363636" : "#454545", // darker tertiary when disabled
+    border: "1px solid #717888",
+    borderRadius: "0.125rem",
+    minHeight: "2.5rem",
+    padding: "0 0.5rem",
+    boxShadow: "none",
+    opacity: state.isDisabled ? 0.6 : 1,
+    cursor: state.isDisabled ? "not-allowed" : "pointer",
+    "&:hover": {
+      borderColor: "#717888",
+    },
+    "& .git-provider-dropdown__value-container": {
+      padding: "2px 0",
+    },
+  }),
+});
@@ -9,6 +9,7 @@ import { CopyToClipboardButton } from "#/components/shared/buttons/copy-to-clipb
 import { anchor } from "../markdown/anchor";
 import { OpenHandsSourceType } from "#/types/core/base";
 import { paragraph } from "../markdown/paragraph";
+import { TooltipButton } from "#/components/shared/buttons/tooltip-button";

 interface ChatMessageProps {
  type: OpenHandsSourceType;
@@ -16,6 +17,7 @@ interface ChatMessageProps {
  actions?: Array<{
    icon: React.ReactNode;
    onClick: () => void;
+    tooltip?: string;
  }>;
 }

@@ -66,17 +68,35 @@ export function ChatMessage({
          "items-center gap-1",
        )}
      >
-        {actions?.map((action, index) => (
-          <button
-            key={index}
-            type="button"
-            onClick={action.onClick}
-            className="button-base p-1 cursor-pointer"
-            aria-label={`Action ${index + 1}`}
-          >
-            {action.icon}
-          </button>
-        ))}
+        {actions?.map((action, index) =>
+          action.tooltip ? (
+            <TooltipButton
+              key={index}
+              tooltip={action.tooltip}
+              ariaLabel={action.tooltip}
+              placement="top"
+            >
+              <button
+                type="button"
+                onClick={action.onClick}
+                className="button-base p-1 cursor-pointer"
+                aria-label={`Action ${index + 1}`}
+              >
+                {action.icon}
+              </button>
+            </TooltipButton>
+          ) : (
+            <button
+              key={index}
+              type="button"
+              onClick={action.onClick}
+              className="button-base p-1 cursor-pointer"
+              aria-label={`Action ${index + 1}`}
+            >
+              {action.icon}
+            </button>
+          ),
+        )}

        <CopyToClipboardButton
          isHidden={!isHovering}
@@ -72,6 +72,9 @@ const getRecallObservationContent = (event: RecallObservation): string => {
    if (event.extras.repo_instructions) {
      content += `\n\n**Repository Instructions:**\n\n${event.extras.repo_instructions}`;
    }
+    if (event.extras.conversation_instructions) {
+      content += `\n\n**Conversation Instructions:**\n\n${event.extras.conversation_instructions}`;
+    }
    if (event.extras.additional_agent_instructions) {
      content += `\n\n**Additional Instructions:**\n\n${event.extras.additional_agent_instructions}`;
    }
@@ -46,6 +46,7 @@ interface EventMessageProps {
  actions?: Array<{
    icon: React.ReactNode;
    onClick: () => void;
+    tooltip?: string;
  }>;
  isInLast10Actions: boolean;
 }
@@ -1,4 +1,5 @@
 import React from "react";
+import { useTranslation } from "react-i18next";
 import { createPortal } from "react-dom";
 import { OpenHandsAction } from "#/types/core/actions";
 import { OpenHandsObservation } from "#/types/core/observations";
@@ -24,6 +25,17 @@ import { AgentState } from "#/types/agent-state";
 import { getFirstPRUrl } from "#/utils/parse-pr-url";
 import MemoryIcon from "#/icons/memory_icon.svg?react";

+const isErrorEvent = (evt: unknown): evt is { error: true; message: string } =>
+  typeof evt === "object" &&
+  evt !== null &&
+  "error" in evt &&
+  evt.error === true;
+
+const isAgentStatusError = (evt: unknown): boolean =>
+  isOpenHandsEvent(evt) &&
+  isAgentStateChangeObservation(evt) &&
+  evt.extras.agent_state === AgentState.ERROR;
+
 interface MessagesProps {
  messages: (OpenHandsAction | OpenHandsObservation)[];
  isAwaitingUserConfirmation: boolean;
@@ -31,8 +43,11 @@ interface MessagesProps {

 export const Messages: React.FC<MessagesProps> = React.memo(
  ({ messages, isAwaitingUserConfirmation }) => {
-    const { createConversationAndSubscribe, isPending } =
-      useCreateConversationAndSubscribeMultiple();
+    const {
+      createConversationAndSubscribe,
+      isPending,
+      unsubscribeFromConversation,
+    } = useCreateConversationAndSubscribeMultiple();
    const { getOptimisticUserMessage } = useOptimisticUserMessage();
    const { conversationId } = useConversationId();
    const { data: conversation } = useUserConversation(conversationId);
@@ -48,6 +63,8 @@ export const Messages: React.FC<MessagesProps> = React.memo(
      EventMicroagentStatus[]
    >([]);

+    const { t } = useTranslation();
+
    const actionHasObservationPair = React.useCallback(
      (event: OpenHandsAction | OpenHandsObservation): boolean => {
        if (isOpenHandsAction(event)) {
@@ -93,20 +110,6 @@ export const Messages: React.FC<MessagesProps> = React.memo(

    const handleMicroagentEvent = React.useCallback(
      (socketEvent: unknown, microagentConversationId: string) => {
-        // Handle error events
-        const isErrorEvent = (
-          evt: unknown,
-        ): evt is { error: true; message: string } =>
-          typeof evt === "object" &&
-          evt !== null &&
-          "error" in evt &&
-          evt.error === true;
-
-        const isAgentStatusError = (evt: unknown): boolean =>
-          isOpenHandsEvent(evt) &&
-          isAgentStateChangeObservation(evt) &&
-          evt.extras.agent_state === AgentState.ERROR;
-
        if (isErrorEvent(socketEvent) || isAgentStatusError(socketEvent)) {
          setMicroagentStatuses((prev) =>
            prev.map((statusEntry) =>
@@ -119,7 +122,11 @@ export const Messages: React.FC<MessagesProps> = React.memo(
          isOpenHandsEvent(socketEvent) &&
          isAgentStateChangeObservation(socketEvent)
        ) {
-          if (socketEvent.extras.agent_state === AgentState.FINISHED) {
+          // Handle completion states
+          if (
+            socketEvent.extras.agent_state === AgentState.FINISHED ||
+            socketEvent.extras.agent_state === AgentState.AWAITING_USER_INPUT
+          ) {
            setMicroagentStatuses((prev) =>
              prev.map((statusEntry) =>
                statusEntry.conversationId === microagentConversationId
@@ -127,6 +134,8 @@ export const Messages: React.FC<MessagesProps> = React.memo(
                  : statusEntry,
              ),
            );
+
+            unsubscribeFromConversation(microagentConversationId);
          }
        } else if (
          isOpenHandsEvent(socketEvent) &&
@@ -147,9 +156,27 @@ export const Messages: React.FC<MessagesProps> = React.memo(
              ),
            );
          }
+
+          unsubscribeFromConversation(microagentConversationId);
+        } else {
+          // For any other event, transition from WAITING to CREATING if still waiting
+          setMicroagentStatuses((prev) => {
+            const currentStatus = prev.find(
+              (entry) => entry.conversationId === microagentConversationId,
+            )?.status;
+
+            if (currentStatus === MicroagentStatus.WAITING) {
+              return prev.map((statusEntry) =>
+                statusEntry.conversationId === microagentConversationId
+                  ? { ...statusEntry, status: MicroagentStatus.CREATING }
+                  : statusEntry,
+              );
+            }
+            return prev; // No change needed
+          });
        }
      },
-      [setMicroagentStatuses],
+      [setMicroagentStatuses, unsubscribeFromConversation],
    );

    const handleLaunchMicroagent = (
@@ -178,13 +205,13 @@ export const Messages: React.FC<MessagesProps> = React.memo(
        },
        onSuccessCallback: (newConversationId: string) => {
          setShowLaunchMicroagentModal(false);
-          // Update status with conversation ID
+          // Update status with conversation ID - start with WAITING
          setMicroagentStatuses((prev) => [
            ...prev.filter((status) => status.eventId !== selectedEventId),
            {
              eventId: selectedEventId,
              conversationId: newConversationId,
-              status: MicroagentStatus.CREATING,
+              status: MicroagentStatus.WAITING,
            },
          ]);
        },
@@ -219,6 +246,7 @@ export const Messages: React.FC<MessagesProps> = React.memo(
                        setSelectedEventId(message.id);
                        setShowLaunchMicroagentModal(true);
                      },
+                      tooltip: t("MICROAGENT$ADD_TO_MEMORY"),
                    },
                  ]
                : undefined
@@ -76,6 +76,10 @@ export function LaunchMicroagentModal({
            </button>
          </div>

+          <span className="text-sm text-[#A3A3A3] font-normal leading-5">
+            {t("MICROAGENT$DEFINITION")}
+          </span>
+
          <form
            data-testid="launch-microagent-modal"
            onSubmit={onSubmit}
@@ -19,6 +19,8 @@ export function MicroagentStatusIndicator({

  const getStatusText = () => {
    switch (status) {
+      case MicroagentStatus.WAITING:
+        return t("MICROAGENT$STATUS_WAITING");
      case MicroagentStatus.CREATING:
        return t("MICROAGENT$STATUS_CREATING");
      case MicroagentStatus.COMPLETED:
@@ -35,6 +37,8 @@ export function MicroagentStatusIndicator({

  const getStatusIcon = () => {
    switch (status) {
+      case MicroagentStatus.WAITING:
+        return <Spinner size="sm" />;
      case MicroagentStatus.CREATING:
        return <Spinner size="sm" />;
      case MicroagentStatus.COMPLETED:
@@ -10,6 +10,11 @@ interface ConversationCreatedToastProps {
  onClose: () => void;
 }

+interface ConversationStartingToastProps {
+  conversationId: string;
+  onClose: () => void;
+}
+
 function ConversationCreatedToast({
  conversationId,
  onClose,
@@ -37,6 +42,33 @@ function ConversationCreatedToast({
  );
 }

+function ConversationStartingToast({
+  conversationId,
+  onClose,
+}: ConversationStartingToastProps) {
+  const { t } = useTranslation();
+  return (
+    <div className="flex items-start gap-2">
+      <Spinner size="sm" />
+      <div>
+        {t("MICROAGENT$CONVERSATION_STARTING")}
+        <br />
+        <a
+          href={`/conversations/${conversationId}`}
+          target="_blank"
+          rel="noopener noreferrer"
+          className="underline"
+        >
+          {t("MICROAGENT$VIEW_CONVERSATION")}
+        </a>
+      </div>
+      <button type="button" onClick={onClose}>
+        <CloseIcon />
+      </button>
+    </div>
+  );
+}
+
 interface ConversationFinishedToastProps {
  conversationId: string;
  onClose: () => void;
@@ -78,10 +110,18 @@ function ConversationErroredToast({
  errorMessage,
  onClose,
 }: ConversationErroredToastProps) {
+  const { t } = useTranslation();
+
+  // Check if the error message is a translation key
+  const displayMessage =
+    errorMessage === "MICROAGENT$UNKNOWN_ERROR"
+      ? t(errorMessage)
+      : errorMessage;
+
  return (
    <div className="flex items-start gap-2">
      <SuccessIndicator status="error" />
-      <div>{errorMessage}</div>
+      <div>{displayMessage}</div>
      <button type="button" onClick={onClose}>
        <CloseIcon />
      </button>
@@ -136,3 +176,18 @@ export const renderConversationErroredToast = (
      duration: 5000,
    },
  );
+
+export const renderConversationStartingToast = (conversationId: string) =>
+  toast(
+    (toastInstance) => (
+      <ConversationStartingToast
+        conversationId={conversationId}
+        onClose={() => toast.dismiss(toastInstance.id)}
+      />
+    ),
+    {
+      ...TOAST_OPTIONS,
+      id: `starting-${conversationId}`,
+      duration: 10000, // Show for 10 seconds or until dismissed
+    },
+  );
@@ -7,11 +7,10 @@ import { ConversationCard } from "../conversation-panel/conversation-card";
 import { Provider } from "#/types/settings";

 interface ControlsProps {
-  setSecurityOpen: (isOpen: boolean) => void;
  showSecurityLock: boolean;
 }

-export function Controls({ setSecurityOpen, showSecurityLock }: ControlsProps) {
+export function Controls({ showSecurityLock }: ControlsProps) {
  const { data: conversation } = useActiveConversation();
  const [contextMenuOpen, setContextMenuOpen] = React.useState(false);

@@ -21,9 +20,7 @@ export function Controls({ setSecurityOpen, showSecurityLock }: ControlsProps) {
        <AgentControlBar />
        <AgentStatusBar />

-        {showSecurityLock && (
-          <SecurityLock onClick={() => setSecurityOpen(true)} />
-        )}
+        {showSecurityLock && <SecurityLock />}
      </div>

      <ConversationCard
@@ -1,17 +1,28 @@
 import { IoLockClosed } from "react-icons/io5";
+import { Tooltip } from "@heroui/react";
+import { useTranslation } from "react-i18next";
+import { Link } from "react-router";
+import { I18nKey } from "#/i18n/declaration";

-interface SecurityLockProps {
-  onClick: () => void;
-}
+export function SecurityLock() {
+  const { t } = useTranslation();

-export function SecurityLock({ onClick }: SecurityLockProps) {
  return (
-    <div
-      className="cursor-pointer hover:opacity-80 transition-all"
-      style={{ marginRight: "8px" }}
-      onClick={onClick}
+    <Tooltip
+      content={
+        <div className="max-w-xs p-2">
+          {t(I18nKey.SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP)}
+        </div>
+      }
+      placement="top"
    >
-      <IoLockClosed size={20} />
-    </div>
+      <Link
+        to="/settings"
+        className="mr-2 cursor-pointer hover:opacity-80 transition-all"
+        aria-label={t(I18nKey.SETTINGS$TITLE)}
+      >
+        <IoLockClosed size={20} />
+      </Link>
+    </Tooltip>
  );
 }
@@ -23,9 +23,9 @@ export function ConfirmStopModal({
    <ModalBackdrop>
      <ModalBody className="items-start border border-tertiary">
        <div className="flex flex-col gap-2">
-          <BaseModalTitle title={t(I18nKey.CONVERSATION$CONFIRM_STOP)} />
+          <BaseModalTitle title={t(I18nKey.CONVERSATION$CONFIRM_PAUSE)} />
          <BaseModalDescription
-            description={t(I18nKey.CONVERSATION$STOP_WARNING)}
+            description={t(I18nKey.CONVERSATION$PAUSE_WARNING)}
          />
        </div>
        <div
@@ -129,7 +129,7 @@ export function ConversationCardContextMenu({

      {onStop && (
        <ContextMenuListItem testId="stop-button" onClick={onStop}>
-          <ContextMenuIconText icon={Power} text={t(I18nKey.BUTTON$STOP)} />
+          <ContextMenuIconText icon={Power} text={t(I18nKey.BUTTON$PAUSE)} />
        </ContextMenuListItem>
      )}

@@ -1,4 +1,6 @@
 import { ConversationStatus } from "#/types/conversation-status";
+import ArchivedIcon from "./state-indicators/archived.svg?react";
+import ErrorIcon from "./state-indicators/error.svg?react";
 import RunningIcon from "./state-indicators/running.svg?react";
 import StartingIcon from "./state-indicators/starting.svg?react";
 import StoppedIcon from "./state-indicators/stopped.svg?react";
@@ -9,6 +11,8 @@ const CONVERSATION_STATUS_INDICATORS: Record<ConversationStatus, SVGIcon> = {
  STOPPED: StoppedIcon,
  RUNNING: RunningIcon,
  STARTING: StartingIcon,
+  ARCHIVED: ArchivedIcon,
+  ERROR: ErrorIcon,
 };

 interface ConversationStateIndicatorProps {
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#A7A9AC"><path d="M0 0h24v24H0V0z" fill="none"/><path d="M17 7h-4v1.9h4c1.71 0 3.1 1.39 3.1 3.1 0 1.43-.98 2.63-2.31 2.98l1.46 1.46C20.88 15.61 22 13.95 22 12c0-2.76-2.24-5-5-5zm-1 4h-2.19l2 2H16zM2 4.27l3.11 3.11C3.29 8.12 2 9.91 2 12c0 2.76 2.24 5 5 5h4v-1.9H7c-1.71 0-3.1-1.39-3.1-3.1 0-1.59 1.21-2.9 2.76-3.07L8.73 11H8v2h2.73L13 15.27V17h1.73l4.01 4L20 19.74 3.27 3 2 4.27z"/><path d="M0 24V0" fill="none"/></svg>
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#e7000b"><path d="M0 0h24v24H0z" fill="none"/><path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"/></svg>
@@ -17,7 +17,7 @@ export function MicroagentManagementAccordionTitle({
        <TooltipButton
          tooltip={repository.full_name}
          ariaLabel={repository.full_name}
-          className="text-white text-base font-normal bg-transparent p-0 min-w-0 h-auto cursor-pointer truncate max-w-[200px] translate-y-[-1px]"
+          className="text-white text-base font-normal bg-transparent p-0 min-w-0 h-auto cursor-pointer truncate max-w-[194px] translate-y-[-1px]"
          testId="repository-name-tooltip"
          placement="bottom"
        >
@@ -32,6 +32,7 @@ import {
 } from "#/utils/custom-toast-handlers";
 import { getFirstPRUrl } from "#/utils/parse-pr-url";
 import { I18nKey } from "#/i18n/declaration";
+import { useUserProviders } from "#/hooks/use-user-providers";

 // Handle error events
 const isErrorEvent = (evt: unknown): evt is { error: true; message: string } =>
@@ -65,16 +66,10 @@ const getConversationInstructions = (
  gitProvider: Provider,
 ) => `Create a microagent for the repository ${repositoryName} by following the steps below:

- Step 1: Create a markdown file inside the .openhands/microagents folder with the name of the microagent (The microagent must be created in the .openhands/microagents folder and should be able to perform the described task when triggered).
-
- This is the instructions about what the microagent should do: ${formData.query}
-
-${
+- Step 1: Create a markdown file inside the .openhands/microagents folder with the name of the microagent (The microagent must be created in the .openhands/microagents folder and should be able to perform the described task when triggered). This is the instructions about what the microagent should do: ${formData.query}. ${
  formData.triggers && formData.triggers.length > 0
-    ? `
- This is the triggers of the microagent: ${formData.triggers.join(", ")}
-`
-    : "- Please be noted that the microagent doesn't have any triggers."
+    ? `This is the triggers of the microagent: ${formData.triggers.join(", ")}`
+    : "Please be noted that the microagent doesn't have any triggers."
 }

 - Step 2: Create a new branch for the repository ${repositoryName}, must avoid duplicated branches.
@@ -91,16 +86,10 @@ const getUpdateConversationInstructions = (
 ) => `Update the microagent for the repository ${repositoryName} by following the steps below:


- Step 1: Update the microagent. This is the path of the microagent: ${formData.microagentPath} (The updated microagent must be in the .openhands/microagents folder and should be able to perform the described task when triggered).
-
- This is the updated instructions about what the microagent should do: ${formData.query}
-
-${
+- Step 1: Update the microagent. This is the path of the microagent: ${formData.microagentPath} (The updated microagent must be in the .openhands/microagents folder and should be able to perform the described task when triggered). This is the updated instructions about what the microagent should do: ${formData.query}. ${
  formData.triggers && formData.triggers.length > 0
-    ? `
- This is the triggers of the microagent: ${formData.triggers.join(", ")}
-`
-    : "- Please be noted that the microagent doesn't have any triggers."
+    ? `This is the triggers of the microagent: ${formData.triggers.join(", ")}`
+    : "Please be noted that the microagent doesn't have any triggers."
 }

 - Step 2: Create a new branch for the repository ${repositoryName}, must avoid duplicated branches.
@@ -119,6 +108,8 @@ export function MicroagentManagementContent() {
    learnThisRepoModalVisible,
  } = useSelector((state: RootState) => state.microagentManagement);

+  const { providers } = useUserProviders();
+
  const { t } = useTranslation();

  const dispatch = useDispatch();
@@ -182,11 +173,7 @@ export function MicroagentManagementContent() {
      // Check if agent has finished and we have a PR
      if (isOpenHandsEvent(socketEvent) && isFinishAction(socketEvent)) {
        const prUrl = getFirstPRUrl(socketEvent.args.final_thought || "");
-        if (prUrl) {
-          displaySuccessToast(
-            t(I18nKey.MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW),
-          );
-        } else {
+        if (!prUrl) {
          // Agent finished but no PR found
          displaySuccessToast(t(I18nKey.MICROAGENT_MANAGEMENT$PR_NOT_CREATED));
        }
@@ -290,6 +277,12 @@ export function MicroagentManagementContent() {
    const repositoryName = repository.full_name;
    const gitProvider = repository.git_provider;

+    const createMicroagent = {
+      repo: repositoryName,
+      git_provider: gitProvider,
+      title: formData.query,
+    };
+
    // Launch a new conversation to help the user understand the repo
    createConversationAndSubscribe({
      query: formData.query,
@@ -299,6 +292,7 @@ export function MicroagentManagementContent() {
        branch: formData.selectedBranch,
        gitProvider,
      },
+      createMicroagent,
      onSuccessCallback: () => {
        hideLearnThisRepoModal();
      },
@@ -329,11 +323,18 @@ export function MicroagentManagementContent() {
    </>
  );

+  const providersAreSet = providers.length > 0;
+
  if (width < 1024) {
    return (
      <div className="w-full h-full flex flex-col gap-6">
        <div className="w-full rounded-lg border border-[#525252] bg-[#24272E] max-h-[494px] min-h-[494px]">
-          <MicroagentManagementSidebar isSmallerScreen />
+          {providersAreSet && (
+            <MicroagentManagementSidebar
+              isSmallerScreen
+              providers={providers}
+            />
+          )}
        </div>
        <div className="w-full rounded-lg border border-[#525252] bg-[#24272E] flex-1 min-h-[494px]">
          <MicroagentManagementMain />
@@ -345,7 +346,7 @@ export function MicroagentManagementContent() {

  return (
    <div className="w-full h-full flex rounded-lg border border-[#525252] bg-[#24272E] overflow-hidden">
-      <MicroagentManagementSidebar />
+      {providersAreSet && <MicroagentManagementSidebar providers={providers} />}
      <div className="flex-1">
        <MicroagentManagementMain />
      </div>
@@ -8,7 +8,7 @@ import { BrandButton } from "../settings/brand-button";
 import { I18nKey } from "#/i18n/declaration";
 import { RootState } from "#/store";
 import XIcon from "#/icons/x.svg?react";
-import { cn } from "#/utils/utils";
+import { cn, getRepoMdCreatePrompt } from "#/utils/utils";
 import { LearnThisRepoFormData } from "#/types/microagent-management";
 import { Branch } from "#/types/git";
 import { useRepositoryBranches } from "#/hooks/query/use-repository-branches";
@@ -76,23 +76,25 @@ export function MicroagentManagementLearnThisRepoModal({
  const onSubmit = (event: React.FormEvent<HTMLFormElement>) => {
    event.preventDefault();

-    if (!query.trim()) {
-      return;
-    }
+    const finalQuery = getRepoMdCreatePrompt(
+      selectedRepository?.git_provider || "github",
+      query.trim(),
+    );

    onConfirm({
-      query: query.trim(),
+      query: finalQuery,
      selectedBranch: selectedBranch?.name || "",
    });
  };

  const handleConfirm = () => {
-    if (!query.trim()) {
-      return;
-    }
+    const finalQuery = getRepoMdCreatePrompt(
+      selectedRepository?.git_provider || "github",
+      query.trim(),
+    );

    onConfirm({
-      query: query.trim(),
+      query: finalQuery,
      selectedBranch: selectedBranch?.name || "",
    });
  };
@@ -244,7 +246,6 @@ export function MicroagentManagementLearnThisRepoModal({
            onClick={handleConfirm}
            testId="confirm-button"
            isDisabled={
-              !query.trim() ||
              isLoading ||
              isLoadingBranches ||
              !selectedBranch ||
@@ -59,8 +59,10 @@ export function MicroagentManagementMicroagentCard({
    if (runtimeStatus === "STATUS$ERROR") {
      return t(I18nKey.MICROAGENT$STATUS_ERROR);
    }
-    if (conversationStatus === "RUNNING" && runtimeStatus === "STATUS$READY") {
-      return t(I18nKey.MICROAGENT$STATUS_OPENING_PR);
+    if (conversationStatus === "RUNNING") {
+      return runtimeStatus === "STATUS$READY"
+        ? t(I18nKey.MICROAGENT$STATUS_OPENING_PR)
+        : t(I18nKey.COMMON$STARTING);
    }
    return "";
  }, [conversationStatus, runtimeStatus, t, hasPr]);
@@ -1,13 +1,16 @@
+import { useTranslation } from "react-i18next";
 import { useEffect } from "react";
 import { useDispatch, useSelector } from "react-redux";
 import { Spinner } from "@heroui/react";
 import { MicroagentManagementMicroagentCard } from "./microagent-management-microagent-card";
 import { MicroagentManagementLearnThisRepo } from "./microagent-management-learn-this-repo";
 import { useRepositoryMicroagents } from "#/hooks/query/use-repository-microagents";
-import { useSearchConversations } from "#/hooks/query/use-search-conversations";
+import { useMicroagentManagementConversations } from "#/hooks/query/use-microagent-management-conversations";
 import { GitRepository } from "#/types/git";
 import { RootState } from "#/store";
 import { setSelectedMicroagentItem } from "#/state/microagent-management-slice";
+import { cn } from "#/utils/utils";
+import { I18nKey } from "#/i18n/declaration";

 interface MicroagentManagementRepoMicroagentsProps {
  repository: GitRepository;
@@ -22,6 +25,8 @@ export function MicroagentManagementRepoMicroagents({

  const dispatch = useDispatch();

+  const { t } = useTranslation();
+
  const { full_name: repositoryName } = repository;

  // Extract owner and repo from repositoryName (format: "owner/repo")
@@ -37,9 +42,9 @@ export function MicroagentManagementRepoMicroagents({
    data: conversations,
    isLoading: isLoadingConversations,
    isError: isErrorConversations,
-  } = useSearchConversations(
+  } = useMicroagentManagementConversations(
    repositoryName,
-    "microagent_management",
+    undefined,
    1000,
    true,
  );
@@ -103,34 +108,47 @@ export function MicroagentManagementRepoMicroagents({
  const numberOfMicroagents = microagents?.length || 0;
  const numberOfConversations = conversations?.length || 0;
  const totalItems = numberOfMicroagents + numberOfConversations;
+  const hasMicroagents = numberOfMicroagents > 0;
+  const hasConversations = numberOfConversations > 0;

  return (
    <div>
      {totalItems === 0 && (
        <MicroagentManagementLearnThisRepo repository={repository} />
      )}
-
      {/* Render microagents */}
-      {numberOfMicroagents > 0 &&
-        microagents?.map((microagent) => (
-          <div key={microagent.name} className="pb-4 last:pb-0">
-            <MicroagentManagementMicroagentCard
-              microagent={microagent}
-              repository={repository}
-            />
-          </div>
-        ))}
+      {hasMicroagents && (
+        <div className="flex flex-col">
+          <span className="text-md text-white font-medium leading-5 mb-4">
+            {t(I18nKey.MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS)}
+          </span>
+          {microagents?.map((microagent) => (
+            <div key={microagent.name} className="pb-4 last:pb-0">
+              <MicroagentManagementMicroagentCard
+                microagent={microagent}
+                repository={repository}
+              />
+            </div>
+          ))}
+        </div>
+      )}

      {/* Render conversations */}
-      {numberOfConversations > 0 &&
-        conversations?.map((conversation) => (
-          <div key={conversation.conversation_id} className="pb-4 last:pb-0">
-            <MicroagentManagementMicroagentCard
-              conversation={conversation}
-              repository={repository}
-            />
-          </div>
-        ))}
+      {hasConversations && (
+        <div className={cn("flex flex-col", hasMicroagents && "mt-4")}>
+          <span className="text-md text-white font-medium leading-5 mb-4">
+            {t(I18nKey.MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS)}
+          </span>
+          {conversations?.map((conversation) => (
+            <div key={conversation.conversation_id} className="pb-4 last:pb-0">
+              <MicroagentManagementMicroagentCard
+                conversation={conversation}
+                repository={repository}
+              />
+            </div>
+          ))}
+        </div>
+      )}
    </div>
  );
 }
@@ -1,15 +1,12 @@
-import { useState, useMemo } from "react";
 import { useTranslation } from "react-i18next";
 import { Accordion, AccordionItem } from "@heroui/react";
 import { MicroagentManagementRepoMicroagents } from "./microagent-management-repo-microagents";
 import { GitRepository } from "#/types/git";
-import { cn } from "#/utils/utils";
 import { TabType } from "#/types/microagent-management";
 import { MicroagentManagementNoRepositories } from "./microagent-management-no-repositories";
 import { I18nKey } from "#/i18n/declaration";
 import { DOCUMENTATION_URL } from "#/utils/constants";
 import { MicroagentManagementAccordionTitle } from "./microagent-management-accordion-title";
-import { sanitizeQuery } from "#/utils/sanitize-query";

 type MicroagentManagementRepositoriesProps = {
  repositories: GitRepository[];
@@ -21,23 +18,9 @@ export function MicroagentManagementRepositories({
  tabType,
 }: MicroagentManagementRepositoriesProps) {
  const { t } = useTranslation();
-  const [searchQuery, setSearchQuery] = useState("");

  const numberOfRepoMicroagents = repositories.length;

-  // Filter repositories based on search query
-  const filteredRepositories = useMemo(() => {
-    if (!searchQuery.trim()) {
-      return repositories;
-    }
-
-    const sanitizedQuery = sanitizeQuery(searchQuery);
-    return repositories.filter((repository) => {
-      const sanitizedRepoName = sanitizeQuery(repository.full_name);
-      return sanitizedRepoName.includes(sanitizedQuery);
-    });
-  }, [repositories, searchQuery]);
-
  if (numberOfRepoMicroagents === 0) {
    if (tabType === "personal") {
      return (
@@ -73,25 +56,6 @@ export function MicroagentManagementRepositories({

  return (
    <div className="flex flex-col gap-4 w-full">
-      {/* Search Input */}
-      <div className="flex flex-col gap-2 w-full">
-        <label htmlFor="repository-search" className="sr-only">
-          {t(I18nKey.COMMON$SEARCH_REPOSITORIES)}
-        </label>
-        <input
-          id="repository-search"
-          name="repository-search"
-          type="text"
-          placeholder={`${t(I18nKey.COMMON$SEARCH_REPOSITORIES)}...`}
-          value={searchQuery}
-          onChange={(e) => setSearchQuery(e.target.value)}
-          className={cn(
-            "bg-tertiary border border-[#717888] bg-[#454545] w-full rounded-sm p-2 placeholder:italic placeholder:text-tertiary-alt",
-            "disabled:bg-[#2D2F36] disabled:border-[#2D2F36] disabled:cursor-not-allowed",
-          )}
-        />
-      </div>
-
      {/* Repositories Accordion */}
      <Accordion
        variant="splitted"
@@ -104,7 +68,7 @@ export function MicroagentManagementRepositories({
        }}
        selectionMode="multiple"
      >
-        {filteredRepositories.map((repository) => (
+        {repositories.map((repository) => (
          <AccordionItem
            key={repository.id}
            aria-label={repository.full_name}
@@ -1,59 +1,109 @@
-import { useEffect } from "react";
+import { useEffect, useState, useMemo } from "react";
 import { useDispatch } from "react-redux";
 import { useTranslation } from "react-i18next";
 import { Spinner } from "@heroui/react";
 import { MicroagentManagementSidebarHeader } from "./microagent-management-sidebar-header";
 import { MicroagentManagementSidebarTabs } from "./microagent-management-sidebar-tabs";
-import { useUserRepositories } from "#/hooks/query/use-user-repositories";
-import { useUserProviders } from "#/hooks/use-user-providers";
+import { useGitRepositories } from "#/hooks/query/use-git-repositories";
+import { GitProviderDropdown } from "#/components/common/git-provider-dropdown";
 import {
  setPersonalRepositories,
  setOrganizationRepositories,
  setRepositories,
 } from "#/state/microagent-management-slice";
 import { GitRepository } from "#/types/git";
+import { Provider } from "#/types/settings";
 import { cn } from "#/utils/utils";
+import { sanitizeQuery } from "#/utils/sanitize-query";
+import { I18nKey } from "#/i18n/declaration";
+import { getGitProviderMicroagentManagementCustomStyles } from "#/components/common/react-select-styles";

 interface MicroagentManagementSidebarProps {
  isSmallerScreen?: boolean;
+  providers: Provider[];
 }

 export function MicroagentManagementSidebar({
  isSmallerScreen = false,
+  providers,
 }: MicroagentManagementSidebarProps) {
+  const [selectedProvider, setSelectedProvider] = useState<Provider | null>(
+    providers.length > 0 ? providers[0] : null,
+  );
+
+  const [searchQuery, setSearchQuery] = useState("");
+
  const dispatch = useDispatch();
+
  const { t } = useTranslation();
-  const { providers } = useUserProviders();
-  const selectedProvider = providers.length > 0 ? providers[0] : null;
-  const { data: repositories, isLoading } =
-    useUserRepositories(selectedProvider);
+
+  const { data: repositories, isLoading } = useGitRepositories({
+    provider: selectedProvider,
+    pageSize: 200,
+    enabled: !!selectedProvider,
+  });
+
+  // Auto-select provider if there's only one
+  useEffect(() => {
+    if (providers.length > 0 && !selectedProvider) {
+      setSelectedProvider(providers[0]);
+    }
+  }, [providers, selectedProvider]);
+
+  const handleProviderChange = (provider: Provider | null) => {
+    setSelectedProvider(provider);
+    setSearchQuery("");
+  };
+
+  // Filter repositories based on search query
+  const filteredRepositories = useMemo(() => {
+    if (!repositories?.pages) return null;
+
+    // Flatten all pages to get all repositories
+    const allRepositories = repositories.pages.flatMap((page) => page.data);
+
+    if (!searchQuery.trim()) {
+      return allRepositories;
+    }
+
+    const sanitizedQuery = sanitizeQuery(searchQuery);
+    return allRepositories.filter((repository: GitRepository) => {
+      const sanitizedRepoName = sanitizeQuery(repository.full_name);
+      return sanitizedRepoName.includes(sanitizedQuery);
+    });
+  }, [repositories, searchQuery, selectedProvider]);

  useEffect(() => {
-    if (repositories?.pages) {
-      const personalRepos: GitRepository[] = [];
-      const organizationRepos: GitRepository[] = [];
-      const otherRepos: GitRepository[] = [];
-
-      // Flatten all pages to get all repositories
-      const allRepositories = repositories.pages.flatMap((page) => page.data);
-
-      allRepositories.forEach((repo: GitRepository) => {
-        const hasOpenHandsSuffix = repo.full_name.endsWith("/.openhands");
-
-        if (repo.owner_type === "user" && hasOpenHandsSuffix) {
-          personalRepos.push(repo);
-        } else if (repo.owner_type === "organization" && hasOpenHandsSuffix) {
-          organizationRepos.push(repo);
-        } else {
-          otherRepos.push(repo);
-        }
-      });
-
-      dispatch(setPersonalRepositories(personalRepos));
-      dispatch(setOrganizationRepositories(organizationRepos));
-      dispatch(setRepositories(otherRepos));
+    if (!filteredRepositories?.length) {
+      dispatch(setPersonalRepositories([]));
+      dispatch(setOrganizationRepositories([]));
+      dispatch(setRepositories([]));
+      return;
    }
-  }, [repositories, dispatch]);
+
+    const personalRepos: GitRepository[] = [];
+    const organizationRepos: GitRepository[] = [];
+    const otherRepos: GitRepository[] = [];
+
+    filteredRepositories.forEach((repo: GitRepository) => {
+      const hasOpenHandsSuffix =
+        selectedProvider === "gitlab"
+          ? repo.full_name.endsWith("/openhands-config")
+          : repo.full_name.endsWith("/.openhands");
+
+      if (repo.owner_type === "user" && hasOpenHandsSuffix) {
+        personalRepos.push(repo);
+      } else if (repo.owner_type === "organization" && hasOpenHandsSuffix) {
+        organizationRepos.push(repo);
+      } else {
+        otherRepos.push(repo);
+      }
+    });
+
+    dispatch(setPersonalRepositories(personalRepos));
+    dispatch(setOrganizationRepositories(organizationRepos));
+    dispatch(setRepositories(otherRepos));
+  }, [filteredRepositories, selectedProvider, dispatch]);

  return (
    <div
@@ -63,6 +113,41 @@ export function MicroagentManagementSidebar({
      )}
    >
      <MicroagentManagementSidebarHeader />
+
+      {/* Provider Selection */}
+      {providers.length > 1 && (
+        <div className="mt-6">
+          <GitProviderDropdown
+            providers={providers}
+            value={selectedProvider}
+            placeholder="Select Provider"
+            onChange={handleProviderChange}
+            className="w-full"
+            classNamePrefix="git-provider-dropdown"
+            styles={getGitProviderMicroagentManagementCustomStyles()}
+          />
+        </div>
+      )}
+
+      {/* Search Input */}
+      <div className="flex flex-col gap-2 w-full mt-6">
+        <label htmlFor="repository-search" className="sr-only">
+          {t(I18nKey.COMMON$SEARCH_REPOSITORIES)}
+        </label>
+        <input
+          id="repository-search"
+          name="repository-search"
+          type="text"
+          placeholder={`${t(I18nKey.COMMON$SEARCH_REPOSITORIES)}...`}
+          value={searchQuery}
+          onChange={(e) => setSearchQuery(e.target.value)}
+          className={cn(
+            "bg-tertiary border border-[#717888] bg-[#454545] w-full rounded-sm p-2 placeholder:italic placeholder:text-tertiary-alt",
+            "disabled:bg-[#2D2F36] disabled:border-[#2D2F36] disabled:cursor-not-allowed h-10 box-shadow-none outline-none",
+          )}
+        />
+      </div>
+
      {isLoading ? (
        <div className="flex flex-col items-center justify-center gap-4 flex-1">
          <Spinner size="sm" />
@@ -1,8 +1,7 @@
 import { Tooltip } from "@heroui/react";
 import { useTranslation } from "react-i18next";
-import ConfirmIcon from "#/assets/confirm";
-import RejectIcon from "#/assets/reject";
 import { I18nKey } from "#/i18n/declaration";
+import { cn } from "#/utils/utils";

 interface ActionTooltipProps {
  type: "confirm" | "reject";
@@ -12,25 +11,35 @@ interface ActionTooltipProps {
 export function ActionTooltip({ type, onClick }: ActionTooltipProps) {
  const { t } = useTranslation();

-  const content =
-    type === "confirm"
-      ? t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)
-      : t(I18nKey.CHAT_INTERFACE$USER_REJECTED);
+  const isConfirm = type === "confirm";
+
+  const ariaLabel = isConfirm
+    ? t(I18nKey.ACTION$CONFIRM)
+    : t(I18nKey.ACTION$REJECT);
+
+  const content = isConfirm
+    ? t(I18nKey.CHAT_INTERFACE$USER_CONFIRMED)
+    : t(I18nKey.CHAT_INTERFACE$USER_REJECTED);
+
+  const buttonLabel = isConfirm
+    ? `${t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE)} ⌘↩`
+    : `${t(I18nKey.BUTTON$CANCEL)} ⇧⌘⌫`;

  return (
    <Tooltip content={content} closeDelay={100}>
      <button
        data-testid={`action-${type}-button`}
        type="button"
-        aria-label={
+        aria-label={ariaLabel}
+        className={cn(
+          "rounded px-2 h-6.5 text-sm font-medium leading-5 cursor-pointer hover:opacity-80",
          type === "confirm"
-            ? t(I18nKey.ACTION$CONFIRM)
-            : t(I18nKey.ACTION$REJECT)
-        }
-        className="bg-tertiary rounded-full p-1 hover:bg-base-secondary"
+            ? "bg-tertiary text-white"
+            : "bg-white text-[#0D0F11]",
+        )}
        onClick={onClick}
      >
-        {type === "confirm" ? <ConfirmIcon /> : <RejectIcon />}
+        {buttonLabel}
      </button>
    </Tooltip>
  );
@@ -1,31 +1,120 @@
+import { useDispatch, useSelector } from "react-redux";
+import { useCallback, useEffect } from "react";
 import { useTranslation } from "react-i18next";
 import { I18nKey } from "#/i18n/declaration";
 import { AgentState } from "#/types/agent-state";
 import { generateAgentStateChangeEvent } from "#/services/agent-state-service";
 import { useWsClient } from "#/context/ws-client-provider";
 import { ActionTooltip } from "../action-tooltip";
+import { isOpenHandsAction } from "#/types/core/guards";
+import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
+import { RiskAlert } from "#/components/shared/risk-alert";
+import WarningIcon from "#/icons/u-warning.svg?react";
+import { RootState } from "#/store";
+import { addSubmittedEventId } from "#/state/event-message-slice";

 export function ConfirmationButtons() {
-  const { t } = useTranslation();
-  const { send } = useWsClient();
+  const submittedEventIds = useSelector(
+    (state: RootState) => state.eventMessage.submittedEventIds,
+  );

-  const handleStateChange = (state: AgentState) => {
-    const event = generateAgentStateChangeEvent(state);
-    send(event);
-  };
+  const dispatch = useDispatch();
+
+  const { t } = useTranslation();
+
+  const { send, parsedEvents } = useWsClient();
+
+  // Find the most recent action awaiting confirmation
+  const awaitingAction = parsedEvents
+    .slice()
+    .reverse()
+    .find((ev) => {
+      if (!isOpenHandsAction(ev) || ev.source !== "agent") return false;
+      const args = ev.args as Record<string, unknown>;
+      return args?.confirmation_state === "awaiting_confirmation";
+    });
+
+  const handleStateChange = useCallback(
+    (state: AgentState) => {
+      if (!awaitingAction) {
+        return;
+      }
+
+      dispatch(addSubmittedEventId(awaitingAction.id));
+      send(generateAgentStateChangeEvent(state));
+    },
+    [send],
+  );
+
+  // Handle keyboard shortcuts
+  useEffect(() => {
+    if (!awaitingAction) {
+      return undefined;
+    }
+
+    const handleCancelShortcut = (event: KeyboardEvent) => {
+      if (event.shiftKey && event.metaKey && event.key === "Backspace") {
+        event.preventDefault();
+        handleStateChange(AgentState.USER_REJECTED);
+      }
+    };
+
+    const handleContinueShortcut = (event: KeyboardEvent) => {
+      if (event.metaKey && event.key === "Enter") {
+        event.preventDefault();
+        handleStateChange(AgentState.USER_CONFIRMED);
+      }
+    };
+
+    const handleKeyDown = (event: KeyboardEvent) => {
+      // Cancel: Shift+Cmd+Backspace (⇧⌘⌫)
+      handleCancelShortcut(event);
+      // Continue: Cmd+Enter (⌘↩)
+      handleContinueShortcut(event);
+    };
+
+    document.addEventListener("keydown", handleKeyDown);
+
+    return () => document.removeEventListener("keydown", handleKeyDown);
+  }, [awaitingAction, handleStateChange]);
+
+  if (!awaitingAction || submittedEventIds.includes(awaitingAction.id)) {
+    return null;
+  }
+
+  const { args } = awaitingAction as { args: Record<string, unknown> };
+
+  const risk = args?.security_risk;
+
+  const isHighRisk =
+    typeof risk === "string"
+      ? risk.toLowerCase() === "high"
+      : Number(risk) === ActionSecurityRisk.HIGH;

  return (
-    <div className="flex justify-between items-center pt-4">
-      <p>{t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}</p>
-      <div className="flex items-center gap-3">
-        <ActionTooltip
-          type="confirm"
-          onClick={() => handleStateChange(AgentState.USER_CONFIRMED)}
-        />
-        <ActionTooltip
-          type="reject"
-          onClick={() => handleStateChange(AgentState.USER_REJECTED)}
+    <div className="flex flex-col gap-2 pt-4">
+      {isHighRisk && (
+        <RiskAlert
+          content={t(I18nKey.CHAT_INTERFACE$HIGH_RISK_WARNING)}
+          icon={<WarningIcon width={16} height={16} color="#fff" />}
+          severity="high"
+          title={t(I18nKey.COMMON$HIGH_RISK)}
        />
+      )}
+      <div className="flex justify-between items-center">
+        <p className="text-sm font-normal text-white">
+          {t(I18nKey.CHAT_INTERFACE$USER_ASK_CONFIRMATION)}
+        </p>
+        <div className="flex items-center gap-3">
+          <ActionTooltip
+            type="reject"
+            onClick={() => handleStateChange(AgentState.USER_REJECTED)}
+          />
+          <ActionTooltip
+            type="confirm"
+            onClick={() => handleStateChange(AgentState.USER_CONFIRMED)}
+          />
+        </div>
      </div>
    </div>
  );
@@ -93,14 +93,14 @@ function SecurityInvariant() {
    (risk: ActionSecurityRisk) => {
      switch (risk) {
        case ActionSecurityRisk.LOW:
-          return t(I18nKey.SECURITY_ANALYZER$LOW_RISK);
+          return t(I18nKey.SECURITY$LOW_RISK);
        case ActionSecurityRisk.MEDIUM:
-          return t(I18nKey.SECURITY_ANALYZER$MEDIUM_RISK);
+          return t(I18nKey.SECURITY$MEDIUM_RISK);
        case ActionSecurityRisk.HIGH:
-          return t(I18nKey.SECURITY_ANALYZER$HIGH_RISK);
+          return t(I18nKey.SECURITY$HIGH_RISK);
        case ActionSecurityRisk.UNKNOWN:
        default:
-          return t(I18nKey.SECURITY_ANALYZER$UNKNOWN_RISK);
+          return t(I18nKey.SECURITY$UNKNOWN_RISK);
      }
    },
    [t],
@@ -0,0 +1,36 @@
+import { ReactNode } from "react";
+import { cn } from "#/utils/utils";
+
+interface RiskAlertProps {
+  className?: string;
+  content: ReactNode;
+  icon?: ReactNode;
+  severity: "high" | "medium" | "low";
+  title: string;
+}
+
+export function RiskAlert({
+  className,
+  content,
+  icon,
+  severity,
+  title,
+}: RiskAlertProps) {
+  // Currently, we are only supporting the high risk alert. If we use want to support other risk levels, we can add them here and use cva to create different variants of this component.
+  if (severity === "high") {
+    return (
+      <div
+        className={cn(
+          "flex items-center gap-3.5 bg-[#4A0709] border border-[#FF0006] text-red-400 rounded-xl px-3.5 h-13 text-sm text-white",
+          className,
+        )}
+      >
+        {icon && <span className="">{icon}</span>}
+        <span className="font-bold">{title}</span>
+        <span className="font-normal">{content}</span>
+      </div>
+    );
+  }
+
+  return null;
+}
@@ -33,6 +33,7 @@ interface ConversationSubscriptionsContextType {
    sessionApiKey: string | null;
    providersSet: ("github" | "gitlab" | "bitbucket" | "enterprise_sso")[];
    baseUrl: string;
+    socketPath?: string;
    onEvent?: (event: unknown, conversationId: string) => void;
  }) => void;
  unsubscribeFromConversation: (conversationId: string) => void;
@@ -95,10 +96,10 @@ export function ConversationSubscriptionsProvider({
    [],
  );

-  const unsubscribeFromConversation = useCallback(
-    (conversationId: string) => {
-      // Get a local reference to the socket data to avoid race conditions
-      const socketData = conversationSockets[conversationId];
+  const unsubscribeFromConversation = useCallback((conversationId: string) => {
+    // Use functional update to access current socket data and perform cleanup
+    setConversationSockets((prev) => {
+      const socketData = prev[conversationId];

      if (socketData) {
        const { socket } = socketData;
@@ -112,24 +113,23 @@ export function ConversationSubscriptionsProvider({
          socket.disconnect();
        }

-        // Update state to remove the socket
-        setConversationSockets((prev) => {
-          const newSockets = { ...prev };
-          delete newSockets[conversationId];
-          return newSockets;
-        });
-
-        // Remove from active IDs
-        setActiveConversationIds((prev) =>
-          prev.filter((id) => id !== conversationId),
-        );
-
        // Clean up event handler reference
        delete eventHandlersRef.current[conversationId];
+
+        // Remove the socket from state
+        const newSockets = { ...prev };
+        delete newSockets[conversationId];
+        return newSockets;
      }
-    },
-    [conversationSockets],
-  );
+
+      return prev; // No change if socket not found
+    });
+
+    // Remove from active IDs
+    setActiveConversationIds((prev) =>
+      prev.filter((id) => id !== conversationId),
+    );
+  }, []);

  const subscribeToConversation = useCallback(
    (options: {
@@ -137,10 +137,17 @@ export function ConversationSubscriptionsProvider({
      sessionApiKey: string | null;
      providersSet: ("github" | "gitlab" | "bitbucket" | "enterprise_sso")[];
      baseUrl: string;
+      socketPath?: string;
      onEvent?: (event: unknown, conversationId: string) => void;
    }) => {
-      const { conversationId, sessionApiKey, providersSet, baseUrl, onEvent } =
-        options;
+      const {
+        conversationId,
+        sessionApiKey,
+        providersSet,
+        baseUrl,
+        socketPath,
+        onEvent,
+      } = options;

      // If already subscribed, don't create a new subscription
      if (conversationSockets[conversationId]) {
@@ -173,9 +180,7 @@ export function ConversationSubscriptionsProvider({
        if (isErrorEvent(event) || isAgentStatusError(event)) {
          renderConversationErroredToast(
            conversationId,
-            isErrorEvent(event)
-              ? event.message
-              : "Unknown error, please try again",
+            isErrorEvent(event) ? event.message : "MICROAGENT$UNKNOWN_ERROR",
          );
        } else if (isStatusUpdate(event)) {
          if (event.type === "info" && event.id === "STATUS$STARTING_RUNTIME") {
@@ -199,6 +204,7 @@ export function ConversationSubscriptionsProvider({
        // Create socket connection
        const socket = io(baseUrl, {
          transports: ["websocket"],
+          path: socketPath ?? "/socket.io",
          query: {
            conversation_id: conversationId,
            session_api_key: sessionApiKey,
@@ -317,15 +317,24 @@ export function WsClientProvider({
      session_api_key: conversation.session_api_key, // Have to set here because socketio doesn't support custom headers. :(
    };

-    let baseUrl = null;
+    let baseUrl: string | null = null;
+    let socketPath: string;
    if (conversation.url && !conversation.url.startsWith("/")) {
-      baseUrl = new URL(conversation.url).host;
+      const u = new URL(conversation.url);
+      baseUrl = u.host;
+      const pathBeforeApi = u.pathname.split("/api/conversations")[0] || "/";
+      // Socket.IO server default path is /socket.io; prefix with pathBeforeApi for path mode
+      socketPath = `${pathBeforeApi.replace(/\/$/, "")}/socket.io`;
    } else {
-      baseUrl = import.meta.env.VITE_BACKEND_BASE_URL || window?.location.host;
+      baseUrl =
+        (import.meta.env.VITE_BACKEND_BASE_URL as string | undefined) ||
+        window?.location.host;
+      socketPath = "/socket.io";
    }

    sio = io(baseUrl, {
      transports: ["websocket"],
+      path: socketPath,
      query,
    });

@@ -19,6 +19,8 @@ const saveSettingsMutationFn = async (settings: Partial<PostSettings>) => {
        : settings.llm_api_key?.trim() || undefined,
    remote_runtime_resource_factor: settings.REMOTE_RUNTIME_RESOURCE_FACTOR,
    enable_default_condenser: settings.ENABLE_DEFAULT_CONDENSER,
+    condenser_max_size:
+      settings.CONDENSER_MAX_SIZE ?? DEFAULT_SETTINGS.CONDENSER_MAX_SIZE,
    enable_sound_notifications: settings.ENABLE_SOUND_NOTIFICATIONS,
    user_consents_to_analytics: settings.user_consents_to_analytics,
    provider_tokens_set: settings.PROVIDER_TOKENS_SET,
@@ -0,0 +1,27 @@
+import { useQuery } from "@tanstack/react-query";
+import OpenHands from "#/api/open-hands";
+
+export const useMicroagentManagementConversations = (
+  selectedRepository: string,
+  pageId?: string,
+  limit: number = 100,
+  cacheDisabled: boolean = false,
+) =>
+  useQuery({
+    queryKey: [
+      "conversations",
+      "microagent-management",
+      pageId,
+      limit,
+      selectedRepository,
+    ],
+    queryFn: () =>
+      OpenHands.getMicroagentManagementConversations(
+        selectedRepository,
+        pageId,
+        limit,
+      ),
+    enabled: !!selectedRepository,
+    staleTime: cacheDisabled ? 0 : 1000 * 60 * 5, // 5 minutes
+    gcTime: cacheDisabled ? 0 : 1000 * 60 * 15, // 15 minutes
+  });
@@ -22,6 +22,8 @@ const getSettingsQueryFn = async (): Promise<Settings> => {
    REMOTE_RUNTIME_RESOURCE_FACTOR: apiSettings.remote_runtime_resource_factor,
    PROVIDER_TOKENS_SET: apiSettings.provider_tokens_set,
    ENABLE_DEFAULT_CONDENSER: apiSettings.enable_default_condenser,
+    CONDENSER_MAX_SIZE:
+      apiSettings.condenser_max_size ?? DEFAULT_SETTINGS.CONDENSER_MAX_SIZE,
    ENABLE_SOUND_NOTIFICATIONS: apiSettings.enable_sound_notifications,
    ENABLE_PROACTIVE_CONVERSATION_STARTERS:
      apiSettings.enable_proactive_conversation_starters,
@@ -1,14 +1,27 @@
 import React from "react";
+import { useQueries, type Query } from "@tanstack/react-query";
+import toast from "react-hot-toast";
+import { AxiosError } from "axios";
 import { useCreateConversation } from "./mutation/use-create-conversation";
 import { useUserProviders } from "./use-user-providers";
 import { useConversationSubscriptions } from "#/context/conversation-subscriptions-provider";
 import { Provider } from "#/types/settings";
-import { CreateMicroagent } from "#/api/open-hands.types";
+import { CreateMicroagent, Conversation } from "#/api/open-hands.types";
+import OpenHands from "#/api/open-hands";
+import { renderConversationStartingToast } from "#/components/features/chat/microagent/microagent-status-toast";
+
+interface ConversationData {
+  conversationId: string;
+  sessionApiKey: string | null;
+  baseUrl: string;
+  socketPath: string;
+  onEventCallback?: (event: unknown, conversationId: string) => void;
+}

 /**
 * Custom hook to create a conversation and subscribe to it, supporting multiple subscriptions.
- * This extends the functionality of useCreateConversationAndSubscribe to allow subscribing to
- * multiple conversations simultaneously.
+ * This version waits for conversation status to be "RUNNING" before establishing WebSocket connection.
+ * Shows immediate toast feedback and polls conversation status until ready.
 */
 export const useCreateConversationAndSubscribeMultiple = () => {
  const { mutate: createConversation, isPending } = useCreateConversation();
@@ -20,6 +33,88 @@ export const useCreateConversationAndSubscribeMultiple = () => {
    activeConversationIds,
  } = useConversationSubscriptions();

+  // Store conversation data immediately after creation
+  const [createdConversations, setCreatedConversations] = React.useState<
+    Record<string, ConversationData>
+  >({});
+
+  // Get conversation IDs that need polling
+  const conversationIdsToWatch = Object.keys(createdConversations);
+
+  // Poll each conversation until it's ready
+  const conversationQueries = useQueries({
+    queries: conversationIdsToWatch.map((conversationId) => ({
+      queryKey: ["conversation-ready-poll", conversationId],
+      queryFn: () => OpenHands.getConversation(conversationId),
+      enabled: !!conversationId,
+      refetchInterval: (query: Query<Conversation | null, AxiosError>) => {
+        const status = query.state.data?.status;
+        if (status === "STARTING") {
+          return 3000; // Poll every 3 seconds while STARTING
+        }
+        return false; // Stop polling once not STARTING
+      },
+      retry: false,
+    })),
+  });
+
+  // Extract stable values from queries for dependency array
+  const queryStatuses = conversationQueries.map((query) => query.data?.status);
+  const queryDataExists = conversationQueries.map((query) => !!query.data);
+
+  // Effect to handle subscription when conversations are ready
+  React.useEffect(() => {
+    conversationQueries.forEach((query, index) => {
+      const conversationId = conversationIdsToWatch[index];
+      const conversationData = createdConversations[conversationId];
+
+      if (!query.data || !conversationData) return;
+
+      const { status, url, session_api_key: sessionApiKey } = query.data;
+
+      let { baseUrl } = conversationData;
+      if (url && !url.startsWith("/")) {
+        baseUrl = new URL(url).host;
+      }
+
+      if (status === "RUNNING") {
+        // Conversation is ready - subscribe to WebSocket
+        subscribeToConversation({
+          conversationId,
+          sessionApiKey,
+          providersSet: providers,
+          baseUrl,
+          socketPath: conversationData.socketPath,
+          onEvent: conversationData.onEventCallback,
+        });
+
+        // Remove from created conversations (cleanup)
+        setCreatedConversations((prev) => {
+          const newCreated = { ...prev };
+          delete newCreated[conversationId];
+          return newCreated;
+        });
+      } else if (status === "STOPPED") {
+        // Dismiss the starting toast
+        toast.dismiss(`starting-${conversationId}`);
+
+        // Remove from created conversations (cleanup)
+        setCreatedConversations((prev) => {
+          const newCreated = { ...prev };
+          delete newCreated[conversationId];
+          return newCreated;
+        });
+      }
+    });
+  }, [
+    queryStatuses,
+    queryDataExists,
+    conversationIdsToWatch,
+    createdConversations,
+    subscribeToConversation,
+    providers,
+  ]);
+
  const createConversationAndSubscribe = React.useCallback(
    ({
      query,
@@ -49,33 +144,46 @@ export const useCreateConversationAndSubscribeMultiple = () => {
        },
        {
          onSuccess: (data) => {
+            // Show immediate toast to let user know something is happening
+            renderConversationStartingToast(data.conversation_id);
+
+            // Call the success callback immediately
+            if (onSuccessCallback) {
+              onSuccessCallback(data.conversation_id);
+            }
+
+            // Only handle immediate post-creation tasks here
            let baseUrl = "";
+            let socketPath: string;
            if (data?.url && !data.url.startsWith("/")) {
-              baseUrl = new URL(data.url).host;
+              const u = new URL(data.url);
+              baseUrl = u.host;
+              const pathBeforeApi =
+                u.pathname.split("/api/conversations")[0] || "/";
+              socketPath = `${pathBeforeApi.replace(/\/$/, "")}/socket.io`;
            } else {
              baseUrl =
                (import.meta.env.VITE_BACKEND_BASE_URL as string | undefined) ||
                window?.location.host;
+              socketPath = "/socket.io";
            }

-            // Subscribe to the conversation
-            subscribeToConversation({
-              conversationId: data.conversation_id,
-              sessionApiKey: data.session_api_key,
-              providersSet: providers,
-              baseUrl,
-              onEvent: onEventCallback,
-            });
-
-            // Call the success callback if provided
-            if (onSuccessCallback) {
-              onSuccessCallback(data.conversation_id);
-            }
+            // Store conversation data for polling and eventual subscription
+            setCreatedConversations((prev) => ({
+              ...prev,
+              [data.conversation_id]: {
+                conversationId: data.conversation_id,
+                sessionApiKey: data.session_api_key,
+                baseUrl,
+                socketPath,
+                onEventCallback,
+              },
+            }));
          },
        },
      );
    },
-    [createConversation, subscribeToConversation, providers],
+    [createConversation],
  );

  return {
@@ -97,6 +97,8 @@ export enum I18nKey {
  SETTINGS$BASE_URL = "SETTINGS$BASE_URL",
  SETTINGS$AGENT = "SETTINGS$AGENT",
  SETTINGS$ENABLE_MEMORY_CONDENSATION = "SETTINGS$ENABLE_MEMORY_CONDENSATION",
+  SETTINGS$CONDENSER_MAX_SIZE = "SETTINGS$CONDENSER_MAX_SIZE",
+  SETTINGS$CONDENSER_MAX_SIZE_TOOLTIP = "SETTINGS$CONDENSER_MAX_SIZE_TOOLTIP",
  SETTINGS$LANGUAGE = "SETTINGS$LANGUAGE",
  ACTION$PUSH_TO_BRANCH = "ACTION$PUSH_TO_BRANCH",
  ACTION$PUSH_CREATE_PR = "ACTION$PUSH_CREATE_PR",
@@ -129,7 +131,6 @@ export enum I18nKey {
  CONVERSATION$REPOSITORY = "CONVERSATION$REPOSITORY",
  CONVERSATION$BRANCH = "CONVERSATION$BRANCH",
  CONVERSATION$GIT_PROVIDER = "CONVERSATION$GIT_PROVIDER",
-  ACCOUNT_SETTINGS$TITLE = "ACCOUNT_SETTINGS$TITLE",
  WORKSPACE$TERMINAL_TAB_LABEL = "WORKSPACE$TERMINAL_TAB_LABEL",
  WORKSPACE$BROWSER_TAB_LABEL = "WORKSPACE$BROWSER_TAB_LABEL",
  WORKSPACE$JUPYTER_TAB_LABEL = "WORKSPACE$JUPYTER_TAB_LABEL",
@@ -326,6 +327,7 @@ export enum I18nKey {
  USER$ACCOUNT_SETTINGS = "USER$ACCOUNT_SETTINGS",
  JUPYTER$OUTPUT_LABEL = "JUPYTER$OUTPUT_LABEL",
  BUTTON$STOP = "BUTTON$STOP",
+  BUTTON$PAUSE = "BUTTON$PAUSE",
  BUTTON$EDIT_TITLE = "BUTTON$EDIT_TITLE",
  BUTTON$DOWNLOAD_VIA_VSCODE = "BUTTON$DOWNLOAD_VIA_VSCODE",
  BUTTON$DISPLAY_COST = "BUTTON$DISPLAY_COST",
@@ -337,6 +339,8 @@ export enum I18nKey {
  LANDING$RECENT_CONVERSATION = "LANDING$RECENT_CONVERSATION",
  CONVERSATION$CONFIRM_DELETE = "CONVERSATION$CONFIRM_DELETE",
  CONVERSATION$CONFIRM_STOP = "CONVERSATION$CONFIRM_STOP",
+  CONVERSATION$CONFIRM_PAUSE = "CONVERSATION$CONFIRM_PAUSE",
+  CONVERSATION$PAUSE_WARNING = "CONVERSATION$PAUSE_WARNING",
  CONVERSATION$STOP_WARNING = "CONVERSATION$STOP_WARNING",
  CONVERSATION$METRICS_INFO = "CONVERSATION$METRICS_INFO",
  CONVERSATION$CREATED = "CONVERSATION$CREATED",
@@ -357,6 +361,7 @@ export enum I18nKey {
  CHAT_INTERFACE$INPUT_PLACEHOLDER = "CHAT_INTERFACE$INPUT_PLACEHOLDER",
  CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE = "CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE",
  CHAT_INTERFACE$USER_ASK_CONFIRMATION = "CHAT_INTERFACE$USER_ASK_CONFIRMATION",
+  CHAT_INTERFACE$HIGH_RISK_WARNING = "CHAT_INTERFACE$HIGH_RISK_WARNING",
  CHAT_INTERFACE$USER_CONFIRMED = "CHAT_INTERFACE$USER_CONFIRMED",
  CHAT_INTERFACE$USER_REJECTED = "CHAT_INTERFACE$USER_REJECTED",
  CHAT_INTERFACE$INPUT_SEND_MESSAGE_BUTTON_CONTENT = "CHAT_INTERFACE$INPUT_SEND_MESSAGE_BUTTON_CONTENT",
@@ -371,10 +376,6 @@ export enum I18nKey {
  CHAT_INTERFACE$MESSAGE_ARIA_LABEL = "CHAT_INTERFACE$MESSAGE_ARIA_LABEL",
  CHAT_INTERFACE$CHAT_CONVERSATION = "CHAT_INTERFACE$CHAT_CONVERSATION",
  CHAT_INTERFACE$UNKNOWN_SENDER = "CHAT_INTERFACE$UNKNOWN_SENDER",
-  SECURITY_ANALYZER$UNKNOWN_RISK = "SECURITY_ANALYZER$UNKNOWN_RISK",
-  SECURITY_ANALYZER$LOW_RISK = "SECURITY_ANALYZER$LOW_RISK",
-  SECURITY_ANALYZER$MEDIUM_RISK = "SECURITY_ANALYZER$MEDIUM_RISK",
-  SECURITY_ANALYZER$HIGH_RISK = "SECURITY_ANALYZER$HIGH_RISK",
  SETTINGS$MODEL_TOOLTIP = "SETTINGS$MODEL_TOOLTIP",
  SETTINGS$AGENT_TOOLTIP = "SETTINGS$AGENT_TOOLTIP",
  SETTINGS$LANGUAGE_TOOLTIP = "SETTINGS$LANGUAGE_TOOLTIP",
@@ -385,9 +386,12 @@ export enum I18nKey {
  SETTINGS$REFRESH_LLM_API_KEY = "SETTINGS$REFRESH_LLM_API_KEY",
  SETTINGS$CONFIRMATION_MODE = "SETTINGS$CONFIRMATION_MODE",
  SETTINGS$CONFIRMATION_MODE_TOOLTIP = "SETTINGS$CONFIRMATION_MODE_TOOLTIP",
+  SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP = "SETTINGS$CONFIRMATION_MODE_LOCK_TOOLTIP",
  SETTINGS$AGENT_SELECT_ENABLED = "SETTINGS$AGENT_SELECT_ENABLED",
  SETTINGS$SECURITY_ANALYZER = "SETTINGS$SECURITY_ANALYZER",
  SETTINGS$SECURITY_ANALYZER_PLACEHOLDER = "SETTINGS$SECURITY_ANALYZER_PLACEHOLDER",
+  SETTINGS$SECURITY_ANALYZER_TOOLTIP = "SETTINGS$SECURITY_ANALYZER_TOOLTIP",
+  SETTINGS$SECURITY_ANALYZER_DESCRIPTION = "SETTINGS$SECURITY_ANALYZER_DESCRIPTION",
  SETTINGS$DONT_KNOW_API_KEY = "SETTINGS$DONT_KNOW_API_KEY",
  SETTINGS$CLICK_FOR_INSTRUCTIONS = "SETTINGS$CLICK_FOR_INSTRUCTIONS",
  SETTINGS$SAVED = "SETTINGS$SAVED",
@@ -474,7 +478,6 @@ export enum I18nKey {
  PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_FILES_LABEL = "PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_FILES_LABEL",
  PROJECT_MENU_CARD$OPEN = "PROJECT_MENU_CARD$OPEN",
  ACTION_BUTTON$RESUME = "ACTION_BUTTON$RESUME",
-  ACTION_BUTTON$PAUSE = "ACTION_BUTTON$PAUSE",
  BROWSER$SCREENSHOT_ALT = "BROWSER$SCREENSHOT_ALT",
  ERROR_TOAST$CLOSE_BUTTON_LABEL = "ERROR_TOAST$CLOSE_BUTTON_LABEL",
  FILE_EXPLORER$UPLOAD = "FILE_EXPLORER$UPLOAD",
@@ -513,7 +516,6 @@ export enum I18nKey {
  STATUS$CONNECTED = "STATUS$CONNECTED",
  BROWSER$NO_PAGE_LOADED = "BROWSER$NO_PAGE_LOADED",
  USER$AVATAR_PLACEHOLDER = "USER$AVATAR_PLACEHOLDER",
-  ACCOUNT_SETTINGS$SETTINGS = "ACCOUNT_SETTINGS$SETTINGS",
  ACCOUNT_SETTINGS$LOGOUT = "ACCOUNT_SETTINGS$LOGOUT",
  SETTINGS_FORM$ADVANCED_OPTIONS_LABEL = "SETTINGS_FORM$ADVANCED_OPTIONS_LABEL",
  CONVERSATION$NO_CONVERSATIONS = "CONVERSATION$NO_CONVERSATIONS",
@@ -573,8 +575,6 @@ export enum I18nKey {
  ENTERPRISE_SSO$CONNECT_TO_ENTERPRISE_SSO = "ENTERPRISE_SSO$CONNECT_TO_ENTERPRISE_SSO",
  AUTH$SIGN_IN_WITH_IDENTITY_PROVIDER = "AUTH$SIGN_IN_WITH_IDENTITY_PROVIDER",
  WAITLIST$JOIN_WAITLIST = "WAITLIST$JOIN_WAITLIST",
-  ACCOUNT_SETTINGS$ADDITIONAL_SETTINGS = "ACCOUNT_SETTINGS$ADDITIONAL_SETTINGS",
-  ACCOUNT_SETTINGS$DISCONNECT_FROM_GITHUB = "ACCOUNT_SETTINGS$DISCONNECT_FROM_GITHUB",
  CONVERSATION$DELETE_WARNING = "CONVERSATION$DELETE_WARNING",
  FEEDBACK$TITLE = "FEEDBACK$TITLE",
  FEEDBACK$DESCRIPTION = "FEEDBACK$DESCRIPTION",
@@ -781,8 +781,6 @@ export enum I18nKey {
  PROJECT_MANAGEMENT$SVC_ACC_EMAIL_VALIDATION_ERROR = "PROJECT_MANAGEMENT$SVC_ACC_EMAIL_VALIDATION_ERROR",
  PROJECT_MANAGEMENT$SVC_ACC_API_KEY_VALIDATION_ERROR = "PROJECT_MANAGEMENT$SVC_ACC_API_KEY_VALIDATION_ERROR",
  MICROAGENT_MANAGEMENT$ERROR_LOADING_MICROAGENT_CONTENT = "MICROAGENT_MANAGEMENT$ERROR_LOADING_MICROAGENT_CONTENT",
-  SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT = "SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT",
-  SETTINGS$MCP_ERROR_URL_DUPLICATE = "SETTINGS$MCP_ERROR_URL_DUPLICATE",
  SETTINGS$MCP_SERVER_TYPE_SSE = "SETTINGS$MCP_SERVER_TYPE_SSE",
  SETTINGS$MCP_SERVER_TYPE_STDIO = "SETTINGS$MCP_SERVER_TYPE_STDIO",
  SETTINGS$MCP_SERVER_TYPE_SHTTP = "SETTINGS$MCP_SERVER_TYPE_SHTTP",
@@ -794,6 +792,8 @@ export enum I18nKey {
  SETTINGS$MCP_ERROR_NAME_DUPLICATE = "SETTINGS$MCP_ERROR_NAME_DUPLICATE",
  SETTINGS$MCP_ERROR_COMMAND_REQUIRED = "SETTINGS$MCP_ERROR_COMMAND_REQUIRED",
  SETTINGS$MCP_ERROR_COMMAND_NO_SPACES = "SETTINGS$MCP_ERROR_COMMAND_NO_SPACES",
+  SETTINGS$MCP_ERROR_URL_DUPLICATE = "SETTINGS$MCP_ERROR_URL_DUPLICATE",
+  SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT = "SETTINGS$MCP_ERROR_ENV_INVALID_FORMAT",
  SETTINGS$MCP_SERVER_TYPE = "SETTINGS$MCP_SERVER_TYPE",
  SETTINGS$MCP_API_KEY_PLACEHOLDER = "SETTINGS$MCP_API_KEY_PLACEHOLDER",
  SETTINGS$MCP_COMMAND_ARGUMENTS = "SETTINGS$MCP_COMMAND_ARGUMENTS",
@@ -814,4 +814,15 @@ export enum I18nKey {
  MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW = "MICROAGENT_MANAGEMENT$PR_READY_FOR_REVIEW",
  MICROAGENT_MANAGEMENT$PR_NOT_CREATED = "MICROAGENT_MANAGEMENT$PR_NOT_CREATED",
  MICROAGENT_MANAGEMENT$ERROR_CREATING_MICROAGENT = "MICROAGENT_MANAGEMENT$ERROR_CREATING_MICROAGENT",
+  MICROAGENT$STATUS_WAITING = "MICROAGENT$STATUS_WAITING",
+  MICROAGENT$UNKNOWN_ERROR = "MICROAGENT$UNKNOWN_ERROR",
+  MICROAGENT$CONVERSATION_STARTING = "MICROAGENT$CONVERSATION_STARTING",
+  MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS = "MICROAGENT_MANAGEMENT$EXISTING_MICROAGENTS",
+  MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS = "MICROAGENT_MANAGEMENT$OPEN_MICROAGENT_PULL_REQUESTS",
+  SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT = "SETTINGS$SECURITY_ANALYZER_LLM_DEFAULT",
+  SETTINGS$SECURITY_ANALYZER_NONE = "SETTINGS$SECURITY_ANALYZER_NONE",
+  SETTINGS$SECURITY_ANALYZER_INVARIANT = "SETTINGS$SECURITY_ANALYZER_INVARIANT",
+  COMMON$HIGH_RISK = "COMMON$HIGH_RISK",
+  MICROAGENT$DEFINITION = "MICROAGENT$DEFINITION",
+  MICROAGENT$ADD_TO_MEMORY = "MICROAGENT$ADD_TO_MEMORY",
 }
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#A7A9AC"><path d="M0 0h24v24H0V0z" fill="none"/><path d="M17 7h-4v1.9h4c1.71 0 3.1 1.39 3.1 3.1 0 1.43-.98 2.63-2.31 2.98l1.46 1.46C20.88 15.61 22 13.95 22 12c0-2.76-2.24-5-5-5zm-1 4h-2.19l2 2H16zM2 4.27l3.11 3.11C3.29 8.12 2 9.91 2 12c0 2.76 2.24 5 5 5h4v-1.9H7c-1.71 0-3.1-1.39-3.1-3.1 0-1.59 1.21-2.9 2.76-3.07L8.73 11H8v2h2.73L13 15.27V17h1.73l4.01 4L20 19.74 3.27 3 2 4.27z"/><path d="M0 24V0" fill="none"/></svg>
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 0 24 24" width="24px" fill="#e7000b"><path d="M0 0h24v24H0z" fill="none"/><path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"/></svg>`