chore: add back accidentally removed repo info (#5532 )

Fix: Redis listener attached at startup (#5516 )
Add docker layer caching to ghcr build (#5517 )
2026-04-29 03:00:45 -04:00 · 2024-12-12 05:51:05 +08:00 · 2024-12-11 09:39:57 -05:00 · 2024-12-11 09:39:09 -05:00 · 2024-12-11 01:05:29 -08:00 · 2024-12-10 21:55:02 +00:00
141 changed files with 1856 additions and 749 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -18,7 +18,7 @@ updates:
          - "chromadb"
      browsergym:
        patterns:
-          - "browsergym"
+          - "browsergym*"
      security-all:
        applies-to: "security-updates"
        patterns:
--- a/.github/workflows/fe-unit-tests.yml
+++ b/.github/workflows/fe-unit-tests.yml
@@ -24,7 +24,8 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        node-version: [20]
+        node-version: [20, 22]
+      fail-fast: true
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -68,6 +68,9 @@ jobs:
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
+      - name: "Set up docker layer caching"
+        uses: satackey/action-docker-layer-caching@v0.0.11
+        continue-on-error: true
      - name: Build and push app image
        if: "!github.event.pull_request.head.repo.fork"
        run: |
--- a/.github/workflows/openhands-resolver.yml
+++ b/.github/workflows/openhands-resolver.yml
@@ -16,6 +16,10 @@ on:
        type: string
        default: "main"
        description: "Target branch to pull and create PR against"
+      LLM_MODEL:
+        required: false
+        type: string
+        default: "anthropic/claude-3-5-sonnet-20241022"
      base_container_image:
        required: false
        type: string
@@ -23,15 +27,15 @@ on:
        description: "Custom sandbox env"
    secrets:
      LLM_MODEL:
-        required: true
+        required: false
      LLM_API_KEY:
        required: true
      LLM_BASE_URL:
        required: false
      PAT_TOKEN:
-        required: true
+        required: false
      PAT_USERNAME:
-        required: true
+        required: false

  issues:
    types: [labeled]
@@ -106,13 +110,14 @@ jobs:

      - name: Check required environment variables
        env:
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
          PAT_USERNAME: ${{ secrets.PAT_USERNAME }}
+          GITHUB_TOKEN: ${{ github.token }}
        run: |
-          required_vars=("LLM_MODEL" "LLM_API_KEY" "PAT_TOKEN" "PAT_USERNAME")
+          required_vars=("LLM_MODEL" "LLM_API_KEY")
          for var in "${required_vars[@]}"; do
            if [ -z "${!var}" ]; then
              echo "Error: Required environment variable $var is not set."
@@ -120,6 +125,19 @@ jobs:
            fi
          done

+          # Check optional variables and warn about fallbacks
+          if [ -z "$PAT_TOKEN" ]; then
+            echo "Warning: PAT_TOKEN is not set, falling back to GITHUB_TOKEN"
+          fi
+
+          if [ -z "$LLM_BASE_URL" ]; then
+            echo "Warning: LLM_BASE_URL is not set, will use default API endpoint"
+          fi
+
+          if [ -z "$PAT_USERNAME" ]; then
+            echo "Warning: PAT_USERNAME is not set, will use openhands-agent"
+          fi
+
      - name: Set environment variables
        run: |
          if [ -n "${{ github.event.review.body }}" ]; then
@@ -143,7 +161,7 @@ jobs:
          fi

          echo "MAX_ITERATIONS=${{ inputs.max_iterations || 50 }}" >> $GITHUB_ENV
-          echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV
+          echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.PAT_TOKEN || github.token }}" >> $GITHUB_ENV
          echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV

          # Set branch variables
@@ -152,7 +170,7 @@ jobs:
      - name: Comment on issue with start message
        uses: actions/github-script@v7
        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
+          github-token: ${{ secrets.PAT_TOKEN || github.token }}
          script: |
            const issueType = process.env.ISSUE_TYPE;
            github.rest.issues.createComment({
@@ -177,9 +195,9 @@ jobs:

      - name: Attempt to resolve issue
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
+          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          PYTHONPATH: ""
@@ -189,7 +207,7 @@ jobs:
            --issue-number ${{ env.ISSUE_NUMBER }} \
            --issue-type ${{ env.ISSUE_TYPE }} \
            --max-iterations ${{ env.MAX_ITERATIONS }} \
-            --comment-id ${{ env.COMMENT_ID }} \
+            --comment-id ${{ env.COMMENT_ID }}

      - name: Check resolution result
        id: check_result
@@ -211,9 +229,9 @@ jobs:
      - name: Create draft PR or push branch
        if: always() # Create PR or branch even if the previous steps fail
        env:
-          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
-          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN || github.token }}
+          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME || 'openhands-agent' }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL || inputs.LLM_MODEL }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          PYTHONPATH: ""
@@ -221,7 +239,8 @@ jobs:
          if [ "${{ steps.check_result.outputs.RESOLUTION_SUCCESS }}" == "true" ]; then
            cd /tmp && python -m openhands.resolver.send_pull_request \
              --issue-number ${{ env.ISSUE_NUMBER }} \
-              --pr-type draft | tee pr_result.txt && \
+              --pr-type draft \
+              --reviewer ${{ github.actor }} | tee pr_result.txt && \
              grep "draft created" pr_result.txt | sed 's/.*\///g' > pr_number.txt
          else
            cd /tmp && python -m openhands.resolver.send_pull_request \
@@ -235,7 +254,7 @@ jobs:
        uses: actions/github-script@v7
        if: always() # Comment on issue even if the previous steps fail
        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
+          github-token: ${{ secrets.PAT_TOKEN || github.token }}
          script: |
            const fs = require('fs');
            const issueNumber = ${{ env.ISSUE_NUMBER }};
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
+        run: poetry run pytest --forked -n auto --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
--- a/Development.md
+++ b/Development.md
@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
 setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.14-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.15-nikolaik`

 ## Develop inside Docker container

--- a/README.md
+++ b/README.md
@@ -38,16 +38,16 @@ See the [Installation](https://docs.all-hands.dev/modules/usage/installation) gu
 system requirements and more information.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik

 docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14
+    docker.all-hands.dev/all-hands-ai/openhands:0.15
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
--- a/compose.yml
+++ b/compose.yml
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/config.template.toml
+++ b/config.template.toml
@@ -95,10 +95,10 @@ workspace_base = "./workspace"
 # AWS secret access key
 #aws_secret_access_key = ""

-# API key to use
+# API key to use (For Headless / CLI only -  In Web this is overridden by Session Init)
 api_key = "your-api-key"

-# API base URL
+# API base URL (For Headless / CLI only -  In Web this is overridden by Session Init)
 #base_url = ""

 # API version
@@ -131,7 +131,7 @@ embedding_model = "local"
 # Maximum number of output tokens
 #max_output_tokens = 0

-# Model to use
+# Model to use. (For Headless / CLI only -  In Web this is overridden by Session Init)
 model = "gpt-4o"

 # Number of retries to attempt when an operation fails with the LLM.
@@ -237,10 +237,10 @@ llm_config = 'gpt3'
 ##############################################################################
 [security]

-# Enable confirmation mode
+# Enable confirmation mode (For Headless / CLI only -  In Web this is overridden by Session Init)
 #confirmation_mode = false

-# The security analyzer to use
+# The security analyzer to use (For Headless / CLI only -  In Web this is overridden by Session Init)
 #security_analyzer = ""

 #################################### Eval ####################################
--- a/containers/dev/compose.yml
+++ b/containers/dev/compose.yml
@@ -11,7 +11,7 @@ services:
      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
      - SANDBOX_API_HOSTNAME=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
--- a/docs/modules/usage/how-to/cli-mode.md
+++ b/docs/modules/usage/how-to/cli-mode.md
@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.15 \
    python -m openhands.core.cli
 ```

--- a/docs/modules/usage/how-to/gui-mode.md
+++ b/docs/modules/usage/how-to/gui-mode.md
@@ -23,10 +23,75 @@ OpenHands provides a user-friendly Graphical User Interface (GUI) mode for inter

 OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it is available. This can happen in two ways:

-1. Locally (OSS): The user directly inputs their GitHub token.
-2. Online (SaaS): The token is obtained through GitHub OAuth authentication.
+1. **Locally (OSS)**: The user directly inputs their GitHub token
+2. **Online (SaaS)**: The token is obtained through GitHub OAuth authentication

-When you reach the `/app` route, the app checks if a token is present. If it finds one, it sets it in the environment for the agent to use.
+#### Setting Up a Local GitHub Token
+
+1. **Generate a Personal Access Token (PAT)**:
+   - Go to GitHub Settings > Developer Settings > Personal Access Tokens > Tokens (classic)
+   - Click "Generate new token (classic)"
+   - Required scopes:
+     - `repo` (Full control of private repositories)
+     - `workflow` (Update GitHub Action workflows)
+     - `read:org` (Read organization data)
+
+2. **Enter Token in OpenHands**:
+   - Click the Settings button (gear icon) in the top right
+   - Navigate to the "GitHub" section
+   - Paste your token in the "GitHub Token" field
+   - Click "Save" to apply the changes
+
+#### Organizational Token Policies
+
+If you're working with organizational repositories, additional setup may be required:
+
+1. **Check Organization Requirements**:
+   - Organization admins may enforce specific token policies
+   - Some organizations require tokens to be created with SSO enabled
+   - Review your organization's [token policy settings](https://docs.github.com/en/organizations/managing-programmatic-access-to-your-organization/setting-a-personal-access-token-policy-for-your-organization)
+
+2. **Verify Organization Access**:
+   - Go to your token settings on GitHub
+   - Look for the organization under "Organization access"
+   - If required, click "Enable SSO" next to your organization
+   - Complete the SSO authorization process
+
+#### OAuth Authentication (Online Mode)
+
+When using OpenHands in online mode, the GitHub OAuth flow:
+
+1. Requests the following permissions:
+   - Repository access (read/write)
+   - Workflow management
+   - Organization read access
+
+2. Authentication steps:
+   - Click "Sign in with GitHub" when prompted
+   - Review the requested permissions
+   - Authorize OpenHands to access your GitHub account
+   - If using an organization, authorize organization access if prompted
+
+#### Troubleshooting
+
+Common issues and solutions:
+
+1. **Token Not Recognized**:
+   - Ensure the token is properly saved in settings
+   - Check that the token hasn't expired
+   - Verify the token has the required scopes
+   - Try regenerating the token
+
+2. **Organization Access Denied**:
+   - Check if SSO is required but not enabled
+   - Verify organization membership
+   - Contact organization admin if token policies are blocking access
+
+3. **Verifying Token Works**:
+   - The app will show a green checkmark if the token is valid
+   - Try accessing a repository to confirm permissions
+   - Check the browser console for any error messages
+   - Use the "Test Connection" button in settings if available

 ### Advanced Settings

--- a/docs/modules/usage/how-to/headless-mode.md
+++ b/docs/modules/usage/how-to/headless-mode.md
@@ -44,7 +44,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -54,6 +54,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.15 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```
--- a/docs/modules/usage/installation.mdx
+++ b/docs/modules/usage/installation.mdx
@@ -11,16 +11,16 @@
 The easiest way to run OpenHands is in Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14
+    docker.all-hands.dev/all-hands-ai/openhands:0.15
 ```

 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
--- a/docs/modules/usage/runtimes.md
+++ b/docs/modules/usage/runtimes.md
@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -17,6 +17,7 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
 * [Connection to host.docker.internal timed out](#connection-to-host-docker-internal-timed-out)
+* [Error building runtime docker image](#error-building-runtime-docker-image)

 ### Unable to connect to Docker

@@ -178,3 +179,21 @@ which OpenHands makes use of when the main server is running inside a docker con
 * [Install Docker Desktop](https://www.docker.com/products/docker-desktop/)
 * Run OpenHands in [Development Mode](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
  So that the main server is not run inside a container, but still creates dockerized runtime sandboxes.
+
+---
+### Error building runtime docker image
+
+**Symptoms**
+Attempts to start a new session fail, and an errors with terms like the following appear in the logs:
+* `debian-security bookworm-security`
+* `InRelease At least one invalid signature was encountered.`
+
+This seems to happen when the hash of an existing external library changes and your local docker instance has
+cached a previous version. To work around this, please try the following:
+
+* Stop any containers where the name has the prefix `openhands-runtime-` :
+  `docker ps --filter name=openhands-runtime- --filter status=running -aq | xargs docker stop`
+* Remove any containers where the name has the prefix `openhands-runtime-` :
+  `docker rmi $(docker images --filter name=openhands-runtime- -q --no-trunc)`
+* Stop and Remove any containers / images where the name has the prefix `openhands-runtime-`
+* Prune containers / images : `docker container prune -f && docker image prune -f`
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -17,8 +17,8 @@
        "prism-react-renderer": "^2.4.0",
        "react": "^18.3.1",
        "react-dom": "^18.3.1",
-        "react-icons": "^5.3.0",
-        "react-use": "^17.5.1"
+        "react-icons": "^5.4.0",
+        "react-use": "^17.6.0"
      },
      "devDependencies": {
        "@docusaurus/module-type-aliases": "^3.5.1",
@@ -15155,9 +15155,10 @@
      }
    },
    "node_modules/react-icons": {
-      "version": "5.3.0",
-      "resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.3.0.tgz",
-      "integrity": "sha512-DnUk8aFbTyQPSkCfF8dbX6kQjXA9DktMeJqfjrg6cK9vwQVMxmcA3BfP4QoiztVmEHtwlTgLFsPuH2NskKT6eg==",
+      "version": "5.4.0",
+      "resolved": "https://registry.npmjs.org/react-icons/-/react-icons-5.4.0.tgz",
+      "integrity": "sha512-7eltJxgVt7X64oHh6wSWNwwbKTCtMfK35hcjvJS0yxEAhPM8oUKdS3+kqaW1vicIltw+kR2unHaa12S9pPALoQ==",
+      "license": "MIT",
      "peerDependencies": {
        "react": "*"
      }
@@ -15263,9 +15264,9 @@
      }
    },
    "node_modules/react-use": {
-      "version": "17.5.1",
-      "resolved": "https://registry.npmjs.org/react-use/-/react-use-17.5.1.tgz",
-      "integrity": "sha512-LG/uPEVRflLWMwi3j/sZqR00nF6JGqTTDblkXK2nzXsIvij06hXl1V/MZIlwj1OKIQUtlh1l9jK8gLsRyCQxMg==",
+      "version": "17.6.0",
+      "resolved": "https://registry.npmjs.org/react-use/-/react-use-17.6.0.tgz",
+      "integrity": "sha512-OmedEScUMKFfzn1Ir8dBxiLLSOzhKe/dPZwVxcujweSj45aNM7BEGPb9BEVIgVEqEXx6f3/TsXzwIktNgUR02g==",
      "dependencies": {
        "@types/js-cookie": "^2.2.6",
        "@xobotyi/scrollbar-width": "^1.9.5",
--- a/docs/package.json
+++ b/docs/package.json
@@ -24,8 +24,8 @@
    "prism-react-renderer": "^2.4.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
-    "react-icons": "^5.3.0",
-    "react-use": "^17.5.1"
+    "react-icons": "^5.4.0",
+    "react-use": "^17.6.0"
  },
  "devDependencies": {
    "@docusaurus/module-type-aliases": "^3.5.1",
--- a/evaluation/benchmarks/EDA/README.md
+++ b/evaluation/benchmarks/EDA/README.md
@@ -4,12 +4,10 @@ This folder contains evaluation harness for evaluating agents on the Entity-dedu

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
-
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Start the evaluation

-
 ```bash
 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
 ./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
@@ -37,7 +35,8 @@ For example,
 ```

 ## Reference
-```
+
+```bibtex
@inproceedings{zhang2023entity,
  title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
  author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
--- a/evaluation/benchmarks/EDA/scripts/run_infer.sh
+++ b/evaluation/benchmarks/EDA/scripts/run_infer.sh
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 if [ -z "$DATASET" ]; then
  echo "Dataset not specified, use default 'things'"
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
  exit 1
 fi

-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"

@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
  --max-iterations 20 \
  --OPENAI_API_KEY $OPENAI_API_KEY \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/agent_bench/README.md
+++ b/evaluation/benchmarks/agent_bench/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation harness for evaluating agents on the [AgentBench

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Start the evaluation

--- a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/aider_bench/README.md
+++ b/evaluation/benchmarks/aider_bench/README.md
@@ -10,7 +10,7 @@ Hugging Face dataset based on the

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local
+Please follow instruction [here](../../README.md#setup) to setup your local
 development environment and LLM.

 ## Start the evaluation
--- a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION

 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then
--- a/evaluation/benchmarks/biocoder/README.md
+++ b/evaluation/benchmarks/biocoder/README.md
@@ -4,13 +4,14 @@ Implements evaluation of agents on BioCoder from the BioCoder benchmark introduc

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## BioCoder Docker Image

 In the openhands branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenHands environment. In the Docker image are testing scripts (`/testing/start_test_openhands.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.

 **Before first execution, pull our Docker image with the following command**
+
 ```bash
 docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
 ```
@@ -19,7 +20,6 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode

 ## Start the evaluation

-
 ```bash
 ./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
@@ -47,7 +47,8 @@ with current OpenHands version, then your command would be:
 ```

 ## Reference
-```
+
+```bibtex
@misc{tang2024biocoder,
      title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
      author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
--- a/evaluation/benchmarks/biocoder/scripts/run_infer.sh
+++ b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"

@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/bird/README.md
+++ b/evaluation/benchmarks/bird/README.md
--- a/evaluation/benchmarks/bird/scripts/run_infer.sh
+++ b/evaluation/benchmarks/bird/scripts/run_infer.sh
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/browsing_delegation/README.md
+++ b/evaluation/benchmarks/browsing_delegation/README.md
@@ -7,7 +7,7 @@ If so, the browsing performance upper-bound of CodeActAgent will be the performa

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference

--- a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"

 COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
  --agent-cls $AGENT \
--- a/evaluation/benchmarks/commit0_bench/README.md
+++ b/evaluation/benchmarks/commit0_bench/README.md
@@ -4,19 +4,18 @@ This folder contains the evaluation harness that we built on top of the original

 The evaluation consists of three steps:

-1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm).
+1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm).
 2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## OpenHands Commit0 Instance-level Docker Support

 OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances).
 This is now the default behavior.

-
 ## Run Inference on Commit0 Instances

 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support).
--- a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HF SPLIT: $SPLIT"
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
  export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
--- a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
  --max-iterations 10 \
  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/gaia/README.md
+++ b/evaluation/benchmarks/gaia/README.md
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run the evaluation
+
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
 Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.

@@ -41,6 +42,7 @@ For example,
 ## Get score

 Then you can get stats by running the following command:
+
 ```bash
 python ./evaluation/benchmarks/gaia/get_score.py \
 --file <path_to/output.json>
--- a/evaluation/benchmarks/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 if [ -z "$LEVELS" ]; then
  LEVELS="2023_level1"
  echo "Levels not specified, use default $LEVELS"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"

@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
  --level $LEVELS \
  --data-split validation \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/gorilla/README.md
+++ b/evaluation/benchmarks/gorilla/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation harness we built on top of the original [Gorilla

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on APIBench Instances

--- a/evaluation/benchmarks/gorilla/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 if [ -z "$HUBS" ]; then
  HUBS="hf,torch,tf"
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
 fi

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"

@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
  --hubs $HUBS \
  --data-split validation \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/gpqa/README.md
+++ b/evaluation/benchmarks/gpqa/README.md
@@ -3,6 +3,7 @@
 Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).

 This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+
 - The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
 - Even experts in the corresponding domains achieve only 65% accuracy.
 - State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
@@ -11,20 +12,24 @@ This code implements the evaluation of agents on the GPQA Benchmark with Open Bo
 Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.

 Further references:
- https://arxiv.org/pdf/2311.12022
- https://paperswithcode.com/dataset/gpqa
- https://github.com/idavidrein/gpqa
+
+- <https://arxiv.org/pdf/2311.12022>
+- <https://paperswithcode.com/dataset/gpqa>
+- <https://github.com/idavidrein/gpqa>

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on GPQA Benchmark
+
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
 From the root of the OpenHands repo, run the following command:
+
 ```bash
 ./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
 ```
+
 You can replace `model_config_name` with any model you set up in `config.toml`.

 - `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
--- a/evaluation/benchmarks/gpqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
  DATA_SPLIT="gpqa_diamond"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
  --max-iterations 10 \
  --eval-num-workers $NUM_WORKERS \
  --data-split $DATA_SPLIT \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/humanevalfix/README.md
+++ b/evaluation/benchmarks/humanevalfix/README.md
@@ -4,7 +4,7 @@ Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on HumanEvalFix

@@ -14,13 +14,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop

 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.

-
 ## Examples

 For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case.

-
-```
+```json
 {
    "task_id": "Python/2",
    "instruction": "Please fix the function in Python__2.py such that all test cases pass.\nEnvironment has been set up for you to start working. You may assume all necessary tools are installed.\n\n# Problem Statement\ndef truncate_number(number: float) -> float:\n    return number % 1.0 + 1.0\n\n\n\n\n\n\ndef check(truncate_number):\n    assert truncate_number(3.5) == 0.5\n    assert abs(truncate_number(1.33) - 0.33) < 1e-6\n    assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n",
--- a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/logic_reasoning/README.md
+++ b/evaluation/benchmarks/logic_reasoning/README.md
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the logic reaso

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on logic_reasoning
+
 The following code will run inference on the first example of the ProofWriter dataset,

 ```bash
--- a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
  DATASET="ProofWriter"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
  --dataset $DATASET \
  --max-iterations 10 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/miniwob/README.md
+++ b/evaluation/benchmarks/miniwob/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) ben

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Test if your environment works

@@ -42,7 +42,6 @@ poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/e

 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).

-
 ## BrowsingAgent V1.0 result

 Tested on BrowsingAgent V1.0
--- a/evaluation/benchmarks/miniwob/scripts/run_infer.sh
+++ b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
  AGENT="BrowsingAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
+EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"

 COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
  --agent-cls $AGENT \
--- a/evaluation/benchmarks/mint/scripts/run_infer.sh
+++ b/evaluation/benchmarks/mint/scripts/run_infer.sh
@@ -18,10 +18,10 @@ checkout_eval_branch
 # Only 'CodeActAgent' is supported for MINT now
 AGENT="CodeActAgent"

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"

 export PYTHONPATH=$(pwd)

--- a/evaluation/benchmarks/ml_bench/README.md
+++ b/evaluation/benchmarks/ml_bench/README.md
@@ -12,7 +12,7 @@ For more details on the ML-Bench task and dataset, please refer to the paper: [M

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on ML-Bench

--- a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/scienceagentbench/README.md
+++ b/evaluation/benchmarks/scienceagentbench/README.md
@@ -1,10 +1,10 @@
 # ScienceAgentBench Evaluation with OpenHands

-This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080).
+This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: <https://arxiv.org/abs/2410.05080>).

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Setup ScienceAgentBench

@@ -45,6 +45,7 @@ After the inference is completed, you may use the following command to extract n
 ```bash
 python post_proc.py [log_fname]
 ```
+
 - `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.

 Output will be write to e.g. `evaluation/.../output.converted.jsonl`
--- a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
  USE_KNOWLEDGE=false
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

 COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
  --use_knowledge $USE_KNOWLEDGE \
  --max-iterations 30 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@@ -6,20 +6,19 @@ This folder contains the evaluation harness that we built on top of the original

 The evaluation consists of three steps:

-1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
+1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
 2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue
 3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches)

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## OpenHands SWE-Bench Instance-level Docker Support

 OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
 This is now the default behavior.

-
 ## Run Inference on SWE-Bench Instances

 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
@@ -52,7 +51,8 @@ default, it is set to 1.
 - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.

 There are also two optional environment variables you can set.
-```
+
+```bash
 export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
 export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
 ```
@@ -127,6 +127,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
 **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**

 > If you want to evaluate existing results, you should first run this to clone existing outputs
+>
 >```bash
 >git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
 >```
@@ -143,6 +144,7 @@ Then you can run the following:
 ```

 The script now accepts optional arguments:
+
 - `instance_id`: Specify a single instance to evaluate (optional)
 - `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`)
 - `split`: The split of the dataset to use (default: `"test"`)
@@ -179,7 +181,6 @@ To clean-up all existing runtimes that you've already started, run:
 ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```

-
 ## Visualize Results

 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
@@ -189,6 +190,7 @@ git clone https://huggingface.co/spaces/OpenHands/evaluation
 ```

 **(optional) setup streamlit environment with conda**:
+
 ```bash
 cd evaluation
 conda create -n streamlit python=3.10
--- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@@ -128,6 +128,11 @@ def process_file(file_path):
                for error, count in error_counter.items()
            },
        },
+        'costs': {
+            'main_agent': sum(main_agent_cost),
+            'editor': sum(editor_cost),
+            'total': sum(main_agent_cost) + sum(editor_cost),
+        },
        'statistics': {
            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
            'costs': {
@@ -251,6 +256,7 @@ if __name__ == '__main__':
            print(
                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
            )
+            print(f"Total cost: {result['costs']['total']:.2f} USD")
            print('## Statistics')
            print(
                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
--- a/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
@@ -0,0 +1,104 @@
+import argparse
+
+import pandas as pd
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def verify_instance_costs(row: pd.Series) -> float:
+    """
+    Verifies that the accumulated_cost matches the sum of individual costs in metrics.
+    Also checks for duplicate consecutive costs which might indicate buggy counting.
+    If the consecutive costs are identical, the file is affected by this bug:
+    https://github.com/All-Hands-AI/OpenHands/issues/5383
+
+    Args:
+        row: DataFrame row containing instance data with metrics
+    Returns:
+        float: The verified total cost for this instance (corrected if needed)
+    """
+    try:
+        metrics = row.get('metrics')
+        if not metrics:
+            logger.warning(f"Instance {row['instance_id']}: No metrics found")
+            return 0.0
+
+        accumulated = metrics.get('accumulated_cost')
+        costs = metrics.get('costs', [])
+
+        if accumulated is None:
+            logger.warning(
+                f"Instance {row['instance_id']}: No accumulated_cost in metrics"
+            )
+            return 0.0
+
+        # Check for duplicate consecutive costs and systematic even-odd pairs
+        has_duplicate = False
+        all_pairs_match = True
+
+        # Check each even-odd pair (0-1, 2-3, etc.)
+        for i in range(0, len(costs) - 1, 2):
+            if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
+                has_duplicate = True
+                logger.debug(
+                    f"Instance {row['instance_id']}: Possible buggy double-counting detected! "
+                    f"Steps {i} and {i+1} have identical costs: {costs[i]['cost']:.2f}"
+                )
+            else:
+                all_pairs_match = False
+                break
+
+        # Calculate total cost, accounting for buggy double counting if detected
+        if len(costs) >= 2 and has_duplicate and all_pairs_match:
+            paired_steps_cost = sum(
+                cost_entry['cost']
+                for cost_entry in costs[: -1 if len(costs) % 2 else None]
+            )
+            real_paired_cost = paired_steps_cost / 2
+
+            unpaired_cost = costs[-1]['cost'] if len(costs) % 2 else 0
+            total_cost = real_paired_cost + unpaired_cost
+
+        else:
+            total_cost = sum(cost_entry['cost'] for cost_entry in costs)
+
+        if not abs(total_cost - accumulated) < 1e-6:
+            logger.warning(
+                f"Instance {row['instance_id']}: Cost mismatch: "
+                f"accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, "
+            )
+
+        return total_cost
+
+    except Exception as e:
+        logger.error(
+            f"Error verifying costs for instance {row.get('instance_id', 'UNKNOWN')}: {e}"
+        )
+        return 0.0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Verify costs in SWE-bench output file'
+    )
+    parser.add_argument(
+        'input_filepath', type=str, help='Path to the output.jsonl file'
+    )
+    args = parser.parse_args()
+
+    try:
+        # Load and verify the JSONL file
+        df = pd.read_json(args.input_filepath, lines=True)
+        logger.info(f'Loaded {len(df)} instances from {args.input_filepath}')
+
+        # Verify costs for each instance and sum up total
+        total_cost = df.apply(verify_instance_costs, axis=1).sum()
+        logger.info(f'Total verified cost across all instances: ${total_cost:.2f}')
+
+    except Exception as e:
+        logger.error(f'Failed to process file: {e}')
+        raise
+
+
+if __name__ == '__main__':
+    main()
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "SPLIT: $SPLIT"
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
  export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
--- a/evaluation/benchmarks/toolqa/README.md
+++ b/evaluation/benchmarks/toolqa/README.md
@@ -4,7 +4,7 @@ This folder contains an evaluation harness we built on top of the original [Tool

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Run Inference on ToolQA Instances

--- a/evaluation/benchmarks/toolqa/scripts/run_infer.sh
+++ b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
  echo "WOLFRAM_APPID not specified"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
  --wolfram_alpha_appid $WOLFRAM_APPID\
  --data-split validation \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/benchmarks/webarena/README.md
+++ b/evaluation/benchmarks/webarena/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we

 ## Setup Environment and LLM Configuration

-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.

 ## Setup WebArena Environment

--- a/evaluation/benchmarks/webarena/scripts/run_infer.sh
+++ b/evaluation/benchmarks/webarena/scripts/run_infer.sh
@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
  AGENT="BrowsingAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"

 COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
  --agent-cls $AGENT \
--- a/evaluation/integration_tests/scripts/run_infer.sh
+++ b/evaluation/integration_tests/scripts/run_infer.sh
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
  AGENT="CodeActAgent"
 fi

-get_agent_version
+get_openhands_version

 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION

 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then
--- a/evaluation/utils/version_control.sh
+++ b/evaluation/utils/version_control.sh
@@ -39,8 +39,8 @@ checkout_original_branch() {
    git checkout $current_branch
 }

-get_agent_version() {
+get_openhands_version() {
    # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
    # We need to track the version of Agent in the evaluation to make sure results are comparable
-    AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+    OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
 }
--- a/frontend/tests/components/browser.test.tsx
+++ b/frontend/tests/components/browser.test.tsx
@@ -11,6 +11,7 @@ describe("Browser", () => {
        browser: {
          url: "https://example.com",
          screenshotSrc: "",
+          updateCount: 0,
        },
      },
    });
@@ -26,6 +27,7 @@ describe("Browser", () => {
          url: "https://example.com",
          screenshotSrc:
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mN0uGvyHwAFCAJS091fQwAAAABJRU5ErkJggg==",
+          updateCount: 0,
        },
      },
    });
--- a/frontend/tests/components/chat-message.test.tsx
+++ b/frontend/tests/components/chat-message.test.tsx
@@ -70,4 +70,12 @@ describe("ChatMessage", () => {
    );
    expect(screen.getByTestId("custom-component")).toBeInTheDocument();
  });
+
+  it("should apply correct styles to inline code", () => {
+    render(<ChatMessage type="assistant" message="Here is some `inline code` text" />);
+    const codeElement = screen.getByText("inline code");
+
+    expect(codeElement.tagName.toLowerCase()).toBe("code");
+    expect(codeElement.closest("article")).not.toBeNull();
+  });
 });
--- a/frontend/tests/components/chat/chat-interface.test.tsx
+++ b/frontend/tests/components/chat/chat-interface.test.tsx
@@ -9,7 +9,7 @@ import { WsClientProviderStatus } from "#/context/ws-client-provider";
 import { ChatInterface } from "#/components/features/chat/chat-interface";

 // eslint-disable-next-line @typescript-eslint/no-unused-vars
-const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
+const renderChatInterface = (messages: (Message)[]) =>
  renderWithProviders(<ChatInterface />);

 describe("Empty state", () => {
@@ -278,7 +278,7 @@ describe.skip("ChatInterface", () => {
  });

  it("should render inline errors", () => {
-    const messages: (Message | ErrorMessage)[] = [
+    const messages: (Message)[] = [
      {
        sender: "assistant",
        content: "Hello",
@@ -287,9 +287,10 @@ describe.skip("ChatInterface", () => {
        pending: true,
      },
      {
-        error: true,
-        id: "",
-        message: "Something went wrong",
+        type: "error",
+        content: "Something went wrong",
+        sender: "assistant",
+        timestamp: new Date().toISOString(),
      },
    ];
    renderChatInterface(messages);
--- a/frontend/tests/components/features/waitlist-modal.test.tsx
+++ b/frontend/tests/components/features/waitlist-modal.test.tsx
@@ -0,0 +1,45 @@
+import { render, screen } from "@testing-library/react";
+import { it, describe, expect, vi } from "vitest";
+import userEvent from "@testing-library/user-event";
+import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
+import * as CaptureConsent from "#/utils/handle-capture-consent";
+
+describe("WaitlistModal", () => {
+  it("should render a tos checkbox that is unchecked by default", () => {
+    render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
+    const checkbox = screen.getByRole("checkbox");
+
+    expect(checkbox).not.toBeChecked();
+  });
+
+  it("should only enable the GitHub button if the tos checkbox is checked", async () => {
+    const user = userEvent.setup();
+    render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
+    const checkbox = screen.getByRole("checkbox");
+    const button = screen.getByRole("button", { name: "Connect to GitHub" });
+
+    expect(button).toBeDisabled();
+
+    await user.click(checkbox);
+
+    expect(button).not.toBeDisabled();
+  });
+
+  it("should set user analytics consent to true when the user checks the tos checkbox", async () => {
+    const handleCaptureConsentSpy = vi.spyOn(
+      CaptureConsent,
+      "handleCaptureConsent",
+    );
+
+    const user = userEvent.setup();
+    render(<WaitlistModal ghToken={null} githubAuthUrl="mock-url" />);
+
+    const checkbox = screen.getByRole("checkbox");
+    await user.click(checkbox);
+
+    const button = screen.getByRole("button", { name: "Connect to GitHub" });
+    await user.click(button);
+
+    expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
+  });
+});
--- a/frontend/tests/routes/_oh.test.tsx
+++ b/frontend/tests/routes/_oh.test.tsx
@@ -4,8 +4,9 @@ import { screen, waitFor, within } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import userEvent from "@testing-library/user-event";
 import MainApp from "#/routes/_oh/route";
-import * as CaptureConsent from "#/utils/handle-capture-consent";
 import i18n from "#/i18n";
+import * as CaptureConsent from "#/utils/handle-capture-consent";
+import OpenHands from "#/api/open-hands";

 describe("frontend/routes/_oh", () => {
  const RouteStub = createRoutesStub([{ Component: MainApp, path: "/" }]);
@@ -60,13 +61,20 @@ describe("frontend/routes/_oh", () => {
    });
  });

-  it("should capture the user's consent", async () => {
+  it("should render and capture the user's consent if oss mode", async () => {
    const user = userEvent.setup();
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
    const handleCaptureConsentSpy = vi.spyOn(
      CaptureConsent,
      "handleCaptureConsent",
    );

+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "oss",
+      GITHUB_CLIENT_ID: "test-id",
+      POSTHOG_CLIENT_KEY: "test-key",
+    });
+
    renderWithProviders(<RouteStub />);

    // The user has not consented to tracking
@@ -87,6 +95,23 @@ describe("frontend/routes/_oh", () => {
    ).not.toBeInTheDocument();
  });

+  it("should not render the user consent form if saas mode", async () => {
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "saas",
+      GITHUB_CLIENT_ID: "test-id",
+      POSTHOG_CLIENT_KEY: "test-key",
+    });
+
+    renderWithProviders(<RouteStub />);
+
+    await waitFor(() => {
+      expect(
+        screen.queryByTestId("user-capture-consent-form"),
+      ).not.toBeInTheDocument();
+    });
+  });
+
  it("should not render the user consent form if the user has already made a decision", async () => {
    localStorage.setItem("analytics-consent", "true");
    renderWithProviders(<RouteStub />);
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "openhands-frontend",
-  "version": "0.14.3",
+  "version": "0.15.1",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "openhands-frontend",
-      "version": "0.14.3",
+      "version": "0.15.1",
      "dependencies": {
        "@monaco-editor/react": "^4.6.0",
        "@nextui-org/react": "^2.4.8",
@@ -78,7 +78,7 @@
        "husky": "^9.1.6",
        "jsdom": "^25.0.1",
        "lint-staged": "^15.2.10",
-        "msw": "^2.3.0-ws.rc-12",
+        "msw": "^2.6.6",
        "postcss": "^8.4.47",
        "prettier": "^3.3.3",
        "tailwindcss": "^3.4.14",
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.14.3",
+  "version": "0.15.1",
  "private": true,
  "type": "module",
  "engines": {
@@ -105,7 +105,7 @@
    "husky": "^9.1.6",
    "jsdom": "^25.0.1",
    "lint-staged": "^15.2.10",
-    "msw": "^2.3.0-ws.rc-12",
+    "msw": "^2.6.6",
    "postcss": "^8.4.47",
    "prettier": "^3.3.3",
    "tailwindcss": "^3.4.14",
--- a/frontend/src/components/features/chat/chat-message.tsx
+++ b/frontend/src/components/features/chat/chat-message.tsx
@@ -47,7 +47,7 @@ export function ChatMessage({
        "rounded-xl relative",
        "flex flex-col gap-2",
        type === "user" && " max-w-[305px] p-4 bg-neutral-700 self-end",
-        type === "assistant" && "pb-4 max-w-full bg-tranparent",
+        type === "assistant" && "mt-6 max-w-full bg-tranparent",
      )}
    >
      <CopyToClipboardButton
--- a/frontend/src/components/features/chat/error-message.tsx
+++ b/frontend/src/components/features/chat/error-message.tsx
@@ -1,42 +0,0 @@
-import { useState, useEffect } from "react";
-import { useTranslation } from "react-i18next";
-
-interface ErrorMessageProps {
-  id?: string;
-  message: string;
-}
-
-export function ErrorMessage({ id, message }: ErrorMessageProps) {
-  const { t, i18n } = useTranslation();
-  const [showDetails, setShowDetails] = useState(true);
-  const [headline, setHeadline] = useState("");
-  const [details, setDetails] = useState(message);
-
-  useEffect(() => {
-    if (id && i18n.exists(id)) {
-      setHeadline(t(id));
-      setDetails(message);
-      setShowDetails(false);
-    }
-  }, [id, message, i18n.language]);
-
-  return (
-    <div className="flex gap-2 items-center justify-start border-l-2 border-danger pl-2 my-2 py-2">
-      <div className="text-sm leading-4 flex flex-col gap-2">
-        {headline && <p className="text-danger font-bold">{headline}</p>}
-        {headline && (
-          <button
-            type="button"
-            onClick={() => setShowDetails(!showDetails)}
-            className="cursor-pointer text-left"
-          >
-            {showDetails
-              ? t("ERROR_MESSAGE$HIDE_DETAILS")
-              : t("ERROR_MESSAGE$SHOW_DETAILS")}
-          </button>
-        )}
-        {showDetails && <p className="text-neutral-300">{details}</p>}
-      </div>
-    </div>
-  );
-}
--- a/frontend/src/components/features/chat/expandable-message.tsx
+++ b/frontend/src/components/features/chat/expandable-message.tsx
@@ -0,0 +1,80 @@
+import { useState, useEffect } from "react";
+import { useTranslation } from "react-i18next";
+import Markdown from "react-markdown";
+import remarkGfm from "remark-gfm";
+import { code } from "../markdown/code";
+import { ol, ul } from "../markdown/list";
+import ArrowUp from "#/icons/angle-up-solid.svg?react";
+import ArrowDown from "#/icons/angle-down-solid.svg?react";
+
+interface ExpandableMessageProps {
+  id?: string;
+  message: string;
+  type: string;
+}
+
+export function ExpandableMessage({
+  id,
+  message,
+  type,
+}: ExpandableMessageProps) {
+  const { t, i18n } = useTranslation();
+  const [showDetails, setShowDetails] = useState(true);
+  const [headline, setHeadline] = useState("");
+  const [details, setDetails] = useState(message);
+
+  useEffect(() => {
+    if (id && i18n.exists(id)) {
+      setHeadline(t(id));
+      setDetails(message);
+      setShowDetails(false);
+    }
+  }, [id, message, i18n.language]);
+
+  const border = type === "error" ? "border-danger" : "border-neutral-300";
+  const textColor = type === "error" ? "text-danger" : "text-neutral-300";
+  let arrowClasses = "h-4 w-4 ml-2 inline";
+  if (type === "error") {
+    arrowClasses += " fill-danger";
+  } else {
+    arrowClasses += " fill-neutral-300";
+  }
+
+  return (
+    <div
+      className={`flex gap-2 items-center justify-start border-l-2 pl-2 my-2 py-2 ${border}`}
+    >
+      <div className="text-sm leading-4 flex flex-col gap-2 max-w-full">
+        {headline && (
+          <p className={`${textColor} font-bold`}>
+            {headline}
+            <button
+              type="button"
+              onClick={() => setShowDetails(!showDetails)}
+              className="cursor-pointer text-left"
+            >
+              {showDetails ? (
+                <ArrowUp className={arrowClasses} />
+              ) : (
+                <ArrowDown className={arrowClasses} />
+              )}
+            </button>
+          </p>
+        )}
+        {showDetails && (
+          <Markdown
+            className="text-sm overflow-auto"
+            components={{
+              code,
+              ul,
+              ol,
+            }}
+            remarkPlugins={[remarkGfm]}
+          >
+            {details}
+          </Markdown>
+        )}
+      </div>
+    </div>
+  );
+}
--- a/frontend/src/components/features/chat/messages.tsx
+++ b/frontend/src/components/features/chat/messages.tsx
@@ -1,14 +1,10 @@
 import { ChatMessage } from "#/components/features/chat/chat-message";
 import { ConfirmationButtons } from "#/components/shared/buttons/confirmation-buttons";
 import { ImageCarousel } from "../images/image-carousel";
-import { ErrorMessage } from "./error-message";
-
-const isErrorMessage = (
-  message: Message | ErrorMessage,
-): message is ErrorMessage => "error" in message;
+import { ExpandableMessage } from "./expandable-message";

 interface MessagesProps {
-  messages: (Message | ErrorMessage)[];
+  messages: Message[];
  isAwaitingUserConfirmation: boolean;
 }

@@ -16,18 +12,27 @@ export function Messages({
  messages,
  isAwaitingUserConfirmation,
 }: MessagesProps) {
-  return messages.map((message, index) =>
-    isErrorMessage(message) ? (
-      <ErrorMessage key={index} id={message.id} message={message.message} />
-    ) : (
+  return messages.map((message, index) => {
+    if (message.type === "error" || message.type === "action") {
+      return (
+        <ExpandableMessage
+          key={index}
+          type={message.type}
+          id={message.translationID}
+          message={message.content}
+        />
+      );
+    }
+
+    return (
      <ChatMessage key={index} type={message.sender} message={message.content}>
-        {message.imageUrls.length > 0 && (
+        {message.imageUrls && message.imageUrls.length > 0 && (
          <ImageCarousel size="small" images={message.imageUrls} />
        )}
        {messages.length - 1 === index &&
          message.sender === "assistant" &&
          isAwaitingUserConfirmation && <ConfirmationButtons />}
      </ChatMessage>
-    ),
-  );
+    );
+  });
 }
--- a/frontend/src/components/features/file-explorer/tree-node.tsx
+++ b/frontend/src/components/features/file-explorer/tree-node.tsx
@@ -1,10 +1,12 @@
 import React from "react";
+import { useSelector } from "react-redux";

 import { useFiles } from "#/context/files";
 import { cn } from "#/utils/utils";
 import { useListFiles } from "#/hooks/query/use-list-files";
 import { useListFile } from "#/hooks/query/use-list-file";
 import { Filename } from "./filename";
+import { RootState } from "#/store";

 interface TreeNodeProps {
  path: string;
@@ -20,6 +22,7 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) {
    selectedPath,
  } = useFiles();
  const [isOpen, setIsOpen] = React.useState(defaultOpen);
+  const { curAgentState } = useSelector((state: RootState) => state.agent);

  const isDirectory = path.endsWith("/");

@@ -39,6 +42,12 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) {
    }
  }, [fileContent, path]);

+  React.useEffect(() => {
+    if (selectedPath === path && !isDirectory) {
+      refetch();
+    }
+  }, [curAgentState, selectedPath, path, isDirectory]);
+
  const fileParts = path.split("/");
  const filename =
    fileParts[fileParts.length - 1] || fileParts[fileParts.length - 2];
--- a/frontend/src/components/features/markdown/code.tsx
+++ b/frontend/src/components/features/markdown/code.tsx
@@ -17,7 +17,20 @@ export function code({
  const match = /language-(\w+)/.exec(className || ""); // get the language

  if (!match) {
-    return <code className={className}>{children}</code>;
+    return (
+      <code
+        className={className}
+        style={{
+          backgroundColor: "#2a3038",
+          padding: "0.2em 0.4em",
+          borderRadius: "4px",
+          color: "#e6edf3",
+          border: "1px solid #30363d",
+        }}
+      >
+        {children}
+      </code>
+    );
  }

  return (
--- a/frontend/src/components/features/sidebar/sidebar.tsx
+++ b/frontend/src/components/features/sidebar/sidebar.tsx
@@ -54,13 +54,13 @@ export function Sidebar() {

  return (
    <>
-      <aside className="px-1 flex flex-col gap-1">
+      <aside className="h-[40px] md:h-auto px-1 flex flex-row md:flex-col gap-1">
        <div className="w-[34px] h-[34px] flex items-center justify-center">
          {user.isLoading && <LoadingSpinner size="small" />}
          {!user.isLoading && <AllHandsLogoButton onClick={handleClickLogo} />}
        </div>

-        <nav className="py-[18px] flex flex-col items-center gap-[18px]">
+        <nav className="md:py-[18px] flex flex-row md:flex-col items-center gap-[18px]">
          <UserActions
            user={user.data ? { avatar_url: user.data.avatar_url } : undefined}
            onLogout={logout}
--- a/frontend/src/components/features/waitlist/tos-checkbox.tsx
+++ b/frontend/src/components/features/waitlist/tos-checkbox.tsx
@@ -0,0 +1,22 @@
+interface TOSCheckboxProps {
+  onChange: () => void;
+}
+
+export function TOSCheckbox({ onChange }: TOSCheckboxProps) {
+  return (
+    <label className="flex items-center gap-2">
+      <input type="checkbox" onChange={onChange} />
+      <span>
+        I accept the{" "}
+        <a
+          href="https://www.all-hands.dev/tos"
+          target="_blank"
+          rel="noopener noreferrer"
+          className="underline underline-offset-2 text-blue-500 hover:text-blue-700"
+        >
+          terms of service
+        </a>
+      </span>
+    </label>
+  );
+}
--- a/frontend/src/components/features/waitlist/waitlist-modal.tsx
+++ b/frontend/src/components/features/waitlist/waitlist-modal.tsx
@@ -1,3 +1,4 @@
+import React from "react";
 import GitHubLogo from "#/assets/branding/github-logo.svg?react";
 import AllHandsLogo from "#/assets/branding/all-hands-logo.svg?react";
 import { JoinWaitlistAnchor } from "./join-waitlist-anchor";
@@ -5,6 +6,8 @@ import { WaitlistMessage } from "./waitlist-message";
 import { ModalBackdrop } from "#/components/shared/modals/modal-backdrop";
 import { ModalButton } from "#/components/shared/buttons/modal-button";
 import { ModalBody } from "#/components/shared/modals/modal-body";
+import { TOSCheckbox } from "./tos-checkbox";
+import { handleCaptureConsent } from "#/utils/handle-capture-consent";

 interface WaitlistModalProps {
  ghToken: string | null;
@@ -12,22 +15,30 @@ interface WaitlistModalProps {
 }

 export function WaitlistModal({ ghToken, githubAuthUrl }: WaitlistModalProps) {
+  const [isTosAccepted, setIsTosAccepted] = React.useState(false);
+
+  const handleGitHubAuth = () => {
+    if (githubAuthUrl) {
+      handleCaptureConsent(true);
+      window.location.href = githubAuthUrl;
+    }
+  };
+
  return (
    <ModalBackdrop>
      <ModalBody>
        <AllHandsLogo width={68} height={46} />
        <WaitlistMessage content={ghToken ? "waitlist" : "sign-in"} />

+        <TOSCheckbox onChange={() => setIsTosAccepted((prev) => !prev)} />
+
        {!ghToken && (
          <ModalButton
+            disabled={!isTosAccepted}
            text="Connect to GitHub"
            icon={<GitHubLogo width={20} height={20} />}
            className="bg-[#791B80] w-full"
-            onClick={() => {
-              if (githubAuthUrl) {
-                window.location.href = githubAuthUrl;
-              }
-            }}
+            onClick={handleGitHubAuth}
          />
        )}
        {ghToken && <JoinWaitlistAnchor />}
--- a/frontend/src/components/layout/container.tsx
+++ b/frontend/src/components/layout/container.tsx
@@ -5,7 +5,7 @@ import { NavTab } from "./nav-tab";
 interface ContainerProps {
  label?: string;
  labels?: {
-    label: string;
+    label: string | React.ReactNode;
    to: string;
    icon?: React.ReactNode;
    isBeta?: boolean;
@@ -39,7 +39,7 @@ export function Container({
          {label}
        </div>
      )}
-      <div className="overflow-scroll h-full rounded-b-xl">{children}</div>
+      <div className="overflow-hidden h-full rounded-b-xl">{children}</div>
    </div>
  );
 }
--- a/frontend/src/components/layout/count-badge.tsx
+++ b/frontend/src/components/layout/count-badge.tsx
@@ -0,0 +1,7 @@
+export function CountBadge({ count }: { count: number }) {
+  return (
+    <span className="text-[11px] leading-5 text-root-primary bg-neutral-400 px-1 rounded-xl">
+      {count}
+    </span>
+  );
+}
--- a/frontend/src/components/layout/nav-tab.tsx
+++ b/frontend/src/components/layout/nav-tab.tsx
@@ -4,7 +4,7 @@ import { BetaBadge } from "./beta-badge";

 interface NavTabProps {
  to: string;
-  label: string;
+  label: string | React.ReactNode;
  icon: React.ReactNode;
  isBeta?: boolean;
 }
--- a/frontend/src/context/ws-client-provider.tsx
+++ b/frontend/src/context/ws-client-provider.tsx
@@ -38,6 +38,7 @@ interface WsClientProviderProps {
  enabled: boolean;
  token: string | null;
  ghToken: string | null;
+  selectedRepository: string | null;
  settings: Settings | null;
 }

@@ -45,12 +46,14 @@ export function WsClientProvider({
  enabled,
  token,
  ghToken,
+  selectedRepository,
  settings,
  children,
 }: React.PropsWithChildren<WsClientProviderProps>) {
  const sioRef = React.useRef<Socket | null>(null);
  const tokenRef = React.useRef<string | null>(token);
  const ghTokenRef = React.useRef<string | null>(ghToken);
+  const selectedRepositoryRef = React.useRef<string | null>(selectedRepository);
  const disconnectRef = React.useRef<ReturnType<typeof setTimeout> | null>(
    null,
  );
@@ -81,8 +84,11 @@ export function WsClientProvider({
    if (ghToken) {
      initEvent.github_token = ghToken;
    }
+    if (selectedRepository) {
+      initEvent.selected_repository = selectedRepository;
+    }
    const lastEvent = lastEventRef.current;
-    if (lastEvent && !Number.isNaN(parseInt(lastEvent.id as string, 10))) {
+    if (lastEvent) {
      initEvent.latest_event_id = lastEvent.id;
    }
    send(initEvent);
@@ -93,7 +99,9 @@ export function WsClientProvider({
      messageRateHandler.record(new Date().getTime());
    }
    setEvents((prevEvents) => [...prevEvents, event]);
-    lastEventRef.current = event;
+    if (!Number.isNaN(parseInt(event.id as string, 10))) {
+      lastEventRef.current = event;
+    }
    const extras = event.extras as Record<string, unknown>;
    if (extras?.agent_state === AgentState.INIT) {
      setStatus(WsClientProviderStatus.ACTIVE);
@@ -156,6 +164,7 @@ export function WsClientProvider({
    sioRef.current = sio;
    tokenRef.current = token;
    ghTokenRef.current = ghToken;
+    selectedRepositoryRef.current = selectedRepository;

    return () => {
      sio.off("connect", handleConnect);
@@ -164,7 +173,7 @@ export function WsClientProvider({
      sio.off("connect_failed", handleError);
      sio.off("disconnect", handleDisconnect);
    };
-  }, [enabled, token, ghToken]);
+  }, [enabled, token, ghToken, selectedRepository]);

  // Strict mode mounts and unmounts each component twice, so we have to wait in the destructor
  // before actually disconnecting the socket and cancel the operation if the component gets remounted.
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -1782,14 +1782,6 @@
    "fr": "Privé",
    "tr": "Özel"
  },
-  "ERROR_MESSAGE$SHOW_DETAILS": {
-    "en": "Show details",
-    "es": "Mostrar detalles"
-  },
-  "ERROR_MESSAGE$HIDE_DETAILS": {
-    "en": "Hide details",
-    "es": "Ocultar detalles"
-  },
  "STATUS$STARTING_RUNTIME": {
    "en": "Starting Runtime...",
    "zh-CN": "启动运行时...",
@@ -2012,5 +2004,41 @@
  "PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_AS_ZIP_LABEL": {
    "en": "Download as .zip",
    "es": "Descargar como .zip"
+  },
+  "ACTION_MESSAGE$RUN": {
+    "en": "Running a bash command"
+  },
+  "ACTION_MESSAGE$RUN_IPYTHON": {
+    "en": "Running a Jupyter command"
+  },
+  "ACTION_MESSAGE$READ": {
+    "en": "Reading the contents of a file"
+  },
+  "ACTION_MESSAGE$WRITE": {
+    "en": "Writing to a file"
+  },
+  "ACTION_MESSAGE$BROWSE": {
+    "en": "Browsing the web"
+  },
+  "OBSERVATION_MESSAGE$RUN": {
+    "en": "Ran a bash command"
+  },
+  "OBSERVATION_MESSAGE$RUN_IPYTHON": {
+    "en": "Ran a Jupyter command"
+  },
+  "OBSERVATION_MESSAGE$READ": {
+    "en": "Read the contents of a file"
+  },
+  "OBSERVATION_MESSAGE$WRITE": {
+    "en": "Wrote to a file"
+  },
+  "OBSERVATION_MESSAGE$BROWSE": {
+    "en": "Browsing completed"
+  },
+  "EXPANDABLE_MESSAGE$SHOW_DETAILS": {
+    "en": "Show details"
+  },
+  "EXPANDABLE_MESSAGE$HIDE_DETAILS": {
+    "en": "Hide details"
  }
 }
--- a/frontend/src/icons/angle-down-solid.svg
+++ b/frontend/src/icons/angle-down-solid.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 374.6c12.5 12.5 32.8 12.5 45.3 0l160-160c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0L224 306.7 86.6 169.4c-12.5-12.5-32.8-12.5-45.3 0s-12.5 32.8 0 45.3l160 160z"/></svg>
--- a/frontend/src/icons/angle-up-solid.svg
+++ b/frontend/src/icons/angle-up-solid.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 137.4c12.5-12.5 32.8-12.5 45.3 0l160 160c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L224 205.3 86.6 342.6c-12.5 12.5-32.8 12.5-45.3 0s-12.5-32.8 0-45.3l160-160z"/></svg>
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -32,8 +32,11 @@ code {
  margin: 0;
  font-size: 85%;
  white-space: break-spaces;
-  background-color: var(--bg-neutral-muted);
-  border-radius: 6px;
+  background-color: #2a3038;
+  border-radius: 4px;
+  color: #e6edf3;
+  border: 1px solid #30363d;
+  letter-spacing: -0.2px;
 }

 .markdown-body pre code {
--- a/frontend/src/message.d.ts
+++ b/frontend/src/message.d.ts
@@ -1,13 +1,10 @@
 type Message = {
  sender: "user" | "assistant";
  content: string;
-  imageUrls: string[];
  timestamp: string;
+  imageUrls?: string[];
+  type?: "thought" | "error" | "action";
  pending?: boolean;
-};
-
-type ErrorMessage = {
-  error: boolean;
-  id?: string;
-  message: string;
+  translationID?: string;
+  eventID?: number;
 };
--- a/frontend/src/routes/_oh.app/hooks/use-ws-status-change.ts
+++ b/frontend/src/routes/_oh.app/hooks/use-ws-status-change.ts
@@ -6,7 +6,6 @@ import {
  WsClientProviderStatus,
 } from "#/context/ws-client-provider";
 import { createChatMessage } from "#/services/chat-service";
-import { getCloneRepoCommand } from "#/services/terminal-service";
 import { setCurrentAgentState } from "#/state/agent-slice";
 import { addUserMessage } from "#/state/chat-slice";
 import {
@@ -37,11 +36,6 @@ export const useWSStatusChange = () => {
    send(createChatMessage(query, base64Files, timestamp));
  };

-  const dispatchCloneRepoCommand = (ghToken: string, repository: string) => {
-    send(getCloneRepoCommand(ghToken, repository));
-    dispatch(clearSelectedRepository());
-  };
-
  const dispatchInitialQuery = (query: string, additionalInfo: string) => {
    if (additionalInfo) {
      sendInitialQuery(`${query}\n\n[${additionalInfo}]`, files);
@@ -57,7 +51,7 @@ export const useWSStatusChange = () => {
    let additionalInfo = "";

    if (gitHubToken && selectedRepository) {
-      dispatchCloneRepoCommand(gitHubToken, selectedRepository);
+      dispatch(clearSelectedRepository());
      additionalInfo = `Repository ${selectedRepository} has been cloned to /workspace. Please check the /workspace for files.`;
    } else if (importedProjectZip) {
      // if there's an uploaded project zip, add it to the chat
--- a/frontend/src/routes/_oh.app/route.tsx
+++ b/frontend/src/routes/_oh.app/route.tsx
@@ -21,6 +21,7 @@ import { useUserPrefs } from "#/context/user-prefs-context";
 import { useConversationConfig } from "#/hooks/query/use-conversation-config";
 import { Container } from "#/components/layout/container";
 import Security from "#/components/shared/modals/security/security";
+import { CountBadge } from "#/components/layout/count-badge";

 function App() {
  const { token, gitHubToken } = useAuth();
@@ -33,6 +34,8 @@ function App() {
    (state: RootState) => state.initalQuery,
  );

+  const { updateCount } = useSelector((state: RootState) => state.browser);
+
  const { data: latestGitHubCommit } = useLatestRepoCommit({
    repository: selectedRepository,
  });
@@ -64,26 +67,31 @@ function App() {
      enabled
      token={token}
      ghToken={gitHubToken}
+      selectedRepository={selectedRepository}
      settings={settings}
    >
      <EventHandler>
        <div className="flex flex-col h-full gap-3">
          <div className="flex h-full overflow-auto gap-3">
-            <Container className="w-[390px] max-h-full relative">
+            <Container className="w-full md:w-[390px] max-h-full relative">
              <ChatInterface />
            </Container>

-            <div className="flex flex-col grow gap-3">
+            <div className="hidden md:flex flex-col grow gap-3">
              <Container
                className="h-2/3"
                labels={[
                  { label: "Workspace", to: "", icon: <CodeIcon /> },
                  { label: "Jupyter", to: "jupyter", icon: <ListIcon /> },
                  {
-                    label: "Browser",
+                    label: (
+                      <div className="flex items-center gap-1">
+                        Browser
+                        {updateCount > 0 && <CountBadge count={updateCount} />}
+                      </div>
+                    ),
                    to: "browser",
                    icon: <GlobeIcon />,
-                    isBeta: true,
                  },
                ]}
              >
--- a/frontend/src/routes/_oh/route.tsx
+++ b/frontend/src/routes/_oh/route.tsx
@@ -6,9 +6,9 @@ import { useIsAuthed } from "#/hooks/query/use-is-authed";
 import { useAuth } from "#/context/auth-context";
 import { useUserPrefs } from "#/context/user-prefs-context";
 import { useConfig } from "#/hooks/query/use-config";
-import { AnalyticsConsentFormModal } from "#/components/features/analytics/analytics-consent-form-modal";
 import { Sidebar } from "#/components/features/sidebar/sidebar";
 import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
+import { AnalyticsConsentFormModal } from "#/components/features/analytics/analytics-consent-form-modal";

 export function ErrorBoundary() {
  const error = useRouteError();
@@ -79,18 +79,19 @@ export default function MainApp() {
  return (
    <div
      data-testid="root-layout"
-      className="bg-root-primary p-3 h-screen min-w-[1024px] overflow-x-hidden flex gap-3"
+      className="bg-root-primary p-3 h-screen md:min-w-[1024px] overflow-x-hidden flex flex-col md:flex-row gap-3"
    >
      <Sidebar />

-      <div className="h-full w-full relative">
+      <div className="h-[calc(100%-50px)] md:h-full w-full relative">
        <Outlet />
      </div>

      {isInWaitlist && (
        <WaitlistModal ghToken={gitHubToken} githubAuthUrl={gitHubAuthUrl} />
      )}
-      {consentFormIsOpen && (
+
+      {config.data?.APP_MODE === "oss" && consentFormIsOpen && (
        <AnalyticsConsentFormModal
          onClose={() => setConsentFormIsOpen(false)}
        />
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -1,14 +1,12 @@
 import {
  addAssistantMessage,
+  addAssistantAction,
  addUserMessage,
  addErrorMessage,
 } from "#/state/chat-slice";
+import { appendSecurityAnalyzerInput } from "#/state/security-analyzer-slice";
 import { setCode, setActiveFilepath } from "#/state/code-slice";
 import { appendJupyterInput } from "#/state/jupyter-slice";
-import {
-  ActionSecurityRisk,
-  appendSecurityAnalyzerInput,
-} from "#/state/security-analyzer-slice";
 import { setCurStatusMessage } from "#/state/status-slice";
 import store from "#/store";
 import ActionType from "#/types/action-type";
@@ -17,21 +15,16 @@ import {
  ObservationMessage,
  StatusMessage,
 } from "#/types/message";
-import EventLogger from "#/utils/event-logger";
 import { handleObservationMessage } from "./observations";

 const messageActions = {
  [ActionType.BROWSE]: (message: ActionMessage) => {
-    if (message.args.thought) {
-      store.dispatch(addAssistantMessage(message.args.thought));
-    } else {
+    if (!message.args.thought && message.message) {
      store.dispatch(addAssistantMessage(message.message));
    }
  },
  [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
-    if (message.args.thought) {
-      store.dispatch(addAssistantMessage(message.args.thought));
-    } else {
+    if (!message.args.thought && message.message) {
      store.dispatch(addAssistantMessage(message.message));
    }
  },
@@ -54,74 +47,28 @@ const messageActions = {
      store.dispatch(addAssistantMessage(message.args.content));
    }
  },
-  [ActionType.FINISH]: (message: ActionMessage) => {
-    store.dispatch(addAssistantMessage(message.message));
-  },
-  [ActionType.REJECT]: (message: ActionMessage) => {
-    store.dispatch(addAssistantMessage(message.message));
-  },
-  [ActionType.DELEGATE]: (message: ActionMessage) => {
-    store.dispatch(addAssistantMessage(message.message));
-  },
-  [ActionType.RUN]: (message: ActionMessage) => {
-    if (message.args.hidden) return;
-    if (message.args.thought) {
-      store.dispatch(addAssistantMessage(message.args.thought));
-    }
-  },
  [ActionType.RUN_IPYTHON]: (message: ActionMessage) => {
-    if (message.args.thought) {
-      store.dispatch(addAssistantMessage(message.args.thought));
-    }
    if (message.args.confirmation_state !== "rejected") {
      store.dispatch(appendJupyterInput(message.args.code));
    }
  },
 };

-function getRiskText(risk: ActionSecurityRisk) {
-  switch (risk) {
-    case ActionSecurityRisk.LOW:
-      return "Low Risk";
-    case ActionSecurityRisk.MEDIUM:
-      return "Medium Risk";
-    case ActionSecurityRisk.HIGH:
-      return "High Risk";
-    case ActionSecurityRisk.UNKNOWN:
-    default:
-      return "Unknown Risk";
-  }
-}
-
 export function handleActionMessage(message: ActionMessage) {
+  if (message.args?.hidden) {
+    return;
+  }
+
  if ("args" in message && "security_risk" in message.args) {
    store.dispatch(appendSecurityAnalyzerInput(message));
  }

-  if (
-    (message.action === ActionType.RUN ||
-      message.action === ActionType.RUN_IPYTHON) &&
-    message.args.confirmation_state === "awaiting_confirmation"
-  ) {
-    if (message.args.thought) {
+  if (message.source === "agent") {
+    if (message.args && message.args.thought) {
      store.dispatch(addAssistantMessage(message.args.thought));
    }
-    if (message.args.command) {
-      store.dispatch(
-        addAssistantMessage(
-          `Running this command now: \n\`\`\`\`bash\n${message.args.command}\n\`\`\`\`\nEstimated security risk: ${getRiskText(message.args.security_risk as unknown as ActionSecurityRisk)}`,
-        ),
-      );
-    } else if (message.args.code) {
-      store.dispatch(
-        addAssistantMessage(
-          `Running this code now: \n\`\`\`\`python\n${message.args.code}\n\`\`\`\`\nEstimated security risk: ${getRiskText(message.args.security_risk as unknown as ActionSecurityRisk)}`,
-        ),
-      );
-    } else {
-      store.dispatch(addAssistantMessage(message.message));
-    }
-    return;
+    // @ts-expect-error TODO: fix
+    store.dispatch(addAssistantAction(message));
  }

  if (message.action in messageActions) {
@@ -155,6 +102,10 @@ export function handleAssistantMessage(message: Record<string, unknown>) {
  } else if (message.status_update) {
    handleStatusMessage(message as unknown as StatusMessage);
  } else {
-    EventLogger.error(`Unknown message type ${message}`);
+    store.dispatch(
+      addErrorMessage({
+        message: "Unknown message type received",
+      }),
+    );
  }
 }
--- a/frontend/src/services/observations.ts
+++ b/frontend/src/services/observations.ts
@@ -2,10 +2,14 @@ import { setCurrentAgentState } from "#/state/agent-slice";
 import { setUrl, setScreenshotSrc } from "#/state/browser-slice";
 import store from "#/store";
 import { ObservationMessage } from "#/types/message";
+import AgentState from "#/types/agent-state";
 import { appendOutput } from "#/state/command-slice";
 import { appendJupyterOutput } from "#/state/jupyter-slice";
 import ObservationType from "#/types/observation-type";
-import { addAssistantMessage } from "#/state/chat-slice";
+import {
+  addAssistantMessage,
+  addAssistantObservation,
+} from "#/state/chat-slice";

 export function handleObservationMessage(message: ObservationMessage) {
  switch (message.observation) {
@@ -46,4 +50,120 @@ export function handleObservationMessage(message: ObservationMessage) {
      store.dispatch(addAssistantMessage(message.message));
      break;
  }
+  if (!message.extras?.hidden) {
+    // Convert the message to the appropriate observation type
+    const { observation } = message;
+    const baseObservation = {
+      ...message,
+      source: "agent" as const,
+    };
+
+    switch (observation) {
+      case "agent_state_changed":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "agent_state_changed" as const,
+            extras: {
+              agent_state: (message.extras.agent_state as AgentState) || "idle",
+            },
+          }),
+        );
+        break;
+      case "run":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "run" as const,
+            extras: {
+              command: String(message.extras.command || ""),
+              command_id: Number(message.extras.command_id || 0),
+              exit_code: Number(message.extras.exit_code || 0),
+              hidden: Boolean(message.extras.hidden),
+            },
+          }),
+        );
+        break;
+      case "run_ipython":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "run_ipython" as const,
+            extras: {
+              code: String(message.extras.code || ""),
+            },
+          }),
+        );
+        break;
+      case "delegate":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "delegate" as const,
+            extras: {
+              outputs:
+                typeof message.extras.outputs === "object"
+                  ? (message.extras.outputs as Record<string, unknown>)
+                  : {},
+            },
+          }),
+        );
+        break;
+      case "browse":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "browse" as const,
+            extras: {
+              url: String(message.extras.url || ""),
+              screenshot: String(message.extras.screenshot || ""),
+              error: Boolean(message.extras.error),
+              open_page_urls: Array.isArray(message.extras.open_page_urls)
+                ? message.extras.open_page_urls
+                : [],
+              active_page_index: Number(message.extras.active_page_index || 0),
+              dom_object:
+                typeof message.extras.dom_object === "object"
+                  ? (message.extras.dom_object as Record<string, unknown>)
+                  : {},
+              axtree_object:
+                typeof message.extras.axtree_object === "object"
+                  ? (message.extras.axtree_object as Record<string, unknown>)
+                  : {},
+              extra_element_properties:
+                typeof message.extras.extra_element_properties === "object"
+                  ? (message.extras.extra_element_properties as Record<
+                      string,
+                      unknown
+                    >)
+                  : {},
+              last_browser_action: String(
+                message.extras.last_browser_action || "",
+              ),
+              last_browser_action_error:
+                message.extras.last_browser_action_error,
+              focused_element_bid: String(
+                message.extras.focused_element_bid || "",
+              ),
+            },
+          }),
+        );
+        break;
+      case "error":
+        store.dispatch(
+          addAssistantObservation({
+            ...baseObservation,
+            observation: "error" as const,
+            source: "user" as const,
+            extras: {
+              error_id: message.extras.error_id,
+            },
+          }),
+        );
+        break;
+      default:
+        // For any unhandled observation types, just ignore them
+        break;
+    }
+  }
 }
--- a/frontend/src/services/terminal-service.ts
+++ b/frontend/src/services/terminal-service.ts
@@ -10,11 +10,3 @@ export function getGitHubTokenCommand(gitHubToken: string) {
  const event = getTerminalCommand(command, true);
  return event;
 }
-
-export function getCloneRepoCommand(gitHubToken: string, repository: string) {
-  const url = `https://${gitHubToken}@github.com/${repository}.git`;
-  const dirName = repository.split("/")[1];
-  const command = `git clone ${url} ${dirName} ; cd ${dirName} ; git checkout -b openhands-workspace`;
-  const event = getTerminalCommand(command, true);
-  return event;
-}
--- a/frontend/src/state/browser-slice.ts
+++ b/frontend/src/state/browser-slice.ts
@@ -5,6 +5,8 @@ export const initialState = {
  url: "https://github.com/All-Hands-AI/OpenHands",
  // Base64-encoded screenshot of browser window (placeholder for now, will be replaced with the actual screenshot later)
  screenshotSrc: "",
+  // Counter for browser updates
+  updateCount: 0,
 };

 export const browserSlice = createSlice({
@@ -16,6 +18,7 @@ export const browserSlice = createSlice({
    },
    setScreenshotSrc: (state, action) => {
      state.screenshotSrc = action.payload;
+      state.updateCount += 1;
    },
  },
 });
--- a/frontend/src/state/chat-slice.ts
+++ b/frontend/src/state/chat-slice.ts
@@ -1,6 +1,28 @@
 import { createSlice, PayloadAction } from "@reduxjs/toolkit";

-type SliceState = { messages: (Message | ErrorMessage)[] };
+import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
+import { OpenHandsObservation } from "#/types/core/observations";
+import { OpenHandsAction } from "#/types/core/actions";
+
+type SliceState = { messages: Message[] };
+
+const MAX_CONTENT_LENGTH = 1000;
+
+const HANDLED_ACTIONS = ["run", "run_ipython", "write", "read", "browse"];
+
+function getRiskText(risk: ActionSecurityRisk) {
+  switch (risk) {
+    case ActionSecurityRisk.LOW:
+      return "Low Risk";
+    case ActionSecurityRisk.MEDIUM:
+      return "Medium Risk";
+    case ActionSecurityRisk.HIGH:
+      return "High Risk";
+    case ActionSecurityRisk.UNKNOWN:
+    default:
+      return "Unknown Risk";
+  }
+}

 const initialState: SliceState = {
  messages: [],
@@ -20,6 +42,7 @@ export const chatSlice = createSlice({
      }>,
    ) {
      const message: Message = {
+        type: "thought",
        sender: "user",
        content: action.payload.content,
        imageUrls: action.payload.imageUrls,
@@ -40,6 +63,7 @@ export const chatSlice = createSlice({

    addAssistantMessage(state, action: PayloadAction<string>) {
      const message: Message = {
+        type: "thought",
        sender: "assistant",
        content: action.payload,
        imageUrls: [],
@@ -49,12 +73,96 @@ export const chatSlice = createSlice({
      state.messages.push(message);
    },

+    addAssistantAction(state, action: PayloadAction<OpenHandsAction>) {
+      const actionID = action.payload.action;
+      if (!HANDLED_ACTIONS.includes(actionID)) {
+        return;
+      }
+      const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`;
+      let text = "";
+      if (actionID === "run") {
+        text = `\`${action.payload.args.command}\``;
+      } else if (actionID === "run_ipython") {
+        text = `\`\`\`\n${action.payload.args.code}\n\`\`\``;
+      } else if (actionID === "write") {
+        let { content } = action.payload.args;
+        if (content.length > MAX_CONTENT_LENGTH) {
+          content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
+        }
+        text = `${action.payload.args.path}\n${content}`;
+      } else if (actionID === "read") {
+        text = action.payload.args.path;
+      } else if (actionID === "browse") {
+        text = `Browsing ${action.payload.args.url}`;
+      }
+      if (actionID === "run" || actionID === "run_ipython") {
+        if (
+          action.payload.args.confirmation_state === "awaiting_confirmation"
+        ) {
+          text += `\n\n${getRiskText(action.payload.args.security_risk as unknown as ActionSecurityRisk)}`;
+        }
+      }
+      const message: Message = {
+        type: "action",
+        sender: "assistant",
+        translationID,
+        eventID: action.payload.id,
+        content: text,
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      };
+      state.messages.push(message);
+    },
+
+    addAssistantObservation(
+      state,
+      observation: PayloadAction<OpenHandsObservation>,
+    ) {
+      const observationID = observation.payload.observation;
+      if (!HANDLED_ACTIONS.includes(observationID)) {
+        return;
+      }
+      const translationID = `OBSERVATION_MESSAGE$${observationID.toUpperCase()}`;
+      const causeID = observation.payload.cause;
+      const causeMessage = state.messages.find(
+        (message) => message.eventID === causeID,
+      );
+      if (!causeMessage) {
+        return;
+      }
+      causeMessage.translationID = translationID;
+      if (observationID === "run" || observationID === "run_ipython") {
+        let { content } = observation.payload;
+        if (content.length > MAX_CONTENT_LENGTH) {
+          content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
+        }
+        content = `\`\`\`\n${content}\n\`\`\``;
+        causeMessage.content = content; // Observation content includes the action
+      } else if (observationID === "browse") {
+        let content = `**URL:** ${observation.payload.extras.url}\n`;
+        if (observation.payload.extras.error) {
+          content += `**Error:**\n${observation.payload.extras.error}\n`;
+        }
+        content += `**Output:**\n${observation.payload.content}`;
+        if (content.length > MAX_CONTENT_LENGTH) {
+          content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
+        }
+        causeMessage.content = content;
+      }
+    },
+
    addErrorMessage(
      state,
      action: PayloadAction<{ id?: string; message: string }>,
    ) {
      const { id, message } = action.payload;
-      state.messages.push({ id, message, error: true });
+      state.messages.push({
+        translationID: id,
+        content: message,
+        type: "error",
+        sender: "assistant",
+        timestamp: new Date().toISOString(),
+      });
    },

    clearMessages(state) {
@@ -66,6 +174,8 @@ export const chatSlice = createSlice({
 export const {
  addUserMessage,
  addAssistantMessage,
+  addAssistantAction,
+  addAssistantObservation,
  addErrorMessage,
  clearMessages,
 } = chatSlice.actions;
--- a/frontend/src/types/core/actions.ts
+++ b/frontend/src/types/core/actions.ts
@@ -1,4 +1,5 @@
 import { OpenHandsActionEvent } from "./base";
+import { ActionSecurityRisk } from "#/state/security-analyzer-slice";

 export interface UserMessageAction extends OpenHandsActionEvent<"message"> {
  source: "user";
@@ -12,6 +13,7 @@ export interface CommandAction extends OpenHandsActionEvent<"run"> {
  source: "agent";
  args: {
    command: string;
+    security_risk: ActionSecurityRisk;
    confirmation_state: "confirmed" | "rejected" | "awaiting_confirmation";
    thought: string;
    hidden?: boolean;
@@ -32,6 +34,7 @@ export interface IPythonAction extends OpenHandsActionEvent<"run_ipython"> {
  source: "agent";
  args: {
    code: string;
+    security_risk: ActionSecurityRisk;
    confirmation_state: "confirmed" | "rejected" | "awaiting_confirmation";
    kernel_init_code: string;
    thought: string;
@@ -96,6 +99,23 @@ export interface ModifyTaskAction extends OpenHandsActionEvent<"modify_task"> {
  };
 }

+export interface FileReadAction extends OpenHandsActionEvent<"read"> {
+  source: "agent";
+  args: {
+    path: string;
+    thought: string;
+  };
+}
+
+export interface FileWriteAction extends OpenHandsActionEvent<"write"> {
+  source: "agent";
+  args: {
+    path: string;
+    content: string;
+    thought: string;
+  };
+}
+
 export interface RejectAction extends OpenHandsActionEvent<"reject"> {
  source: "agent";
  args: {
@@ -112,6 +132,8 @@ export type OpenHandsAction =
  | DelegateAction
  | BrowseAction
  | BrowseInteractiveAction
+  | FileReadAction
+  | FileWriteAction
  | AddTaskAction
  | ModifyTaskAction
  | RejectAction;
--- a/frontend/src/types/core/base.ts
+++ b/frontend/src/types/core/base.ts
@@ -1,7 +1,9 @@
-type OpenHandsEventType =
+export type OpenHandsEventType =
  | "message"
  | "agent_state_changed"
  | "run"
+  | "read"
+  | "write"
  | "run_ipython"
  | "delegate"
  | "browse"
--- a/frontend/src/types/message.tsx
+++ b/frontend/src/types/message.tsx
@@ -1,6 +1,8 @@
 export interface ActionMessage {
+  id: number;
+
  // Either 'agent' or 'user'
-  source: string;
+  source: "agent" | "user";

  // The action to be taken
  action: string;
@@ -19,6 +21,9 @@ export interface ObservationMessage {
  // The type of observation
  observation: string;

+  id: number;
+  cause: number;
+
  // The observed data
  content: string;

--- a/frontend/tailwind.config.js
+++ b/frontend/tailwind.config.js
@@ -1,5 +1,7 @@
 /** @type {import('tailwindcss').Config} */
-const { nextui } = require("@nextui-org/react");
+import { nextui } from "@nextui-org/react";
+import typography from '@tailwindcss/typography';
+
 export default {
  content: [
    "./src/**/*.{js,ts,jsx,tsx}",
@@ -33,6 +35,6 @@ export default {
        }
      }
    }),
-    require('@tailwindcss/typography'),
+    typography,
  ],
 };
--- a/openhands/init.py
+++ b/openhands/init.py
@@ -4,6 +4,16 @@ __package_name__ = 'openhands_ai'


 def get_version():
+    # Try getting the version from pyproject.toml
+    try:
+        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
+            for line in f:
+                if line.startswith('version ='):
+                    return line.split('=')[1].strip().strip('"')
+    except FileNotFoundError:
+        pass
+
    try:
        from importlib.metadata import PackageNotFoundError, version

@@ -18,16 +28,6 @@ def get_version():
    except (ImportError, DistributionNotFound):
        pass

-    # Try getting the version from pyproject.toml
-    try:
-        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
-            for line in f:
-                if line.startswith('version ='):
-                    return line.split('=')[1].strip().strip('"')
-    except FileNotFoundError:
-        pass
-
    return 'unknown'


--- a/openhands/agenthub/codeact_agent/README.md
+++ b/openhands/agenthub/codeact_agent/README.md
@@ -1,28 +1,75 @@
 # CodeAct Agent Framework

-This folder implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+This folder is an implementation of OpenHands's main agent, the CodeAct Agent. It is based on ([CodeAct](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)), an idea of consolidating LLM agents' **act**ions into a unified **code** action space for both *simplicity* and *performance*.

-The conceptual idea is illustrated below. At each turn, the agent can:
+## Overview
+
+The CodeAct agent operates through a function calling interface. At each turn, the agent can:

 1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
-2. **CodeAct**: Choose to perform the task by executing code
-   - Execute any valid Linux `bash` command
-   - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+2. **CodeAct**: Execute actions through a set of well-defined tools:
+   - Execute Linux `bash` commands with `execute_bash`
+   - Run Python code in an [IPython](https://ipython.org/) environment with `execute_ipython_cell`
+   - Interact with web browsers using `browser` and `web_read`
+   - Edit files using `str_replace_editor` or `edit_file`

 ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)

+## Built-in Tools
+
+The agent provides several built-in tools:
+
+### 1. `execute_bash`
+- Execute any valid Linux bash command
+- Handles long-running commands by running them in background with output redirection
+- Supports interactive processes with STDIN input and process interruption
+- Handles command timeouts with automatic retry in background mode
+
+### 2. `execute_ipython_cell`
+- Run Python code in an IPython environment
+- Supports magic commands like `%pip`
+- Variables are scoped to the IPython environment
+- Requires defining variables and importing packages before use
+
+### 3. `web_read` and `browser`
+- `web_read`: Read and convert webpage content to markdown
+- `browser`: Interact with webpages through Python code
+- Supports common browser actions like navigation, clicking, form filling, scrolling
+- Handles file uploads and drag-and-drop operations
+
+### 4. `str_replace_editor`
+- View, create and edit files through string replacement
+- Persistent state across command calls
+- File viewing with line numbers
+- String replacement with exact matching
+- Undo functionality for edits
+
+### 5. `edit_file` (LLM-based)
+- Edit files using LLM-based content generation
+- Support for partial file edits with line ranges
+- Handles large files by editing specific sections
+- Append mode for adding content to files
+
+## Configuration
+
+Tools can be enabled/disabled through configuration parameters:
+- `codeact_enable_browsing`: Enable browser interaction tools
+- `codeact_enable_jupyter`: Enable IPython code execution
+- `codeact_enable_llm_editor`: Enable LLM-based file editing (falls back to string replacement editor if disabled)
+
+## Micro-agents
+
+The agent includes specialized micro-agents for specific tasks:
+
+1. **npm**: Handles npm package installation with non-interactive shell workarounds
+2. **github**: Manages GitHub operations with API token support and PR creation guidelines
+3. **flarglebargle**: Easter egg response handler
+
 ## Adding New Tools

-The CodeAct agent uses a function calling interface to define tools that the agent can use. Tools are defined in `function_calling.py` using the `ChatCompletionToolParam` class from `litellm`. Each tool consists of:
-
-1. A description string that explains what the tool does and how to use it
-2. A tool definition using `ChatCompletionToolParam` that specifies:
-   - The tool's name
-   - The tool's parameters and their types
-   - Required vs optional parameters
-
-Here's an example of how a tool is defined:
+The CodeAct agent uses a function calling interface based on `litellm`'s `ChatCompletionToolParam`. To add a new tool:

+1. Define the tool in `function_calling.py`:
 ```python
 MyTool = ChatCompletionToolParam(
    type='function',
@@ -47,20 +94,20 @@ MyTool = ChatCompletionToolParam(
 )
 ```

-To add a new tool:
+2. Add the tool to `get_tools()` in `function_calling.py`
+3. Implement the corresponding action handler in the agent class

-1. Define your tool in `function_calling.py` following the pattern above
-2. Add your tool to the `get_tools()` function in `function_calling.py`
-3. Implement the corresponding action handler in the agent to process the tool's invocation
+## Implementation Details

-The agent currently supports several built-in tools:
- `execute_bash`: Execute bash commands
- `execute_ipython_cell`: Run Python code in IPython
- `browser`: Interact with a web browser
- `str_replace_editor`: Edit files using string replacement
- `edit_file`: Edit files using LLM-based editing
+The agent is implemented in two main files:

-Tools can be enabled/disabled through configuration parameters:
- `codeact_enable_browsing`: Enable browser interaction
- `codeact_enable_jupyter`: Enable IPython code execution
- `codeact_enable_llm_editor`: Enable LLM-based file editing (if disabled, uses string replacement editor instead)
+1. `codeact_agent.py`: Core agent implementation with:
+   - Message history management
+   - Tool execution handling
+   - State management
+   - Action/observation processing
+
+2. `function_calling.py`: Tool definitions and function calling interface with:
+   - Tool parameter specifications
+   - Tool descriptions and examples
+   - Function calling response parsing
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -154,10 +154,7 @@ class CodeActAgent(Agent):
                BrowseInteractiveAction,
                BrowseURLAction,
            ),
-        ) or (
-            isinstance(action, (AgentFinishAction, CmdRunAction))
-            and action.source == 'agent'
-        ):
+        ) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
            tool_metadata = action.tool_call_metadata
            assert tool_metadata is not None, (
                'Tool call metadata should NOT be None when function calling is enabled. Action: '
@@ -166,6 +163,7 @@ class CodeActAgent(Agent):

            llm_response: ModelResponse = tool_metadata.model_response
            assistant_msg = llm_response.choices[0].message
+
            # Add the LLM message (assistant) that initiated the tool calls
            # (overwrites any previous message with the same response_id)
            pending_tool_call_action_messages[llm_response.id] = Message(
@@ -177,6 +175,33 @@ class CodeActAgent(Agent):
                tool_calls=assistant_msg.tool_calls,
            )
            return []
+        elif isinstance(action, AgentFinishAction):
+            role = 'user' if action.source == 'user' else 'assistant'
+
+            # when agent finishes, it has tool_metadata
+            # which has already been executed, and it doesn't have a response
+            # when the user finishes (/exit), we don't have tool_metadata
+            tool_metadata = action.tool_call_metadata
+            if tool_metadata is not None:
+                # take the response message from the tool call
+                assistant_msg = tool_metadata.model_response.choices[0].message
+                content = assistant_msg.content or ''
+
+                # save content if any, to thought
+                if action.thought:
+                    if action.thought != content:
+                        action.thought += '\n' + content
+                else:
+                    action.thought = content
+
+                # remove the tool call metadata
+                action.tool_call_metadata = None
+            return [
+                Message(
+                    role=role,
+                    content=[TextContent(text=action.thought)],
+                )
+            ]
        elif isinstance(action, MessageAction):
            role = 'user' if action.source == 'user' else 'assistant'
            content = [TextContent(text=action.content or '')]
@@ -373,6 +398,9 @@ class CodeActAgent(Agent):
            - Messages from the same role are combined to prevent consecutive same-role messages
            - For Anthropic models, specific messages are cached according to their documentation
        """
+        if not self.prompt_manager:
+            raise Exception('Prompt Manager not instantiated.')
+
        messages: list[Message] = [
            Message(
                role='system',
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xingyao Wang	907c65cc00	chore: add back accidentally removed repo info (#5532 )	2024-12-12 05:51:05 +08:00
tofarr	a6d1a4c98f	Fix: Redis listener attached at startup (#5516 )	2024-12-11 09:39:57 -05:00
Robert Brennan	a60ee09881	Add docker layer caching to ghcr build (#5517 )	2024-12-11 09:39:09 -05:00
Graham Neubig	246107c618	Parallize Python Unit tests (#5499 )	2024-12-11 01:05:29 -08:00
Robert Brennan	5fa18511b3	minor fixes for when commands time out (#5518 )	2024-12-10 21:55:02 +00:00
Rohit Malhotra	a482182a9e	Remove Beta label from Browser tab (#5484 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-10 15:10:00 -05:00
tofarr	58d22a1905	Fix for issue where double scroll hides save button (#5488 )	2024-12-10 19:50:49 +00:00
dependabot[bot]	17bbfa29a1	chore(deps): bump react-use from 17.5.1 to 17.6.0 in /docs in the version-all group (#5505 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-10 23:40:20 +04:00
tofarr	5fe116cfb1	Make layout responsive for mobile devices (#5475 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-11 03:31:20 +08:00
Xingyao Wang	e9637d40b9	Add browser observations to chat interface (#5514 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-11 03:30:44 +08:00
OpenHands	6de177521f	Fix issue #5450 : In openhands-resolver.yml, request code review from the person who initiated the workflow (#5451 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-12-10 12:19:55 -05:00
Xingyao Wang	9d36b80b96	Fix duplicate search messages in web browsing actions (#5511 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-10 15:51:03 +00:00
Engel Nyst	b11e905988	Verify costs script (#5469 )	2024-12-10 14:20:53 +01:00
dependabot[bot]	39e5311233	chore(deps-dev): bump the llama group across 1 directory with 3 updates (#5503 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-10 07:00:45 +00:00
Engel Nyst	651ed1c3c8	Dependabot config for any browsergym-* package (#5501 )	2024-12-10 01:27:11 -05:00
tofarr	e27c2e9c99	Fix: Auto-refresh file content when selected file changes (#5476 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-09 21:17:41 -05:00
Rohit Malhotra	cfe222e1d5	Fix issue #5162 : docs: Improve GitHub token setup documentation in UI… (#5491 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-09 21:14:55 -05:00
tofarr	c872af4658	Doc: Added troubleshooting section for Nebulous docker errors (#5482 )	2024-12-09 22:04:23 +00:00
OpenHands	99fa6c6a4a	Fix issue #5186 : [Bug]: Fix up inline code styles in chat window (#5226 ) Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: amanape <83104063+amanape@users.noreply.github.com>	2024-12-09 16:33:25 -05:00
OpenHands	3946f813a4	Fix issue #5471 : Resolver: LLM_MODEL should use "variable" instead of "secret" (#5477 )	2024-12-09 16:08:45 -05:00
Engel Nyst	455e667739	add cost to summary (#5473 )	2024-12-10 03:14:03 +08:00
Engel Nyst	2874041381	Fix stuck execution flow (#5458 )	2024-12-08 22:39:32 +01:00
Engel Nyst	279e1d7abc	Resolver minor tweaks (#5461 )	2024-12-08 12:34:01 -05:00
Graham Neubig	a7e4a7aa63	Improve error message when issue/PR not found in resolver (#5455 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-07 23:34:55 -05:00
Engel Nyst	2466d903df	Update version (#5459 )	2024-12-07 18:59:46 -05:00
Cheng Yang	424cdf121a	Feat/better log: Add colorize function and TermColor enum for text coloring (#5410 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2024-12-07 16:30:40 -05:00
Graham Neubig	6972f4806f	Update resolver README.md to fix repo location (#5454 )	2024-12-07 21:02:45 +00:00
Graham Neubig	78cc552e3a	Fix syntax in external openhands-resolver.yml (#5453 )	2024-12-07 20:46:20 +00:00
Graham Neubig	a241b9ff98	fix: Update frontend tests to support Node.js 22.x (#5444 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-07 04:58:27 +01:00
Regis David Souza Mesquita	c757d7c613	Allows using the github-resolver without a PAT (#5278 ) Co-authored-by: Rohit Malhotra <rohitvinodmalhotra@gmail.com>	2024-12-07 02:59:08 +00:00
Raj Maheshwari	2b06e4e5d0	[Feat] Custom MicroAgents. (#4983 ) Co-authored-by: diwu-sf <di.wu@shadowfaxdata.com>	2024-12-06 17:11:06 -05:00
diwu-sf	cf157c86b3	rename socket.py to listen_socket.py to avoid circular import (#5373 )	2024-12-06 20:13:41 +00:00
mamoodi	f2dc3663d7	Release 0.15.1 (#5437 )	2024-12-06 14:02:29 -05:00
mamoodi	e4e3e4abb8	Revert "issue/4599-Add cursor position information on the bottom of the editor area" (#5440 )	2024-12-06 18:16:28 +00:00
dependabot[bot]	22292f72cd	chore(deps-dev): bump llama-index from 0.12.2 to 0.12.3 in the llama group (#5434 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-06 17:47:38 +01:00
Engel Nyst	f4ee3a4cb6	e2b take two (#5433 )	2024-12-06 16:02:16 +00:00
STF-Zero	2df426732a	issue/4599-Add cursor position information on the bottom of the editor area (#5379 )	2024-12-06 15:42:15 +04:00
Engel Nyst	e81623110d	Fix finish action (#5428 )	2024-12-06 04:36:19 +01:00
tofarr	de81020a8d	Feat: Introduce class for SessionInitData rather than using a dict (#5406 )	2024-12-05 13:11:00 -07:00
Engel Nyst	1146b6248b	Support multiline and default user messages (#5400 )	2024-12-05 21:03:18 +01:00
tofarr	c3ddb26e43	Feat named imports (#5413 )	2024-12-05 12:10:52 -07:00
dependabot[bot]	3d853f7db3	chore(deps-dev): bump chromadb from 0.5.20 to 0.5.23 in the chromadb group across 1 directory (#5420 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-05 18:44:04 +01:00
tofarr	027c642268	Fix duplicate events on reinit (#5424 )	2024-12-05 10:09:53 -07:00
sp.wack	910b2a9b9e	chore(frontend): Remove initial analytics modal and update waitlist modal (#5416 )	2024-12-05 20:57:51 +04:00
Robert Brennan	ea96ffca9b	fix messages (#5421 )	2024-12-05 11:38:02 -05:00
sp.wack	7ec407dc50	chore(frontend): Update `msw` (#5367 )	2024-12-05 18:53:50 +04:00
Graham Neubig	83b94786a3	docs: Update CodeAct agent documentation (#5418 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-12-05 22:25:54 +08:00
dependabot[bot]	786cde39fd	chore(deps): bump react-icons from 5.3.0 to 5.4.0 in /docs in the version-all group (#5404 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-05 10:57:47 +04:00
tofarr	ceb60b9a37	Prioritize version from pyproject.toml (#5412 )	2024-12-04 21:34:07 +01:00
OpenHands	794408cd31	Fix issue #5383 : [Bug]: LLM Cost is added to the `metrics` twice (#5396 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2024-12-04 21:32:08 +01:00
tofarr	9aa89e8f2f	Fix: Only send the last agent state changed event (#5411 )	2024-12-04 19:18:47 +00:00
Engel Nyst	3314b97cb2	Fix e2b import (#5409 )	2024-12-04 18:44:57 +00:00
Cheng Yang	8f47547b08	docs: fix markdown linting and broken links (#5401 )	2024-12-05 01:28:04 +08:00
Ryan H. Tran	c5117bc48d	Upgrade `openhands-aci` to v0.1.2 (#5397 )	2024-12-05 01:25:24 +08:00
mamoodi	851d88593c	Release 0.15.0 (#5402 )	2024-12-04 10:08:22 -05:00
Xingyao Wang	9908e1b285	[Evaluation]: Log openhands version in eval output folder, instead of agent version (#5394 )	2024-12-04 03:33:43 +00:00
Robert Brennan	793e142c4a	Show all actions in the message window (#5190 ) Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: amanape <83104063+amanape@users.noreply.github.com>	2024-12-03 18:29:49 -05:00
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 374.6c12.5 12.5 32.8 12.5 45.3 0l160-160c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0L224 306.7 86.6 169.4c-12.5-12.5-32.8-12.5-45.3 0s-12.5 32.8 0 45.3l160 160z"/></svg>`