migrate to use OH version

Merge branch 'main' into eval/visualcodebench
Log restart reason if runtime reports it (#6455 )
2026-04-29 03:00:45 -04:00 · 2025-01-26 15:24:35 -05:00 · 2025-01-26 15:14:28 -05:00 · 2025-01-25 07:20:18 +01:00 · 2025-01-24 19:03:00 +00:00 · 2025-01-24 18:43:02 +00:00
148 changed files with 4635 additions and 1234 deletions
@@ -160,7 +160,6 @@ jobs:
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml
-
      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
@@ -174,12 +173,42 @@ jobs:
          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
+      # -------------------------------------------------------------
+      # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 15
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+      - name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run'
+
+          # Find and export the visual browsing agent test results
+          REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV

      - name: Create archive of evaluation outputs
        run: |
          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/*  # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories

      - name: Upload evaluation results as artifact
        uses: actions/upload-artifact@v4
@@ -227,4 +256,7 @@ jobs:
              **Integration Tests Report Delegator (DeepSeek)**
              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
              ---
+              **Integration Tests Report VisualBrowsing (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
+              ---
              Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
@@ -84,6 +84,10 @@ jobs:
        run: |
          python -m pip index versions openhands-ai > openhands_versions.txt
          OPENHANDS_VERSION=$(head -n 1 openhands_versions.txt | awk '{print $2}' | tr -d '()')
+          # Ensure requirements.txt ends with newline before appending
+          if [ -f requirements.txt ] && [ -s requirements.txt ]; then
+            sed -i -e '$a\' requirements.txt
+          fi
          echo "openhands-ai==${OPENHANDS_VERSION}" >> requirements.txt
          cat requirements.txt

@@ -176,6 +176,7 @@ evaluation/gorilla/data
 evaluation/toolqa/data
 evaluation/scienceagentbench/benchmark
 evaluation/commit0_bench/repos
+evaluation/visualcodebench/

 # openhands resolver
 output/
@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
 setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.

-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.20-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.21-nikolaik`

 ## Develop inside Docker container

@@ -39,21 +39,21 @@ Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [
 ## ⚡ Quick Start

 The easiest way to run OpenHands is in Docker.
-See the [Installation](https://docs.all-hands.dev/modules/usage/installation) guide for
+See the [Running OpenHands](https://docs.all-hands.dev/modules/usage/installation) guide for
 system requirements and more information.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20
+    docker.all-hands.dev/all-hands-ai/openhands:0.21
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -69,7 +69,7 @@ run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules
 interact with it via a [friendly CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode),
 or run it on tagged issues with [a github action](https://docs.all-hands.dev/modules/usage/how-to/github-action).

-Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.
+Visit [Running OpenHands](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.

 > [!CAUTION]
 > OpenHands is meant to be run by a single user on their local workstation.
@@ -39,6 +39,11 @@ workspace_base = "./workspace"
 # If it's a folder, the session id will be used as the file name
 #save_trajectory_path="./trajectories"

+# Path to replay a trajectory, must be a file path
+# If provided, trajectory will be loaded and replayed before the
+# agent responds to any user instruction
+#replay_trajectory_path = ""
+
 # File store path
 #file_store_path = "/tmp/file_store"

@@ -70,7 +75,7 @@ workspace_base = "./workspace"
 #run_as_openhands = true

 # Runtime environment
-#runtime = "eventstream"
+#runtime = "docker"

 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -11,7 +11,7 @@ services:
      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
      - SANDBOX_API_HOSTNAME=host.docker.internal
      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.20-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.21-nikolaik}
      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
@@ -7,7 +7,7 @@ services:
    image: openhands:latest
    container_name: openhands-app-${DATE:-}
    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik}
      #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
    ports:
@@ -373,7 +373,7 @@ Les options de configuration de l'agent sont définies dans les sections `[agent
  - Description : Si l'éditeur LLM est activé dans l'espace d'action (fonctionne uniquement avec l'appel de fonction)

 **Utilisation du micro-agent**
- `use_microagents`
+- `enable_prompt_extensions`
  - Type : `bool`
  - Valeur par défaut : `true`
  - Description : Indique si l'utilisation des micro-agents est activée ou non
@@ -52,7 +52,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -61,7 +61,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.cli
 ```

@@ -46,7 +46,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -56,6 +56,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
 ```
@@ -13,16 +13,16 @@
 La façon la plus simple d'exécuter OpenHands est avec Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20
+    docker.all-hands.dev/all-hands-ai/openhands:0.21
 ```

 Vous pouvez également exécuter OpenHands en mode [headless scriptable](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), en tant que [CLI interactive](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), ou en utilisant l'[Action GitHub OpenHands](https://docs.all-hands.dev/modules/usage/how-to/github-action).
@@ -13,7 +13,7 @@ C'est le Runtime par défaut qui est utilisé lorsque vous démarrez OpenHands.

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.cli
 ```

@@ -47,7 +47,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -57,6 +57,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.main -t "write a bash script that prints hi" --no-auto-continue
 ```
@@ -11,16 +11,16 @@
 在 Docker 中运行 OpenHands 是最简单的方式。

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20
+    docker.all-hands.dev/all-hands-ai/openhands:0.21
 ```

 你也可以在可脚本化的[无头模式](https://docs.all-hands.dev/modules/usage/how-to/headless-mode)下运行 OpenHands，作为[交互式 CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode)，或使用 [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action)。
@@ -11,7 +11,7 @@

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
@@ -55,6 +55,11 @@ The core configuration options are defined in the `[core]` section of the `confi
  - Default: `"./trajectories"`
  - Description: Path to store trajectories (can be a folder or a file). If it's a folder, the trajectories will be saved in a file named with the session id name and .json extension, in that folder.

+- `replay_trajectory_path`
+  - Type: `str`
+  - Default: `""`
+  - Description: Path to load a trajectory and replay. If given, must be a path to the trajectory file in JSON format. The actions in the trajectory file would be replayed first before any user instruction is executed.
+
 ### File Store
 - `file_store_path`
  - Type: `str`
@@ -1,6 +1,6 @@
 # Getting Started with OpenHands

-So you've [installed OpenHands](./installation) and have
+So you've [run OpenHands](./installation) and have
 [set up your LLM](./installation#setup). Now what?

 OpenHands can help you tackle a wide variety of engineering tasks. But the technology
@@ -35,7 +35,7 @@ To run OpenHands in CLI mode with Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -45,7 +45,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.cli
 ```

@@ -32,7 +32,7 @@ To run OpenHands in Headless mode with Docker:
 ```bash
 docker run -it \
    --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -e LLM_API_KEY=$LLM_API_KEY \
@@ -43,7 +43,7 @@ docker run -it \
    -v ~/.openhands-state:/.openhands-state \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.21 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```

@@ -1,27 +1,66 @@
-# Installation
+# Running OpenHands

 ## System Requirements

- Docker version 26.0.0+ or Docker Desktop 4.31.0+.
- You must be using Linux or Mac OS.
-  - If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+- MacOS with [Docker Desktop support](https://docs.docker.com/desktop/setup/install/mac-install/#system-requirements)
+- Linux
+- Windows with [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) and [Docker Desktop support](https://docs.docker.com/desktop/setup/install/windows-install/#system-requirements)

-## Start the app
+## Prerequisites
+
+<details>
+  <summary>MacOS</summary>
+  ### Docker Desktop
+
+  1. [Install Docker Desktop on Mac](https://docs.docker.com/desktop/setup/install/mac-install).
+  2. Open Docker Desktop, go to `Settings > Advanced` and ensure `Allow the default Docker socket to be used` is enabled.
+</details>
+
+<details>
+  <summary>Linux</summary>
+
+  :::note
+  Tested with Ubuntu 22.04.
+  :::
+
+  ### Docker Desktop
+
+  1. [Install Docker Desktop on Linux](https://docs.docker.com/desktop/setup/install/linux/).
+
+</details>
+
+<details>
+  <summary>Windows</summary>
+  ### WSL
+
+  1. [Install WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+  2. Run `wsl --version` in powershell and confirm `Default Version: 2`.
+
+  ### Docker Desktop
+
+  1. [Install Docker Desktop on Windows](https://docs.docker.com/desktop/setup/install/windows-install).
+  2. Open Docker Desktop, go to `Settings` and confirm the following:
+  - General: `Use the WSL 2 based engine` is enabled.
+  - Resources > WSL Integration: `Enable integration with my default WSL distro` is enabled.
+
+</details>
+
+## Start the App

 The easiest way to run OpenHands is in Docker.

 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik

 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -e LOG_ALL_EVENTS=true \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v ~/.openhands-state:/.openhands-state \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.20
+    docker.all-hands.dev/all-hands-ai/openhands:0.21
 ```

 You'll find OpenHands running at http://localhost:3000!
@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:

 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.21-nikolaik \
    -v /var/run/docker.sock:/var/run/docker.sock \
    # ...
 ```
@@ -5,7 +5,7 @@ const sidebars: SidebarsConfig = {
  docsSidebar: [
    {
      type: 'doc',
-      label: 'Installation',
+      label: 'Running OpenHands',
      id: 'usage/installation',
    },
    {
@@ -8,6 +8,9 @@ Please follow instruction [here](../../README.md#setup) to setup your local deve

 ## Test if your environment works

+Follow the instructions here https://miniwob.farama.org/content/getting_started/ & https://miniwob.farama.org/content/viewing/
+to set up MiniWoB server in your local environment at http://localhost:8080/miniwob/
+
 Access with browser the above MiniWoB URLs and see if they load correctly.

 ## Run Evaluation
@@ -71,7 +71,7 @@ def process_git_patch(patch):
    return patch


-def get_config(instance: pd.Series) -> AppConfig:
+def get_config(metadata: EvalMetadata, instance: pd.Series) -> AppConfig:
    # We use a different instance image for the each instance of swe-bench eval
    base_container_image = get_instance_docker_image(instance['instance_id'])
    logger.info(
@@ -132,7 +132,7 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

-    config = get_config(instance)
+    config = get_config(metadata, instance)
    instance_id = instance.instance_id
    model_patch = instance['model_patch']
    test_spec: TestSpec = instance['test_spec']
@@ -158,6 +158,7 @@ def get_config(
        codeact_enable_browsing=RUN_WITH_BROWSING,
        codeact_enable_llm_editor=False,
        condenser=metadata.condenser_config,
+        enable_prompt_extensions=False,
    )
    config.set_agent_config(agent_config)
    return config
@@ -0,0 +1,674 @@
+from collections import Counter
+from copy import deepcopy
+from difflib import SequenceMatcher
+from io import BytesIO
+
+from bs4 import BeautifulSoup, Comment, NavigableString, Tag
+import cv2
+import numpy as np
+import torch
+from colormath.color_conversions import convert_color
+from colormath.color_diff import delta_e_cie2000
+from colormath.color_objects import LabColor, sRGBColor
+from PIL import Image, ImageChops, ImageColor
+from scipy.optimize import linear_sum_assignment
+from transformers import CLIPModel, CLIPProcessor
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def calculate_similarity(block1, block2):
+    """Calculate text similarity between two blocks using SequenceMatcher."""
+    text_similarity = SequenceMatcher(None, block1['text'], block2['text']).ratio()
+    return text_similarity
+
+
+def adjust_cost_for_context(cost_matrix, consecutive_bonus=1.0, window_size=20):
+    """Adjust cost matrix by considering context similarity."""
+    if window_size <= 0:
+        return cost_matrix
+
+    n, m = cost_matrix.shape
+    adjusted_cost_matrix = np.copy(cost_matrix)
+
+    for i in range(n):
+        for j in range(m):
+            if adjusted_cost_matrix[i][j] >= -0.5:
+                continue
+            nearby_matrix = cost_matrix[
+                max(0, i - window_size) : min(n, i + window_size + 1),
+                max(0, j - window_size) : min(m, j + window_size + 1),
+            ]
+            flattened_array = nearby_matrix.flatten()
+            sorted_array = np.sort(flattened_array)[::-1]
+            sorted_array = np.delete(
+                sorted_array, np.where(sorted_array == cost_matrix[i, j])[0][0]
+            )
+            top_k_elements = sorted_array[-window_size * 2 :]
+            bonus = consecutive_bonus * np.sum(top_k_elements)
+            adjusted_cost_matrix[i][j] += bonus
+    return adjusted_cost_matrix
+
+
+def create_cost_matrix(A, B):
+    """Create cost matrix for block matching."""
+    n = len(A)
+    m = len(B)
+    cost_matrix = np.zeros((n, m))
+    for i in range(n):
+        for j in range(m):
+            cost_matrix[i, j] = -calculate_similarity(A[i], B[j])
+    return cost_matrix
+
+
+def calculate_distance_max_1d(x1, y1, x2, y2):
+    """Calculate maximum 1D distance between points."""
+    return max(abs(x2 - x1), abs(y2 - y1))
+
+
+def calculate_ratio(h1, h2):
+    """Calculate ratio between two heights."""
+    return max(h1, h2) / min(h1, h2)
+
+
+def rgb_to_lab(rgb):
+    """Convert RGB color to Lab color space."""
+    rgb_color = sRGBColor(rgb[0], rgb[1], rgb[2], is_upscaled=True)
+    lab_color = convert_color(rgb_color, LabColor)
+    return lab_color
+
+
+def color_similarity_ciede2000(rgb1, rgb2):
+    """Calculate color similarity using CIEDE2000 formula."""
+    lab1 = rgb_to_lab(rgb1)
+    lab2 = rgb_to_lab(rgb2)
+    delta_e = delta_e_cie2000(lab1, lab2)
+    similarity = max(0, 1 - (delta_e / 100))
+    return similarity
+
+
+def merge_blocks_wo_check(block1, block2):
+    """Merge two blocks without additional checks."""
+    merged_text = block1['text'] + ' ' + block2['text']
+    x_min = min(block1['bbox'][0], block2['bbox'][0])
+    y_min = min(block1['bbox'][1], block2['bbox'][1])
+    x_max = max(
+        block1['bbox'][0] + block1['bbox'][2], block2['bbox'][0] + block2['bbox'][2]
+    )
+    y_max = max(
+        block1['bbox'][1] + block1['bbox'][3], block2['bbox'][1] + block2['bbox'][3]
+    )
+    merged_bbox = (x_min, y_min, x_max - x_min, y_max - y_min)
+    merged_color = tuple(
+        (color1 + color2) // 2
+        for color1, color2 in zip(block1['color'], block2['color'])
+    )
+    return {'text': merged_text, 'bbox': merged_bbox, 'color': merged_color}
+
+
+def find_maximum_matching(A, B, consecutive_bonus, window_size):
+    """Find maximum matching between two sets of blocks."""
+    cost_matrix = create_cost_matrix(A, B)
+    cost_matrix = adjust_cost_for_context(cost_matrix, consecutive_bonus, window_size)
+    row_ind, col_ind = linear_sum_assignment(cost_matrix)
+    current_cost = cost_matrix[row_ind, col_ind].tolist()
+    return list(zip(row_ind, col_ind)), current_cost, cost_matrix
+
+
+def remove_indices(lst, indices):
+    """Remove indices from list in reverse order."""
+    for index in sorted(indices, reverse=True):
+        if index < len(lst):
+            lst.pop(index)
+    return lst
+
+
+def merge_blocks_by_list(blocks, merge_list):
+    """Merge blocks according to merge list."""
+    pop_list = []
+    while merge_list:
+        i = merge_list[0][0]
+        j = merge_list[0][1]
+        blocks[i] = merge_blocks_wo_check(blocks[i], blocks[j])
+        pop_list.append(j)
+        merge_list.pop(0)
+        if merge_list:
+            new_merge_list = []
+            for k in range(len(merge_list)):
+                if (
+                    merge_list[k][0] != i
+                    and merge_list[k][1] != i
+                    and merge_list[k][0] != j
+                    and merge_list[k][1] != j
+                ):
+                    new_merge_list.append(merge_list[k])
+            merge_list = new_merge_list
+    remove_indices(blocks, pop_list)
+    return blocks
+
+
+def difference_of_means(list1, list2):
+    """Calculate difference of means between two lists."""
+    counter1 = Counter(list1)
+    counter2 = Counter(list2)
+
+    for element in set(list1) & set(list2):
+        common_count = min(counter1[element], counter2[element])
+        counter1[element] -= common_count
+        counter2[element] -= common_count
+
+    unique_list1 = [item for item in counter1.elements()]
+    unique_list2 = [item for item in counter2.elements()]
+
+    mean_list1 = sum(unique_list1) / len(unique_list1) if unique_list1 else 0
+    mean_list2 = sum(unique_list2) / len(unique_list2) if unique_list2 else 0
+
+    if mean_list1 - mean_list2 > 0:
+        if min(unique_list1) > min(unique_list2):
+            return mean_list1 - mean_list2
+        return 0.0
+    return mean_list1 - mean_list2
+
+
+def find_possible_merge(A, B, consecutive_bonus, window_size, debug=False):
+    """Find possible merges between blocks."""
+    merge_bonus = 0.0
+    merge_windows = 1
+
+    def sortFn(value):
+        return value[2]
+
+    while True:
+        A_changed = False
+        B_changed = False
+
+        matching, current_cost, cost_matrix = find_maximum_matching(
+            A, B, merge_bonus, merge_windows
+        )
+
+        if len(A) >= 2:
+            merge_list = []
+            for i in range(len(A) - 1):
+                new_A = deepcopy(A)
+                new_A[i] = merge_blocks_wo_check(new_A[i], new_A[i + 1])
+                new_A.pop(i + 1)
+                updated_matching, updated_cost, _ = find_maximum_matching(
+                    new_A, B, merge_bonus, merge_windows
+                )
+                diff = difference_of_means(current_cost, updated_cost)
+                if diff > 0.05:
+                    merge_list.append([i, i + 1, diff])
+
+            merge_list.sort(key=sortFn, reverse=True)
+            if merge_list:
+                A_changed = True
+                A = merge_blocks_by_list(A, merge_list)
+                matching, current_cost, cost_matrix = find_maximum_matching(
+                    A, B, merge_bonus, merge_windows
+                )
+
+        if len(B) >= 2:
+            merge_list = []
+            for i in range(len(B) - 1):
+                new_B = deepcopy(B)
+                new_B[i] = merge_blocks_wo_check(new_B[i], new_B[i + 1])
+                new_B.pop(i + 1)
+                updated_matching, updated_cost, _ = find_maximum_matching(
+                    A, new_B, merge_bonus, merge_windows
+                )
+                diff = difference_of_means(current_cost, updated_cost)
+                if diff > 0.05:
+                    merge_list.append([i, i + 1, diff])
+
+            merge_list.sort(key=sortFn, reverse=True)
+            if merge_list:
+                B_changed = True
+                B = merge_blocks_by_list(B, merge_list)
+                matching, current_cost, cost_matrix = find_maximum_matching(
+                    A, B, merge_bonus, merge_windows
+                )
+
+        if not A_changed and not B_changed:
+            break
+
+    matching, _, _ = find_maximum_matching(A, B, consecutive_bonus, window_size)
+    return A, B, matching
+
+
+def merge_blocks_by_bbox(blocks):
+    """Merge blocks with same bounding box."""
+    merged_blocks = {}
+    for block in blocks:
+        bbox = tuple(block['bbox'])
+        if bbox in merged_blocks:
+            existing_block = merged_blocks[bbox]
+            existing_block['text'] += ' ' + block['text']
+            existing_block['color'] = [
+                (ec + c) / 2 for ec, c in zip(existing_block['color'], block['color'])
+            ]
+        else:
+            merged_blocks[bbox] = block
+    return list(merged_blocks.values())
+
+
+def mask_bounding_boxes_with_inpainting(image, bounding_boxes):
+    """Mask bounding boxes in image using inpainting."""
+    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    mask = np.zeros(image_cv.shape[:2], dtype=np.uint8)
+    height, width = image_cv.shape[:2]
+
+    for bbox in bounding_boxes:
+        x_ratio, y_ratio, w_ratio, h_ratio = bbox
+        x = int(x_ratio * width)
+        y = int(y_ratio * height)
+        w = int(w_ratio * width)
+        h = int(h_ratio * height)
+        mask[y : y + h, x : x + w] = 255
+
+    inpainted_image = cv2.inpaint(image_cv, mask, 3, cv2.INPAINT_TELEA)
+    return Image.fromarray(cv2.cvtColor(inpainted_image, cv2.COLOR_BGR2RGB))
+
+
+def rescale_and_mask(image, blocks):
+    """Rescale image and mask blocks."""
+    if blocks:
+        image = mask_bounding_boxes_with_inpainting(image, blocks)
+
+    width, height = image.size
+    if width < height:
+        new_size = (width, width)
+    else:
+        new_size = (height, height)
+
+    return image.resize(new_size, Image.LANCZOS)
+
+
+def calculate_clip_similarity(image1, image2, blocks1, blocks2):
+    """Calculate CLIP similarity between two images."""
+    model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
+    processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = model.to(device)
+
+    # Mask and preprocess images
+    image1_masked = rescale_and_mask(image1, [block['bbox'] for block in blocks1])
+    image2_masked = rescale_and_mask(image2, [block['bbox'] for block in blocks2])
+    inputs = processor(
+        images=[image1_masked, image2_masked], return_tensors='pt', padding=True
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+
+    # Calculate features and similarity
+    with torch.no_grad():
+        image_features = model.get_image_features(**inputs)
+        image_features1 = image_features[0].unsqueeze(0)
+        image_features2 = image_features[1].unsqueeze(0)
+        image_features1 /= image_features1.norm(dim=-1, keepdim=True)
+        image_features2 /= image_features2.norm(dim=-1, keepdim=True)
+        similarity = (image_features1 @ image_features2.T).item()
+
+    return similarity
+
+
+def rgb_to_hex(rgb):
+    """Convert an RGB tuple to hexadecimal format."""
+    return '{:02X}{:02X}{:02X}'.format(*rgb)
+
+
+class ColorPool:
+    def __init__(self, offset=0):
+        color_values = list(range(10, 251, 16))
+        color_list = [((r + offset) % 256, (g + offset) % 256, (b + offset) % 256) 
+                     for r in color_values for g in color_values for b in color_values]
+        self.color_pool = [rgb_to_hex(color) for color in color_list]
+
+    def pop_color(self):
+        if self.color_pool:
+            return self.color_pool.pop()
+        else:
+            raise NotImplementedError
+
+
+def process_html_str(html_str, offset=0):
+    """Process HTML string to assign unique colors to text elements."""
+    soup = BeautifulSoup(html_str, 'html.parser')
+
+    def update_style(element, property_name, value):
+        important_value = f"{value} !important"
+        styles = element.attrs.get('style', '').split(';')
+        updated_styles = [s for s in styles if not s.strip().startswith(property_name) and len(s.strip()) > 0]
+        updated_styles.append(f"{property_name}: {important_value}")
+        element['style'] = '; '.join(updated_styles).strip()
+
+    # Set background color of all elements to transparent white
+    for element in soup.find_all(True):
+        update_style(element, 'background-color', 'rgba(255, 255, 255, 0.0)')
+
+    color_pool = ColorPool(offset)
+    text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'a', 'b', 'li', 
+                 'table', 'td', 'th', 'button', 'footer', 'header', 'figcaption']
+
+    for tag in soup.find_all(text_tags):
+        color = f"#{color_pool.pop_color()}"
+        update_style(tag, 'color', color)
+        update_style(tag, 'opacity', '1.0')
+
+    return str(soup)
+
+
+def similar(n1, n2):
+    """Check if two numbers are similar within a threshold."""
+    return abs(n1 - n2) <= 8
+
+
+def find_different_pixels(image1, image2):
+    """Find pixels that differ between two images."""
+    if image1.size != image2.size:
+        logger.warning("Images are not the same size")
+        return None
+
+    image1 = image1.convert('RGB')
+    image2 = image2.convert('RGB')
+    pixels1 = image1.load()
+    pixels2 = image2.load()
+    different_pixels = []
+
+    for x in range(image1.size[0]):
+        for y in range(image1.size[1]):
+            r1, g1, b1 = pixels1[x, y]
+            r2, g2, b2 = pixels2[x, y]
+            if similar((r1 + 50) % 256, r2) and similar((g1 + 50) % 256, g2) and similar((b1 + 50) % 256, b2):
+                different_pixels.append((y, x))
+
+    return np.stack(different_pixels) if different_pixels else None
+
+
+def extract_text_with_color(html_str):
+    """Extract text and color information from HTML string."""
+    def get_color(tag):
+        if 'style' in tag.attrs:
+            styles = tag['style'].split(';')
+            color_style = [s for s in styles if 'color' in s and 'background-color' not in s]
+            if color_style:
+                color = color_style[-1].split(':')[1].strip().replace(" !important", "")
+                if color[0] == "#":
+                    return color
+                else:
+                    try:
+                        if color.startswith('rgb'):
+                            color = tuple(map(int, color[4:-1].split(',')))
+                        else:
+                            color = ImageColor.getrgb(color)
+                        return '#{:02x}{:02x}{:02x}'.format(*color)
+                    except ValueError:
+                        logger.warning(f"Unable to identify or convert color: {color}")
+                        return None
+        return None
+
+    def extract_text_recursive(element, parent_color='#000000'):
+        if isinstance(element, Comment):
+            return None
+        elif isinstance(element, NavigableString):
+            text = element.strip()
+            return (text, parent_color) if text else None
+        elif isinstance(element, Tag):
+            current_color = get_color(element) or parent_color
+            children_texts = filter(None, [extract_text_recursive(child, current_color) 
+                                        for child in element.children])
+            return list(children_texts)
+
+    soup = BeautifulSoup(html_str, 'html.parser')
+    body = soup.body
+    return extract_text_recursive(body) if body else []
+
+
+def flatten_tree(tree):
+    """Flatten a nested tree structure into a list."""
+    flat_list = []
+    def flatten(node):
+        if isinstance(node, list):
+            for item in node:
+                flatten(item)
+        else:
+            flat_list.append(node)
+    flatten(tree)
+    return flat_list
+
+
+def get_blocks_from_image_diff_pixels(image, html_text_color_tree, different_pixels):
+    """Extract text blocks from image using color differences."""
+    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    x_w = image_cv.shape[0]
+    y_w = image_cv.shape[1]
+
+    def hex_to_bgr(hex_color):
+        hex_color = hex_color.lstrip('#')
+        rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+        return rgb[::-1]
+
+    def get_intersect(arr1, arr2):
+        arr1_reshaped = arr1.view([('', arr1.dtype)] * arr1.shape[1])
+        arr2_reshaped = arr2.view([('', arr2.dtype)] * arr2.shape[1])
+        common_rows = np.intersect1d(arr1_reshaped, arr2_reshaped)
+        return common_rows.view(arr1.dtype).reshape(-1, arr1.shape[1])
+
+    blocks = []
+    for item in html_text_color_tree:
+        try:
+            color = np.array(hex_to_bgr(item[1]), dtype="uint8")
+        except:
+            continue
+
+        lower = color - 4
+        upper = color + 4
+        mask = cv2.inRange(image_cv, lower, upper)
+        coords = np.column_stack(np.where(mask > 0))
+        coords = get_intersect(coords, different_pixels)
+
+        if coords.size == 0:
+            continue
+
+        x_min, y_min = np.min(coords, axis=0)
+        x_max, y_max = np.max(coords, axis=0)
+        
+        # Get average color from original image
+        color_coords = coords.copy()
+        color_coords = color_coords[color_coords[:, 0] <= x_max]
+        color_coords = color_coords[color_coords[:, 1] <= y_max]
+        colors = [image_cv[x, y] for x, y in color_coords]
+        avg_color = tuple(map(int, np.mean(colors, axis=0)))[::-1]  # Convert BGR to RGB
+
+        blocks.append({
+            'text': item[0].lower(),
+            'bbox': (y_min / y_w, x_min / x_w, (y_max - y_min + 1) / y_w, (x_max - x_min + 1) / x_w),
+            'color': avg_color
+        })
+
+    return blocks
+
+
+def get_blocks_from_html(html_str, image1):
+    """Extract text blocks from HTML and image."""
+    # Process HTML with two different color offsets
+    html_str_1 = process_html_str(html_str, offset=0)
+    html_str_2 = process_html_str(html_str, offset=50)
+
+    # Render both HTML versions to images
+    # TODO: Screenshot html_str_2
+    filter_color = (255, 0, 0)  
+    image2 = Image.new("RGB", image1.size, filter_color)
+
+
+    # Find pixels that differ between the two rendered images
+    different_pixels = find_different_pixels(image1, image2)
+    if different_pixels is None:
+        logger.warning("Unable to get pixels with different colors")
+        return []
+
+    # Extract text and color information from HTML
+    html_text_color_tree = flatten_tree(extract_text_with_color(html_str_1))
+    try:
+        blocks = get_blocks_from_image_diff_pixels(image1, html_text_color_tree, different_pixels)
+    except Exception as e:
+        logger.warning(f"Unable to get blocks: {e}")
+        return []
+
+    return blocks
+
+
+def evaluate(task, generated_img):
+    """Evaluate generated image against reference image using multiple metrics."""
+    # Load reference image
+    post_image = task['post_image']
+
+    # Extract blocks from HTML and images
+    post_blocks = get_blocks_from_html(task['post_html'], post_image)
+    gen_blocks = get_blocks_from_html(task['gen_html'], generated_img)
+
+    print("block details", post_blocks, gen_blocks)
+    if not post_blocks or not gen_blocks:
+        # Fallback to basic CLIP and pixel comparison if no blocks available
+        clip_score = calculate_clip_similarity(post_image, generated_img, [], [])
+        logger.info(f'CLIP similarity score: {clip_score}')
+
+        # Pixel comparison
+        diff = ImageChops.difference(generated_img, post_image)
+        pixel_match = not diff.getbbox()
+        logger.info(
+            f"Pixel difference analysis: {'No difference' if pixel_match else 'Differences found'}"
+        )
+
+        return clip_score > 0.95 or pixel_match
+
+    # Merge blocks with same bounding boxes
+    post_blocks = merge_blocks_by_bbox(post_blocks)
+    gen_blocks = merge_blocks_by_bbox(gen_blocks)
+
+    # Find optimal block matching
+    consecutive_bonus, window_size = 0.1, 1
+    gen_blocks_m, post_blocks_m, matching = find_possible_merge(
+        gen_blocks, deepcopy(post_blocks), consecutive_bonus, window_size
+    )
+
+    # Filter matches with low similarity
+    filtered_matching = []
+    for i, j in matching:
+        text_similarity = calculate_similarity(gen_blocks_m[i], post_blocks_m[j])
+        if text_similarity >= 0.5:
+            filtered_matching.append([i, j, text_similarity])
+    matching = filtered_matching
+
+    if not matching:
+        logger.warning('No matching blocks found')
+        clip_score = calculate_clip_similarity(
+            post_image, generated_img, gen_blocks, post_blocks
+        )
+        return clip_score > 0.95
+
+    # Calculate metrics for matched blocks
+    indices1 = [item[0] for item in matching]
+    indices2 = [item[1] for item in matching]
+
+    # Calculate unmatched areas
+    unmatched_area_1 = sum(
+        block['bbox'][2] * block['bbox'][3]
+        for i, block in enumerate(gen_blocks_m)
+        if i not in indices1
+    )
+    unmatched_area_2 = sum(
+        block['bbox'][2] * block['bbox'][3]
+        for j, block in enumerate(post_blocks_m)
+        if j not in indices2
+    )
+    total_unmatched_area = unmatched_area_1 + unmatched_area_2
+
+    # Calculate metrics for matched blocks
+    matched_areas = []
+    text_scores = []
+    position_scores = []
+    color_scores = []
+
+    for i, j, text_similarity in matching:
+        # Area
+        block_area = (
+            gen_blocks_m[i]['bbox'][2] * gen_blocks_m[i]['bbox'][3]
+            + post_blocks_m[j]['bbox'][2] * post_blocks_m[j]['bbox'][3]
+        )
+        matched_areas.append(block_area)
+
+        # Position similarity
+        position_similarity = 1 - calculate_distance_max_1d(
+            gen_blocks_m[i]['bbox'][0] + gen_blocks_m[i]['bbox'][2] / 2,
+            gen_blocks_m[i]['bbox'][1] + gen_blocks_m[i]['bbox'][3] / 2,
+            post_blocks_m[j]['bbox'][0] + post_blocks_m[j]['bbox'][2] / 2,
+            post_blocks_m[j]['bbox'][1] + post_blocks_m[j]['bbox'][3] / 2,
+        )
+
+        # Color similarity
+        color_similarity = color_similarity_ciede2000(
+            gen_blocks_m[i]['color'], post_blocks_m[j]['color']
+        )
+
+        text_scores.append(text_similarity)
+        position_scores.append(position_similarity)
+        color_scores.append(color_similarity)
+
+    # Calculate final scores
+    total_area = sum(matched_areas) + total_unmatched_area
+    size_score = sum(matched_areas) / total_area if total_area > 0 else 0
+    text_score = np.mean(text_scores) if text_scores else 0
+    position_score = np.mean(position_scores) if position_scores else 0
+    color_score = np.mean(color_scores) if color_scores else 0
+    clip_score = calculate_clip_similarity(
+        post_image, generated_img, gen_blocks, post_blocks
+    )
+
+    # Combine scores with equal weights
+    final_score = 0.2 * (
+        size_score + text_score + position_score + color_score + clip_score
+    )
+
+    logger.info('Evaluation scores:')
+    logger.info(f'- Size score: {size_score:.3f}')
+    logger.info(f'- Text score: {text_score:.3f}')
+    logger.info(f'- Position score: {position_score:.3f}')
+    logger.info(f'- Color score: {color_score:.3f}')
+    logger.info(f'- CLIP score: {clip_score:.3f}')
+    logger.info(f'- Final score: {final_score:.3f}')
+
+    return final_score > 0.8  # Consider it a match if final score > 80%
+
+
+def png_to_bytes(png):
+    buffer = BytesIO()
+    png.save(buffer, format='PNG')
+    image_bytes = buffer.getvalue()
+    return image_bytes
+
+
+def bytes_to_image(image_bytes):
+    """Convert bytes to a Pillow Image object."""
+    return Image.open(BytesIO(image_bytes))
+
+
+if __name__ == '__main__':
+    first_image = Image.open('./evaluation/visualcodebench/data/1/post.png')
+    image = Image.open('./evaluation/visualcodebench/data/1/prev.png')
+    
+    
+    html_file = open('./evaluation/visualcodebench/data/1/post/index.html', 'r')
+    first_html = html_file.read()
+    html_file.close()
+
+    html_file = open('./evaluation/visualcodebench/data/1/prev/index.html', 'r')
+    gen_html = html_file.read()
+    html_file.close()
+
+
+
+    sample = {'post_image': first_image, "post_html": first_html, "gen_html": gen_html}
+
+
+
+    evaluate(sample, image)
+
@@ -0,0 +1,97 @@
+import base64
+import os
+from io import BytesIO
+
+import pandas as pd
+from huggingface_hub import snapshot_download
+from PIL import PngImagePlugin
+from tqdm import tqdm
+
+from openhands.core.logger import openhands_logger as logger
+
+REPO_DOWNLOAD_DIR = (
+    './evaluation/visualcodebench/'  # Directory to store the downloaded repository
+)
+
+
+def download_repository():
+    """
+    Download the entire repository from Hugging Face Hub.
+    This function clones the repository into REPO_DOWNLOAD_DIR.
+    """
+    repo_id = 'rvmalhot/VisualCodeBench'
+    try:
+        logger.info(f"Downloading repository '{repo_id}'...")
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=REPO_DOWNLOAD_DIR,
+            repo_type='dataset',
+            ignore_patterns=None,  # Download all files
+        )
+        logger.info(f"Repository downloaded to '{REPO_DOWNLOAD_DIR}'.")
+    except Exception as e:
+        logger.error(f"Error downloading repository '{repo_id}': {e}")
+        raise e
+
+
+def format_task_dict(example):
+    instance_id = example['id']
+    prev_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/prev')
+    post_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/post')
+
+    # Check if 'prev' and 'post' directories exist
+    prev_exists = os.path.exists(prev_remote_path)
+    post_exists = os.path.exists(post_remote_path)
+
+    if prev_exists and post_exists:
+        skip = False
+    else:
+        skip = True
+
+    task = {
+        'instance_id': instance_id,
+        'prev_image': example['prev_image'],
+        'post_image': example['post_image'],
+        'changes': example['changes'],
+        'prev_code_files': example['prev_code_files'],
+        'post_code_files': example['post_code_files'],
+        'skip': skip,
+    }
+
+    return task
+
+
+def prepare_visualcodebench(dataset):
+    logger.info('Processing dataset')
+    dataset_processed = []
+    for example in tqdm(dataset['train']):
+        formatted_example = format_task_dict(example)
+        if formatted_example['skip']:
+            continue
+        del formatted_example['skip']
+        dataset_processed.append(formatted_example)
+
+    return pd.DataFrame(dataset_processed)
+
+
+def pil_image_to_base64(image: PngImagePlugin.PngImageFile) -> str:
+    """
+    Converts a PIL image to a Base64-encoded string.
+
+    Parameters:
+    - image (PngImagePlugin.PngImageFile): The PIL image to convert.
+
+    Returns:
+    - str: The Base64-encoded string of the image.
+    """
+    if not isinstance(image, PngImagePlugin.PngImageFile):
+        raise ValueError(
+            'The provided image is not a PIL.PngImagePlugin.PngImageFile instance.'
+        )
+
+    buffered = BytesIO()
+    image.save(buffered, format='PNG')
+    img_bytes = buffered.getvalue()
+    img_base64 = base64.b64encode(img_bytes).decode('utf-8')
+    base64_with_prefix = f'data:image/png;base64,{img_base64}'
+    return [base64_with_prefix]
@@ -0,0 +1,247 @@
+# FILE: run_infer.py
+
+import asyncio
+import os
+import shutil
+import tempfile
+from functools import partial
+
+import pandas as pd
+from datasets import load_dataset
+
+# from evaluation.benchmarks.visualcodebench.eval import capture_screenshot
+from evaluation.benchmarks.visualcodebench.prepare import (
+    REPO_DOWNLOAD_DIR,
+    download_repository,
+    pil_image_to_base64,
+    prepare_visualcodebench,
+)
+from evaluation.utils.shared import (
+    EvalMetadata,
+    assert_and_raise,
+    codeact_user_response,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+)
+from openhands.core.config.utils import parse_arguments
+from openhands.core.logger import openhands_logger as logger  # Import OpenHands logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action.commands import CmdRunAction
+from openhands.events.action.message import MessageAction
+from openhands.events.observation.commands import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Define workspace and output directories
+WORKSPACE_DIR = './workspace'
+
+FAKE_RESPONSES = {
+    'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='python:3.12-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    workspace_dir_name = instance['instance_id']
+    obs: CmdOutputObservation
+
+    action = CmdRunAction(command='mkdir -p /workspace/{workspace_dir_name}')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to create /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    file_path = REPO_DOWNLOAD_DIR + f'data/{workspace_dir_name}/prev/index.html'
+    runtime.copy_to(file_path, f'/workspace/{workspace_dir_name}')
+    logger.info(f'Copied code file for instance {workspace_dir_name}')
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> str:
+    # TODO: extract edited HTML file from agent workspace
+    # temp_zip = runtime.copy_from(f'/workspace/{instance.instance_id}')
+    # file_name = f'/workspace/{instance.instance_id}/index.html'
+    # with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
+    #     if file_name in zip_ref.namelist():
+    #         with zip_ref.open(file_name) as file:
+    #             file_content = file.read().decode('utf-8')  # Decode bytes to string
+    #     else:
+    #         raise FileNotFoundError(f"'{file_name}' not found in the ZIP archive.")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_folder = REPO_DOWNLOAD_DIR + f'data/{instance.instance_id}/post/'
+        shutil.copytree(src_folder, tmpdir, dirs_exist_ok=True)
+
+        # image = capture_screenshot(tmpdir)
+        # if image is not None:
+        #     shutil.copy(os.path.join(tmpdir, 'final_screenshot.png'), REPO_DOWNLOAD_DIR)
+
+
+def process_instance(
+    instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True
+):
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    instruction = (
+        f"Modify the HTML/CSS according to the following instruction:\n\n"
+        f"{instance['changes']}\n\n"
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided '
+        'to you AND NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    try:
+        initialize_runtime(runtime, instance=instance)
+
+        image_urls = pil_image_to_base64(instance['prev_image'])
+
+        action = MessageAction(content=instruction, image_urls=image_urls)
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=action,
+                runtime=runtime,
+                fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+            )
+        )
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        # =============================================
+        # result evaluation
+        # =============================================
+
+        return_val = complete_runtime(runtime, instance)
+        logger.info(f'Return value {return_val}')
+    finally:
+        runtime.close()
+
+    # TODO: return EVAL output
+
+
+def main():
+    """Main function to run the evaluation."""
+    # args = parse_args()
+    args = parse_arguments()
+
+    logger.info(f"\n{'='*80}\nStarting VisualCodeBench Evaluation\n{'='*80}")
+    logger.info(f'Agent: {args.agent_cls}')
+    logger.info(f'Model: {args.llm_config}')
+    logger.info(f'Max iterations: {args.max_iterations}')
+    logger.info(f'Eval limit: {args.eval_n_limit}')
+    logger.info(f'Num workers: {args.eval_num_workers}\n')
+    logger.info(f'Eval output: {args.eval_output_dir}\n')
+
+    # Step 1: Download the entire repository once
+    logger.info('Downloading repository...')
+    download_repository()
+
+    # Step 2: Load Dataset
+    logger.info('Loading dataset...')
+    dataset = load_dataset(REPO_DOWNLOAD_DIR)
+
+    # Step 3: Prepare dataset
+    llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        logger.error(f'Could not find LLM config: {args.llm_config}')
+        raise ValueError(f'Could not find LLM config: {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'VisualCodeBench',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        'evaluation/output/',
+    )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    dataset = prepare_visualcodebench(dataset)
+    instances = prepare_dataset(dataset, output_file, eval_n_limit=args.eval_n_limit)
+
+    # Step 4: Run eval
+    run_evaluation(
+        instances, metadata, output_file, args.eval_num_workers, process_instance
+    )
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+# Check if required arguments are provided
+if [ "$#" -lt 4 ]; then
+    echo "Usage: $0 [model_config] [commit_hash] [agent_cls] [eval_limit] [num_workers]"
+    echo "Example: $0 llm.eval_gpt_4o_mini HEAD CodeActAgent 5 1"
+    exit 1
+fi
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT_CLS=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=${5:-1}  # Default to 1 worker if not specified
+
+# Checkout the specified commit
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="export PYTHONPATH=evaluation/benchmarks/visualcodebench:\$PYTHONPATH && poetry run python evaluation/benchmarks/visualcodebench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 5 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $OPENHANDS_VERSION" \
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -0,0 +1,167 @@
+import http
+import os
+import socket
+import socketserver
+import threading
+import time
+from io import BytesIO
+
+import requests
+from PIL import Image, ImageChops
+from playwright.sync_api import sync_playwright
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def get_free_port():
+    """Find a free port to run the HTTP server."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def start_http_server(tmpdir):
+    port = get_free_port()
+
+    class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+        def translate_path(self, path):
+            # Serve files from the specified directory instead of the current working directory
+            path = super().translate_path(path)
+            relative_path = os.path.relpath(path, os.getcwd())
+            return os.path.join(tmpdir, relative_path)
+
+    handler = CustomHTTPRequestHandler
+    server = socketserver.TCPServer(('', port), handler)
+    return server, port
+
+
+def capture_screenshot(tmpdir):
+    server, port = start_http_server(tmpdir)
+    server_thread = threading.Thread(target=server.serve_forever)
+    server_thread.daemon = True
+    server_thread.start()
+    time.sleep(10)
+
+    image = None
+    try:
+        server_url = f'http://localhost:{port}/'
+
+        if not is_server_reachable(server_url):
+            raise RuntimeError(f'Server not reachable at {server_url}')
+
+        screenshot_path = os.path.join(tmpdir, 'final_screenshot.png')
+        capture_screenshot_playwright(server_url, screenshot_path)
+        image = Image.open(screenshot_path)
+        image.load()
+    finally:
+        # Shut down the server and clean up
+        server.shutdown()
+        server.server_close()
+
+    return image
+
+
+def is_server_reachable(url):
+    """
+    Check if the local server is reachable.
+    """
+    try:
+        response = requests.get(url, timeout=5)  # Set a 5-second timeout
+        if response.status_code == 200:
+            logger.info(f'Server is reachable at {url}')
+            return True
+        else:
+            logger.warning(
+                f'Server responded with status code {response.status_code} at {url}'
+            )
+            return False
+    except requests.ConnectionError as e:
+        logger.error(f'Failed to connect to server at {url}: {e}')
+        return False
+
+
+def capture_screenshot_playwright(url, screenshot_path):
+    """Capture a screenshot of the given URL using Playwright."""
+    try:
+        with sync_playwright() as p:
+            logger.info('Launching browser...')
+            browser = p.chromium.launch(timeout=10000)  # 10 seconds for browser launch
+
+            logger.info('Creating a new page...')
+            page = browser.new_page()
+
+            logger.info(f'Navigating to URL: {url}')
+            try:
+                page.goto(url, timeout=60 * 1000)  # Set timeout to 5 seconds
+                logger.info('Page navigation completed.')
+            except Exception as e:
+                logger.warning(f'Page navigation timed out. {e}. Continuing...')
+
+            logger.info('Waiting for network to be idle...')
+            try:
+                page.wait_for_load_state(
+                    'networkidle', timeout=60 * 1000
+                )  # Set timeout to 5 seconds
+                logger.info('Page load state reached.')
+            except Exception as e:
+                logger.warning(f'Page load state timed out. {e}. Continuing...')
+
+            logger.info('Capturing screenshot...')
+            page.screenshot(
+                path=screenshot_path, full_page=True
+            )  # Capture full page screenshot
+
+            logger.info(f'Screenshot saved to {screenshot_path}')
+            browser.close()
+            return True
+    except Exception as e:
+        logger.error(f'Error capturing screenshot with Playwright: {e}')
+        return False
+
+
+def evaluate(task, screenshot_path):
+    """Compare generated screenshot with post_image using CLIP score."""
+    try:
+        import torch
+        from transformers import CLIPModel, CLIPProcessor
+
+        # Load CLIP model and processor
+        model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
+        processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
+
+        # Load images
+        post_image = Image.open(BytesIO(task['post_image']))
+        generated_img = Image.open(screenshot_path)
+
+        # Process images
+        inputs = processor(
+            images=[post_image, generated_img], return_tensors='pt', padding=True
+        )
+
+        # Get image features
+        image_features = model.get_image_features(**inputs)
+
+        # Calculate cosine similarity
+        similarity = torch.nn.functional.cosine_similarity(
+            image_features[0].unsqueeze(0), image_features[1].unsqueeze(0)
+        ).item()
+
+        logger.info(f'CLIP similarity score: {similarity}')
+
+        return similarity > 0.95  # Consider it a match if similarity > 95%
+    except Exception as e:
+        logger.error(f'Error in CLIP evaluation: {e}')
+        # Fallback to pixel comparison if CLIP fails
+        try:
+            post_image = Image.open(BytesIO(task['post_image']))
+            generated_img = Image.open(screenshot_path)
+
+            # Compare images directly without converting to bytes
+            diff = ImageChops.difference(generated_img, post_image)
+            logger.info(
+                f"Pixel difference analysis: {'No difference' if not diff.getbbox() else 'Differences found'}"
+            )
+            return not diff.getbbox()
+        except Exception as ex:
+            logger.error(f'Error in fallback evaluation: {ex}')
+            return False
@@ -0,0 +1,50 @@
+# VisualWebArena Evaluation with OpenHands Browsing Agents
+
+This folder contains evaluation for [VisualWebArena](https://github.com/web-arena-x/visualwebarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Setup VisualWebArena Environment
+
+VisualWebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
+Follow [this document](https://github.com/web-arena-x/visualwebarena/blob/main/environment_docker/README.md) to set up your own VisualWebArena environment through local servers or AWS EC2 instances.
+Take note of the base URL (`$VISUALWEBARENA_BASE_URL`) of the machine where the environment is installed.
+
+## Test if your environment works
+
+Access with browser the above VisualWebArena website URLs and see if they load correctly.
+If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
+Check the network security policy if you are using an AWS machine.
+Follow the VisualWebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
+
+## Run Evaluation
+
+```bash
+export VISUALWEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
+export OPENAI_API_KEY="yourkey" # this OpenAI API key is required for some visualWebArena validators that utilize LLMs
+export OPENAI_BASE_URL="https://api.openai.com/v1/" # base URL for OpenAI model used for VisualWebArena evaluation
+bash evaluation/benchmarks/visualwebarena/scripts/run_infer.sh llm.claude HEAD VisualBrowsingAgent
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/visualwebarena/`
+
+To calculate the success rate, run:
+
+```sh
+poetry run python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+## VisualBrowsingAgent V1.0 result
+
+Tested on VisualBrowsingAgent V1.0
+
+VisualWebArena, 910 tasks (high cost, single run due to fixed task), max step 15. Resolve rates are:
+
+- GPT4o: 26.15%
+- Claude-3.5 Sonnet: 25.27%
@@ -0,0 +1,40 @@
+import argparse
+import json
+
+import browsergym.visualwebarena  # noqa F401 register visualwebarena tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id
+        for id in gym.envs.registry.keys()
+        if id.startswith('browsergym/visualwebarena')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            reward = data['test_result']['reward']
+            if reward >= 0:
+                total_reward += data['test_result']['reward']
+            else:
+                actual_num -= 1
+    avg_reward = total_reward / total_num
+    print('Total reward: ', total_reward)
+    print('Success Rate: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Total Cost: ', total_cost)
+    print('Actual number of tasks finished: ', actual_num)
@@ -0,0 +1,254 @@
+import asyncio
+import json
+import os
+from typing import Any
+
+import browsergym.visualwebarena  # noqa F401 register visualwebarena tasks as gym environments
+import gymnasium as gym
+import pandas as pd
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import (
+    BrowseInteractiveAction,
+    CmdRunAction,
+    MessageAction,
+)
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.runtime.browser.browser_env import (
+    BROWSER_EVAL_GET_GOAL_ACTION,
+    BROWSER_EVAL_GET_REWARDS_ACTION,
+)
+from openhands.utils.async_utils import call_async_from_sync
+
+SUPPORTED_AGENT_CLS = {'VisualBrowsingAgent'}
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'VisualBrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+    env_id: str,
+) -> AppConfig:
+    base_url = os.environ.get('VISUALWEBARENA_BASE_URL', None)
+    openai_api_key = os.environ.get('OPENAI_API_KEY', None)
+    openai_base_url = os.environ.get('OPENAI_BASE_URL', None)
+    assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
+    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
+    assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='docker',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='python:3.12-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+            browsergym_eval_env=env_id,
+            runtime_startup_env_vars={
+                'BASE_URL': base_url,
+                'OPENAI_API_KEY': openai_api_key,
+                'OPENAI_BASE_URL': openai_base_url,
+                'VWA_CLASSIFIEDS': f'{base_url}:9980',
+                'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
+                'VWA_SHOPPING': f'{base_url}:7770',
+                'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
+                'VWA_REDDIT': f'{base_url}:9999',
+                'VWA_GITLAB': f'{base_url}:8023',
+                'VWA_WIKIPEDIA': f'{base_url}:8888',
+                'VWA_HOMEPAGE': f'{base_url}:4399',
+            },
+            timeout=300,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+        attach_to_existing=True,
+    )
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config,
+            metadata.eval_output_dir,
+            env_id,
+        )
+    )
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+) -> tuple[str, list]:
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    goal = obs.content
+    goal_image_urls = []
+    if hasattr(obs, 'goal_image_urls'):
+        goal_image_urls = obs.goal_image_urls
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return goal, goal_image_urls
+
+
+def complete_runtime(
+    runtime: Runtime,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'rewards': json.loads(obs.content),
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+):
+    env_id = instance.instance_id
+
+    config = get_config(metadata, env_id)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, env_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    task_str, goal_image_urls = initialize_runtime(runtime)
+    initial_user_action = MessageAction(content=task_str, image_urls=goal_image_urls)
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=initial_user_action,
+            runtime=runtime,
+        )
+    )
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Instruction obtained from the first message from the USER
+    instruction = ''
+    for event in state.history:
+        if isinstance(event, MessageAction):
+            instruction = event.content
+            break
+
+    try:
+        return_val = complete_runtime(runtime)
+        logger.info(f'Return value from complete_runtime: {return_val}')
+        reward = max(return_val['rewards'])
+    except Exception:
+        reward = -1.0  # kept -1 to identify instances for which evaluation failed.
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=env_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'reward': reward,
+        },
+    )
+    runtime.close()
+    return output
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    dataset = pd.DataFrame(
+        {
+            'instance_id': [
+                id
+                for id in gym.envs.registry.keys()
+                if id.startswith('browsergym/visualwebarena')
+            ]
+        }
+    )
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+    metadata = make_metadata(
+        llm_config,
+        'visualwebarena',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+# configure browsing agent
+export USE_NAV="true"
+export USE_CONCISE_ANSWER="true"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default VisualBrowsingAgent"
+  AGENT="VisualBrowsingAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="${OPENHANDS_VERSION}"
+
+COMMAND="poetry run python evaluation/benchmarks/visualwebarena/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 15 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
@@ -35,6 +35,7 @@ from openhands.utils.async_utils import call_async_from_sync
 FAKE_RESPONSES = {
    'CodeActAgent': fake_user_response,
    'DelegatorAgent': fake_user_response,
+    'VisualBrowsingAgent': fake_user_response,
 }


@@ -355,7 +355,9 @@ def _process_instance_wrapper(
            )
            # e is likely an EvalException, so we can't directly infer it from type
            # but rather check if it's a fatal error
-            if is_fatal_runtime_error(str(e)):
+            # But it can also be AgentRuntime**Error (e.g., swe_bench/eval_infer.py)
+            _error_str = type(e).__name__ + ': ' + str(e)
+            if is_fatal_runtime_error(_error_str):
                runtime_failure_count += 1
                msg += f'Runtime disconnected error detected for instance {instance.instance_id}, runtime failure count: {runtime_failure_count}'
                msg += '\n' + '-' * 10 + '\n'
@@ -531,6 +533,7 @@ def is_fatal_runtime_error(error: str | None) -> bool:
        return False

    FATAL_RUNTIME_ERRORS = [
+        AgentRuntimeTimeoutError,
        AgentRuntimeUnavailableError,
        AgentRuntimeDisconnectedError,
        AgentRuntimeNotFoundError,
@@ -37,7 +37,6 @@ describe("Browser", () => {
        browser: {
          url: "https://example.com",
          screenshotSrc: "",
-          updateCount: 0,
        },
      },
    });
@@ -53,7 +52,6 @@ describe("Browser", () => {
          url: "https://example.com",
          screenshotSrc:
            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mN0uGvyHwAFCAJS091fQwAAAABJRU5ErkJggg==",
-          updateCount: 0,
        },
      },
    });
@@ -1,4 +1,4 @@
-import { render, screen, within } from "@testing-library/react";
+import { render, screen, waitFor, within } from "@testing-library/react";
 import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
 import {
  QueryClientProvider,
@@ -7,10 +7,12 @@ import {
 } from "@tanstack/react-query";
 import userEvent from "@testing-library/user-event";
 import { createRoutesStub } from "react-router";
+import React from "react";
 import { ConversationPanel } from "#/components/features/conversation-panel/conversation-panel";
 import OpenHands from "#/api/open-hands";
 import { AuthProvider } from "#/context/auth-context";
 import { clickOnEditButton } from "./utils";
+import { queryClientConfig } from "#/query-client-config";

 describe("ConversationPanel", () => {
  const onCloseMock = vi.fn();
@@ -231,4 +233,47 @@ describe("ConversationPanel", () => {

    expect(onCloseMock).toHaveBeenCalledOnce();
  });
+
+  it("should refetch data on rerenders", async () => {
+    // We need to simulate the toggling of the component to test the refetching
+    function PanelWithToggle() {
+      const [isOpen, setIsOpen] = React.useState(true);
+      return (
+        <>
+          <button type="button" onClick={() => setIsOpen((prev) => !prev)}>
+            Toggle
+          </button>
+          {isOpen && <ConversationPanel onClose={onCloseMock} />}
+        </>
+      );
+    }
+
+    const MyRouterStub = createRoutesStub([
+      {
+        Component: PanelWithToggle,
+        path: "/",
+      },
+    ]);
+
+    const getUserConversationsSpy = vi.spyOn(OpenHands, "getUserConversations");
+    render(<MyRouterStub />, {
+      wrapper: ({ children }) => (
+        <AuthProvider>
+          <QueryClientProvider client={new QueryClient(queryClientConfig)}>
+            {children}
+          </QueryClientProvider>
+        </AuthProvider>
+      ),
+    });
+
+    await waitFor(() => expect(getUserConversationsSpy).toHaveBeenCalledOnce());
+
+    const button = screen.getByText("Toggle");
+    await userEvent.click(button);
+    await userEvent.click(button);
+
+    await waitFor(() =>
+      expect(getUserConversationsSpy).toHaveBeenCalledTimes(2),
+    );
+  });
 });
@@ -8,6 +8,9 @@ import { MULTI_CONVERSATION_UI } from "#/utils/feature-flags";
 import OpenHands from "#/api/open-hands";
 import { MOCK_USER_PREFERENCES } from "#/mocks/handlers";

+// These tests will now fail because the conversation panel is rendered through a portal
+// and technically not a child of the Sidebar component.
+
 const renderSidebar = () => {
  const RouterStub = createRoutesStub([
    {
@@ -152,11 +155,13 @@ describe("Sidebar", () => {
      const settingsModal = screen.getByTestId("ai-config-modal");

      // Click the advanced options switch to show the API key input
-      const advancedOptionsSwitch = within(settingsModal).getByTestId("advanced-option-switch");
+      const advancedOptionsSwitch = within(settingsModal).getByTestId(
+        "advanced-option-switch",
+      );
      await user.click(advancedOptionsSwitch);

      const apiKeyInput = within(settingsModal).getByLabelText(/API\$KEY/i);
-      await user.type(apiKeyInput, "SET");
+      await user.type(apiKeyInput, "**********");

      const saveButton = within(settingsModal).getByTestId(
        "save-settings-button",
@@ -1,12 +1,13 @@
 import { render, screen, within } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { FeedbackActions } from "#/components/features/feedback/feedback-actions";
+import { TrajectoryActions } from "#/components/features/trajectory/trajectory-actions";

-describe("FeedbackActions", () => {
+describe("TrajectoryActions", () => {
  const user = userEvent.setup();
  const onPositiveFeedback = vi.fn();
  const onNegativeFeedback = vi.fn();
+  const onExportTrajectory = vi.fn();

  afterEach(() => {
    vi.clearAllMocks();
@@ -14,9 +15,10 @@ describe("FeedbackActions", () => {

  it("should render correctly", () => {
    render(
-      <FeedbackActions
+      <TrajectoryActions
        onPositiveFeedback={onPositiveFeedback}
        onNegativeFeedback={onNegativeFeedback}
+        onExportTrajectory={onExportTrajectory}
      />,
    );

@@ -27,9 +29,10 @@ describe("FeedbackActions", () => {

  it("should call onPositiveFeedback when positive feedback is clicked", async () => {
    render(
-      <FeedbackActions
+      <TrajectoryActions
        onPositiveFeedback={onPositiveFeedback}
        onNegativeFeedback={onNegativeFeedback}
+        onExportTrajectory={onExportTrajectory}
      />,
    );

@@ -41,9 +44,10 @@ describe("FeedbackActions", () => {

  it("should call onNegativeFeedback when negative feedback is clicked", async () => {
    render(
-      <FeedbackActions
+      <TrajectoryActions
        onPositiveFeedback={onPositiveFeedback}
        onNegativeFeedback={onNegativeFeedback}
+        onExportTrajectory={onExportTrajectory}
      />,
    );

@@ -52,4 +56,19 @@ describe("FeedbackActions", () => {

    expect(onNegativeFeedback).toHaveBeenCalled();
  });
+
+  it("should call onExportTrajectory when negative feedback is clicked", async () => {
+    render(
+      <TrajectoryActions
+        onPositiveFeedback={onPositiveFeedback}
+        onNegativeFeedback={onNegativeFeedback}
+        onExportTrajectory={onExportTrajectory}
+      />,
+    );
+
+    const exportButton = screen.getByTestId("export-trajectory");
+    await user.click(exportButton);
+
+    expect(onExportTrajectory).toHaveBeenCalled();
+  });
 });
@@ -1,20 +1,20 @@
 import { describe, it, expect } from "vitest";
 import store from "../src/store";
 import {
-  setInitialQuery,
-  clearInitialQuery,
+  setInitialPrompt,
+  clearInitialPrompt,
 } from "../src/state/initial-query-slice";

 describe("Initial Query Behavior", () => {
-  it("should clear initial query when clearInitialQuery is dispatched", () => {
+  it("should clear initial query when clearInitialPrompt is dispatched", () => {
    // Set up initial query in the store
-    store.dispatch(setInitialQuery("test query"));
-    expect(store.getState().initialQuery.initialQuery).toBe("test query");
+    store.dispatch(setInitialPrompt("test query"));
+    expect(store.getState().initialQuery.initialPrompt).toBe("test query");

    // Clear the initial query
-    store.dispatch(clearInitialQuery());
+    store.dispatch(clearInitialPrompt());

    // Verify initial query is cleared
-    expect(store.getState().initialQuery.initialQuery).toBeNull();
+    expect(store.getState().initialQuery.initialPrompt).toBeNull();
  });
 });
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.20.0",
+  "version": "0.21.0",
  "private": true,
  "type": "module",
  "engines": {
@@ -20,6 +20,7 @@
    "axios": "^1.7.9",
    "clsx": "^2.1.1",
    "eslint-config-airbnb-typescript": "^18.0.0",
+    "framer-motion": "^12.0.1",
    "i18next": "^24.2.1",
    "i18next-browser-languagedetector": "^8.0.2",
    "i18next-http-backend": "^3.0.1",
@@ -42,7 +43,7 @@
    "sirv-cli": "^3.0.0",
    "socket.io-client": "^4.8.1",
    "tailwind-merge": "^2.6.0",
-    "vite": "^6.0.7",
+    "vite": "^5.4.11",
    "web-vitals": "^3.5.2",
    "ws": "^8.18.0"
  },
@@ -79,7 +80,8 @@
    "@playwright/test": "^1.49.1",
    "@react-router/dev": "^7.1.2",
    "@tailwindcss/typography": "^0.5.16",
-    "@tanstack/eslint-plugin-query": "^5.62.16",
+    "@tanstack/eslint-plugin-query": "^5.64.2",
+    "@testing-library/dom": "^10.4.0",
    "@testing-library/jest-dom": "^6.6.1",
    "@testing-library/react": "^16.2.0",
    "@testing-library/user-event": "^14.6.0",
@@ -100,7 +102,7 @@
    "eslint-config-prettier": "^10.0.1",
    "eslint-plugin-import": "^2.29.1",
    "eslint-plugin-jsx-a11y": "^6.10.2",
-    "eslint-plugin-prettier": "^5.2.2",
+    "eslint-plugin-prettier": "^5.2.3",
    "eslint-plugin-react": "^7.37.4",
    "eslint-plugin-react-hooks": "^4.6.2",
    "husky": "^9.1.6",
@@ -10,6 +10,7 @@ import {
  AuthenticateResponse,
  Conversation,
  ResultSet,
+  GetTrajectoryResponse,
 } from "./open-hands.types";
 import { openHands } from "./open-hands-axios";
 import { ApiSettings } from "#/services/settings";
@@ -243,10 +244,14 @@ class OpenHands {
  static async createConversation(
    githubToken?: string,
    selectedRepository?: string,
+    initialUserMsg?: string,
+    imageUrls?: string[],
  ): Promise<Conversation> {
    const body = {
      github_token: githubToken,
      selected_repository: selectedRepository,
+      initial_user_msg: initialUserMsg,
+      image_urls: imageUrls,
    };

    const { data } = await openHands.post<Conversation>(
@@ -354,6 +359,15 @@ class OpenHands {

    return response.data.items;
  }
+
+  static async getTrajectory(
+    conversationId: string,
+  ): Promise<GetTrajectoryResponse> {
+    const { data } = await openHands.get<GetTrajectoryResponse>(
+      `/api/conversations/${conversationId}/trajectory`,
+    );
+    return data;
+  }
 }

 export default OpenHands;
@@ -55,6 +55,11 @@ export interface GetVSCodeUrlResponse {
  error?: string;
 }

+export interface GetTrajectoryResponse {
+  trajectory: unknown[] | null;
+  error?: string;
+}
+
 export interface AuthenticateResponse {
  message?: string;
  error?: string;
@@ -23,7 +23,7 @@ export const AGENT_STATUS_MAP: {
  },
  [AgentState.AWAITING_USER_INPUT]: {
    message: I18nKey.CHAT_INTERFACE$AGENT_AWAITING_USER_INPUT_MESSAGE,
-    indicator: IndicatorColor.ORANGE,
+    indicator: IndicatorColor.BLUE,
  },
  [AgentState.PAUSED]: {
    message: I18nKey.CHAT_INTERFACE$AGENT_PAUSED_MESSAGE,
@@ -1,8 +1,10 @@
 import { useDispatch, useSelector } from "react-redux";
+import toast from "react-hot-toast";
 import React from "react";
 import posthog from "posthog-js";
+import { useParams } from "react-router";
 import { convertImageToBase64 } from "#/utils/convert-image-to-base-64";
-import { FeedbackActions } from "../feedback/feedback-actions";
+import { TrajectoryActions } from "../trajectory/trajectory-actions";
 import { createChatMessage } from "#/services/chat-service";
 import { InteractiveChatBox } from "./interactive-chat-box";
 import { addUserMessage } from "#/state/chat-slice";
@@ -19,6 +21,8 @@ import { ActionSuggestions } from "./action-suggestions";
 import { ContinueButton } from "#/components/shared/buttons/continue-button";
 import { ScrollToBottomButton } from "#/components/shared/buttons/scroll-to-bottom-button";
 import { LoadingSpinner } from "#/components/shared/loading-spinner";
+import { useGetTrajectory } from "#/hooks/mutation/use-get-trajectory";
+import { downloadTrajectory } from "#/utils/download-files";

 function getEntryPoint(
  hasRepository: boolean | null,
@@ -47,6 +51,8 @@ export function ChatInterface() {
  const { selectedRepository, importedProjectZip } = useSelector(
    (state: RootState) => state.initialQuery,
  );
+  const params = useParams();
+  const { mutate: getTrajectory } = useGetTrajectory();

  const handleSendMessage = async (content: string, files: File[]) => {
    if (messages.length === 0) {
@@ -90,6 +96,25 @@ export function ChatInterface() {
    setFeedbackPolarity(polarity);
  };

+  const onClickExportTrajectoryButton = () => {
+    if (!params.conversationId) {
+      toast.error("ConversationId unknown, cannot download trajectory");
+      return;
+    }
+
+    getTrajectory(params.conversationId, {
+      onSuccess: async (data) => {
+        await downloadTrajectory(
+          params.conversationId ?? "unknown",
+          data.trajectory,
+        );
+      },
+      onError: (error) => {
+        toast.error(error.message);
+      },
+    });
+  };
+
  const isWaitingForUserInput =
    curAgentState === AgentState.AWAITING_USER_INPUT ||
    curAgentState === AgentState.FINISHED;
@@ -129,13 +154,14 @@ export function ChatInterface() {

      <div className="flex flex-col gap-[6px] px-4 pb-4">
        <div className="flex justify-between relative">
-          <FeedbackActions
+          <TrajectoryActions
            onPositiveFeedback={() =>
              onClickShareFeedbackActionButton("positive")
            }
            onNegativeFeedback={() =>
              onClickShareFeedbackActionButton("negative")
            }
+            onExportTrajectory={() => onClickExportTrajectoryButton()}
          />

          <div className="absolute left-1/2 transform -translate-x-1/2 bottom-0">
@@ -6,24 +6,19 @@ import { cn } from "#/utils/utils";
 import { ul, ol } from "../markdown/list";
 import { CopyToClipboardButton } from "#/components/shared/buttons/copy-to-clipboard-button";
 import { anchor } from "../markdown/anchor";
-import { JumpToFileButton } from "#/components/shared/buttons/jump-to-file-button";
-import { useFiles } from "#/context/files";

 interface ChatMessageProps {
  type: "user" | "assistant";
  message: string;
-  filePath?: string;
 }

 export function ChatMessage({
  type,
  message,
-  filePath,
  children,
 }: React.PropsWithChildren<ChatMessageProps>) {
  const [isHovering, setIsHovering] = React.useState(false);
  const [isCopy, setIsCopy] = React.useState(false);
-  const { setSelectedPath } = useFiles();

  const handleCopyToClipboard = async () => {
    await navigator.clipboard.writeText(message);
@@ -62,12 +57,6 @@ export function ChatMessage({
        onClick={handleCopyToClipboard}
        mode={isCopy ? "copied" : "copy"}
      />
-      {filePath && (
-        <JumpToFileButton
-          filePath={filePath}
-          onClick={() => setSelectedPath(filePath)}
-        />
-      )}
      <Markdown
        className="text-sm overflow-auto"
        components={{
@@ -12,15 +12,22 @@ interface MessagesProps {
 export const Messages: React.FC<MessagesProps> = React.memo(
  ({ messages, isAwaitingUserConfirmation }) =>
    messages.map((message, index) => {
+      const shouldShowConfirmationButtons =
+        messages.length - 1 === index &&
+        message.sender === "assistant" &&
+        isAwaitingUserConfirmation;
+
      if (message.type === "error" || message.type === "action") {
        return (
-          <ExpandableMessage
-            key={index}
-            type={message.type}
-            id={message.translationID}
-            message={message.content}
-            success={message.success}
-          />
+          <div key={index}>
+            <ExpandableMessage
+              type={message.type}
+              id={message.translationID}
+              message={message.content}
+              success={message.success}
+            />
+            {shouldShowConfirmationButtons && <ConfirmationButtons />}
+          </div>
        );
      }

@@ -29,14 +36,11 @@ export const Messages: React.FC<MessagesProps> = React.memo(
          key={index}
          type={message.sender}
          message={message.content}
-          filePath={message.filePath}
        >
          {message.imageUrls && message.imageUrls.length > 0 && (
            <ImageCarousel size="small" images={message.imageUrls} />
          )}
-          {messages.length - 1 === index &&
-            message.sender === "assistant" &&
-            isAwaitingUserConfirmation && <ConfirmationButtons />}
+          {shouldShowConfirmationButtons && <ConfirmationButtons />}
        </ChatMessage>
      );
    }),
@@ -43,7 +43,7 @@ export function AgentStatusBar() {

  React.useEffect(() => {
    if (status === WsClientProviderStatus.DISCONNECTED) {
-      setStatusMessage("Trying to reconnect...");
+      setStatusMessage("Connecting...");
    } else {
      setStatusMessage(AGENT_STATUS_MAP[curAgentState].message);
    }
@@ -1,28 +1,36 @@
 import ThumbsUpIcon from "#/icons/thumbs-up.svg?react";
 import ThumbDownIcon from "#/icons/thumbs-down.svg?react";
-import { FeedbackActionButton } from "#/components/shared/buttons/feedback-action-button";
+import ExportIcon from "#/icons/export.svg?react";
+import { TrajectoryActionButton } from "#/components/shared/buttons/trajectory-action-button";

-interface FeedbackActionsProps {
+interface TrajectoryActionsProps {
  onPositiveFeedback: () => void;
  onNegativeFeedback: () => void;
+  onExportTrajectory: () => void;
 }

-export function FeedbackActions({
+export function TrajectoryActions({
  onPositiveFeedback,
  onNegativeFeedback,
-}: FeedbackActionsProps) {
+  onExportTrajectory,
+}: TrajectoryActionsProps) {
  return (
    <div data-testid="feedback-actions" className="flex gap-1">
-      <FeedbackActionButton
+      <TrajectoryActionButton
        testId="positive-feedback"
        onClick={onPositiveFeedback}
        icon={<ThumbsUpIcon width={15} height={15} />}
      />
-      <FeedbackActionButton
+      <TrajectoryActionButton
        testId="negative-feedback"
        onClick={onNegativeFeedback}
        icon={<ThumbDownIcon width={15} height={15} />}
      />
+      <TrajectoryActionButton
+        testId="export-trajectory"
+        onClick={onExportTrajectory}
+        icon={<ExportIcon width={15} height={15} />}
+      />
    </div>
  );
 }
@@ -1,32 +0,0 @@
-import React from "react";
-import { useTranslation } from "react-i18next";
-import { VscGoToFile } from "react-icons/vsc";
-import { I18nKey } from "#/i18n/declaration";
-import { ActionTooltip } from "#/components/shared/action-tooltip";
-import { cn } from "#/utils/utils";
-
-interface JumpToFileButtonProps {
-  filePath: string;
-  onClick: () => void;
-}
-
-export function JumpToFileButton({ filePath, onClick }: JumpToFileButtonProps) {
-  const { t } = useTranslation();
-
-  return (
-    <ActionTooltip content={t(I18nKey.CHAT$JUMP_TO_FILE_TOOLTIP, { path: filePath })} side="top">
-      <button
-        type="button"
-        data-testid="jump-to-file-button"
-        onClick={onClick}
-        className={cn(
-          "absolute top-2 right-12 p-2 rounded-lg",
-          "text-neutral-400 hover:text-neutral-200 hover:bg-neutral-700",
-          "transition-colors duration-200"
-        )}
-      >
-        <VscGoToFile size={16} />
-      </button>
-    </ActionTooltip>
-  );
-}
@@ -1,14 +1,14 @@
-interface FeedbackActionButtonProps {
+interface TrajectoryActionButtonProps {
  testId?: string;
  onClick: () => void;
  icon: React.ReactNode;
 }

-export function FeedbackActionButton({
+export function TrajectoryActionButton({
  testId,
  onClick,
  icon,
-}: FeedbackActionButtonProps) {
+}: TrajectoryActionButtonProps) {
  return (
    <button
      type="button"
@@ -171,7 +171,7 @@ export function SettingsForm({

          <APIKeyInput
            isDisabled={!!disabled}
-            isSet={settings.LLM_API_KEY === "SET"}
+            isSet={settings.LLM_API_KEY === "**********"}
          />

          {showAdvancedOptions && (
@@ -34,7 +34,7 @@ export function SettingsProvider({ children }: SettingsProviderProps) {
      ...newSettings,
    };

-    if (updatedSettings.LLM_API_KEY === "SET") {
+    if (updatedSettings.LLM_API_KEY === "**********") {
      delete updatedSettings.LLM_API_KEY;
    }

@@ -11,15 +11,11 @@ import { hydrateRoot } from "react-dom/client";
 import { Provider } from "react-redux";
 import posthog from "posthog-js";
 import "./i18n";
-import {
-  QueryCache,
-  QueryClient,
-  QueryClientProvider,
-} from "@tanstack/react-query";
-import toast from "react-hot-toast";
+import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import store from "./store";
 import { useConfig } from "./hooks/query/use-config";
 import { AuthProvider } from "./context/auth-context";
+import { queryClientConfig } from "./query-client-config";
 import { SettingsProvider } from "./context/settings-context";

 function PosthogInit() {
@@ -50,27 +46,7 @@ async function prepareApp() {
  }
 }

-const QUERY_KEYS_TO_IGNORE = ["authenticated", "hosts"];
-const queryClient = new QueryClient({
-  queryCache: new QueryCache({
-    onError: (error, query) => {
-      if (!QUERY_KEYS_TO_IGNORE.some((key) => query.queryKey.includes(key))) {
-        toast.error(error.message);
-      }
-    },
-  }),
-  defaultOptions: {
-    queries: {
-      staleTime: 1000 * 60 * 5, // 5 minutes
-      gcTime: 1000 * 60 * 15, // 15 minutes
-    },
-    mutations: {
-      onError: (error) => {
-        toast.error(error.message);
-      },
-    },
-  },
-});
+const queryClient = new QueryClient(queryClientConfig);

 prepareApp().then(() =>
  startTransition(() => {
@@ -3,7 +3,7 @@ import { useNavigate } from "react-router";
 import posthog from "posthog-js";
 import { useDispatch, useSelector } from "react-redux";
 import OpenHands from "#/api/open-hands";
-import { setInitialQuery } from "#/state/initial-query-slice";
+import { setInitialPrompt } from "#/state/initial-query-slice";
 import { RootState } from "#/store";
 import { useAuth } from "#/context/auth-context";

@@ -18,7 +18,7 @@ export const useCreateConversation = () => {
  );

  return useMutation({
-    mutationFn: (variables: { q?: string }) => {
+    mutationFn: async (variables: { q?: string }) => {
      if (
        !variables.q?.trim() &&
        !selectedRepository &&
@@ -28,10 +28,13 @@ export const useCreateConversation = () => {
        throw new Error("No query provided");
      }

-      if (variables.q) dispatch(setInitialQuery(variables.q));
+      if (variables.q) dispatch(setInitialPrompt(variables.q));
+
      return OpenHands.createConversation(
        gitHubToken || undefined,
        selectedRepository || undefined,
+        variables.q,
+        files,
      );
    },
    onSuccess: async ({ conversation_id: conversationId }, { q }) => {
@@ -0,0 +1,7 @@
+import { useMutation } from "@tanstack/react-query";
+import OpenHands from "#/api/open-hands";
+
+export const useGetTrajectory = () =>
+  useMutation({
+    mutationFn: (cid: string) => OpenHands.getTrajectory(cid),
+  });
@@ -6,7 +6,7 @@ import { useConfig } from "./use-config";
 import OpenHands from "#/api/open-hands";

 export const useGitHubUser = () => {
-  const { gitHubToken, setUserId } = useAuth();
+  const { gitHubToken, setUserId, logout } = useAuth();
  const { data: config } = useConfig();

  const user = useQuery({
@@ -29,5 +29,11 @@ export const useGitHubUser = () => {
    }
  }, [user.data]);

+  React.useEffect(() => {
+    if (user.isError) {
+      logout();
+    }
+  }, [user.isError]);
+
  return user;
 };
@@ -9,5 +9,6 @@ export const useUserConversations = () => {
    queryKey: ["user", "conversations"],
    queryFn: OpenHands.getUserConversations,
    enabled: !!userIsAuthenticated,
+    staleTime: 0,
  });
 };
@@ -4492,21 +4492,7 @@
        "tr": "İstemcinin hazır olması bekleniyor...",
        "ja": "クライアントの準備を待機中"
    },
-    "CHAT$JUMP_TO_FILE_TOOLTIP": {
-    "en": "Jump to file: {{path}}",
-    "zh-CN": "跳转到文件：{{path}}",
-    "de": "Zur Datei springen: {{path}}",
-    "ko-KR": "파일로 이동: {{path}}",
-    "no": "Hopp til fil: {{path}}",
-    "zh-TW": "跳轉到文件：{{path}}",
-    "it": "Vai al file: {{path}}",
-    "pt": "Ir para o arquivo: {{path}}",
-    "es": "Ir al archivo: {{path}}",
-    "ar": "انتقل إلى الملف: {{path}}",
-    "fr": "Aller au fichier: {{path}}",
-    "tr": "Dosyaya git: {{path}}"
-  },
-  "SUGGESTIONS$WHAT_TO_BUILD": {
+    "SUGGESTIONS$WHAT_TO_BUILD": {
        "en": "What do you want to build?",
        "ja": "何を開発しますか？",
        "zh-CN": "你想要构建什么？",
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-download"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" x2="12" y1="15" y2="3"/></svg>
@@ -8,5 +8,4 @@ type Message = {
  pending?: boolean;
  translationID?: string;
  eventID?: number;
-  filePath?: string;
 };
@@ -141,7 +141,7 @@ export const handlers = [
      { id: 2, full_name: "octocat/earth" },
    ]),
  ),
-  http.get("https://api.github.com/user", () => {
+  http.get("/api/github/user", () => {
    const user: GitHubUser = {
      id: 1,
      login: "octocat",
@@ -0,0 +1,25 @@
+import { QueryClientConfig, QueryCache } from "@tanstack/react-query";
+import toast from "react-hot-toast";
+
+const QUERY_KEYS_TO_IGNORE = ["authenticated", "hosts"];
+
+export const queryClientConfig: QueryClientConfig = {
+  queryCache: new QueryCache({
+    onError: (error, query) => {
+      if (!QUERY_KEYS_TO_IGNORE.some((key) => query.queryKey.includes(key))) {
+        toast.error(error.message);
+      }
+    },
+  }),
+  defaultOptions: {
+    queries: {
+      staleTime: 1000 * 60 * 5, // 5 minutes
+      gcTime: 1000 * 60 * 15, // 15 minutes
+    },
+    mutations: {
+      onError: (error) => {
+        toast.error(error.message);
+      },
+    },
+  },
+};
@@ -1,10 +1,8 @@
 import React from "react";
-import { useWSStatusChange } from "./hooks/use-ws-status-change";
 import { useHandleWSEvents } from "./hooks/use-handle-ws-events";
 import { useHandleRuntimeActive } from "./hooks/use-handle-runtime-active";

 export function EventHandler({ children }: React.PropsWithChildren) {
-  useWSStatusChange();
  useHandleWSEvents();
  useHandleRuntimeActive();

@@ -1,68 +0,0 @@
-import React from "react";
-import { useDispatch, useSelector } from "react-redux";
-import {
-  useWsClient,
-  WsClientProviderStatus,
-} from "#/context/ws-client-provider";
-import { createChatMessage } from "#/services/chat-service";
-import { setCurrentAgentState } from "#/state/agent-slice";
-import { addUserMessage } from "#/state/chat-slice";
-import { clearFiles, clearInitialQuery } from "#/state/initial-query-slice";
-import { RootState } from "#/store";
-import { AgentState } from "#/types/agent-state";
-
-export const useWSStatusChange = () => {
-  const { send, status } = useWsClient();
-  const { curAgentState } = useSelector((state: RootState) => state.agent);
-  const dispatch = useDispatch();
-
-  const statusRef = React.useRef<WsClientProviderStatus | null>(null);
-
-  const { files, initialQuery } = useSelector(
-    (state: RootState) => state.initialQuery,
-  );
-
-  const sendInitialQuery = (query: string, base64Files: string[]) => {
-    const timestamp = new Date().toISOString();
-    send(createChatMessage(query, base64Files, timestamp));
-  };
-
-  const dispatchInitialQuery = (query: string) => {
-    sendInitialQuery(query, files);
-    dispatch(clearFiles()); // reset selected files
-    dispatch(clearInitialQuery()); // reset initial query
-  };
-
-  const handleAgentInit = () => {
-    if (initialQuery) {
-      dispatchInitialQuery(initialQuery);
-    }
-  };
-  React.useEffect(() => {
-    if (curAgentState === AgentState.INIT) {
-      handleAgentInit();
-    }
-  }, [curAgentState]);
-
-  React.useEffect(() => {
-    if (statusRef.current === status) {
-      return; // This is a check because of strict mode - if the status did not change, don't do anything
-    }
-    statusRef.current = status;
-
-    if (status !== WsClientProviderStatus.DISCONNECTED && initialQuery) {
-      dispatch(
-        addUserMessage({
-          content: initialQuery,
-          imageUrls: files,
-          timestamp: new Date().toISOString(),
-          pending: true,
-        }),
-      );
-    }
-
-    if (status === WsClientProviderStatus.DISCONNECTED) {
-      dispatch(setCurrentAgentState(AgentState.STOPPED));
-    }
-  }, [status]);
-};
@@ -11,8 +11,7 @@ import {
  useConversation,
 } from "#/context/conversation-context";
 import { Controls } from "#/components/features/controls/controls";
-import { RootState } from "#/store";
-import { clearMessages } from "#/state/chat-slice";
+import { clearMessages, addUserMessage } from "#/state/chat-slice";
 import { clearTerminal } from "#/state/command-slice";
 import { useEffectOnce } from "#/hooks/use-effect-once";
 import CodeIcon from "#/icons/code.svg?react";
@@ -33,11 +32,12 @@ import {
 import Security from "#/components/shared/modals/security/security";
 import { useEndSession } from "#/hooks/use-end-session";
 import { useUserConversation } from "#/hooks/query/use-user-conversation";
-import { CountBadge } from "#/components/layout/count-badge";
 import { ServedAppLabel } from "#/components/layout/served-app-label";
 import { TerminalStatusLabel } from "#/components/features/terminal/terminal-status-label";
 import { useSettings } from "#/hooks/query/use-settings";
 import { MULTI_CONVERSATION_UI } from "#/utils/feature-flags";
+import { clearFiles, clearInitialPrompt } from "#/state/initial-query-slice";
+import { RootState } from "#/store";

 function AppContent() {
  useConversationConfig();
@@ -48,11 +48,13 @@ function AppContent() {
  const { data: conversation, isFetched } = useUserConversation(
    conversationId || null,
  );
+  const { initialPrompt, files } = useSelector(
+    (state: RootState) => state.initialQuery,
+  );
  const dispatch = useDispatch();
  const endSession = useEndSession();

  const [width, setWidth] = React.useState(window.innerWidth);
-  const { updateCount } = useSelector((state: RootState) => state.browser);

  const secrets = React.useMemo(
    () => [gitHubToken].filter((secret) => secret !== null),
@@ -77,6 +79,18 @@ function AppContent() {
    dispatch(clearMessages());
    dispatch(clearTerminal());
    dispatch(clearJupyter());
+    if (conversationId && (initialPrompt || files.length > 0)) {
+      dispatch(
+        addUserMessage({
+          content: initialPrompt || "",
+          imageUrls: files || [],
+          timestamp: new Date().toISOString(),
+          pending: true,
+        }),
+      );
+      dispatch(clearInitialPrompt());
+      dispatch(clearFiles());
+    }
  }, [conversationId]);

  useEffectOnce(() => {
@@ -144,7 +158,6 @@ function AppContent() {
                    label: (
                      <div className="flex items-center gap-1">
                        {t(I18nKey.BROWSER$TITLE)}
-                        {updateCount > 0 && <CountBadge count={updateCount} />}
                      </div>
                    ),
                    to: "browser",
@@ -5,8 +5,6 @@ export const initialState = {
  url: "https://github.com/All-Hands-AI/OpenHands",
  // Base64-encoded screenshot of browser window (placeholder for now, will be replaced with the actual screenshot later)
  screenshotSrc: "",
-  // Counter for browser updates
-  updateCount: 0,
 };

 export const browserSlice = createSlice({
@@ -18,7 +16,6 @@ export const browserSlice = createSlice({
    },
    setScreenshotSrc: (state, action) => {
      state.screenshotSrc = action.payload;
-      state.updateCount += 1;
    },
  },
 });
@@ -166,9 +166,8 @@ export const chatSlice = createSlice({
        }\n\nOutput:\n\`\`\`\n${content.trim() || "[Command finished execution with no output]"}\n\`\`\``;
        causeMessage.content = content; // Observation content includes the action
      } else if (observationID === "read" || observationID === "edit") {
-        const { content, extras } = observation.payload;
+        const { content } = observation.payload;
        causeMessage.content = `\`\`\`${observationID === "edit" ? "diff" : "python"}\n${content}\n\`\`\``; // Content is already truncated by the ACI
-        causeMessage.filePath = extras.path;
      } else if (observationID === "browse") {
        let content = `**URL:** ${observation.payload.extras.url}\n`;
        if (observation.payload.extras.error) {
@@ -2,14 +2,14 @@ import { createSlice, PayloadAction } from "@reduxjs/toolkit";

 type SliceState = {
  files: string[]; // base64 encoded images
-  initialQuery: string | null;
+  initialPrompt: string | null;
  selectedRepository: string | null;
  importedProjectZip: string | null; // base64 encoded zip
 };

 const initialState: SliceState = {
  files: [],
-  initialQuery: null,
+  initialPrompt: null,
  selectedRepository: null,
  importedProjectZip: null,
 };
@@ -27,11 +27,11 @@ export const selectedFilesSlice = createSlice({
    clearFiles(state) {
      state.files = [];
    },
-    setInitialQuery(state, action: PayloadAction<string>) {
-      state.initialQuery = action.payload;
+    setInitialPrompt(state, action: PayloadAction<string>) {
+      state.initialPrompt = action.payload;
    },
-    clearInitialQuery(state) {
-      state.initialQuery = null;
+    clearInitialPrompt(state) {
+      state.initialPrompt = null;
    },
    setSelectedRepository(state, action: PayloadAction<string | null>) {
      state.selectedRepository = action.payload;
@@ -49,8 +49,8 @@ export const {
  addFile,
  removeFile,
  clearFiles,
-  setInitialQuery,
-  clearInitialQuery,
+  setInitialPrompt,
+  clearInitialPrompt,
  setSelectedRepository,
  clearSelectedRepository,
  setImportedProjectZip,
@@ -26,6 +26,18 @@ interface FileSystemDirectoryHandle {
  ): Promise<FileSystemFileHandle>;
 }

+interface SaveFilePickerOptions {
+  suggestedName?: string;
+  types?: Array<{
+    description?: string;
+    accept: Record<string, string[]>;
+  }>;
+  excludeAcceptAllOption?: boolean;
+}
+
 interface Window {
  showDirectoryPicker(): Promise<FileSystemDirectoryHandle>;
+  showSaveFilePicker(
+    options?: SaveFilePickerOptions,
+  ): Promise<FileSystemFileHandle>;
 }
@@ -22,6 +22,13 @@ function isFileSystemAccessSupported(): boolean {
  return "showDirectoryPicker" in window;
 }

+/**
+ * Checks if the Save File Picker API is supported
+ */
+function isSaveFilePickerSupported(): boolean {
+  return "showSaveFilePicker" in window;
+}
+
 /**
 * Creates subdirectories and returns the final directory handle
 */
@@ -162,6 +169,39 @@ async function processBatch(
  };
 }

+export async function downloadTrajectory(
+  conversationId: string,
+  data: unknown[] | null,
+): Promise<void> {
+  try {
+    if (!isSaveFilePickerSupported()) {
+      throw new Error(
+        "Your browser doesn't support downloading folders. Please use Chrome, Edge, or another browser that supports the File System Access API.",
+      );
+    }
+    const options = {
+      suggestedName: `trajectory-${conversationId}.json`,
+      types: [
+        {
+          description: "JSON File",
+          accept: {
+            "application/json": [".json"],
+          },
+        },
+      ],
+    };
+
+    const handle = await window.showSaveFilePicker(options);
+    const writable = await handle.createWritable();
+    await writable.write(JSON.stringify(data, null, 2));
+    await writable.close();
+  } catch (error) {
+    throw new Error(
+      `Failed to download file: ${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+}
+
 /**
 * Downloads files from the workspace one by one
 * @param initialPath Initial path to start downloading from. If not provided, downloads from root
@@ -0,0 +1,65 @@
+---
+name: add_openhands_repo_instruction
+type: task
+version: 1.0.0
+author: openhands
+agent: CodeActAgent
+inputs:
+  - name: REPO_FOLDER_NAME
+    description: "Branch for the agent to work on"
+    required: false
+---
+
+Please browse the current repository under /workspace/{{ REPO_FOLDER_NAME }}, look at the documentation and relevant code, and understand the purpose of this repository.
+
+Specifically, I want you to create a `.openhands/microagents/repo.md`  file. This file should contain succinct information that summarizes (1) the purpose of this repository, (2) the general setup of this repo, and (3) a brief description of the structure of this repo.
+
+Here's an example:
+```markdown
+---
+name: repo
+type: repo
+agent: CodeActAgent
+---
+
+This repository contains the code for runtime-API, an automated AI software engineer. It has a Python backend
+(in the `openhands` directory) and React frontend (in the `frontend` directory).
+
+## General Setup:
+To set up the entire repo, including frontend and backend, run `make build`.
+You don't need to do this unless the user asks you to, or if you're trying to run the entire application.
+
+Before pushing any changes, you should ensure that any lint errors or simple test errors have been fixed.
+
+* If you've made changes to the backend, you should run `pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
+* If you've made changes to the frontend, you should run `cd frontend && npm run lint:fix && npm run build ; cd ..`
+
+If either command fails, it may have automatically fixed some issues. You should fix any issues that weren't automatically fixed,
+then re-run the command to ensure it passes.
+
+## Repository Structure
+Backend:
+- Located in the `openhands` directory
+- Testing:
+  - All tests are in `tests/unit/test_*.py`
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - Write all tests with pytest
+
+Frontend:
+- Located in the `frontend` directory
+- Prerequisites: A recent version of NodeJS / NPM
+- Setup: Run `npm install` in the frontend directory
+- Testing:
+  - Run tests: `npm run test`
+  - To run specific tests: `npm run test -- -t "TestName"`
+- Building:
+  - Build for production: `npm run build`
+- Environment Variables:
+  - Set in `frontend/.env` or as environment variables
+  - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
+- Internationalization:
+  - Generate i18n declaration file: `npm run make-i18n`
+```
+
+Now, please write a similar markdown for the current repository.
+Read all the GitHub workflows under .github/ of the repository (if this folder exists) to understand the CI checks (e.g., linter, pre-commit), and include those in the repo.md file.
@@ -20,7 +20,7 @@ The key classes in OpenHands are:
    * Sandbox: the part of the runtime responsible for running commands, e.g. inside of Docker
 * Server: brokers OpenHands sessions over HTTP, e.g. to drive the frontend
    * Session: holds a single EventStream, a single AgentController, and a single Runtime. Generally represents a single task (but potentially including several user prompts)
-    * SessionManager: keeps a list of active sessions, and ensures requests are routed to the correct Session
+    * ConversationManager: keeps a list of active sessions, and ensures requests are routed to the correct Session

 ## Control Flow
 Here's the basic loop (in pseudocode) that drives agents.
@@ -12,6 +12,7 @@ from openhands.agenthub import (  # noqa: E402
    codeact_agent,
    delegator_agent,
    dummy_agent,
+    visualbrowsing_agent,
 )

 __all__ = [
@@ -19,6 +20,7 @@ __all__ = [
    'delegator_agent',
    'dummy_agent',
    'browsing_agent',
+    'visualbrowsing_agent',
 ]

 for agent in all_microagents.values():
@@ -277,16 +277,11 @@ class CodeActAgent(Agent):
            # if it doesn't have tool call metadata, it was triggered by a user action
            if obs.tool_call_metadata is None:
                text = truncate_content(
-                    f'\nObserved result of command executed by user:\n{obs.content}',
+                    f'\nObserved result of command executed by user:\n{obs.to_agent_observation()}',
                    max_message_chars,
                )
            else:
-                text = truncate_content(
-                    obs.content
-                    + f'\n[Python Interpreter: {obs.metadata.py_interpreter_path}]',
-                    max_message_chars,
-                )
-            text += f'\n[Command finished with exit code {obs.exit_code}]'
+                text = truncate_content(obs.to_agent_observation(), max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, IPythonRunCellObservation):
            text = obs.content
@@ -32,6 +32,7 @@ from openhands.events.tool import ToolCallMetadata
 _BASH_DESCRIPTION = """Execute a bash command in the terminal.
 * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
 * Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process.
+* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.
 """

 CmdRunTool = ChatCompletionToolParam(
@@ -44,7 +45,7 @@ CmdRunTool = ChatCompletionToolParam(
            'properties': {
                'command': {
                    'type': 'string',
-                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.',
+                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process. Note: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use `&&` or `;` to chain them together.',
                },
                'is_input': {
                    'type': 'string',
@@ -80,7 +81,7 @@ IPythonTool = ChatCompletionToolParam(
    ),
 )

-_FILE_EDIT_DESCRIPTION = """Edit a file.
+_FILE_EDIT_DESCRIPTION = """Edit a file in plain-text format.
 * The assistant can edit files by specifying the file path and providing a draft of the new file content.
 * The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
 * IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
@@ -216,7 +217,7 @@ LLMBasedFileEditTool = ChatCompletionToolParam(
    ),
 )

-_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files
+_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format
 * State is persistent across command calls and discussions with the user
 * If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
 * The `create` command cannot be used if the specified `path` already exists as a file
@@ -1,6 +1,7 @@
 You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
 <IMPORTANT>
 * If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+* You should start exploring the file system with your view command, unless you need to explore more deeply.
 * When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
-* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
+* You MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
 </IMPORTANT>
@@ -0,0 +1,7 @@
+# Browsing Agent Framework
+
+This folder implements the AgentLab [generic agent](https://github.com/ServiceNow/AgentLab/tree/main/src/agentlab/agents/generic_agent) that enables full-featured web browsing. The observations given to the agent include set-of-marks annotated web-page screenshot, accessibility tree of the web-page and all the thoughts and actions from previous steps.
+
+## Test run
+
+Note that for browsing tasks, GPT-4/Claude is usually a requirement to get reasonable results, due to the complexity of the web page structures. This agent has been evaluated on the VisualWebArena benchmark and the CodeAct agent does not call this VisualBrowsingAgent. CodeAct agent uses has in-built support for browsing (e.g., via browse_url and browser tool).
@@ -0,0 +1,6 @@
+from openhands.agenthub.visualbrowsing_agent.visualbrowsing_agent import (
+    VisualBrowsingAgent,
+)
+from openhands.controller.agent import Agent
+
+Agent.register('VisualBrowsingAgent', VisualBrowsingAgent)
@@ -0,0 +1,306 @@
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.utils.obs import flatten_axtree_to_str
+
+from openhands.agenthub.browsing_agent.response_parser import BrowsingResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    MessageAction,
+)
+from openhands.events.event import EventSource
+from openhands.events.observation import BrowserOutputObservation
+from openhands.events.observation.observation import Observation
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    PluginRequirement,
+)
+
+
+def get_error_prefix(obs: BrowserOutputObservation) -> str:
+    # temporary fix for OneStopMarket to ignore timeout errors
+    if 'timeout' in obs.last_browser_action_error:
+        return ''
+    return f'## Error from previous action:\n{obs.last_browser_action_error}\n'
+
+
+def create_goal_prompt(goal: str, image_urls: list[str] | None):
+    goal_txt: str = f"""\
+# Instructions
+Review the current state of the page and all other information to find the best possible next action to accomplish your goal. Your answer will be interpreted and executed by a program, make sure to follow the formatting instructions.
+
+## Goal:
+{goal}
+"""
+    goal_image_urls = []
+    if image_urls is not None:
+        for idx, url in enumerate(image_urls):
+            goal_txt = goal_txt + f'Images: Goal input image ({idx+1})\n'
+            goal_image_urls.append(url)
+    goal_txt += '\n'
+    return goal_txt, goal_image_urls
+
+
+def create_observation_prompt(
+    axtree_txt: str,
+    tabs: str,
+    focused_element: str,
+    error_prefix: str,
+    som_screenshot: str | None,
+):
+    txt_observation = f"""
+# Observation of current step:
+{tabs}{axtree_txt}{focused_element}{error_prefix}
+"""
+
+    # screenshot + som: will be a non-empty string if present in observation
+    screenshot_url = None
+    if (som_screenshot is not None) and (len(som_screenshot) > 0):
+        txt_observation += 'Image: Current page screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.\n'
+        screenshot_url = som_screenshot
+    else:
+        logger.info('SOM Screenshot not present in observation!')
+    txt_observation += '\n'
+    return txt_observation, screenshot_url
+
+
+def get_tabs(obs: BrowserOutputObservation) -> str:
+    prompt_pieces = ['\n## Currently open tabs:']
+    for page_index, page_url in enumerate(obs.open_pages_urls):
+        active_or_not = ' (active tab)' if page_index == obs.active_page_index else ''
+        prompt_piece = f"""\
+Tab {page_index}{active_or_not}:
+URL: {page_url}
+"""
+        prompt_pieces.append(prompt_piece)
+    return '\n'.join(prompt_pieces) + '\n'
+
+
+def get_axtree(axtree_txt: str) -> str:
+    bid_info = """\
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+
+"""
+    visible_tag_info = """\
+Note: You can only interact with visible elements. If the "visible" tag is not present, the element is not visible on the page.
+
+"""
+    return f'\n## AXTree:\n{bid_info}{visible_tag_info}{axtree_txt}\n'
+
+
+def get_action_prompt(action_set: HighLevelActionSet) -> str:
+    action_set_generic_info = """\
+Note: This action set allows you to interact with your environment. Most of them are python function executing playwright code. The primary way of referring to elements in the page is through bid which are specified in your observations.
+
+"""
+    action_description = action_set.describe(
+        with_long_description=False,
+        with_examples=False,
+    )
+    action_prompt = f'# Action space:\n{action_set_generic_info}{action_description}\n'
+    return action_prompt
+
+
+def get_history_prompt(prev_actions: list[BrowseInteractiveAction]) -> str:
+    history_prompt = ['# History of all previous interactions with the task:\n']
+    for i in range(len(prev_actions)):
+        history_prompt.append(f'## step {i+1}')
+        history_prompt.append(
+            f'\nOuput thought and action: {prev_actions[i].thought} ```{prev_actions[i].browser_actions}```\n'
+        )
+    return '\n'.join(history_prompt) + '\n'
+
+
+class VisualBrowsingAgent(Agent):
+    VERSION = '1.0'
+    """
+    VisualBrowsing Agent that can uses webpage screenshots during browsing.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = []
+    response_parser = BrowsingResponseParser()
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the VisualBrowsingAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = [
+            'chat',
+            'bid',
+            'nav',
+            'tab',
+            'infeas',
+        ]
+        self.action_space = HighLevelActionSet(
+            subsets=action_subsets,
+            strict=False,  # less strict on the parsing of the actions
+            multiaction=False,
+        )
+        self.action_prompt = get_action_prompt(self.action_space)
+        self.abstract_example = f"""
+# Abstract Example
+
+Here is an abstract version of the answer with description of the content of each tag. Make sure you follow this structure, but replace the content with your answer:
+
+You must mandatorily think step by step. If you need to make calculations such as coordinates, write them here. Describe the effect that your previous action had on the current content of the page. In summary the next action I will perform is ```{self.action_space.example_action(abstract=True)}```
+"""
+        self.concrete_example = """
+# Concrete Example
+
+Here is a concrete example of how to format your answer. Make sure to generate the action in the correct format ensuring that the action is present inside ``````:
+
+Let's think step-by-step. From previous action I tried to set the value of year to "2022", using select_option, but it doesn't appear to be in the form. It may be a dynamic dropdown, I will try using click with the bid "324" and look at the response from the page. In summary the next action I will perform is ```click('324')```
+"""
+        self.hints = """
+Note:
+* Make sure to use bid to identify elements when using commands.
+* Interacting with combobox, dropdowns and auto-complete fields can be tricky, sometimes you need to use select_option, while other times you need to use fill or click and wait for the reaction of the page.
+
+"""
+        self.reset()
+
+    def reset(self) -> None:
+        """Resets the VisualBrowsingAgent."""
+        super().reset()
+        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the VisualBrowsingAgent.
+
+        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        messages: list[Message] = []
+        prev_actions = []
+        cur_axtree_txt = ''
+        error_prefix = ''
+        focused_element = ''
+        tabs = ''
+        last_obs = None
+        last_action = None
+
+        if len(state.history) == 1:
+            # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop(1000)')
+
+        for event in state.history:
+            if isinstance(event, BrowseInteractiveAction):
+                prev_actions.append(event)
+                last_action = event
+            elif isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                # agent has responded, task finished.
+                return AgentFinishAction(outputs={'content': event.content})
+            elif isinstance(event, Observation):
+                last_obs = event
+
+        if len(prev_actions) >= 1:  # ignore noop()
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenHands and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)
+
+        history_prompt = get_history_prompt(prev_actions)
+        if isinstance(last_obs, BrowserOutputObservation):
+            if last_obs.error:
+                # add error recovery prompt prefix
+                error_prefix = get_error_prefix(last_obs)
+                if len(error_prefix) > 0:
+                    self.error_accumulator += 1
+                    if self.error_accumulator > 5:
+                        return MessageAction(
+                            'Too many errors encountered. Task failed.'
+                        )
+            focused_element = '## Focused element:\nNone\n'
+            if last_obs.focused_element_bid is not None:
+                focused_element = (
+                    f"## Focused element:\nbid='{last_obs.focused_element_bid}'\n"
+                )
+            tabs = get_tabs(last_obs)
+            try:
+                # IMPORTANT: keep AX Tree of full webpage, add visible and clickable tags
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_visible=True,
+                    with_clickable=True,
+                    with_center_coords=False,
+                    with_bounding_box_coords=False,
+                    filter_visible_only=False,
+                    filter_with_bid_only=False,
+                    filter_som_only=False,
+                )
+                cur_axtree_txt = get_axtree(axtree_txt=cur_axtree_txt)
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')
+            set_of_marks = last_obs.set_of_marks
+        goal, image_urls = state.get_current_user_intent()
+
+        if goal is None:
+            goal = state.inputs['task']
+        goal_txt, goal_images = create_goal_prompt(goal, image_urls)
+        observation_txt, som_screenshot = create_observation_prompt(
+            cur_axtree_txt, tabs, focused_element, error_prefix, set_of_marks
+        )
+        human_prompt = [TextContent(type='text', text=goal_txt)]
+        if len(goal_images) > 0:
+            human_prompt.append(ImageContent(image_urls=goal_images))
+        human_prompt.append(TextContent(type='text', text=observation_txt))
+        if som_screenshot is not None:
+            human_prompt.append(ImageContent(image_urls=[som_screenshot]))
+        remaining_content = f"""
+{history_prompt}\
+{self.action_prompt}\
+{self.hints}\
+{self.abstract_example}\
+{self.concrete_example}\
+"""
+        human_prompt.append(TextContent(type='text', text=remaining_content))
+
+        system_msg = """\
+You are an agent trying to solve a web task based on the content of the page and user instructions. You can interact with the page and explore, and send messages to the user when you finish the task. Each time you submit an action it will be sent to the browser and you will receive a new page.
+""".strip()
+
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))
+        messages.append(Message(role='user', content=human_prompt))
+
+        flat_messages = self.llm.format_messages_for_llm(messages)
+
+        response = self.llm.completion(
+            messages=flat_messages,
+            temperature=0.0,
+            stop=[')```', ')\n```'],
+        )
+
+        return self.response_parser.parse(response)
@@ -12,6 +12,7 @@ from litellm.exceptions import (
 )

 from openhands.controller.agent import Agent
+from openhands.controller.replay import ReplayManager
 from openhands.controller.state.state import State, TrafficControlState
 from openhands.controller.stuck import StuckDetector
 from openhands.core.config import AgentConfig, LLMConfig
@@ -90,6 +91,7 @@ class AgentController:
        is_delegate: bool = False,
        headless_mode: bool = True,
        status_callback: Callable | None = None,
+        replay_events: list[Event] | None = None,
    ):
        """Initializes a new instance of the AgentController class.

@@ -108,6 +110,7 @@ class AgentController:
            is_delegate: Whether this controller is a delegate.
            headless_mode: Whether the agent is run in headless mode.
            status_callback: Optional callback function to handle status updates.
+            replay_events: A list of logs to replay.
        """
        self.id = sid
        self.agent = agent
@@ -139,6 +142,9 @@ class AgentController:
        self._stuck_detector = StuckDetector(self.state)
        self.status_callback = status_callback

+        # replay-related
+        self._replay_manager = ReplayManager(replay_events)
+
    async def close(self) -> None:
        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.

@@ -234,6 +240,11 @@ class AgentController:
            await self._react_to_exception(reported)

    def should_step(self, event: Event) -> bool:
+        """
+        Whether the agent should take a step based on an event. In general,
+        the agent should take a step if it receives a message from the user,
+        or observes something in the environment (after acting).
+        """
        # it might be the delegate's day in the sun
        if self.delegate is not None:
            return False
@@ -490,10 +501,6 @@ class AgentController:
            EventSource.ENVIRONMENT,
        )

-        if new_state == AgentState.INIT and self.state.resume_state:
-            await self.set_agent_state_to(self.state.resume_state)
-            self.state.resume_state = None
-
    def get_agent_state(self) -> AgentState:
        """Returns the current state of the agent.

@@ -641,42 +648,50 @@ class AgentController:

        self.update_state_before_step()
        action: Action = NullAction()
-        try:
-            action = self.agent.step(self.state)
-            if action is None:
-                raise LLMNoActionError('No action was returned')
-        except (
-            LLMMalformedActionError,
-            LLMNoActionError,
-            LLMResponseError,
-            FunctionCallValidationError,
-            FunctionCallNotExistsError,
-        ) as e:
-            self.event_stream.add_event(
-                ErrorObservation(
-                    content=str(e),
-                ),
-                EventSource.AGENT,
-            )
-            return
-        except (ContextWindowExceededError, BadRequestError) as e:
-            # FIXME: this is a hack until a litellm fix is confirmed
-            # Check if this is a nested context window error
-            error_str = str(e).lower()
-            if (
-                'contextwindowexceedederror' in error_str
-                or 'prompt is too long' in error_str
-                or isinstance(e, ContextWindowExceededError)
-            ):
-                # When context window is exceeded, keep roughly half of agent interactions
-                self.state.history = self._apply_conversation_window(self.state.history)

-                # Save the ID of the first event in our truncated history for future reloading
-                if self.state.history:
-                    self.state.start_id = self.state.history[0].id
-                # Don't add error event - let the agent retry with reduced context
+        if self._replay_manager.should_replay():
+            # in replay mode, we don't let the agent to proceed
+            # instead, we replay the action from the replay trajectory
+            action = self._replay_manager.step()
+        else:
+            try:
+                action = self.agent.step(self.state)
+                if action is None:
+                    raise LLMNoActionError('No action was returned')
+            except (
+                LLMMalformedActionError,
+                LLMNoActionError,
+                LLMResponseError,
+                FunctionCallValidationError,
+                FunctionCallNotExistsError,
+            ) as e:
+                self.event_stream.add_event(
+                    ErrorObservation(
+                        content=str(e),
+                    ),
+                    EventSource.AGENT,
+                )
                return
-            raise
+            except (ContextWindowExceededError, BadRequestError) as e:
+                # FIXME: this is a hack until a litellm fix is confirmed
+                # Check if this is a nested context window error
+                error_str = str(e).lower()
+                if (
+                    'contextwindowexceedederror' in error_str
+                    or 'prompt is too long' in error_str
+                    or isinstance(e, ContextWindowExceededError)
+                ):
+                    # When context window is exceeded, keep roughly half of agent interactions
+                    self.state.history = self._apply_conversation_window(
+                        self.state.history
+                    )
+
+                    # Save the ID of the first event in our truncated history for future reloading
+                    if self.state.history:
+                        self.state.start_id = self.state.history[0].id
+                    # Don't add error event - let the agent retry with reduced context
+                    return
+                raise

        if action.runnable:
            if self.state.confirmation_mode and (
@@ -0,0 +1,52 @@
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.action import Action
+from openhands.events.event import Event, EventSource
+
+
+class ReplayManager:
+    """ReplayManager manages the lifecycle of a replay session of a given trajectory.
+
+    Replay manager keeps track of a list of events, replays actions, and ignore
+    messages and observations. It could lead to unexpected or even errorneous
+    results if any action is non-deterministic, or if the initial state before
+    the replay session is different from the initial state of the trajectory.
+    """
+
+    def __init__(self, replay_events: list[Event] | None):
+        if replay_events:
+            logger.info(f'Replay logs loaded, events length = {len(replay_events)}')
+        self.replay_events = replay_events
+        self.replay_mode = bool(replay_events)
+        self.replay_index = 0
+
+    def _replayable(self) -> bool:
+        return (
+            self.replay_events is not None
+            and self.replay_index < len(self.replay_events)
+            and isinstance(self.replay_events[self.replay_index], Action)
+            and self.replay_events[self.replay_index].source != EventSource.USER
+        )
+
+    def should_replay(self) -> bool:
+        """
+        Whether the controller is in trajectory replay mode, and the replay
+        hasn't finished. Note: after the replay is finished, the user and
+        the agent could continue to message/act.
+
+        This method also moves "replay_index" to the next action, if applicable.
+        """
+        if not self.replay_mode:
+            return False
+
+        assert self.replay_events is not None
+        while self.replay_index < len(self.replay_events) and not self._replayable():
+            self.replay_index += 1
+
+        return self._replayable()
+
+    def step(self) -> Action:
+        assert self.replay_events is not None
+        event = self.replay_events[self.replay_index]
+        assert isinstance(event, Action)
+        self.replay_index += 1
+        return event
@@ -27,6 +27,6 @@ class AgentConfig(BaseModel):
    memory_enabled: bool = Field(default=False)
    memory_max_threads: int = Field(default=3)
    llm_config: str | None = Field(default=None)
-    enable_prompt_extensions: bool = Field(default=False)
+    enable_prompt_extensions: bool = Field(default=True)
    disabled_microagents: list[str] | None = Field(default=None)
    condenser: CondenserConfig = Field(default_factory=NoOpCondenserConfig)
@@ -28,6 +28,7 @@ class AppConfig(BaseModel):
        file_store: Type of file store to use.
        file_store_path: Path to the file store.
        save_trajectory_path: Either a folder path to store trajectories with auto-generated filenames, or a designated trajectory file path.
+        replay_trajectory_path: Path to load trajectory and replay. If provided, trajectory would be replayed first before user's instruction.
        workspace_base: Base path for the workspace. Defaults to `./workspace` as absolute path.
        workspace_mount_path: Path to mount the workspace. Defaults to `workspace_base`.
        workspace_mount_path_in_sandbox: Path to mount the workspace in sandbox. Defaults to `/workspace`.
@@ -55,6 +56,7 @@ class AppConfig(BaseModel):
    file_store: str = Field(default='local')
    file_store_path: str = Field(default='/tmp/openhands_file_store')
    save_trajectory_path: str | None = Field(default=None)
+    replay_trajectory_path: str | None = Field(default=None)
    workspace_base: str | None = Field(default=None)
    workspace_mount_path: str | None = Field(default=None)
    workspace_mount_path_in_sandbox: str = Field(default='/workspace')
@@ -1,8 +1,8 @@
 from __future__ import annotations

 import os
-
 from typing import Any
+
 from pydantic import BaseModel, Field, SecretStr

 from openhands.core.logger import LOG_DIR
@@ -39,12 +39,12 @@ class LLMConfig(BaseModel):
        drop_params: Drop any unmapped (unsupported) params without causing an exception.
        modify_params: Modify params allows litellm to do transformations like adding a default message, when a message is empty.
        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
        caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider.
        log_completions: Whether to log LLM completions to the state.
        log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
        custom_tokenizer: A custom tokenizer to use for token counting.
        native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
+        reasoning_effort: The effort to put into reasoning. This is a string that can be one of 'low', 'medium', 'high', or 'none'. Exclusive for o1 models.
    """

    model: str = Field(default='claude-3-5-sonnet-20241022')
@@ -85,7 +85,8 @@ class LLMConfig(BaseModel):
    log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
    custom_tokenizer: str | None = Field(default=None)
    native_tool_calling: bool | None = Field(default=None)
-    
+    reasoning_effort: str | None = Field(default=None)
+
    model_config = {'extra': 'forbid'}

    def model_post_init(self, __context: Any):
@@ -60,7 +60,7 @@ class SandboxConfig(BaseModel):
    runtime_startup_env_vars: dict[str, str] = Field(default_factory=dict)
    browsergym_eval_env: str | None = Field(default=None)
    platform: str | None = Field(default=None)
-    close_delay: int = Field(default=900)
+    close_delay: int = Field(default=15)
    remote_runtime_resource_factor: int = Field(default=1)
    enable_gpu: bool = Field(default=False)
    docker_runtime_kwargs: str | None = Field(default=None)
@@ -9,7 +9,7 @@ from uuid import uuid4

 import toml
 from dotenv import load_dotenv
-from pydantic import BaseModel, ValidationError
+from pydantic import BaseModel, SecretStr, ValidationError

 from openhands.core import logger
 from openhands.core.config.agent_config import AgentConfig
@@ -192,7 +192,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                                    custom_fields[k] = v
                            merged_llm_dict = generic_llm_fields.copy()
                            merged_llm_dict.update(custom_fields)
-                            
+
                            custom_llm_config = LLMConfig(**merged_llm_dict)
                            cfg.set_llm_config(custom_llm_config, nested_key)

@@ -287,8 +287,10 @@ def finalize_config(cfg: AppConfig):
        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)

    if not cfg.jwt_secret:
-        cfg.jwt_secret = get_or_create_jwt_secret(
-            get_file_store(cfg.file_store, cfg.file_store_path)
+        cfg.jwt_secret = SecretStr(
+            get_or_create_jwt_secret(
+                get_file_store(cfg.file_store, cfg.file_store_path)
+            )
        )


@@ -2,6 +2,7 @@ import asyncio
 import json
 import os
 import sys
+from pathlib import Path
 from typing import Callable, Protocol

 import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
@@ -22,10 +23,11 @@ from openhands.core.setup import (
    generate_sid,
 )
 from openhands.events import EventSource, EventStreamSubscriber
-from openhands.events.action import MessageAction
+from openhands.events.action import MessageAction, NullAction
 from openhands.events.action.action import Action
 from openhands.events.event import Event
 from openhands.events.observation import AgentStateChangedObservation
+from openhands.events.serialization import event_from_dict
 from openhands.events.serialization.event import event_to_trajectory
 from openhands.runtime.base import Runtime

@@ -101,7 +103,17 @@ async def run_controller(
    if agent is None:
        agent = create_agent(runtime, config)

-    controller, initial_state = create_controller(agent, runtime, config)
+    replay_events: list[Event] | None = None
+    if config.replay_trajectory_path:
+        logger.info('Trajectory replay is enabled')
+        assert isinstance(initial_user_action, NullAction)
+        replay_events, initial_user_action = load_replay_log(
+            config.replay_trajectory_path
+        )
+
+    controller, initial_state = create_controller(
+        agent, runtime, config, replay_events=replay_events
+    )

    assert isinstance(
        initial_user_action, Action
@@ -194,21 +206,64 @@ def auto_continue_response(
    return message


+def load_replay_log(trajectory_path: str) -> tuple[list[Event] | None, Action]:
+    """
+    Load trajectory from given path, serialize it to a list of events, and return
+    two things:
+    1) A list of events except the first action
+    2) First action (user message, a.k.a. initial task)
+    """
+    try:
+        path = Path(trajectory_path).resolve()
+
+        if not path.exists():
+            raise ValueError(f'Trajectory file not found: {path}')
+
+        if not path.is_file():
+            raise ValueError(f'Trajectory path is a directory, not a file: {path}')
+
+        with open(path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+            if not isinstance(data, list):
+                raise ValueError(
+                    f'Expected a list in {path}, got {type(data).__name__}'
+                )
+            events = []
+            for item in data:
+                event = event_from_dict(item)
+                # cannot add an event with _id to event stream
+                event._id = None  # type: ignore[attr-defined]
+                events.append(event)
+            assert isinstance(events[0], MessageAction)
+            return events[1:], events[0]
+    except json.JSONDecodeError as e:
+        raise ValueError(f'Invalid JSON format in {trajectory_path}: {e}')
+
+
 if __name__ == '__main__':
    args = parse_arguments()

+    config = setup_config_from_args(args)
+
    # Determine the task
+    task_str = ''
    if args.file:
        task_str = read_task_from_file(args.file)
    elif args.task:
        task_str = args.task
    elif not sys.stdin.isatty():
        task_str = read_task_from_stdin()
+
+    initial_user_action: Action = NullAction()
+    if config.replay_trajectory_path:
+        if task_str:
+            raise ValueError(
+                'User-specified task is not supported under trajectory replay mode'
+            )
+    elif task_str:
+        initial_user_action = MessageAction(content=task_str)
    else:
        raise ValueError('No task provided. Please specify a task through -t, -f.')
-    initial_user_action: MessageAction = MessageAction(content=task_str)
-
-    config = setup_config_from_args(args)

    # Set session name
    session_name = args.name
@@ -4,10 +4,6 @@ __all__ = ['ActionType']


 class ActionTypeSchema(BaseModel):
-    INIT: str = Field(default='initialize')
-    """Initializes the agent. Only sent by client.
-    """
-
    MESSAGE: str = Field(default='message')
    """Represents a message.
    """
@@ -6,10 +6,6 @@ class AgentState(str, Enum):
    """The agent is loading.
    """

-    INIT = 'init'
-    """The agent is initialized.
-    """
-
    RUNNING = 'running'
    """The agent is running.
    """
@@ -11,6 +11,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.events import EventStream
+from openhands.events.event import Event
 from openhands.llm.llm import LLM
 from openhands.runtime import get_runtime_cls
 from openhands.runtime.base import Runtime
@@ -78,7 +79,11 @@ def create_agent(runtime: Runtime, config: AppConfig) -> Agent:


 def create_controller(
-    agent: Agent, runtime: Runtime, config: AppConfig, headless_mode: bool = True
+    agent: Agent,
+    runtime: Runtime,
+    config: AppConfig,
+    headless_mode: bool = True,
+    replay_events: list[Event] | None = None,
 ) -> Tuple[AgentController, State | None]:
    event_stream = runtime.event_stream
    initial_state = None
@@ -101,6 +106,7 @@ def create_controller(
        initial_state=initial_state,
        headless_mode=headless_mode,
        confirmation_mode=config.security.confirmation_mode,
+        replay_events=replay_events,
    )
    return (controller, initial_state)

@@ -24,6 +24,8 @@ class FileReadSource(str, Enum):

@dataclass
 class Event:
+    INVALID_ID = -1
+
    @property
    def message(self) -> str | None:
        if hasattr(self, '_message'):
@@ -34,7 +36,7 @@ class Event:
    def id(self) -> int:
        if hasattr(self, '_id'):
            return self._id  # type: ignore[attr-defined]
-        return -1
+        return Event.INVALID_ID

    @property
    def timestamp(self):
@@ -12,9 +12,11 @@ class BrowserOutputObservation(Observation):

    url: str
    trigger_by_action: str
-    screenshot: str = field(repr=False)  # don't show in repr
+    screenshot: str = field(repr=False, default='')  # don't show in repr
+    set_of_marks: str = field(default='', repr=False)  # don't show in repr
    error: bool = False
    observation: str = ObservationType.BROWSE
+    goal_image_urls: list = field(default_factory=list)
    # do not include in the memory
    open_pages_urls: list = field(default_factory=list)
    active_page_index: int = -1
@@ -149,16 +149,18 @@ class CmdOutputObservation(Observation):
            f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code}, '
            f'metadata={json.dumps(self.metadata.model_dump(), indent=2)})**\n'
            '--BEGIN AGENT OBSERVATION--\n'
-            f'{self._to_agent_observation()}\n'
+            f'{self.to_agent_observation()}\n'
            '--END AGENT OBSERVATION--'
        )

-    def _to_agent_observation(self) -> str:
+    def to_agent_observation(self) -> str:
        ret = f'{self.metadata.prefix}{self.content}{self.metadata.suffix}'
        if self.metadata.working_dir:
            ret += f'\n[Current working directory: {self.metadata.working_dir}]'
        if self.metadata.py_interpreter_path:
            ret += f'\n[Python interpreter: {self.metadata.py_interpreter_path}]'
+        if self.metadata.exit_code != -1:
+            ret += f'\n[Command finished with exit code {self.metadata.exit_code}]'
        return ret


@@ -152,6 +152,12 @@ class LLM(RetryMixin, DebugMixin):
            temperature=self.config.temperature,
            top_p=self.config.top_p,
            drop_params=self.config.drop_params,
+            # add reasoning_effort, only if the model is supported
+            **(
+                {'reasoning_effort': self.config.reasoning_effort}
+                if self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
+                else {}
+            ),
        )

        self._completion_unwrapped = self._completion
@@ -217,10 +223,6 @@ class LLM(RetryMixin, DebugMixin):
                        'anthropic-beta': 'prompt-caching-2024-07-31',
                    }

-            # Set reasoning effort for models that support it
-            if self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS:
-                kwargs['reasoning_effort'] = self.config.reasoning_effort
-
            # set litellm modify_params to the configured value
            # True by default to allow litellm to do transformations like adding a default message, when a message is empty
            # NOTE: this setting is global; unlike drop_params, it cannot be overridden in the litellm completion partial
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
rohitvinodmalhotra@gmail.com	532a284d5c	migrate to use OH version	2025-01-26 15:24:35 -05:00
rohitvinodmalhotra@gmail.com	43f6104967	Merge branch 'main' into eval/visualcodebench	2025-01-26 15:14:28 -05:00
Ray Myers	e619929909	Log restart reason if runtime reports it (#6455 )	2025-01-25 07:20:18 +01:00
Ryan H. Tran	93753ac2e0	Upgrade openhands-aci to 0.1.9 (#6450 )	2025-01-24 19:03:00 +00:00
Robert Brennan	38e19d214d	Fix up conversation initialization (#6430 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-24 18:43:02 +00:00
dependabot[bot]	19a4f1c3ec	chore(deps-dev): bump llama-index from 0.12.12 to 0.12.13 in the llama group (#6448 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-01-24 16:16:53 +00:00
Rohit Malhotra	45a048f9e3	NIT: Remove unused param (#6446 )	2025-01-24 14:51:09 +00:00
sp.wack	358d9cb3f4	hotfix(frontend): Logout and clear token if retrieving user fails (#6436 )	2025-01-24 09:49:50 -05:00
Xingyao Wang	e6a2fd3fd4	feat: add prompt to prevent agent execute multiple bash command at the same time (#6428 )	2025-01-24 22:43:34 +08:00
OpenHands	c2f308f397	Fix issue #5620 : [Bug]: Resolver fails when the existing requirements.txt does not end in a newline character (#6327 )	2025-01-24 09:36:59 -05:00
Rohit Malhotra	a1f1c802d9	[Fix]: Fix bugs for target_branch param on resolver (#5745 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-23 21:36:20 -05:00
Xiaohua Zhang	ad2237d7dd	feat: vscode support for modal runtime (#6442 ) Co-authored-by: Xiaohua Zhang <xiaohua.dev@gmail.com>	2025-01-24 01:39:07 +00:00
Xiaohua Zhang	aa0cd51967	fix(frontend): display confirmation buttons for explandable messages (#6426 ) Co-authored-by: Xiaohua Zhang <xiaohua.dev@gmail.com>	2025-01-23 20:14:52 -05:00
Graham Neubig	081a1305f0	Fix resolver linting issues (#6401 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-23 18:21:11 -05:00
Xiaohua Zhang	9912e28576	chore: update config template to use docker runtime by default (#6435 ) Co-authored-by: Xiaohua Zhang <xiaohua.dev@gmail.com>	2025-01-23 22:24:00 +00:00
tofarr	b19a33ccad	Fix: Filtering conversations with no created at (#6414 )	2025-01-23 15:09:57 -07:00
tofarr	21e912d6fb	Feat remove redis (#6278 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-23 14:33:16 -07:00
Robert Brennan	0dd9b95dbe	change message to connecting (#6433 )	2025-01-23 20:42:41 +00:00
Aditya Bharat Soni	aebb583779	Support for VisualWebArena evaluation in OpenHands (#4773 ) Co-authored-by: Xingyao Wang <xingyao@all-hands.dev> Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com>	2025-01-23 20:18:30 +00:00
chuckbutkus	2ff9ba1229	AWS necessary changes only (#6375 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2025-01-23 13:10:11 -05:00
Michael Jewell	a7e6068ba8	build: add required dependencies to package.json (#6423 )	2025-01-23 10:07:12 -05:00
dependabot[bot]	24adcee9e3	chore(deps-dev): bump the llama group with 2 updates (#6411 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-01-23 14:54:27 +00:00
tofarr	21d4ba0bbd	Feat: Stop runtimes rather than delete them (#6403 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-23 07:43:02 -07:00
tofarr	5ba9a6d321	Feat: Better mechanism for attaching middleware (#6365 )	2025-01-23 07:31:43 -07:00
tofarr	aa223734d4	One more SecretStr fix (#6419 )	2025-01-22 18:21:14 -07:00
sp.wack	053723a4d4	fix(frontend): Refetch conversations when toggling the conversation panel (#6190 )	2025-01-22 18:19:01 +00:00
mamoodi	5a6dbac5a3	Release 0.21.0 (#6392 ) Co-authored-by: Calvin Smith <email@cjsmith.io> Co-authored-by: Calvin Smith <calvin@all-hands.dev> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>	2025-01-22 11:26:12 -05:00
Robert Brennan	93d74e9b41	make export button more stylistically consistent (#6412 )	2025-01-22 11:18:43 -05:00
tofarr	1337d03816	Example usage of httpx (#6325 )	2025-01-22 16:06:43 +00:00
Robert Brennan	04e36df4d7	remove dead code (#6386 )	2025-01-22 10:26:59 -05:00
Boxuan Li	f9ba16b648	Edit tool prompt tweaking: only plain-text format is supported (#6067 ) Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: mamoodi <mamoodiha@gmail.com>	2025-01-21 18:22:01 -08:00
Engel Nyst	f0dbb02ee1	Adjust prompt to use view command (#5506 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-21 23:50:39 +01:00
tofarr	318c811817	Added check to shutdown hook (#6402 )	2025-01-21 22:32:46 +00:00
Xingyao Wang	b468150f2a	fix(codeact): make sure agent sees the prefix/suffix as part of observation (#6400 )	2025-01-21 21:54:57 +00:00
Engel Nyst	b9a3f1c753	Fix eval on remote runtime (#6398 )	2025-01-21 20:49:30 +00:00
tofarr	09e8a1eeba	Fix: Keeping runtimes alive again (For now) (#6395 )	2025-01-21 19:20:35 +00:00
Xingyao Wang	ff3880c76d	fix(remote_runtime): define runtime_id first to fix attrbute error (#6393 )	2025-01-21 18:13:43 +00:00
Calvin Smith	8bd7613724	fix: Settings modal properly tracks if an API key is set (#6394 ) Co-authored-by: Calvin Smith <calvin@all-hands.dev>	2025-01-21 11:04:30 -07:00
Engel Nyst	5b7fcfbe1a	Disable prompt extensions in SWE-bench (#6391 )	2025-01-21 17:18:30 +00:00
Robert Brennan	8ae36481df	Fix API key again (#6390 )	2025-01-21 17:00:59 +00:00
Robert Brennan	25fdb0c3bf	fix api key value (#6388 )	2025-01-21 16:15:28 +00:00
louria	7f57dbebda	Update MiniWoB README (#6385 )	2025-01-21 16:26:47 +01:00
dependabot[bot]	54589d7e83	chore(deps-dev): bump pre-commit from 4.0.1 to 4.1.0 in the pre-commit group (#6384 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-01-21 15:10:20 +00:00
Boxuan Li	b7f34c3f8d	(feat) Add button to export trajectory on chat panel (#6378 )	2025-01-21 22:10:00 +08:00
dependabot[bot]	210eeee94a	chore(deps-dev): bump the eslint group in /frontend with 2 updates (#6358 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-01-21 13:46:56 +04:00
Robert Brennan	509892cf0e	Revert changes to config defaults (#6370 )	2025-01-21 04:23:21 +01:00
Engel Nyst	89963e93d8	Re-add reasoning effort (#6371 )	2025-01-21 04:22:48 +01:00
tofarr	b6804f9e1e	Fix: Static assets should not have the same rate limit (#6360 ) Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2025-01-20 21:55:49 +00:00
mamoodi	d30211da18	Update running OpenHands guide with detailed prerequisites (#6366 )	2025-01-20 13:53:14 -05:00
Boxuan Li	06121bf20f	chore(deps): Revert vite upgrade (#6349 )	2025-01-20 19:11:32 +01:00
tofarr	541a445dfc	Fix: API meta for OpenHands (#6295 )	2025-01-20 09:47:57 -07:00
dependabot[bot]	03e496fb60	chore(deps): bump the version-all group with 7 updates (#6359 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-01-20 17:04:22 +01:00
Xingyao Wang	1b6e444ecb	feat(remote runtime): do not resume runtime if not keep_runtime_alive (#6355 ) Co-authored-by: Robert Brennan <accounts@rbren.io>	2025-01-19 21:42:00 +00:00
Xingyao Wang	2b04ee2e62	feat(eval): reliability improvement for SWE-Bench eval_infer (#6347 )	2025-01-18 14:02:59 -05:00
Boxuan Li	4383be1ab4	(feat) Add trajectory replay for headless mode (#6215 )	2025-01-18 05:48:22 +00:00
tofarr	b4d20e3e18	Feat: settings default (#6328 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-17 20:17:18 -07:00
mamoodi	532c7cdf02	Attempt to fix doc deploy (#6337 )	2025-01-18 00:16:47 +00:00
mamoodi	987861b5e7	Remove broken browser counter logic (#6334 ) Co-authored-by: openhands <openhands@all-hands.dev>	2025-01-17 22:41:31 +00:00
Calvin Smith	f07ec7a09c	fix: Conversation creation accessing secret without unwrapping (#6335 ) Co-authored-by: Calvin Smith <calvin@all-hands.dev>	2025-01-17 22:16:57 +00:00
Xingyao Wang	b1fa6301f0	feat: add prompt for generating repo.md for an arbiratry repo (#6034 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2025-01-17 21:47:27 +00:00
openhands	e249b920ff	feat: adapt Design2Code block detection for in-memory evaluation	2024-11-30 19:28:22 +00:00
rohitvinodmalhotra@gmail.com	d920a69f69	adding back server code	2024-11-30 14:00:25 -05:00
openhands	a8ce888981	refactor: adapt Design2Code evaluation metrics	2024-11-30 17:17:05 +00:00
rohitvinodmalhotra@gmail.com	e22ddc0dd6	uncomment agent run	2024-11-26 17:00:07 -05:00
rohitvinodmalhotra@gmail.com	c370912f12	adding eval scripts	2024-11-26 16:57:19 -05:00
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-download"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/><polyline points="7 10 12 15 17 10"/><line x1="12" x2="12" y1="15" y2="3"/></svg>`