random changes to agent

Simplify makefile (#4591 )
fix(frontend): Record events sent to WS (#4596 )
2026-04-29 03:00:45 -04:00 · 2024-10-28 16:19:15 -04:00 · 2024-10-28 13:10:32 -04:00 · 2024-10-28 15:53:31 +00:00 · 2024-10-28 17:26:28 +04:00 · 2024-10-28 16:42:17 +04:00
459 changed files with 18261 additions and 38570 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,21 +1,35 @@
-# To get started with Dependabot version updates, you'll need to specify which
-# package ecosystems to update and where the package manifests are located.
-# Please see the documentation for all configuration options:
-# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
-
 version: 2
 updates:
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
+    groups:
+      # put packages in their own group if they have a history of breaking the build or needing to be reverted
+      pre-commit:
+        patterns:
+          - "pre-commit"
+      llama:
+        patterns:
+          - "llama*"
+      chromadb:
+        patterns:
+          - "chromadb"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/frontend"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -23,12 +37,21 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/docs"
    schedule:
-      interval: "daily"
-    open-pull-requests-limit: 20
+      interval: "weekly"
+      day: "wednesday"
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -36,3 +59,11 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -88,14 +88,6 @@ jobs:
          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
          echo "Hash from app image: $hash_from_app_image"
-      # This test should move when we have a test suite for the app image
-      - name: Test docker in App Image
-        run: |
-          # Lowercase the repository owner
-          export REPO_OWNER=${{ github.repository_owner }}
-          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
-
-          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "docker run hello-world"

  # Builds the runtime Docker images
  ghcr_build_runtime:
@@ -384,78 +376,6 @@ jobs:
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

-  # Run integration tests with the eventstream runtime Docker image
-  runtime_integration_tests_on_linux:
-    name: RT Integration Tests (Linux)
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime]
-    strategy:
-      fail-fast: false
-      matrix:
-        base_image: ['nikolaik']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      # Forked repos can't push to GHCR, so we need to download the image as an artifact
-      - name: Download runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        uses: actions/download-artifact@v4
-        with:
-          name: runtime-${{ matrix.base_image }}
-          path: /tmp
-      - name: Load runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        run: |
-          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Run integration tests
-        run: |
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
-          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
-
-          TEST_RUNTIME=eventstream \
-          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
-          TEST_IN_CI=true \
-          TEST_ONLY=true \
-          ./tests/integration/regenerate.sh
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
  # The two following jobs (named identically) are to check whether all the runtime tests have passed as the
  # "All Runtime Tests Passed" is a required job for PRs to merge
  # Due to this bug: https://github.com/actions/runner/issues/2566, we want to create a job that runs when the
@@ -464,7 +384,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@@ -473,7 +393,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
        run: |
--- a/.github/workflows/py-unit-tests-mac.yml
+++ b/.github/workflows/py-unit-tests-mac.yml
@@ -0,0 +1,96 @@
+# Workflow that runs python unit tests on mac
+name: Run Python Unit Tests Mac
+
+# This job is flaky so only run it nightly
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  # Run python unit tests on macOS
+  test-on-macos:
+    name: Python Unit Tests on macOS
+    runs-on: macos-14
+    env:
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
+    strategy:
+      matrix:
+        python-version: ['3.12']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Install & Start Docker
+        if: env.INSTALL_DOCKER == '1'
+        run: |
+          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
+
+          # Uninstall colima to upgrade to the latest version
+          if brew list colima &>/dev/null; then
+            brew uninstall colima
+            # unlinking colima dependency: go
+            brew uninstall go@1.21
+          fi
+          rm -rf ~/.colima ~/.lima
+          brew install --HEAD colima
+          brew install docker
+
+          start_colima() {
+            # Find a free port in the range 10000-20000
+            RANDOM_PORT=$((RANDOM % 10001 + 10000))
+
+            # Original line:
+            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
+              echo "Failed to start Colima."
+              return 1
+            fi
+            return 0
+          }
+
+          # Attempt to start Colima for 5 total attempts:
+          ATTEMPT_LIMIT=5
+          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
+
+            if start_colima; then
+              echo "Colima started successfully."
+              break
+            else
+              colima stop -f
+              sleep 10
+              colima delete -f
+              if [ $i -eq $ATTEMPT_LIMIT ]; then
+                exit 1
+              fi
+              sleep 10
+            fi
+          done
+
+          # For testcontainers to find the Colima socket
+          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
+          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
+      - name: Build Environment
+        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Run Tests
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -16,94 +16,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  # Run python unit tests on macOS
-  test-on-macos:
-    name: Python Unit Tests on macOS
-    runs-on: macos-12
-    env:
-      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
-    strategy:
-      matrix:
-        python-version: ['3.12']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation,llama-index
-      - name: Install & Start Docker
-        if: env.INSTALL_DOCKER == '1'
-        run: |
-          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
-
-          # Uninstall colima to upgrade to the latest version
-          if brew list colima &>/dev/null; then
-            brew uninstall colima
-            # unlinking colima dependency: go
-            brew uninstall go@1.21
-          fi
-          rm -rf ~/.colima ~/.lima
-          brew install --HEAD colima
-          brew install docker
-
-          start_colima() {
-            # Find a free port in the range 10000-20000
-            RANDOM_PORT=$((RANDOM % 10001 + 10000))
-
-            # Original line:
-            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
-              echo "Failed to start Colima."
-              return 1
-            fi
-            return 0
-          }
-
-          # Attempt to start Colima for 5 total attempts:
-          ATTEMPT_LIMIT=5
-          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
-
-            if start_colima; then
-              echo "Colima started successfully."
-              break
-            else
-              colima stop -f
-              sleep 10
-              colima delete -f
-              if [ $i -eq $ATTEMPT_LIMIT ]; then
-                exit 1
-              fi
-              sleep 10
-            fi
-          done
-
-          # For testcontainers to find the Colima socket
-          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
-          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-      - name: Build Environment
-        run: make build
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Run Tests
-        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
  # Run python unit tests on Linux
  test-on-linux:
    name: Python Unit Tests on Linux
--- a/.github/workflows/regenerate_integration_tests.yml
+++ b/.github/workflows/regenerate_integration_tests.yml
@@ -1,73 +0,0 @@
-
-name: Regenerate Integration Tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      debug:
-        description: 'Enable debug mode'
-        type: boolean
-        default: true
-      log_to_file:
-        description: 'Enable logging to file'
-        type: boolean
-        default: true
-      force_regenerate_tests:
-        description: 'Force regeneration of tests'
-        type: boolean
-        default: false
-      force_use_llm:
-        description: 'Force use of LLM'
-        type: boolean
-        default: false
-
-jobs:
-  regenerate_integration_tests:
-    if: github.ref != 'refs/heads/main'
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Set up Docker Buildx
-      id: buildx
-      uses: docker/setup-buildx-action@v3
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.12"
-    - name: Cache Poetry dependencies
-      uses: actions/cache@v4
-      with:
-        path: |
-          ~/.cache/pypoetry
-          ~/.virtualenvs
-        key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-        restore-keys: |
-          ${{ runner.os }}-poetry-
-    - name: Install poetry via pipx
-      run: pipx install poetry
-    - name: Install Python dependencies using Poetry
-      run: make install-python-dependencies
-    - name: Build Environment
-      run: make build
-    - name: Regenerate integration tests
-      run: |
-        DEBUG=${{ inputs.debug }} \
-        LOG_TO_FILE=${{ inputs.log_to_file }} \
-        FORCE_REGENERATE=${{ inputs.force_regenerate_tests }} \
-        FORCE_USE_LLM=${{ inputs.force_use_llm }} \
-        ./tests/integration/regenerate.sh
-    - name: Commit changes
-      run: |
-        if git diff --quiet --exit-code; then
-          echo "No changes to commit"
-          exit 0
-        fi
-
-        git config --global user.name 'github-actions[bot]'
-        git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-        git add .
-        # run it twice in case pre-commit makes changes
-        git commit -am "Regenerate integration tests" || git commit -am "Regenerate integration tests"
-        git push
--- a/.gitignore
+++ b/.gitignore
@@ -178,7 +178,6 @@ evaluation/toolqa/data
 # frontend

 # dependencies
-frontend/node_modules
 frontend/.pnp
 frontend/bun.lockb
 frontend/yarn.lock
@@ -228,3 +227,4 @@ runtime_*.tar
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
 containers/runtime/code
+**/node_modules/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,14 +2,6 @@

 Thanks for your interest in contributing to OpenHands! We welcome and appreciate contributions.

-## How Can I Contribute?
-
-There are many ways that you can contribute:
-
-1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
-2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
-
 ## Understanding OpenHands's CodeBase

 To understand the codebase, please refer to the README in each module:
@@ -19,79 +11,61 @@ To understand the codebase, please refer to the README in each module:
   - [agenthub](./openhands/agenthub/README.md)
   - [server](./openhands/server/README.md)

+## Setting up your development environment

+We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
+
+## How can I contribute?
+
+There are many ways that you can contribute:
+
+1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
+2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
+3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
+
+## What can I build?
+Here are a few ways you can help improve the codebase.
+
+#### UI/UX
+We're always looking to improve the look and feel of the application. If you've got a small fix
+for something that's bugging you, feel free to open up a PR that changes the `./frontend` directory.
+
+If you're looking to make a bigger change, add a new UI element, or significantly alter the style
+of the application, please open an issue first, or better, join the #frontend channel in our Slack
+to gather consensus from our design team first.
+
+#### Improving the agent
+Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent)
+
+Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
+You can try modifying the prompts to see how they change the behavior of the agent as you use the app
+locally, but we will need to do an end-to-end evaluation of any changes here to ensure that the agent
+is getting better over time.
+
+We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. You can join the #evaluation
+channel in Slack to learn more.
+
+#### Adding a new agent
+You may want to experiment with building new types of agents. You can add an agent to `openhands/agenthub`
+to help expand the capabilities of OpenHands.
+
+#### Adding a new runtime
+The agent needs a place to run code and commands. When you run OpenHands on your laptop, it uses a Docker container
+to do this by default. But there are other ways of creating a sandbox for the agent.
+
+If you work for a company that provides a cloud-based runtime, you could help us add support for that runtime
+by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/runtime.py).
+
+#### Testing
 When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
 At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.

 ## Sending Pull Requests to OpenHands

-### 1. Fork the Official Repository
-Fork the [OpenHands repository](https://github.com/All-Hands-AI/OpenHands) into your own account.
-Clone your own forked repository into your local environment:
+You'll need to fork our repository to send us a Pull Request. You can learn more
+about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8)

-```shell
-git clone git@github.com:<YOUR-USERNAME>/OpenHands.git
-```
-
-### 2. Configure Git
-
-Set the official repository as your [upstream](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams) to synchronize with the latest update in the official repository.
-Add the original repository as upstream:
-
-```shell
-cd OpenHands
-git remote add upstream git@github.com:All-Hands-AI/OpenHands.git
-```
-
-Verify that the remote is set:
-
-```shell
-git remote -v
-```
-
-You should see both `origin` and `upstream` in the output.
-
-### 3. Synchronize with Official Repository
-Synchronize latest commit with official repository before coding:
-
-```shell
-git fetch upstream
-git checkout main
-git merge upstream/main
-git push origin main
-```
-
-### 4. Set up the Development Environment
-
-We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
-
-### 5. Write Code and Commit It
-
-Once you have done this, you can write code, test it, and commit it to a branch (replace `my_branch` with an appropriate name):
-
-```shell
-git checkout -b my_branch
-git add .
-git commit
-git push origin my_branch
-```
-
-### 6. Open a Pull Request
-
-* On GitHub, go to the page of your forked repository, and create a Pull Request:
-   - Click on `Branches`
-   - Click on the `...` beside your branch and click on `New pull request`
-   - Set `base repository` to `All-Hands-AI/OpenHands`
-   - Set `base` to `main`
-   - Click `Create pull request`
-
-The PR should appear in [OpenHands PRs](https://github.com/All-Hands-AI/OpenHands/pulls).
-
-Then the OpenHands team will review your code.
-
-## PR Rules
-
-### 1. Pull Request title
+### Pull Request title
 As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:

 - `feat`: A new feature
@@ -112,6 +86,9 @@ For example, a PR title could be:

 You may also check out previous PRs in the [PR list](https://github.com/All-Hands-AI/OpenHands/pulls).

-### 2. Pull Request description
+### Pull Request description
 - If your PR is small (such as a typo fix), you can go brief.
 - If it contains a lot of changes, it's better to write more details.
+
+If your changes are user-facing (e.g. a new feature in the UI, a change in behavior, or a bugfix)
+please include a short message that we can add to our changelog.
--- a/Development.md
+++ b/Development.md
@@ -5,12 +5,14 @@ Otherwise, you can clone the OpenHands project directly.

 ## Start the server for development
 ### 1. Requirements
-* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [ Ubuntu <= 22.04]
+* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [Ubuntu <= 22.04]
 * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
 * [Python](https://www.python.org/downloads/) = 3.12
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
-* netcat => sudo apt-get install netcat
+* OS-specific dependencies:
+  - Ubuntu: build-essential => `sudo apt-get install build-essential`
+  - WSL: netcat => `sudo apt-get install netcat`

 Make sure you have all these dependencies installed before moving on to `make build`.

@@ -91,9 +93,6 @@ To run tests, refer to the following:
 poetry run pytest ./tests/unit/test_*.py
 ```

-#### Integration tests
-Please refer to [this README](./tests/integration/README.md) for details.
-
 ### 9. Add or update dependency
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`
--- a/4
+++ b/4
@@ -195,7 +195,7 @@ start-backend:
 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start -- --port $(FRONTEND_PORT)
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run dev -- --port $(FRONTEND_PORT) --host $(BACKEND_HOST)

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -214,7 +214,7 @@ _run_setup:
 run:
 	@echo "$(YELLOW)Running the app...$(RESET)"
 	@$(MAKE) -s _run_setup
-	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
+	@$(MAKE) -s start-frontend
 	@echo "$(GREEN)Application started successfully.$(RESET)"

 # Run the app (in docker)
--- a/README.md
+++ b/README.md
@@ -42,10 +42,10 @@ system requirements and more information.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

-docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik

 docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -53,7 +53,7 @@ docker run -it --pull=always \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
+    ghcr.io/all-hands-ai/openhands:0.11
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
--- a/config.template.toml
+++ b/config.template.toml
@@ -172,11 +172,9 @@ model = "gpt-4o"
 #disable_vision = true

 [llm.gpt4o-mini]
-# API key to use
 api_key = "your-api-key"
+model = "gpt-4o"

-# Model to use
-model = "gpt-4o-mini"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -8,7 +8,7 @@ RUN npm install -g npm@10.5.1
 RUN npm ci

 COPY ./frontend ./
-RUN npm run make-i18n && npm run build
+RUN npm run build

 FROM python:3.12.3-slim AS backend-builder

@@ -46,14 +46,6 @@ RUN mkdir -p $WORKSPACE_BASE
 RUN apt-get update -y \
    && apt-get install -y curl ssh sudo

-# Install Docker - https://docs.docker.com/engine/install/debian/
-RUN apt-get install ca-certificates curl \
-    && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \
-    && chmod a+r /etc/apt/keyrings/docker.asc \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null \
-    && apt-get update \
-    && apt install -y docker-ce
-
 # Default is 1000, but OSX is often 501
 RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
 # Default is 60000, but we've seen up to 200000
@@ -90,7 +82,7 @@ RUN python openhands/core/download.py # No-op to download assets
 # openhands:openhands -> openhands:app
 RUN find /app \! -group app -exec chgrp app {} +

-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build/client ./frontend/build
+COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
 COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root
--- a/docs/modules/usage/architecture/runtime.md
+++ b/docs/modules/usage/architecture/runtime.md
@@ -21,7 +21,7 @@ The OpenHands Runtime system uses a client-server architecture implemented with
 graph TD
    A[User-provided Custom Docker Image] --> B[OpenHands Backend]
    B -->|Builds| C[OH Runtime Image]
-    C -->|Launches| D[Runtime Client]
+    C -->|Launches| D[Action Executor]
    D -->|Initializes| E[Browser]
    D -->|Initializes| F[Bash Shell]
    D -->|Initializes| G[Plugins]
@@ -49,10 +49,10 @@ graph TD
 1. User Input: The user provides a custom base Docker image
 2. Image Building: OpenHands builds a new Docker image (the "OH runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
 3. Container Launch: When OpenHands starts, it launches a Docker container using the OH runtime image
-4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins
-5. Communication: The OpenHands backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations
+4. Action Execution Server Initialization: The action execution server initializes an `ActionExecutor` inside the container, setting up necessary components like a bash shell and loading any specified plugins
+5. Communication: The OpenHands backend (`openhands/runtime/impl/eventstream/eventstream_runtime.py`) communicates with the action execution server over RESTful API, sending actions and receiving observations
 6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
-7. Observation Return: The client sends execution results back to the OpenHands backend as observations
+7. Observation Return: The action execution server sends execution results back to the OpenHands backend as observations


 The role of the client:
@@ -70,74 +70,46 @@ Check out the [relevant code](https://github.com/All-Hands-AI/OpenHands/blob/mai

 ### Image Tagging System

-OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility.
+Tags may be in one of 2 formats:

-1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
-   Example: `runtime:abc123def456`
+- **Generic**: `oh_v{openhands_version}_{16_digit_lock_hash}` (e.g.: `oh_v0.9.9_1234567890abcdef`)
+- **Specific**: `oh_v{openhands_version}_{16_digit_lock_hash}_{16_digit_source_hash}`
+  (e.g.: `oh_v0.9.9_1234567890abcdef_1234567890abcdef`)

-   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile
-   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile
-   - This ensures reproducibility; the same hash always means the same image contents
+#### Lock Hash

-2. Generic tag: `{target_image_repo}:{target_image_tag}`.
-   Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+This hash is built from the first 16 digits of the MD5 of:
+- The name of the base image upon which the image was built (e.g.: `nikolaik/python-nodejs:python3.12-nodejs22`)
+- The content of the `pyproject.toml` included in the image.
+- The content of the `poetry.lock` included in the image.

-   - This tag follows the format: `runtime:oh_v{OH_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
-   - It represents the latest build for a particular base image and OpenHands version combination
-   - This tag is updated whenever a new image is built from the same base image, even if the source code changes
+This effectively gives a hash for the dependencies of Openhands independent of the source code.

-The hash-based tag ensures reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.
+#### Source Hash

-### Build Process
+This is the first 16 digits of the MD5 of the directory hash for the source directory. This gives a hash
+for only the openhands source

-1. Image Naming Convention:
-   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
-     Example: `runtime:abc123def456`
-   - Generic tag: `{target_image_repo}:{target_image_tag}`.
-     Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+#### Build Process

-2. Build Process:
-   - a. Convert the base image name to an OH runtime image name
-      Example: `ubuntu:22.04` -> `runtime:oh_v0.9.3_ubuntu_tag_22.04`
-   - b. Generate a build context (Dockerfile and OpenHands source code) and calculate its hash
-   - c. Check for an existing image with the calculated hash
-   - d. If not found, check for a recent compatible image to use as a base
-   - e. If no compatible image exists, build from scratch using the original base image
-   - f. Tag the new image with both hash-based and generic tags
+When generating an image...

-3. Image Reuse and Rebuilding Logic:
-   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
-   - a. If an image exists with the same hash (e.g., `runtime:abc123def456`), it will be reused as is
-   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
-   - c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch
+- OpenHands first checks whether an image with the same **Specific** tag exists. If there is such an image,
+  no build is performed - the existing image is used.
+- OpenHands next checks whether an image with the **Generic** tag exists. If there is such an image,
+  OpenHands builds a new image based upon it, bypassing all installation steps (like `poetry install` and
+  `apt-get`) except a final operation to copy the current source code. The new image is tagged with a
+  **Specific** tag only.
+- If neither a **Specific** nor **Generic** tag exists, a brand new image is built based upon the base
+  image (Which is a slower operation). This new image is tagged with both the **Generic** and **Specific**
+  tags.

-4. Caching and Efficiency:
-   - The system attempts to reuse existing images when possible to save build time
-   - If an exact match (by hash) is found, it's used without rebuilding
-   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation
-
-Here's a flowchart illustrating the build process:
-
-```mermaid
-flowchart TD
-    A[Start] --> B{Convert base image name}
-    B --> |ubuntu:22.04 -> runtime:oh_v0.9.3_ubuntu_tag_22.04| C[Generate build context and hash]
-    C --> D{Check for existing image with hash}
-    D -->|Found runtime:abc123def456| E[Use existing image]
-    D -->|Not found| F{Check for runtime:oh_v0.9.3_ubuntu_tag_22.04}
-    F -->|Found| G[Rebuild based on recent image]
-    F -->|Not found| H[Build from scratch]
-    G --> I[Tag with hash and generic tags]
-    H --> I
-    E --> J[End]
-    I --> J
-```
-
-This approach ensures that:
+This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.

 1. Identical source code and Dockerfile always produce the same image (via hash-based tags)
 2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images)
-3. The generic tag (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination
+3. The generic tag (e.g., `runtime:oh_v0.9.3_1234567890abcdef`) always points to the latest build for a particular base image and OpenHands version combination

 ## Runtime Plugin System

--- a/docs/modules/usage/how-to/cli-mode.md
+++ b/docs/modules/usage/how-to/cli-mode.md
@@ -57,7 +57,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
+    ghcr.io/all-hands-ai/openhands:0.11 \
    python -m openhands.core.cli
 ```

--- a/docs/modules/usage/how-to/custom-sandbox-guide.md
+++ b/docs/modules/usage/how-to/custom-sandbox-guide.md
@@ -1,81 +1,64 @@
 # Custom Sandbox

-The sandbox is where the agent does its work. Instead of running commands directly on your computer
-(which could be dangerous), the agent runs them inside of a Docker container.
+The sandbox is where the agent performs its tasks. Instead of running commands directly on your computer
+(which could be risky), the agent runs them inside a Docker container.

 The default OpenHands sandbox (`python-nodejs:python3.12-nodejs22`
 from [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)) comes with some packages installed such
-as python and Node.js but your use case may need additional software installed by default.
+as python and Node.js but may need other software installed by default.

-There are two ways you can do so:
+You have two options for customization:

-1. Use an existing image from docker hub.
-2. Creating your own custom docker image and using it.
+1. Use an existing image with the required software.
+2. Create your own custom Docker image.

-If you want to take the first approach, you can skip the `Create Your Docker Image` section.
-
-## Setup
-
-Make sure you are able to run OpenHands using the [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) first.
+If you choose the first option, you can skip the `Create Your Docker Image` section.

 ## Create Your Docker Image

-To create a custom docker image, it must be debian/ubuntu based.
+To create a custom Docker image, it must be Debian based.

-For example, if we want OpenHands to have access to the `node` binary, we would use the following Dockerfile:
+For example, if you want OpenHands to have `ruby` installed, create a `Dockerfile` with the following content:

 ```dockerfile
-# Start with latest ubuntu image
-FROM ubuntu:latest
+FROM debian:latest

-# Run needed updates
-RUN apt-get update && apt-get install -y
-
-# Install node
-RUN apt-get install -y nodejs
+# Install required packages
+RUN apt-get update && apt-get install -y ruby
 ```

-Next build your docker image with the name of your choice, for example `custom_image`.
+Save this file in a folder. Then, build your Docker image (e.g., named custom-image) by navigating to the folder in
+the terminal and running::
+```bash
+docker build -t custom-image .
+```

-To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:
+This will produce a new image called `custom-image`, which will be available in Docker.
+
+> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the
+> sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
+
+## Using the Development Workflow
+
+### Setup
+
+First, ensure you can run OpenHands by following the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+### Specify the Base Sandbox Image
+
+In the `config.toml` file within the OpenHands directory, set the `sandbox_base_container_image` to the image you want to use.
+This can be an image you’ve already pulled or one you’ve built:

 ```bash
-docker build -t custom_image .
-```
-
-This will produce a new image called ```custom_image``` that will be available in Docker Engine.
-
-> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
->
-> Installing with apt-get above installs node for all users.
-
-## Specify your sandbox image in config.toml file
-
-OpenHands configuration occurs via the top-level `config.toml` file.
-
-Create a `config.toml` file in the OpenHands directory and enter these contents:
-
-```toml
 [core]
-workspace_base="./workspace"
-run_as_openhands=true
-sandbox_base_container_image="custom_image"
+...
+sandbox_base_container_image="custom-image"
 ```

-For `sandbox_base_container_image`, you can specify either:
+### Run

-1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
-2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)
-
-## Run
 Run OpenHands by running ```make run``` in the top level directory.

-Navigate to ```localhost:3001``` and check if your desired dependencies are available.
-
-In the case of the example above, running ```node -v``` in the terminal produces ```v20.15.0```.
-
-Congratulations!
-
 ## Technical Explanation

 Please refer to [custom docker image section of the runtime documentation](https://docs.all-hands.dev/modules/usage/architecture/runtime#advanced-how-openhands-builds-and-maintains-od-runtime-images) for more details.
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:

 4. Create a function to process each instance:
   ```python
+   from openhands.utils.async_utils import call_async_from_sync
   def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
       config = get_config(instance, metadata)
       runtime = create_runtime(config)
+       call_async_from_sync(runtime.connect)
       initialize_runtime(runtime, instance)

       instruction = get_instruction(instance, metadata)
--- a/docs/modules/usage/how-to/headless-mode.md
+++ b/docs/modules/usage/how-to/headless-mode.md
@@ -51,6 +51,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
+    ghcr.io/all-hands-ai/openhands:0.11 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```
--- a/docs/modules/usage/installation.mdx
+++ b/docs/modules/usage/installation.mdx
@@ -14,10 +14,10 @@ existing code that you'd like to modify.
 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

-docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik

 docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -25,7 +25,7 @@ docker run -it --pull=always \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
+    ghcr.io/all-hands-ai/openhands:0.11
 ```

 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
--- a/docs/modules/usage/llms/azure-llms.md
+++ b/docs/modules/usage/llms/azure-llms.md
@@ -5,7 +5,7 @@ OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their
 ## Azure OpenAI Configuration

 When running OpenHands, you'll need to set the following environment variable using `-e` in the
-[docker run command](/modules/usage/installation):
+[docker run command](/modules/usage/installation#start-the-app):

 ```
 LLM_API_VERSION="<api-version>"              # e.g. "2023-05-15"
@@ -37,7 +37,7 @@ OpenHands uses llama-index for embeddings. You can find their documentation on A
 ### Azure OpenAI Configuration

 When running OpenHands, set the following environment variables using `-e` in the
-[docker run command](/modules/usage/installation):
+[docker run command](/modules/usage/installation#start-the-app):

 ```
 LLM_EMBEDDING_MODEL="azureopenai"
--- a/docs/modules/usage/llms/google-llms.md
+++ b/docs/modules/usage/llms/google-llms.md
@@ -16,7 +16,7 @@ If the model is not in the list, toggle `Advanced Options`, and enter it in `Cus
 ## VertexAI - Google Cloud Platform Configs

 To use Vertex AI through Google Cloud Platform when running OpenHands, you'll need to set the following environment
-variables using `-e` in the [docker run command](/modules/usage/installation):
+variables using `-e` in the [docker run command](/modules/usage/installation#start-the-app):

 ```
 GOOGLE_APPLICATION_CREDENTIALS="<json-dump-of-gcp-service-account-json>"
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -48,7 +48,7 @@ The following can be set in the OpenHands UI through the Settings:
 - `Base URL` (through `Advanced Settings`)

 There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
-can be set through environment variables passed to the [docker run command](/modules/usage/installation)
+can be set through environment variables passed to the [docker run command](/modules/usage/installation#start-the-app)
 using `-e`:

 - `LLM_API_VERSION`
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -12,7 +12,7 @@
        "@docusaurus/plugin-content-pages": "^3.5.2",
        "@docusaurus/preset-classic": "^3.5.2",
        "@docusaurus/theme-mermaid": "^3.5.2",
-        "@mdx-js/react": "^3.0.0",
+        "@mdx-js/react": "^3.1.0",
        "clsx": "^2.0.0",
        "prism-react-renderer": "^2.4.0",
        "react": "^18.3.1",
@@ -2883,9 +2883,9 @@
      }
    },
    "node_modules/@mdx-js/react": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.0.1.tgz",
-      "integrity": "sha512-9ZrPIU4MGf6et1m1ov3zKf+q9+deetI51zprKB1D/z3NOb+rUxxtEl3mCjW5wTGh6VhRdwPueh1oRzi6ezkA8A==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.0.tgz",
+      "integrity": "sha512-QjHtSaoameoalGnKDT3FoIl4+9RwyTmo9ZJGBdLOks/YOiWHoRDI3PUwEzOE7kEmGcV3AFcp9K6dYu9rEuKLAQ==",
      "dependencies": {
        "@types/mdx": "^2.0.0"
      },
--- a/docs/package.json
+++ b/docs/package.json
@@ -19,7 +19,7 @@
    "@docusaurus/plugin-content-pages": "^3.5.2",
    "@docusaurus/preset-classic": "^3.5.2",
    "@docusaurus/theme-mermaid": "^3.5.2",
-    "@mdx-js/react": "^3.0.0",
+    "@mdx-js/react": "^3.1.0",
    "clsx": "^2.0.0",
    "prism-react-renderer": "^2.4.0",
    "react": "^18.3.1",
--- a/docs/static/img/settings-advanced.png
+++ b/docs/static/img/settings-advanced.png
--- a/docs/static/img/settings-screenshot.png
+++ b/docs/static/img/settings-screenshot.png
--- a/docs/yarn.lock
+++ b/docs/yarn.lock
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -23,6 +23,7 @@ from openhands.core.config import (
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync

 game = None

@@ -119,6 +120,7 @@ def process_instance(

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -2,19 +2,47 @@

 This folder contains code and resources to run experiments and evaluations.

-## Logistics
+## For Benchmark Users

-To better organize the evaluation folder, we should follow the rules below:
+### Setup

- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
-all the preprocessing/evaluation/analysis scripts.
- Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+Before starting evaluation, follow the instructions here [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
+
+Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the evaluation directory.
+Generally these will involve running `run_infer.py` to perform inference with the agents.
+
+### Implementing and Evaluating an Agent
+
+To add an agent to OpenHands, you will need to implement it in the [agenthub directory](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub). There is a README there with more information.
+
+To evaluate an agent, you can provide the agent's name to the `run_infer.py` program.
+
+### Evaluating Different LLMs
+
+OpenHands in development mode uses `config.toml` to keep track of most configuration.
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```

 ## Supported Benchmarks

-To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness).
+The OpenHands evaluation harness supports a wide variety of benchmarks across software engineering, web browsing, and miscellaneous assistance tasks.

 ### Software Engineering

@@ -41,36 +69,19 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
 - ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)

-## Before everything begins: Setup Environment and LLM Configuration
-
-Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
-
-OpenHands in development mode uses `config.toml` to keep track of most configurations.
-
-Here's an example configuration file you can use to define and use multiple LLMs:
-
-```toml
-[llm]
-# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
-api_key = "sk-XXX"
-
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
-
-### Result Visualization
+## Result Visualization

 Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.

-### Upload your results
-
 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+## For Benchmark Developers
+
+To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
+
+- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
+all the preprocessing/evaluation/analysis scripts.
+- Raw data and experimental records should not be stored within this repo.
+- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
+- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def get_config(
@@ -210,6 +211,7 @@ def process_instance(
    # =============================================

    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    initialize_runtime(runtime, instance=instance)

--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 # Configure visibility of unit tests to the Agent.
 USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
@@ -48,13 +49,14 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            timeout=100,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
        ),
        # do not mount workspace
        workspace_base=None,
@@ -186,7 +188,9 @@ def process_instance(
        signature_file=f'{instance.instance_name}.py',
    )
    if USE_UNIT_TESTS:
-        print(f'\nInstruction to run test_file: {instance.instance_name}_test.py\n')
+        logger.info(
+            f'\nInstruction to run test_file: {instance.instance_name}_test.py\n'
+        )
        instruction += (
            f'Use `python -m unittest {instance.instance_name}_test.py` to run the test_file '
            'and verify the correctness of your solution. DO NOT EDIT the test file.\n\n'
@@ -204,6 +208,7 @@ def process_instance(
    # =============================================

    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    initialize_runtime(runtime, instance=instance)

--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -29,7 +29,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': functools.partial(
@@ -275,7 +276,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def codeact_user_response(state: State) -> str:
@@ -403,6 +404,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -28,7 +28,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')

@@ -142,6 +143,7 @@ def process_instance(
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -25,6 +25,7 @@ from openhands.core.config import (
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -81,6 +82,7 @@ def process_instance(

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -48,6 +48,7 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import Observation
+from openhands.utils.async_utils import call_async_from_sync

 ACTION_FORMAT = """
 <<FINAL_ANSWER||
@@ -215,7 +216,7 @@ Ok now its time to start solving the question. Good luck!
 """

    runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -37,7 +37,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 IMPORT_HELPER = {
    'python': [
@@ -233,6 +234,7 @@ def process_instance(

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)
    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/integration_tests/README.md
+++ b/evaluation/integration_tests/README.md
@@ -0,0 +1,69 @@
+# Integration tests
+
+This directory implements integration tests that [was running in CI](https://github.com/All-Hands-AI/OpenHands/tree/23d3becf1d6f5d07e592f7345750c314a826b4e9/tests/integration).
+
+[PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985) introduce LLM-based editing, which requires access to LLM to perform edit. Hence, we remove integration tests from CI and intend to run them as nightly evaluation to ensure the quality of OpenHands softwares.
+
+## To add new tests
+
+Each test is a file named like `tXX_testname.py` where `XX` is a number.
+Make sure to name the file for each test to start with `t` and ends with `.py`.
+
+Each test should be structured as a subclass of [`BaseIntegrationTest`](./tests/base.py), where you need to implement `initialize_runtime` that setup the runtime enviornment before test, and `verify_result` that takes in a `Runtime` and history of `Event` and return a `TestResult`. See [t01_fix_simple_typo.py](./tests/t01_fix_simple_typo.py) and [t05_simple_browsing.py](./tests/t05_simple_browsing.py) for two representative examples.
+
+```python
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
+```
+
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local
+development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+    your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+    defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+    instances. By default, the script evaluates the entire Exercism test set
+    (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
+
+Example:
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh llm.claude-35-sonnet-eval HEAD CodeActAgent
+```
--- a/evaluation/integration_tests/init.py
+++ b/evaluation/integration_tests/init.py
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -0,0 +1,213 @@
+import asyncio
+import importlib.util
+import os
+
+import pandas as pd
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    codeact_user_response,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+FAKE_RESPONSES = {
+    'CodeActAgent': codeact_user_response,
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+    instance_id: str,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            # use default base_container_image
+            enable_auto_lint=True,
+            use_host_network=False,
+            timeout=100,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    if metadata.llm_config.log_completions:
+        metadata.llm_config.log_completions_folder = os.path.join(
+            metadata.eval_output_dir, 'llm_completions', instance_id
+        )
+        logger.info(
+            f'Logging LLM completions for instance {instance_id} to '
+            f'{metadata.llm_config.log_completions_folder}'
+        )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata, instance.instance_id)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # import test instance
+    # =============================================
+    instance_id = instance.instance_id
+    spec = importlib.util.spec_from_file_location(instance_id, instance.file_path)
+    test_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(test_module)
+    assert hasattr(
+        test_module, 'Test'
+    ), f'Test module {instance_id} does not have a Test class'
+
+    test_class: type[BaseIntegrationTest] = test_module.Test
+    assert issubclass(
+        test_class, BaseIntegrationTest
+    ), f'Test class {instance_id} does not inherit from BaseIntegrationTest'
+
+    instruction = test_class.INSTRUCTION
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    test_class.initialize_runtime(runtime)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # # =============================================
+    # # result evaluation
+    # # =============================================
+
+    histories = state.history.get_events()
+    test_result: TestResult = test_class.verify_result(runtime, histories)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result.model_dump(),
+    )
+    return output
+
+
+def load_integration_tests() -> pd.DataFrame:
+    """Load tests from python files under ./tests"""
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    test_dir = os.path.join(cur_dir, 'tests')
+    test_files = [
+        os.path.join(test_dir, f)
+        for f in os.listdir(test_dir)
+        if f.startswith('t') and f.endswith('.py')
+    ]
+    df = pd.DataFrame(test_files, columns=['file_path'])
+    df['instance_id'] = df['file_path'].apply(
+        lambda x: os.path.basename(x).rstrip('.py')
+    )
+    return df
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    integration_tests = load_integration_tests()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'integration_tests',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        integration_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
+
+    df = pd.read_json(output_file, lines=True, orient='records')
+    df['success'] = df['test_result'].apply(lambda x: x['success'])
+    df['reason'] = df['test_result'].apply(lambda x: x['reason'])
+    logger.info('-' * 100)
+    logger.info(
+        f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})'
+    )
+    logger.info(
+        '\nEvaluation Results:'
+        + '\n'
+        + df[['instance_id', 'success', 'reason']].to_string(index=False)
+    )
+    logger.info('-' * 100)
--- a/evaluation/integration_tests/scripts/run_infer.sh
+++ b/evaluation/integration_tests/scripts/run_infer.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+EVAL_IDS=$6
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_agent_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$AGENT_VERSION
+
+# Default to NOT use unit tests.
+if [ -z "$USE_UNIT_TESTS" ]; then
+  export USE_UNIT_TESTS=false
+fi
+echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+# export PYTHONPATH=evaluation/integration_tests:\$PYTHONPATH
+COMMAND="poetry run python evaluation/integration_tests/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/integration_tests/tests/init.py
+++ b/evaluation/integration_tests/tests/init.py
--- a/evaluation/integration_tests/tests/base.py
+++ b/evaluation/integration_tests/tests/base.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
--- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py
+++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@@ -0,0 +1,39 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Fix typos in bad.txt.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'bad.txt')
+            with open(temp_file_path, 'w') as f:
+                f.write('This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!')
+
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/workspace')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/bad.txt has been fixed
+        action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to run command: {obs.content}'
+            )
+        # check if the file /workspace/bad.txt has been fixed
+        if (
+            obs.content.strip().replace('\r\n', '\n')
+            == 'This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!'
+        ):
+            return TestResult(success=True)
+        return TestResult(success=False, reason=f'File not fixed: {obs.content}')
--- a/evaluation/integration_tests/tests/t02_add_bash_hello.py
+++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py
@@ -0,0 +1,40 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Write a shell script '/workspace/hello.sh' that prints 'hello'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/hello.sh', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.sh: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='bash /workspace/hello.sh', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to execute /workspace/hello.sh: {obs.content}.',
+            )
+        if obs.content.strip() != 'hello':
+            return TestResult(
+                success=False, reason=f'Script did not print "hello": {obs.content}.'
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py
+++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
@@ -0,0 +1,43 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        if 'hello world' not in obs.content.strip():
+            return TestResult(
+                success=False,
+                reason=f'File did not contain "hello world": {obs.content}.',
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t04_git_staging.py
+++ b/evaluation/integration_tests/tests/t04_git_staging.py
@@ -0,0 +1,58 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Write a git commit message for the current staging area and commit the changes.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git init
+        action = CmdRunAction(command='git init', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create README.md
+        action = CmdRunAction(
+            command='echo \'print("hello world")\' > hello.py', keep_prompt=False
+        )
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git add README.md
+        action = CmdRunAction(command='git add hello.py', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.py exists
+        action = CmdRunAction(command='cat /workspace/hello.py', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.py: {obs.content}.',
+            )
+
+        # check if the staging area is empty
+        action = CmdRunAction(command='git status', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to git status: {obs.content}.'
+            )
+        if 'nothing to commit, working tree clean' in obs.content.strip():
+            return TestResult(success=True)
+
+        return TestResult(
+            success=False,
+            reason=f'Failed to check for "nothing to commit, working tree clean": {obs.content}.',
+        )
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -0,0 +1,134 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+HTML_FILE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>The Ultimate Answer</title>
+    <style>
+        body {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+            margin: 0;
+            background: linear-gradient(to right, #1e3c72, #2a5298);
+            color: #fff;
+            font-family: 'Arial', sans-serif;
+            text-align: center;
+        }
+        .container {
+            text-align: center;
+            padding: 20px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.2);
+        }
+        h1 {
+            font-size: 36px;
+            margin-bottom: 20px;
+        }
+        p {
+            font-size: 18px;
+            margin-bottom: 30px;
+        }
+        #showButton {
+            padding: 10px 20px;
+            font-size: 16px;
+            color: #1e3c72;
+            background: #fff;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            transition: background 0.3s ease;
+        }
+        #showButton:hover {
+            background: #f0f0f0;
+        }
+        #result {
+            margin-top: 20px;
+            font-size: 24px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>The Ultimate Answer</h1>
+        <p>Click the button to reveal the answer to life, the universe, and everything.</p>
+        <button id="showButton">Click me</button>
+        <div id="result"></div>
+    </div>
+    <script>
+        document.getElementById('showButton').addEventListener('click', function() {
+            document.getElementById('result').innerText = 'The answer is OpenHands is all you need!';
+        });
+    </script>
+</body>
+</html>
+"""
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Browse localhost:8000, and tell me the ultimate answer to life.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        action = CmdRunAction(command='mkdir -p /tmp/server', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'index.html')
+            with open(temp_file_path, 'w') as f:
+                f.write(HTML_FILE)
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/tmp/server')
+
+        # create README.md
+        action = CmdRunAction(
+            command='cd /tmp/server && nohup python3 -m http.server 8000 &',
+            keep_prompt=False,
+        )
+        obs = runtime.run_action(action)
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the "The answer is OpenHands is all you need!" is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        for event in message_actions:
+            if isinstance(event, AgentDelegateObservation):
+                content = event.content
+            elif isinstance(event, AgentFinishAction):
+                content = event.outputs.get('content', '')
+            elif isinstance(event, MessageAction):
+                content = event.content
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')
+
+            if 'OpenHands is all you need!' in content:
+                return TestResult(success=True)
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
+        )
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -29,7 +29,8 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -202,6 +203,7 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -1,4 +1,4 @@
-# WebArena Evaluation with OpenHands Browsing Agents
+# Mini-World of Bits Evaluation with OpenHands Browsing Agents

 This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.

--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -30,11 +30,12 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
    BROWSER_EVAL_GET_GOAL_ACTION,
    BROWSER_EVAL_GET_REWARDS_ACTION,
 )
-from openhands.runtime.runtime import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

@@ -127,6 +128,7 @@ def process_instance(
        logger.info(f'Starting evaluation for instance {env_id}.')

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    task_str = initialize_runtime(runtime)
    state: State | None = asyncio.run(
        run_controller(
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -32,7 +32,8 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
@@ -176,6 +177,7 @@ def process_instance(
    )

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime)

    state: State | None = asyncio.run(
--- a/evaluation/mint/tasks/reasoning.py
+++ b/evaluation/mint/tasks/reasoning.py
@@ -131,11 +131,9 @@ class MultipleChoiceTask(Task):


 def compare_two_numbers(p, gt):
-    if isinstance(p, int) or isinstance(p, float):
+    if isinstance(p, (int, float)):
        pass
-    elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
-        return False
-    elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
+    elif isinstance(p, (bool, complex, dict, list, str, tuple)):
        return False
    else:
        raise ValueError(p)
@@ -227,8 +225,8 @@ class TheoremqaTask(Task):
            prediction = prediction.replace('°', '')

        # Detect the boolean keyword in the generation
-        if prediction in ['true', 'yes', 'false', 'no']:
-            if prediction == 'true' or prediction == 'yes':
+        if prediction in ('true', 'yes', 'false', 'no'):
+            if prediction in ('true', 'yes'):
                prediction = 'True'
            else:
                prediction = 'False'
@@ -342,7 +340,7 @@ class TheoremqaTask(Task):
        answer_type = self._answer_type
        gt = self.extract_answer(self.reference)

-        if isinstance(prediction, (str, int, float)) or isinstance(prediction, list):
+        if isinstance(prediction, (str, int, float, list)):
            # Comparing prediction against the reference
            if answer_type in ['bool', 'option', 'Option']:
                cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -41,7 +41,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 config = load_app_config()

@@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Run the agent
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -28,6 +28,7 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation
+from openhands.utils.async_utils import call_async_from_sync

 # TODO: migrate all swe-bench docker to ghcr.io/openhands
 DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
@@ -128,7 +129,7 @@ def process_instance(
        )

    runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
    # Get patch and save it to /tmp/patch.diff
    with tempfile.TemporaryDirectory() as temp_dir:
        # Patch file
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -11,6 +11,7 @@ from datasets import load_dataset
 import openhands.agenthub
 from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
 from evaluation.utils.shared import (
+    EvalException,
    EvalMetadata,
    EvalOutput,
    assert_and_raise,
@@ -32,8 +33,9 @@ from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.events.serialization.event import event_to_dict
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
 from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
+from openhands.utils.async_utils import call_async_from_sync

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
@@ -80,8 +82,10 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
            instruction += f'# Hints\n{instance.hints_text}\n\n'
        instruction += (
            'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-            'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
-            'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+            'You should NOT modify any existing test case files. You SHOULD add new test in a NEW file to reproduce the issue.\n'
+            'You should verify that the issue is resolved and any new tests you create pass successfully.\n'
+            'You should NEVER use web browsing or any other web-based tools.\n'
+            'You should ALWAYS use the default Python interpreter available in the <execute_bash> environment to run code related to the provided issue and/or repository.\n'
        )

    # NOTE: You can actually set slightly different instruction for different agents
@@ -122,7 +126,6 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        max_budget_per_task=4,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'eventstream'),
        sandbox=SandboxConfig(
@@ -131,6 +134,8 @@ def get_config(
            use_host_network=False,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
+            # Add platform to the sandbox config to solve issue 4401
+            platform='linux/amd64',
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_remote_runtime_alive=False,
@@ -139,6 +144,14 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
+    if metadata.llm_config.log_completions:
+        metadata.llm_config.log_completions_folder = os.path.join(
+            metadata.eval_output_dir, 'llm_completions', instance['instance_id']
+        )
+        logger.info(
+            f'Logging LLM completions for instance {instance["instance_id"]} to '
+            f'{metadata.llm_config.log_completions_folder}'
+        )
    config.set_llm_config(metadata.llm_config)
    return config

@@ -166,7 +179,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
-        obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {obs.content}'
+        obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
    )

    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
@@ -174,7 +187,7 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {obs.content}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')

    if USE_INSTANCE_IMAGE:
        # inject the init script
@@ -188,7 +201,7 @@ def initialize_runtime(
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert_and_raise(
            obs.exit_code == 0,
-            f'Failed to create /swe_util/eval_data/instances: {obs.content}',
+            f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
        )

        swe_instance_json_name = 'swe-bench-instance.json'
@@ -215,16 +228,16 @@ def initialize_runtime(
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {obs.content}')
+        assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')

        action = CmdRunAction(command='source ~/.bashrc')
        action.timeout = 600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert_and_raise(
-            obs.exit_code == 0, f'Failed to source ~/.bashrc: {obs.content}'
-        )
+        if isinstance(obs, ErrorObservation):
+            logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
+        assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')

        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
        action.timeout = 3600
@@ -233,7 +246,7 @@ def initialize_runtime(
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert_and_raise(
            obs.exit_code == 0,
-            f'Failed to source /swe_util/instance_swe_entry.sh: {obs.content}',
+            f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
        )
    else:
        action = CmdRunAction(command='source /swe_util/swe_entry.sh')
@@ -243,7 +256,7 @@ def initialize_runtime(
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert_and_raise(
            obs.exit_code == 0,
-            f'Failed to source /swe_util/swe_entry.sh: {obs.content}',
+            f'Failed to source /swe_util/swe_entry.sh: {str(obs)}',
        )

    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
@@ -253,7 +266,7 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {obs.content}',
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
    )

    action = CmdRunAction(command='git reset --hard')
@@ -261,7 +274,7 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {obs.content}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')

    action = CmdRunAction(
        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
@@ -270,7 +283,7 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {obs.content}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
@@ -300,7 +313,7 @@ def complete_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {obs.content}',
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
    )

    action = CmdRunAction(command='git config --global core.pager ""')
@@ -310,7 +323,7 @@ def complete_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
-        f'Failed to git config --global core.pager "": {obs.content}',
+        f'Failed to git config --global core.pager "": {str(obs)}',
    )

    action = CmdRunAction(command='git add -A')
@@ -318,7 +331,7 @@ def complete_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(obs.exit_code == 0, f'Failed to git add -A: {obs.content}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to git add -A: {str(obs)}')

    n_retries = 0
    git_patch = None
@@ -343,7 +356,9 @@ def complete_runtime(
            logger.error(f'Error occurred: {obs.content}. Retrying...')
            sleep_if_should_continue(10)
        else:
-            assert_and_raise(False, f'Unexpected observation type: {type(obs)}')
+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')

    logger.info('-' * 30)
    logger.info('END Runtime Completion Fn')
@@ -366,6 +381,7 @@ def process_instance(
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    try:
        initialize_runtime(runtime, instance)
@@ -384,6 +400,13 @@ def process_instance(
            )
        )

+        # if fatal error, throw EvalError to trigger re-run
+        if (
+            state.last_error
+            and 'fatal error during agent execution' in state.last_error
+        ):
+            raise EvalException('Fatal error detected: ' + state.last_error)
+
        # ======= THIS IS SWE-Bench specific =======
        # Get git patch
        return_val = complete_runtime(runtime, instance)
@@ -419,7 +442,6 @@ def process_instance(
        metadata=metadata,
        history=histories,
        metrics=metrics,
-        llm_completions=state.extra_data.get('llm_completions', []),
        error=state.last_error if state and state.last_error else None,
    )
    return output
@@ -472,14 +494,13 @@ if __name__ == '__main__':

    details = {}
    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
-    if hasattr(_agent_cls, 'system_message'):
-        details['system_message'] = _agent_cls.system_message
-    if hasattr(_agent_cls, 'in_context_example'):
-        details['in_context_example'] = _agent_cls.in_context_example

+    dataset_descrption = (
+        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
+    )
    metadata = make_metadata(
        llm_config,
-        'swe-bench-lite',
+        dataset_descrption,
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
--- a/evaluation/swe_bench/scripts/eval/compare_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/compare_outputs.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+import argparse
+
+import pandas as pd
+
+parser = argparse.ArgumentParser(
+    description='Compare two swe_bench output JSONL files and print the resolved diff'
+)
+parser.add_argument('input_file_1', type=str)
+parser.add_argument('input_file_2', type=str)
+args = parser.parse_args()
+
+df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
+df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
+
+
+# Get the intersection of the instance_ids
+df = pd.merge(df1, df2, on='instance_id', how='inner')
+
+
+def _get_resolved(report):
+    if report is None:
+        return False
+    if isinstance(report, float):
+        return False
+    else:
+        return report.get('resolved', False)
+
+
+df['resolved_x'] = df['report_x'].apply(_get_resolved)
+df['resolved_y'] = df['report_y'].apply(_get_resolved)
+df['diff'] = df.apply(lambda x: x['resolved_x'] != x['resolved_y'], axis=1)
+
+df_diff = df[df['diff']].sort_values(
+    by=['resolved_x', 'resolved_y'], ascending=[False, False]
+)
+# skip if any of the resolved is nan, which means one of the eval is not finished yet
+df_diff = df_diff[df_diff['resolved_x'].notna() & df_diff['resolved_y'].notna()]
+
+print(f'X={args.input_file_1}')
+print(f'Y={args.input_file_2}')
+print(f'# diff={df_diff.shape[0]}')
+df_diff = df_diff[['instance_id', 'resolved_x', 'resolved_y', 'report_x', 'report_y']]
+
+# x resolved but y not
+print('-' * 100)
+df_diff_x_only = df_diff[df_diff['resolved_x'] & ~df_diff['resolved_y']].sort_values(
+    by='instance_id'
+)
+print(f'# x resolved but y not={df_diff_x_only.shape[0]}')
+print(df_diff_x_only[['instance_id', 'report_x', 'report_y']])
+
+# y resolved but x not
+print('-' * 100)
+df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_values(
+    by='instance_id'
+)
+print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
+print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
+# get instance_id from df_diff_y_only
+print('-' * 100)
+print('Instances that x resolved but y not:')
+print(df_diff_x_only['instance_id'].tolist())
+
+print('-' * 100)
+print('Instances that y resolved but x not:')
+print(df_diff_y_only['instance_id'].tolist())
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -3,6 +3,9 @@ import argparse
 import json
 from collections import Counter

+from openhands.events.serialization import event_from_dict
+from openhands.events.utils import get_pairs_from_events
+
 ERROR_KEYWORDS = [
    'Agent encountered an error while processing the last action',
    'APIError',
@@ -26,8 +29,37 @@ if __name__ == '__main__':

    error_counter = Counter()

+    main_agent_cost = []
+    editor_cost = []
+    num_turns = []
+
    for line in lines:
        _d = json.loads(line)
+
+        # Cost
+        costs = _d['metrics'].get('costs', [])
+        _cur_main_agent_cost = 0
+        _cur_editor_cost = 0
+        for cost in costs:
+            if isinstance(cost, float):
+                # backward compatible
+                _cur_main_agent_cost += cost
+            else:
+                if 'draft_editor' in cost['model']:
+                    _cur_editor_cost += cost['cost']
+                else:
+                    _cur_main_agent_cost += cost['cost']
+
+        main_agent_cost.append(_cur_main_agent_cost)
+        editor_cost.append(_cur_editor_cost)
+
+        # Turn status
+        history = _d.get('history', [])
+        events = [event_from_dict(event) for event in history]
+        pairs = get_pairs_from_events(events)
+        num_turns.append(len(pairs))
+
+        # Patch & resolve status
        patch = _d.get('test_result', {}).get('git_patch', '')
        if patch == '':
            num_empty_patch += 1
@@ -38,6 +70,7 @@ if __name__ == '__main__':
        if resolved:
            num_resolved += 1

+        # Error
        error = _d.get('error', None)

        if error is not None and isinstance(error, str):
@@ -70,7 +103,17 @@ if __name__ == '__main__':
    print(
        f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
    )
+    assert len(num_turns) == num_lines
+    assert len(main_agent_cost) == num_lines
+    assert len(editor_cost) == num_lines
+    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
+    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
+    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
+    print(
+        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    )
    print('-' * 100)
    print('Detailed error breakdown:')
    for error, count in error_counter.items():
        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+    print('-' * 100)
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -25,8 +25,8 @@ if [ -z "$AGENT" ]; then
 fi

 if [ -z "$MAX_ITER" ]; then
-  echo "MAX_ITER not specified, use default 30"
-  MAX_ITER=30
+  echo "MAX_ITER not specified, use default 100"
+  MAX_ITER=100
 fi

 if [ -z "$USE_INSTANCE_IMAGE" ]; then
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -25,7 +25,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -61,7 +61,6 @@ class EvalOutput(BaseModel):
    history: (
        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
    ) = None
-    llm_completions: list[dict[str, Any]] | None = None
    metrics: dict[str, Any] | None = None
    error: str | None = None

--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -30,11 +30,12 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
    BROWSER_EVAL_GET_GOAL_ACTION,
    BROWSER_EVAL_GET_REWARDS_ACTION,
 )
-from openhands.runtime.runtime import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

@@ -143,6 +144,7 @@ def process_instance(
        logger.info(f'Starting evaluation for instance {env_id}.')

    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    task_str = initialize_runtime(runtime)

    state: State | None = asyncio.run(
--- a/frontend/.env.sample
+++ b/frontend/.env.sample
@@ -1,6 +1,2 @@
 VITE_BACKEND_BASE_URL="localhost:3000" # Backend URL without protocol (e.g. localhost:3000)
 VITE_MOCK_API="false" # true or false
-
-# GitHub OAuth
-VITE_GITHUB_CLIENT_ID=""
-VITE_APP_MODE="oss" # "oss" or "saas"
--- a/frontend/tests/components/chat-message.test.tsx
+++ b/frontend/tests/components/chat-message.test.tsx
@@ -0,0 +1,73 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { describe, it, expect, test } from "vitest";
+import { ChatMessage } from "#/components/chat-message";
+
+describe("ChatMessage", () => {
+  it("should render a user message", () => {
+    render(<ChatMessage type="user" message="Hello, World!" />);
+    expect(screen.getByTestId("user-message")).toBeInTheDocument();
+    expect(screen.getByText("Hello, World!")).toBeInTheDocument();
+  });
+
+  it("should render an assistant message", () => {
+    render(<ChatMessage type="assistant" message="Hello, World!" />);
+    expect(screen.getByTestId("assistant-message")).toBeInTheDocument();
+    expect(screen.getByText("Hello, World!")).toBeInTheDocument();
+  });
+
+  it.skip("should support code syntax highlighting", () => {
+    const code = "```js\nconsole.log('Hello, World!')\n```";
+    render(<ChatMessage type="user" message={code} />);
+
+    // SyntaxHighlighter breaks the code blocks into "tokens"
+    expect(screen.getByText("console")).toBeInTheDocument();
+    expect(screen.getByText("log")).toBeInTheDocument();
+    expect(screen.getByText("'Hello, World!'")).toBeInTheDocument();
+  });
+
+  it.todo("should support markdown content");
+
+  it("should render the copy to clipboard button when the user hovers over the message", async () => {
+    const user = userEvent.setup();
+    render(<ChatMessage type="user" message="Hello, World!" />);
+    const message = screen.getByText("Hello, World!");
+
+    expect(screen.getByTestId("copy-to-clipboard")).not.toBeVisible();
+
+    await user.hover(message);
+
+    expect(screen.getByTestId("copy-to-clipboard")).toBeVisible();
+  });
+
+  it("should copy content to clipboard", async () => {
+    const user = userEvent.setup();
+    render(<ChatMessage type="user" message="Hello, World!" />);
+    const copyToClipboardButton = screen.getByTestId("copy-to-clipboard");
+
+    await user.click(copyToClipboardButton);
+
+    expect(navigator.clipboard.readText()).resolves.toBe("Hello, World!");
+  });
+
+  // BUG: vi.useFakeTimers() seems to break the tests
+  it.todo(
+    "should display a checkmark for 200ms and disable the button after copying content to clipboard",
+  );
+
+  it("should display an error toast if copying content to clipboard fails", async () => {});
+
+  test.todo("push a toast after successfully copying content to clipboard");
+
+  it("should render a component passed as a prop", () => {
+    function Component() {
+      return <div data-testid="custom-component">Custom Component</div>;
+    }
+    render(
+      <ChatMessage type="user" message="Hello, World">
+        <Component />
+      </ChatMessage>,
+    );
+    expect(screen.getByTestId("custom-component")).toBeInTheDocument();
+  });
+});
--- a/frontend/tests/components/chat/Chat.test.tsx
+++ b/frontend/tests/components/chat/Chat.test.tsx
@@ -1,28 +0,0 @@
-import { screen } from "@testing-library/react";
-import { describe, expect, it } from "vitest";
-import { renderWithProviders } from "test-utils";
-import Chat from "#/components/chat/Chat";
-
-const MESSAGES: Message[] = [
-  {
-    sender: "assistant",
-    content: "Hello!",
-    imageUrls: [],
-    timestamp: new Date().toISOString(),
-  },
-  {
-    sender: "user",
-    content: "Hi!",
-    imageUrls: [],
-    timestamp: new Date().toISOString(),
-  },
-];
-
-describe("Chat", () => {
-  it("should render chat messages", () => {
-    renderWithProviders(<Chat messages={MESSAGES} />);
-
-    const messages = screen.getAllByTestId("article");
-    expect(messages).toHaveLength(MESSAGES.length);
-  });
-});
--- a/frontend/tests/components/chat/ChatInput.test.tsx
+++ b/frontend/tests/components/chat/ChatInput.test.tsx
@@ -1,119 +0,0 @@
-import userEvent from "@testing-library/user-event";
-import { render, screen } from "@testing-library/react";
-import { describe, afterEach, vi, it, expect } from "vitest";
-import ChatInput from "#/components/chat/ChatInput";
-
-describe.skip("ChatInput", () => {
-  afterEach(() => {
-    vi.clearAllMocks();
-  });
-
-  const onSendMessage = vi.fn();
-
-  it("should render a textarea", () => {
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    expect(screen.getByRole("textbox")).toBeInTheDocument();
-  });
-
-  it("should be able to be set as disabled", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput disabled onSendMessage={onSendMessage} />);
-
-    const textarea = screen.getByRole("textbox");
-    const button = screen.getByRole("button");
-
-    expect(textarea).not.toBeDisabled(); // user can still type
-    expect(button).toBeDisabled(); // user cannot submit
-
-    await user.type(textarea, "Hello, world!");
-    await user.keyboard("{Enter}");
-
-    expect(onSendMessage).not.toHaveBeenCalled();
-  });
-
-  it("should render with a placeholder", () => {
-    render(<ChatInput onSendMessage={onSendMessage} />);
-
-    const textarea = screen.getByPlaceholderText(
-      /CHAT_INTERFACE\$INPUT_PLACEHOLDER/i,
-    );
-    expect(textarea).toBeInTheDocument();
-  });
-
-  it("should render a send button", () => {
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    expect(screen.getByRole("button")).toBeInTheDocument();
-  });
-
-  it("should call sendChatMessage with the input when the send button is clicked", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput onSendMessage={onSendMessage} />);
-
-    const textarea = screen.getByRole("textbox");
-    const button = screen.getByRole("button");
-
-    await user.type(textarea, "Hello, world!");
-    await user.click(button);
-
-    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!", []);
-    // Additionally, check if it was called exactly once
-    expect(onSendMessage).toHaveBeenCalledTimes(1);
-  });
-
-  it("should be able to send a message when the enter key is pressed", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = screen.getByRole("textbox");
-
-    await user.type(textarea, "Hello, world!");
-    await user.keyboard("{Enter}");
-
-    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!", []);
-  });
-
-  it("should NOT send a message when shift + enter is pressed", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = screen.getByRole("textbox");
-
-    await user.type(textarea, "Hello, world!");
-    await user.keyboard("{Shift>} {Enter}"); // Shift + Enter
-
-    expect(onSendMessage).not.toHaveBeenCalled();
-  });
-
-  it("should NOT send an empty message", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = screen.getByRole("textbox");
-    const button = screen.getByRole("button");
-
-    await user.type(textarea, " ");
-
-    // with enter key
-    await user.keyboard("{Enter}");
-    expect(onSendMessage).not.toHaveBeenCalled();
-
-    // with button click
-    await user.click(button);
-    expect(onSendMessage).not.toHaveBeenCalled();
-  });
-
-  it("should clear the input message after sending a message", async () => {
-    const user = userEvent.setup();
-    render(<ChatInput onSendMessage={onSendMessage} />);
-    const textarea = screen.getByRole("textbox");
-    const button = screen.getByRole("button");
-
-    await user.type(textarea, "Hello, world!");
-    expect(textarea).toHaveValue("Hello, world!");
-
-    await user.click(button);
-    expect(textarea).toHaveValue("");
-  });
-
-  // this is already implemented but need to figure out how to test it
-  it.todo(
-    "should NOT send a message when the enter key is pressed while composing",
-  );
-});
--- a/frontend/tests/components/chat/ChatInterface.test.tsx
+++ b/frontend/tests/components/chat/ChatInterface.test.tsx
@@ -1,148 +0,0 @@
-import { screen, act } from "@testing-library/react";
-import { describe, expect, it, vi } from "vitest";
-import userEvent from "@testing-library/user-event";
-import { renderWithProviders } from "test-utils";
-import { createMemoryRouter, RouterProvider } from "react-router-dom";
-import { addAssistantMessage } from "#/state/chatSlice";
-import AgentState from "#/types/AgentState";
-import ChatInterface from "#/components/chat/ChatInterface";
-
-const router = createMemoryRouter([
-  {
-    path: "/",
-    element: <ChatInterface />,
-  },
-]);
-
-/// <reference types="vitest" />
-
-interface CustomMatchers<R = unknown> {
-  toMatchMessageEvent(expected: string): R;
-}
-
-declare module "vitest" {
-  interface Assertion<T> extends CustomMatchers<T> {}
-  // @ts-expect-error - recursively references itself
-  interface AsymmetricMatchersContaining extends CustomMatchers {}
-}
-
-// This is for the scrollview ref in Chat.tsx
-// TODO: Move this into test setup
-HTMLElement.prototype.scrollTo = vi.fn().mockImplementation(() => {});
-const TEST_TIMESTAMP = new Date().toISOString();
-
-describe.skip("ChatInterface", () => {
-  // TODO: replace below with e.g. fake timers
-  // https://vitest.dev/guide/mocking#timers
-  // https://vitest.dev/api/vi.html#vi-usefaketimers
-  // Custom matcher for testing message events
-  expect.extend({
-    toMatchMessageEvent(received, expected) {
-      const receivedObj = JSON.parse(received);
-      const expectedObj = JSON.parse(expected);
-
-      // Compare everything except the timestamp
-      const { timestamp: receivedTimestamp, ...receivedRest } =
-        receivedObj.args;
-      const { timestamp: expectedTimestamp, ...expectedRest } =
-        expectedObj.args;
-
-      const pass =
-        this.equals(receivedRest, expectedRest) &&
-        typeof receivedTimestamp === "string";
-
-      return {
-        pass,
-        message: () =>
-          pass
-            ? `expected ${received} not to match the structure of ${expected} (ignoring exact timestamp)`
-            : `expected ${received} to match the structure of ${expected} (ignoring exact timestamp)`,
-      };
-    },
-  });
-
-  it("should render empty message list and input", () => {
-    renderWithProviders(<ChatInterface />);
-    expect(screen.queryAllByTestId("article")).toHaveLength(0);
-  });
-
-  it("should render user and assistant messages", () => {
-    const { store } = renderWithProviders(<RouterProvider router={router} />, {
-      preloadedState: {
-        chat: {
-          messages: [
-            {
-              sender: "user",
-              content: "Hello",
-              imageUrls: [],
-              timestamp: TEST_TIMESTAMP,
-            },
-          ],
-        },
-      },
-    });
-
-    expect(screen.getAllByTestId("article")).toHaveLength(1);
-    expect(screen.getByText("Hello")).toBeInTheDocument();
-
-    act(() => {
-      // simulate assistant response
-      store.dispatch(addAssistantMessage("Hello to you!"));
-    });
-
-    expect(screen.getAllByTestId("article")).toHaveLength(2);
-    expect(screen.getByText("Hello to you!")).toBeInTheDocument();
-  });
-
-  it("should send the user message as an event to the Session when the agent state is INIT", async () => {
-    const user = userEvent.setup();
-    renderWithProviders(<RouterProvider router={router} />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.INIT,
-        },
-      },
-    });
-
-    const input = screen.getByRole("textbox");
-    await user.type(input, "my message");
-    await user.keyboard("{Enter}");
-  });
-
-  it("should send the user message as an event to the Session when the agent state is AWAITING_USER_INPUT", async () => {
-    const user = userEvent.setup();
-    renderWithProviders(<RouterProvider router={router} />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.AWAITING_USER_INPUT,
-        },
-      },
-    });
-
-    const input = screen.getByRole("textbox");
-    await user.type(input, "my message");
-    await user.keyboard("{Enter}");
-  });
-
-  it("should disable the user input if agent is not initialized", async () => {
-    const user = userEvent.setup();
-    renderWithProviders(<RouterProvider router={router} />, {
-      preloadedState: {
-        agent: {
-          curAgentState: AgentState.LOADING,
-        },
-      },
-    });
-
-    const input = screen.getByRole("textbox");
-    await user.type(input, "my message");
-    await user.keyboard("{Enter}");
-    const submitButton = screen.getByLabelText(
-      "CHAT_INTERFACE$TOOLTIP_SEND_MESSAGE",
-    );
-
-    expect(submitButton).toBeDisabled();
-  });
-
-  it.todo("test scroll-related behaviour");
-});
--- a/frontend/tests/components/chat/ChatMessage.test.tsx
+++ b/frontend/tests/components/chat/ChatMessage.test.tsx
@@ -1,200 +0,0 @@
-import { fireEvent, render, screen, within } from "@testing-library/react";
-import { describe, it, expect, vi } from "vitest";
-import userEvent from "@testing-library/user-event";
-import toast from "#/utils/toast";
-import ChatMessage from "#/components/chat/ChatMessage";
-
-describe("Message", () => {
-  it("should render a user message", () => {
-    render(
-      <ChatMessage
-        message={{
-          sender: "user",
-          content: "Hello",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-        }}
-        isLastMessage={false}
-      />,
-    );
-
-    expect(screen.getByTestId("article")).toBeInTheDocument();
-    expect(screen.getByTestId("article")).toHaveClass("self-end"); // user message should be on the right side
-  });
-
-  it("should render an assistant message", () => {
-    render(
-      <ChatMessage
-        message={{
-          sender: "assistant",
-          content: "Hi",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-        }}
-        isLastMessage={false}
-      />,
-    );
-
-    expect(screen.getByTestId("article")).toBeInTheDocument();
-    expect(screen.getByTestId("article")).not.toHaveClass("self-end"); // assistant message should be on the left side
-  });
-
-  it("should render markdown content", () => {
-    render(
-      <ChatMessage
-        message={{
-          sender: "user",
-          content: "```js\nconsole.log('Hello')\n```",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-        }}
-        isLastMessage={false}
-      />,
-    );
-
-    // SyntaxHighlighter breaks the code blocks into "tokens"
-    expect(screen.getByText("console")).toBeInTheDocument();
-    expect(screen.getByText("log")).toBeInTheDocument();
-    expect(screen.getByText("'Hello'")).toBeInTheDocument();
-  });
-
-  describe("copy to clipboard", () => {
-    const toastInfoSpy = vi.spyOn(toast, "info");
-    const toastErrorSpy = vi.spyOn(toast, "error");
-
-    it("should copy any message to clipboard", async () => {
-      const user = userEvent.setup();
-      render(
-        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Hello",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage={false}
-        />,
-      );
-
-      const message = screen.getByTestId("article");
-      let copyButton = within(message).queryByTestId("copy-button");
-      expect(copyButton).not.toBeInTheDocument();
-
-      // I am using `fireEvent` here because `userEvent.hover()` seems to interfere with the
-      // `userEvent.click()` call later on
-      fireEvent.mouseEnter(message);
-
-      copyButton = within(message).getByTestId("copy-button");
-      await user.click(copyButton);
-
-      expect(navigator.clipboard.readText()).resolves.toBe("Hello");
-      expect(toastInfoSpy).toHaveBeenCalled();
-    });
-
-    it("should show an error message when the message cannot be copied", async () => {
-      const user = userEvent.setup();
-      render(
-        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Hello",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage={false}
-        />,
-      );
-
-      const message = screen.getByTestId("article");
-      fireEvent.mouseEnter(message);
-
-      const copyButton = within(message).getByTestId("copy-button");
-      const clipboardSpy = vi
-        .spyOn(navigator.clipboard, "writeText")
-        .mockRejectedValue(new Error("Failed to copy"));
-
-      await user.click(copyButton);
-
-      expect(clipboardSpy).toHaveBeenCalled();
-      expect(toastErrorSpy).toHaveBeenCalled();
-    });
-  });
-
-  describe("confirmation buttons", () => {
-    const expectButtonsNotToBeRendered = () => {
-      expect(
-        screen.queryByTestId("action-confirm-button"),
-      ).not.toBeInTheDocument();
-      expect(
-        screen.queryByTestId("action-reject-button"),
-      ).not.toBeInTheDocument();
-    };
-
-    it.skip("should display confirmation buttons for the last assistant message", () => {
-      // it should not render buttons if the message is not the last one
-      const { rerender } = render(
-        <ChatMessage
-          message={{
-            sender: "assistant",
-            content: "Are you sure?",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage={false}
-          awaitingUserConfirmation
-        />,
-      );
-      expectButtonsNotToBeRendered();
-
-      // it should not render buttons if the message is not from the assistant
-      rerender(
-        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Yes",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage
-          awaitingUserConfirmation
-        />,
-      );
-      expectButtonsNotToBeRendered();
-
-      // it should not render buttons if the message is not awaiting user confirmation
-      rerender(
-        <ChatMessage
-          message={{
-            sender: "assistant",
-            content: "Are you sure?",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage
-          awaitingUserConfirmation={false}
-        />,
-      );
-      expectButtonsNotToBeRendered();
-
-      // it should render buttons if all conditions are met
-      rerender(
-        <ChatMessage
-          message={{
-            sender: "assistant",
-            content: "Are you sure?",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
-          isLastMessage
-          awaitingUserConfirmation
-        />,
-      );
-
-      const confirmButton = screen.getByTestId("action-confirm-button");
-      const rejectButton = screen.getByTestId("action-reject-button");
-
-      expect(confirmButton).toBeInTheDocument();
-      expect(rejectButton).toBeInTheDocument();
-    });
-  });
-});
--- a/frontend/tests/components/chat/chat-input.test.tsx
+++ b/frontend/tests/components/chat/chat-input.test.tsx
@@ -0,0 +1,161 @@
+import userEvent from "@testing-library/user-event";
+import { render, screen } from "@testing-library/react";
+import { describe, afterEach, vi, it, expect } from "vitest";
+import { ChatInput } from "#/components/chat-input";
+
+describe("ChatInput", () => {
+  const onSubmitMock = vi.fn();
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render a textarea", () => {
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    expect(screen.getByTestId("chat-input")).toBeInTheDocument();
+    expect(screen.getByRole("textbox")).toBeInTheDocument();
+  });
+
+  it("should call onSubmit when the user types and presses enter", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    const textarea = screen.getByRole("textbox");
+
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Enter}");
+
+    expect(onSubmitMock).toHaveBeenCalledWith("Hello, world!");
+  });
+
+  it("should call onSubmit when pressing the submit button", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");
+
+    await user.type(textarea, "Hello, world!");
+    await user.click(button);
+
+    expect(onSubmitMock).toHaveBeenCalledWith("Hello, world!");
+  });
+
+  it("should not call onSubmit when the message is empty", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    const button = screen.getByRole("button");
+
+    await user.click(button);
+    expect(onSubmitMock).not.toHaveBeenCalled();
+
+    await user.keyboard("{Enter}");
+    expect(onSubmitMock).not.toHaveBeenCalled();
+  });
+
+  it("should disable submit", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput disabled onSubmit={onSubmitMock} />);
+
+    const button = screen.getByRole("button");
+    const textarea = screen.getByRole("textbox");
+
+    await user.type(textarea, "Hello, world!");
+
+    expect(button).toBeDisabled();
+    await user.click(button);
+    expect(onSubmitMock).not.toHaveBeenCalled();
+
+    await user.keyboard("{Enter}");
+    expect(onSubmitMock).not.toHaveBeenCalled();
+  });
+
+  it("should render a placeholder", () => {
+    render(
+      <ChatInput placeholder="Enter your message" onSubmit={onSubmitMock} />,
+    );
+
+    const textarea = screen.getByPlaceholderText("Enter your message");
+    expect(textarea).toBeInTheDocument();
+  });
+
+  it("should create a newline instead of submitting when shift + enter is pressed", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    const textarea = screen.getByRole("textbox");
+
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Shift>} {Enter}"); // Shift + Enter
+
+    expect(onSubmitMock).not.toHaveBeenCalled();
+    // expect(textarea).toHaveValue("Hello, world!\n");
+  });
+
+  it("should clear the input message after sending a message", async () => {
+    const user = userEvent.setup();
+    render(<ChatInput onSubmit={onSubmitMock} />);
+    const textarea = screen.getByRole("textbox");
+    const button = screen.getByRole("button");
+
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Enter}");
+    expect(textarea).toHaveValue("");
+
+    await user.type(textarea, "Hello, world!");
+    await user.click(button);
+    expect(textarea).toHaveValue("");
+  });
+
+  it("should hide the submit button", () => {
+    render(<ChatInput onSubmit={onSubmitMock} showButton={false} />);
+    expect(screen.queryByRole("button")).not.toBeInTheDocument();
+  });
+
+  it("should call onChange when the user types", async () => {
+    const user = userEvent.setup();
+    const onChangeMock = vi.fn();
+    render(<ChatInput onSubmit={onSubmitMock} onChange={onChangeMock} />);
+    const textarea = screen.getByRole("textbox");
+
+    await user.type(textarea, "Hello, world!");
+
+    expect(onChangeMock).toHaveBeenCalledTimes("Hello, world!".length);
+  });
+
+  it("should have set the passed value", () => {
+    render(<ChatInput value="Hello, world!" onSubmit={onSubmitMock} />);
+    const textarea = screen.getByRole("textbox");
+
+    expect(textarea).toHaveValue("Hello, world!");
+  });
+
+  it("should display the stop button and trigger the callback", async () => {
+    const user = userEvent.setup();
+    const onStopMock = vi.fn();
+    render(
+      <ChatInput onSubmit={onSubmitMock} button="stop" onStop={onStopMock} />,
+    );
+    const stopButton = screen.getByTestId("stop-button");
+
+    await user.click(stopButton);
+    expect(onStopMock).toHaveBeenCalledOnce();
+  });
+
+  it("should call onFocus and onBlur when the textarea is focused and blurred", async () => {
+    const user = userEvent.setup();
+    const onFocusMock = vi.fn();
+    const onBlurMock = vi.fn();
+    render(
+      <ChatInput
+        onSubmit={onSubmitMock}
+        onFocus={onFocusMock}
+        onBlur={onBlurMock}
+      />,
+    );
+    const textarea = screen.getByRole("textbox");
+
+    await user.click(textarea);
+    expect(onFocusMock).toHaveBeenCalledOnce();
+
+    await user.tab();
+    expect(onBlurMock).toHaveBeenCalledOnce();
+  });
+});
--- a/frontend/tests/components/chat/chat-interface.test.tsx
+++ b/frontend/tests/components/chat/chat-interface.test.tsx
@@ -0,0 +1,185 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { render, screen, within } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { ChatInterface } from "#/components/chat-interface";
+import { SocketProvider } from "#/context/socket";
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
+  render(<ChatInterface />, { wrapper: SocketProvider });
+
+describe.skip("ChatInterface", () => {
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it.todo("should render suggestions if empty");
+
+  it("should render messages", () => {
+    const messages: Message[] = [
+      {
+        sender: "user",
+        content: "Hello",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+      {
+        sender: "assistant",
+        content: "Hi",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+    ];
+    renderChatInterface(messages);
+
+    expect(screen.getAllByTestId(/-message/)).toHaveLength(2);
+  });
+
+  it("should render a chat input", () => {
+    const messages: Message[] = [];
+    renderChatInterface(messages);
+
+    expect(screen.getByTestId("chat-input")).toBeInTheDocument();
+  });
+
+  it.todo("should call socket send when submitting a message", async () => {
+    const user = userEvent.setup();
+    const messages: Message[] = [];
+    renderChatInterface(messages);
+
+    const input = screen.getByTestId("chat-input");
+    await user.type(input, "Hello");
+    await user.keyboard("{Enter}");
+
+    // spy on send and expect to have been called
+  });
+
+  it("should render an image carousel with a message", () => {
+    let messages: Message[] = [
+      {
+        sender: "assistant",
+        content: "Here are some images",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+    ];
+    const { rerender } = renderChatInterface(messages);
+
+    expect(screen.queryByTestId("image-carousel")).not.toBeInTheDocument();
+
+    messages = [
+      {
+        sender: "assistant",
+        content: "Here are some images",
+        imageUrls: ["image1", "image2"],
+        timestamp: new Date().toISOString(),
+      },
+    ];
+
+    rerender(<ChatInterface />);
+
+    const imageCarousel = screen.getByTestId("image-carousel");
+    expect(imageCarousel).toBeInTheDocument();
+    expect(within(imageCarousel).getAllByTestId("image-preview")).toHaveLength(
+      2,
+    );
+  });
+
+  it.todo("should render confirmation buttons");
+
+  it("should render a 'continue' action when there are more than 2 messages and awaiting user input", () => {
+    const messages: Message[] = [
+      {
+        sender: "assistant",
+        content: "Hello",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+      {
+        sender: "user",
+        content: "Hi",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+    ];
+    const { rerender } = renderChatInterface(messages);
+    expect(
+      screen.queryByTestId("continue-action-button"),
+    ).not.toBeInTheDocument();
+
+    messages.push({
+      sender: "assistant",
+      content: "How can I help you?",
+      imageUrls: [],
+      timestamp: new Date().toISOString(),
+    });
+
+    rerender(<ChatInterface />);
+
+    expect(screen.getByTestId("continue-action-button")).toBeInTheDocument();
+  });
+
+  it("should render inline errors", () => {
+    const messages: (Message | ErrorMessage)[] = [
+      {
+        sender: "assistant",
+        content: "Hello",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+      {
+        error: "Woops!",
+        message: "Something went wrong",
+      },
+    ];
+    renderChatInterface(messages);
+
+    const error = screen.getByTestId("error-message");
+    expect(within(error).getByText("Woops!")).toBeInTheDocument();
+    expect(within(error).getByText("Something went wrong")).toBeInTheDocument();
+  });
+
+  it("should render feedback actions if there are more than 3 messages", () => {
+    const messages: Message[] = [
+      {
+        sender: "assistant",
+        content: "Hello",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+      {
+        sender: "user",
+        content: "Hi",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+      {
+        sender: "assistant",
+        content: "How can I help you?",
+        imageUrls: [],
+        timestamp: new Date().toISOString(),
+      },
+    ];
+    const { rerender } = renderChatInterface(messages);
+    expect(screen.queryByTestId("feedback-actions")).not.toBeInTheDocument();
+
+    messages.push({
+      sender: "user",
+      content: "I need help",
+      imageUrls: [],
+      timestamp: new Date().toISOString(),
+    });
+
+    rerender(<ChatInterface />);
+
+    expect(screen.getByTestId("feedback-actions")).toBeInTheDocument();
+  });
+
+  describe("feedback", () => {
+    it.todo("should open the feedback modal when a feedback action is clicked");
+    it.todo(
+      "should submit feedback and hide the actions when feedback is shared",
+    );
+    it.todo("should render the actions once more after new messages are added");
+  });
+});
--- a/frontend/tests/components/context-menu/account-settings-context-menu.test.tsx
+++ b/frontend/tests/components/context-menu/account-settings-context-menu.test.tsx
@@ -0,0 +1,99 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, describe, expect, it, test, vi } from "vitest";
+import { AccountSettingsContextMenu } from "#/components/context-menu/account-settings-context-menu";
+
+describe("AccountSettingsContextMenu", () => {
+  const user = userEvent.setup();
+  const onClickAccountSettingsMock = vi.fn();
+  const onLogoutMock = vi.fn();
+  const onCloseMock = vi.fn();
+
+  afterEach(() => {
+    onClickAccountSettingsMock.mockClear();
+    onLogoutMock.mockClear();
+    onCloseMock.mockClear();
+  });
+
+  it("should always render the right options", () => {
+    render(
+      <AccountSettingsContextMenu
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        onClose={onCloseMock}
+        isLoggedIn
+      />,
+    );
+
+    expect(
+      screen.getByTestId("account-settings-context-menu"),
+    ).toBeInTheDocument();
+    expect(screen.getByText("Account Settings")).toBeInTheDocument();
+    expect(screen.getByText("Logout")).toBeInTheDocument();
+  });
+
+  it("should call onClickAccountSettings when the account settings option is clicked", async () => {
+    render(
+      <AccountSettingsContextMenu
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        onClose={onCloseMock}
+        isLoggedIn
+      />,
+    );
+
+    const accountSettingsOption = screen.getByText("Account Settings");
+    await user.click(accountSettingsOption);
+
+    expect(onClickAccountSettingsMock).toHaveBeenCalledOnce();
+  });
+
+  it("should call onLogout when the logout option is clicked", async () => {
+    render(
+      <AccountSettingsContextMenu
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        onClose={onCloseMock}
+        isLoggedIn
+      />,
+    );
+
+    const logoutOption = screen.getByText("Logout");
+    await user.click(logoutOption);
+
+    expect(onLogoutMock).toHaveBeenCalledOnce();
+  });
+
+  test("onLogout should be disabled if the user is not logged in", async () => {
+    render(
+      <AccountSettingsContextMenu
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        onClose={onCloseMock}
+        isLoggedIn={false}
+      />,
+    );
+
+    const logoutOption = screen.getByText("Logout");
+    await user.click(logoutOption);
+
+    expect(onLogoutMock).not.toHaveBeenCalled();
+  });
+
+  it("should call onClose when clicking outside of the element", async () => {
+    render(
+      <AccountSettingsContextMenu
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        onClose={onCloseMock}
+        isLoggedIn
+      />,
+    );
+
+    const accountSettingsButton = screen.getByText("Account Settings");
+    await user.click(accountSettingsButton);
+    await user.click(document.body);
+
+    expect(onCloseMock).toHaveBeenCalledOnce();
+  });
+});
--- a/frontend/tests/components/context-menu/context-menu-list-item.test.tsx
+++ b/frontend/tests/components/context-menu/context-menu-list-item.test.tsx
@@ -0,0 +1,41 @@
+import { describe, it, expect, vi } from "vitest";
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { ContextMenuListItem } from "#/components/context-menu/context-menu-list-item";
+
+describe("ContextMenuListItem", () => {
+  it("should render the component with the children", () => {
+    render(<ContextMenuListItem onClick={vi.fn}>Test</ContextMenuListItem>);
+
+    expect(screen.getByTestId("context-menu-list-item")).toBeInTheDocument();
+    expect(screen.getByText("Test")).toBeInTheDocument();
+  });
+
+  it("should call the onClick callback when clicked", async () => {
+    const user = userEvent.setup();
+    const onClickMock = vi.fn();
+    render(
+      <ContextMenuListItem onClick={onClickMock}>Test</ContextMenuListItem>,
+    );
+
+    const element = screen.getByTestId("context-menu-list-item");
+    await user.click(element);
+
+    expect(onClickMock).toHaveBeenCalledOnce();
+  });
+
+  it("should not call the onClick callback when clicked and the button is disabled", async () => {
+    const user = userEvent.setup();
+    const onClickMock = vi.fn();
+    render(
+      <ContextMenuListItem onClick={onClickMock} isDisabled>
+        Test
+      </ContextMenuListItem>,
+    );
+
+    const element = screen.getByTestId("context-menu-list-item");
+    await user.click(element);
+
+    expect(onClickMock).not.toHaveBeenCalled();
+  });
+});
--- a/frontend/tests/components/feedback-actions.test.tsx
+++ b/frontend/tests/components/feedback-actions.test.tsx
@@ -0,0 +1,55 @@
+import { render, screen, within } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { FeedbackActions } from "#/components/feedback-actions";
+
+describe("FeedbackActions", () => {
+  const user = userEvent.setup();
+  const onPositiveFeedback = vi.fn();
+  const onNegativeFeedback = vi.fn();
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render correctly", () => {
+    render(
+      <FeedbackActions
+        onPositiveFeedback={onPositiveFeedback}
+        onNegativeFeedback={onNegativeFeedback}
+      />,
+    );
+
+    const actions = screen.getByTestId("feedback-actions");
+    within(actions).getByTestId("positive-feedback");
+    within(actions).getByTestId("negative-feedback");
+  });
+
+  it("should call onPositiveFeedback when positive feedback is clicked", async () => {
+    render(
+      <FeedbackActions
+        onPositiveFeedback={onPositiveFeedback}
+        onNegativeFeedback={onNegativeFeedback}
+      />,
+    );
+
+    const positiveFeedback = screen.getByTestId("positive-feedback");
+    await user.click(positiveFeedback);
+
+    expect(onPositiveFeedback).toHaveBeenCalled();
+  });
+
+  it("should call onNegativeFeedback when negative feedback is clicked", async () => {
+    render(
+      <FeedbackActions
+        onPositiveFeedback={onPositiveFeedback}
+        onNegativeFeedback={onNegativeFeedback}
+      />,
+    );
+
+    const negativeFeedback = screen.getByTestId("negative-feedback");
+    await user.click(negativeFeedback);
+
+    expect(onNegativeFeedback).toHaveBeenCalled();
+  });
+});
--- a/frontend/tests/components/feedback-form.test.tsx
+++ b/frontend/tests/components/feedback-form.test.tsx
@@ -0,0 +1,108 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { FeedbackForm } from "#/components/feedback-form";
+
+describe("FeedbackForm", () => {
+  const user = userEvent.setup();
+  const onSubmitMock = vi.fn();
+  const onCloseMock = vi.fn();
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render correctly", () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+
+    screen.getByLabelText("Email");
+    screen.getByLabelText("Private");
+    screen.getByLabelText("Public");
+
+    screen.getByRole("button", { name: "Submit" });
+    screen.getByRole("button", { name: "Cancel" });
+  });
+
+  it("should switch between private and public permissions", async () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+    const privateRadio = screen.getByLabelText("Private");
+    const publicRadio = screen.getByLabelText("Public");
+
+    expect(privateRadio).toBeChecked(); // private is the default value
+    expect(publicRadio).not.toBeChecked();
+
+    await user.click(publicRadio);
+    expect(publicRadio).toBeChecked();
+    expect(privateRadio).not.toBeChecked();
+
+    await user.click(privateRadio);
+    expect(privateRadio).toBeChecked();
+    expect(publicRadio).not.toBeChecked();
+  });
+
+  it("should call onSubmit when the form is submitted", async () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+    const email = screen.getByLabelText("Email");
+
+    await user.type(email, "test@test.test");
+    await user.click(screen.getByRole("button", { name: "Submit" }));
+
+    expect(onSubmitMock).toHaveBeenCalledWith("private", "test@test.test"); // private is the default value
+  });
+
+  it("should not call onSubmit when the email is invalid", async () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+    const email = screen.getByLabelText("Email");
+    const submitButton = screen.getByRole("button", { name: "Submit" });
+
+    await user.click(submitButton);
+
+    expect(onSubmitMock).not.toHaveBeenCalled();
+
+    await user.type(email, "test");
+    await user.click(submitButton);
+
+    expect(onSubmitMock).not.toHaveBeenCalled();
+  });
+
+  it("should submit public permissions when the public radio is checked", async () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+    const email = screen.getByLabelText("Email");
+    const publicRadio = screen.getByLabelText("Public");
+
+    await user.type(email, "test@test.test");
+    await user.click(publicRadio);
+    await user.click(screen.getByRole("button", { name: "Submit" }));
+
+    expect(onSubmitMock).toHaveBeenCalledWith("public", "test@test.test");
+  });
+
+  it("should call onClose when the close button is clicked", async () => {
+    render(<FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />);
+    await user.click(screen.getByRole("button", { name: "Cancel" }));
+
+    expect(onSubmitMock).not.toHaveBeenCalled();
+    expect(onCloseMock).toHaveBeenCalled();
+  });
+
+  it("should disable the buttons if isSubmitting is true", () => {
+    const { rerender } = render(
+      <FeedbackForm onSubmit={onSubmitMock} onClose={onCloseMock} />,
+    );
+    const submitButton = screen.getByRole("button", { name: "Submit" });
+    const cancelButton = screen.getByRole("button", { name: "Cancel" });
+
+    expect(submitButton).not.toBeDisabled();
+    expect(cancelButton).not.toBeDisabled();
+
+    rerender(
+      <FeedbackForm
+        onSubmit={onSubmitMock}
+        onClose={onCloseMock}
+        isSubmitting
+      />,
+    );
+    expect(submitButton).toBeDisabled();
+    expect(cancelButton).toBeDisabled();
+  });
+});
--- a/frontend/tests/components/file-explorer/FileExplorer.test.tsx
+++ b/frontend/tests/components/file-explorer/FileExplorer.test.tsx
@@ -16,7 +16,7 @@ vi.mock("../../services/fileService", async () => ({
 }));

 const renderFileExplorerWithRunningAgentState = () =>
-  renderWithProviders(<FileExplorer />, {
+  renderWithProviders(<FileExplorer error={null} />, {
    preloadedState: {
      agent: {
        curAgentState: AgentState.RUNNING,
--- a/frontend/tests/components/image-preview.test.tsx
+++ b/frontend/tests/components/image-preview.test.tsx
@@ -0,0 +1,37 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { describe, expect, it, vi } from "vitest";
+import { ImagePreview } from "#/components/image-preview";
+
+describe("ImagePreview", () => {
+  it("should render an image", () => {
+    render(
+      <ImagePreview src="https://example.com/image.jpg" onRemove={vi.fn} />,
+    );
+    const img = screen.getByRole("img");
+
+    expect(screen.getByTestId("image-preview")).toBeInTheDocument();
+    expect(img).toHaveAttribute("src", "https://example.com/image.jpg");
+  });
+
+  it("should call onRemove when the close button is clicked", async () => {
+    const user = userEvent.setup();
+    const onRemoveMock = vi.fn();
+    render(
+      <ImagePreview
+        src="https://example.com/image.jpg"
+        onRemove={onRemoveMock}
+      />,
+    );
+
+    const closeButton = screen.getByRole("button");
+    await user.click(closeButton);
+
+    expect(onRemoveMock).toHaveBeenCalledOnce();
+  });
+
+  it("shoud not display the close button when onRemove is not provided", () => {
+    render(<ImagePreview src="https://example.com/image.jpg" />);
+    expect(screen.queryByRole("button")).not.toBeInTheDocument();
+  });
+});
--- a/frontend/tests/components/interactive-chat-box.test.tsx
+++ b/frontend/tests/components/interactive-chat-box.test.tsx
@@ -0,0 +1,119 @@
+import { render, screen, within } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
+import { InteractiveChatBox } from "#/components/interactive-chat-box";
+
+describe("InteractiveChatBox", () => {
+  const onSubmitMock = vi.fn();
+  const onStopMock = vi.fn();
+
+  beforeAll(() => {
+    global.URL.createObjectURL = vi
+      .fn()
+      .mockReturnValue("blob:http://example.com");
+  });
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render", () => {
+    render(<InteractiveChatBox onSubmit={onSubmitMock} onStop={onStopMock} />);
+
+    const chatBox = screen.getByTestId("interactive-chat-box");
+    within(chatBox).getByTestId("chat-input");
+    within(chatBox).getByTestId("upload-image-input");
+  });
+
+  it("should display the image previews when images are uploaded", async () => {
+    const user = userEvent.setup();
+    render(<InteractiveChatBox onSubmit={onSubmitMock} onStop={onStopMock} />);
+
+    const file = new File(["(⌐□_□)"], "chucknorris.png", { type: "image/png" });
+    const input = screen.getByTestId("upload-image-input");
+
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(0);
+
+    await user.upload(input, file);
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(1);
+
+    const files = [
+      new File(["(⌐□_□)"], "chucknorris2.png", { type: "image/png" }),
+      new File(["(⌐□_□)"], "chucknorris3.png", { type: "image/png" }),
+    ];
+
+    await user.upload(input, files);
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(3);
+  });
+
+  it("should remove the image preview when the close button is clicked", async () => {
+    const user = userEvent.setup();
+    render(<InteractiveChatBox onSubmit={onSubmitMock} onStop={onStopMock} />);
+
+    const file = new File(["(⌐□_□)"], "chucknorris.png", { type: "image/png" });
+    const input = screen.getByTestId("upload-image-input");
+
+    await user.upload(input, file);
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(1);
+
+    const imagePreview = screen.getByTestId("image-preview");
+    const closeButton = within(imagePreview).getByRole("button");
+    await user.click(closeButton);
+
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(0);
+  });
+
+  it("should call onSubmit with the message and images", async () => {
+    const user = userEvent.setup();
+    render(<InteractiveChatBox onSubmit={onSubmitMock} onStop={onStopMock} />);
+
+    const textarea = within(screen.getByTestId("chat-input")).getByRole(
+      "textbox",
+    );
+    const input = screen.getByTestId("upload-image-input");
+    const file = new File(["(⌐□_□)"], "chucknorris.png", { type: "image/png" });
+
+    await user.upload(input, file);
+    await user.type(textarea, "Hello, world!");
+    await user.keyboard("{Enter}");
+
+    expect(onSubmitMock).toHaveBeenCalledWith("Hello, world!", [file]);
+
+    // clear images after submission
+    expect(screen.queryAllByTestId("image-preview")).toHaveLength(0);
+  });
+
+  it("should disable the submit button", async () => {
+    const user = userEvent.setup();
+    render(
+      <InteractiveChatBox
+        isDisabled
+        onSubmit={onSubmitMock}
+        onStop={onStopMock}
+      />,
+    );
+
+    const button = screen.getByRole("button");
+    expect(button).toBeDisabled();
+
+    await user.click(button);
+    expect(onSubmitMock).not.toHaveBeenCalled();
+  });
+
+  it("should display the stop button if set and call onStop when clicked", async () => {
+    const user = userEvent.setup();
+    render(
+      <InteractiveChatBox
+        mode="stop"
+        onSubmit={onSubmitMock}
+        onStop={onStopMock}
+      />,
+    );
+
+    const stopButton = screen.getByTestId("stop-button");
+    expect(stopButton).toBeInTheDocument();
+
+    await user.click(stopButton);
+    expect(onStopMock).toHaveBeenCalledOnce();
+  });
+});
--- a/frontend/tests/components/modals/feeback/FeedbackModal.test.tsx
+++ b/frontend/tests/components/modals/feeback/FeedbackModal.test.tsx
@@ -1,193 +0,0 @@
-import { render, screen, within } from "@testing-library/react";
-import { Mock, afterEach, describe, expect, it, vi } from "vitest";
-import userEvent from "@testing-library/user-event";
-import toast from "react-hot-toast";
-import FeedbackModal from "#/components/modals/feedback/FeedbackModal";
-import OpenHands from "#/api/open-hands";
-
-describe.skip("FeedbackModal", () => {
-  Storage.prototype.setItem = vi.fn();
-  Storage.prototype.getItem = vi.fn();
-
-  vi.mock("#/services/feedbackService", () => ({
-    sendFeedback: vi.fn(),
-  }));
-
-  vi.mock("#/services/auth", () => ({
-    getToken: vi.fn().mockReturnValue("some-token"),
-  }));
-  // mock Session class
-  vi.mock("#/services/session", () => ({
-    default: {
-      _history: [
-        { args: { LLM_API_KEY: "DANGER-key-should-not-be-here" } },
-        { content: "Hello" },
-      ],
-    },
-  }));
-
-  afterEach(() => {
-    vi.clearAllMocks();
-  });
-
-  it("should render the feedback model when open", () => {
-    const { rerender } = render(
-      <FeedbackModal
-        polarity="positive"
-        isOpen={false}
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-    expect(screen.queryByTestId("feedback-modal")).not.toBeInTheDocument();
-
-    rerender(
-      <FeedbackModal
-        polarity="positive"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-    expect(screen.getByTestId("feedback-modal")).toBeInTheDocument();
-  });
-
-  it("should display an error if the email is invalid when submitting", async () => {
-    const user = userEvent.setup();
-    render(
-      <FeedbackModal
-        polarity="positive"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-
-    const submitButton = screen.getByRole("button", {
-      name: "FEEDBACK$SHARE_LABEL",
-    });
-
-    await user.click(submitButton);
-
-    expect(screen.getByTestId("invalid-email-message")).toBeInTheDocument();
-    expect(OpenHands.sendFeedback).not.toHaveBeenCalled();
-  });
-
-  it("should call sendFeedback with the correct data when the share button is clicked", async () => {
-    const user = userEvent.setup();
-    render(
-      <FeedbackModal
-        polarity="negative"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-
-    const submitButton = screen.getByRole("button", {
-      name: "FEEDBACK$SHARE_LABEL",
-    });
-
-    const email = "example@example.com";
-    const emailInput = screen.getByTestId("email-input");
-    await user.type(emailInput, email);
-
-    // select public
-    const permissionsGroup = screen.getByTestId("permissions-group");
-    const publicOption = within(permissionsGroup).getByRole("radio", {
-      name: "FEEDBACK$PUBLIC_LABEL",
-    });
-    expect(publicOption).not.toBeChecked();
-    await user.click(publicOption);
-    expect(publicOption).toBeChecked();
-
-    await user.click(submitButton);
-
-    expect(
-      screen.queryByTestId("invalid-email-message"),
-    ).not.toBeInTheDocument();
-
-    expect(OpenHands.sendFeedback).toHaveBeenCalledWith({
-      email,
-      permissions: "public",
-      feedback: "negative",
-      trajectory: [{ args: {} }, { content: "Hello" }], // api key should be removed
-      token: "some-token",
-      version: "1.0",
-    });
-  });
-
-  it("should store the users email in local state for later use", async () => {
-    const email = "example@example.com";
-
-    const user = userEvent.setup();
-    const { rerender } = render(
-      <FeedbackModal
-        polarity="negative"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-
-    expect(localStorage.getItem).toHaveBeenCalledWith("feedback-email");
-    const emailInput = screen.getByTestId("email-input");
-    expect(emailInput).toHaveValue("");
-
-    await user.type(emailInput, email);
-    expect(emailInput).toHaveValue(email);
-
-    const submitButton = screen.getByRole("button", {
-      name: "FEEDBACK$SHARE_LABEL",
-    });
-    await user.click(submitButton);
-
-    expect(localStorage.setItem).toHaveBeenCalledWith("feedback-email", email);
-
-    rerender(
-      <FeedbackModal
-        polarity="positive"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-
-    const emailInputAfterClose = screen.getByTestId("email-input");
-    expect(emailInputAfterClose).toHaveValue(email);
-  });
-
-  // TODO: figure out how to properly mock toast
-  it.skip("should display a success toast when the feedback is shared successfully", async () => {
-    (OpenHands.sendFeedback as Mock).mockResolvedValue({
-      statusCode: 200,
-      body: {
-        message: "Feedback shared",
-        feedback_id: "some-id",
-        password: "some-password",
-      },
-    });
-
-    const user = userEvent.setup();
-    render(
-      <FeedbackModal
-        polarity="negative"
-        isOpen
-        onOpenChange={vi.fn}
-        onSendFeedback={vi.fn}
-      />,
-    );
-
-    const submitButton = screen.getByRole("button", {
-      name: "FEEDBACK$SHARE_LABEL",
-    });
-
-    const email = "example@example.com";
-    const emailInput = screen.getByTestId("email-input");
-    await user.type(emailInput, email);
-
-    await user.click(submitButton);
-
-    expect(toast).toHaveBeenCalled();
-  });
-});
--- a/frontend/tests/components/settings/ai-config-form.test.tsx
+++ b/frontend/tests/components/settings/ai-config-form.test.tsx
@@ -0,0 +1,9 @@
+import { describe, it } from "vitest";
+
+describe("AIConfigForm", () => {
+  it.todo("should render the AI config form");
+  it.todo("should toggle the advanced settings when clicked");
+  it.todo("should call the onSubmit callback when the form is submitted");
+  it.todo("should call the onReset callback when the reset button is clicked");
+  it.todo("should call the onClose callback when the close button is clicked");
+});
--- a/frontend/tests/components/settings/dropdown-input.test.tsx
+++ b/frontend/tests/components/settings/dropdown-input.test.tsx
@@ -0,0 +1,9 @@
+import { describe, it } from "vitest";
+
+describe("DropdownInput", () => {
+  it.todo("should render the input");
+  it.todo("should render the placeholder");
+  it.todo("should render the dropdown when clicked");
+  it.todo("should select an option when clicked");
+  it.todo("should filter the options when typing");
+});
--- a/frontend/tests/components/settings/model-selector.test.tsx
+++ b/frontend/tests/components/settings/model-selector.test.tsx
@@ -0,0 +1,12 @@
+import { describe, it } from "vitest";
+
+describe("ModelSelector", () => {
+  it.todo("should render the model selector");
+  it.todo("should display and select the providers");
+  it.todo("should display and select the models");
+  it.todo("should disable the models if a provider is not selected");
+  it.todo("should disable the inputs if isDisabled is true");
+  it.todo(
+    "should set the selected model and provider if the currentModel prop is set",
+  );
+});
--- a/frontend/tests/components/upload-image-input.test.tsx
+++ b/frontend/tests/components/upload-image-input.test.tsx
@@ -0,0 +1,71 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { UploadImageInput } from "#/components/upload-image-input";
+
+describe("UploadImageInput", () => {
+  const user = userEvent.setup();
+  const onUploadMock = vi.fn();
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("should render an input", () => {
+    render(<UploadImageInput onUpload={onUploadMock} />);
+    expect(screen.getByTestId("upload-image-input")).toBeInTheDocument();
+  });
+
+  it("should call onUpload when a file is selected", async () => {
+    render(<UploadImageInput onUpload={onUploadMock} />);
+
+    const file = new File(["(⌐□_□)"], "chucknorris.png", { type: "image/png" });
+    const input = screen.getByTestId("upload-image-input");
+
+    await user.upload(input, file);
+
+    expect(onUploadMock).toHaveBeenNthCalledWith(1, [file]);
+  });
+
+  it("should call onUpload when multiple files are selected", async () => {
+    render(<UploadImageInput onUpload={onUploadMock} />);
+
+    const files = [
+      new File(["(⌐□_□)"], "chucknorris.png", { type: "image/png" }),
+      new File(["(⌐□_□)"], "chucknorris2.png", { type: "image/png" }),
+    ];
+    const input = screen.getByTestId("upload-image-input");
+
+    await user.upload(input, files);
+
+    expect(onUploadMock).toHaveBeenNthCalledWith(1, files);
+  });
+
+  it("should not upload any file that is not an image", async () => {
+    render(<UploadImageInput onUpload={onUploadMock} />);
+
+    const file = new File(["(⌐□_□)"], "chucknorris.txt", {
+      type: "text/plain",
+    });
+    const input = screen.getByTestId("upload-image-input");
+
+    await user.upload(input, file);
+
+    expect(onUploadMock).not.toHaveBeenCalled();
+  });
+
+  it("should render custom labels", () => {
+    const { rerender } = render(<UploadImageInput onUpload={onUploadMock} />);
+    expect(screen.getByTestId("default-label")).toBeInTheDocument();
+
+    function CustomLabel() {
+      return <span>Custom label</span>;
+    }
+    rerender(
+      <UploadImageInput onUpload={onUploadMock} label={<CustomLabel />} />,
+    );
+
+    expect(screen.getByText("Custom label")).toBeInTheDocument();
+    expect(screen.queryByTestId("default-label")).not.toBeInTheDocument();
+  });
+});
--- a/frontend/tests/components/user-actions.test.tsx
+++ b/frontend/tests/components/user-actions.test.tsx
@@ -0,0 +1,132 @@
+import { render, screen } from "@testing-library/react";
+import { describe, expect, it, test, vi, afterEach } from "vitest";
+import userEvent from "@testing-library/user-event";
+import * as Remix from "@remix-run/react";
+import { UserActions } from "#/components/user-actions";
+
+describe("UserActions", () => {
+  const user = userEvent.setup();
+  const onClickAccountSettingsMock = vi.fn();
+  const onLogoutMock = vi.fn();
+
+  const useFetcherSpy = vi.spyOn(Remix, "useFetcher");
+  // @ts-expect-error - Only returning the relevant properties for the test
+  useFetcherSpy.mockReturnValue({ state: "idle" });
+
+  afterEach(() => {
+    onClickAccountSettingsMock.mockClear();
+    onLogoutMock.mockClear();
+    useFetcherSpy.mockClear();
+  });
+
+  it("should render", () => {
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+      />,
+    );
+
+    expect(screen.getByTestId("user-actions")).toBeInTheDocument();
+    expect(screen.getByTestId("user-avatar")).toBeInTheDocument();
+  });
+
+  it("should toggle the user menu when the user avatar is clicked", async () => {
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+      />,
+    );
+
+    const userAvatar = screen.getByTestId("user-avatar");
+    await user.click(userAvatar);
+
+    expect(
+      screen.getByTestId("account-settings-context-menu"),
+    ).toBeInTheDocument();
+
+    await user.click(userAvatar);
+
+    expect(
+      screen.queryByTestId("account-settings-context-menu"),
+    ).not.toBeInTheDocument();
+  });
+
+  it("should call onClickAccountSettings and close the menu when the account settings option is clicked", async () => {
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+      />,
+    );
+
+    const userAvatar = screen.getByTestId("user-avatar");
+    await user.click(userAvatar);
+
+    const accountSettingsOption = screen.getByText("Account Settings");
+    await user.click(accountSettingsOption);
+
+    expect(onClickAccountSettingsMock).toHaveBeenCalledOnce();
+    expect(
+      screen.queryByTestId("account-settings-context-menu"),
+    ).not.toBeInTheDocument();
+  });
+
+  it("should call onLogout and close the menu when the logout option is clicked", async () => {
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        user={{ avatar_url: "https://example.com/avatar.png" }}
+      />,
+    );
+
+    const userAvatar = screen.getByTestId("user-avatar");
+    await user.click(userAvatar);
+
+    const logoutOption = screen.getByText("Logout");
+    await user.click(logoutOption);
+
+    expect(onLogoutMock).toHaveBeenCalledOnce();
+    expect(
+      screen.queryByTestId("account-settings-context-menu"),
+    ).not.toBeInTheDocument();
+  });
+
+  test("onLogout should not be called when the user is not logged in", async () => {
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+      />,
+    );
+
+    const userAvatar = screen.getByTestId("user-avatar");
+    await user.click(userAvatar);
+
+    const logoutOption = screen.getByText("Logout");
+    await user.click(logoutOption);
+
+    expect(onLogoutMock).not.toHaveBeenCalled();
+  });
+
+  it("should display the loading spinner", () => {
+    // @ts-expect-error - Only returning the relevant properties for the test
+    useFetcherSpy.mockReturnValue({ state: "loading" });
+
+    render(
+      <UserActions
+        onClickAccountSettings={onClickAccountSettingsMock}
+        onLogout={onLogoutMock}
+        user={{ avatar_url: "https://example.com/avatar.png" }}
+      />,
+    );
+
+    const userAvatar = screen.getByTestId("user-avatar");
+    user.click(userAvatar);
+
+    expect(screen.getByTestId("loading-spinner")).toBeInTheDocument();
+    expect(screen.queryByAltText("user avatar")).not.toBeInTheDocument();
+  });
+});
--- a/frontend/tests/components/user-avatar.test.tsx
+++ b/frontend/tests/components/user-avatar.test.tsx
@@ -0,0 +1,68 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { UserAvatar } from "#/components/user-avatar";
+
+describe("UserAvatar", () => {
+  const onClickMock = vi.fn();
+
+  afterEach(() => {
+    onClickMock.mockClear();
+  });
+
+  it("(default) should render the placeholder avatar when the user is logged out", () => {
+    render(<UserAvatar onClick={onClickMock} />);
+    expect(screen.getByTestId("user-avatar")).toBeInTheDocument();
+    expect(
+      screen.getByLabelText("user avatar placeholder"),
+    ).toBeInTheDocument();
+  });
+
+  it("should call onClick when clicked", async () => {
+    const user = userEvent.setup();
+    render(<UserAvatar onClick={onClickMock} />);
+
+    const userAvatarContainer = screen.getByTestId("user-avatar");
+    await user.click(userAvatarContainer);
+
+    expect(onClickMock).toHaveBeenCalledOnce();
+  });
+
+  it("should display the user's avatar when available", () => {
+    render(
+      <UserAvatar
+        onClick={onClickMock}
+        avatarUrl="https://example.com/avatar.png"
+      />,
+    );
+
+    expect(screen.getByAltText("user avatar")).toBeInTheDocument();
+    expect(
+      screen.queryByLabelText("user avatar placeholder"),
+    ).not.toBeInTheDocument();
+  });
+
+  it("should display a loading spinner instead of an avatar when isLoading is true", () => {
+    const { rerender } = render(<UserAvatar onClick={onClickMock} />);
+    expect(screen.queryByTestId("loading-spinner")).not.toBeInTheDocument();
+    expect(
+      screen.getByLabelText("user avatar placeholder"),
+    ).toBeInTheDocument();
+
+    rerender(<UserAvatar onClick={onClickMock} isLoading />);
+    expect(screen.getByTestId("loading-spinner")).toBeInTheDocument();
+    expect(
+      screen.queryByLabelText("user avatar placeholder"),
+    ).not.toBeInTheDocument();
+
+    rerender(
+      <UserAvatar
+        onClick={onClickMock}
+        avatarUrl="https://example.com/avatar.png"
+        isLoading
+      />,
+    );
+    expect(screen.getByTestId("loading-spinner")).toBeInTheDocument();
+    expect(screen.queryByAltText("user avatar")).not.toBeInTheDocument();
+  });
+});
--- a/frontend/tests/hooks/use-click-outside-element.test.tsx
+++ b/frontend/tests/hooks/use-click-outside-element.test.tsx
@@ -0,0 +1,36 @@
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { expect, test, vi } from "vitest";
+import { useClickOutsideElement } from "#/hooks/useClickOutsideElement";
+
+interface ClickOutsideTestComponentProps {
+  callback: () => void;
+}
+
+function ClickOutsideTestComponent({
+  callback,
+}: ClickOutsideTestComponentProps) {
+  const ref = useClickOutsideElement<HTMLDivElement>(callback);
+
+  return (
+    <div>
+      <div data-testid="inside-element" ref={ref} />
+      <div data-testid="outside-element" />
+    </div>
+  );
+}
+
+test("call the callback when the element is clicked outside", async () => {
+  const user = userEvent.setup();
+  const callback = vi.fn();
+  render(<ClickOutsideTestComponent callback={callback} />);
+
+  const insideElement = screen.getByTestId("inside-element");
+  const outsideElement = screen.getByTestId("outside-element");
+
+  await user.click(insideElement);
+  expect(callback).not.toHaveBeenCalled();
+
+  await user.click(outsideElement);
+  expect(callback).toHaveBeenCalled();
+});
--- a/frontend/tests/routes/_oh.test.tsx
+++ b/frontend/tests/routes/_oh.test.tsx
@@ -0,0 +1,35 @@
+import { describe, it, test } from "vitest";
+
+describe("frontend/routes/_oh", () => {
+  describe("brand logo", () => {
+    it.todo("should not do anything if the user is in the main screen");
+    it.todo(
+      "should be clickable and redirect to the main screen if the user is not in the main screen",
+    );
+  });
+
+  describe("user menu", () => {
+    it.todo("should open the user menu when clicked");
+
+    describe("logged out", () => {
+      it.todo("should display a placeholder");
+      test.todo("the logout option in the user menu should be disabled");
+    });
+
+    describe("logged in", () => {
+      it.todo("should display the user's avatar");
+      it.todo("should log the user out when the logout option is clicked");
+    });
+  });
+
+  describe("config", () => {
+    it.todo("should open the config modal when clicked");
+    it.todo(
+      "should not save the config and close the config modal when the close button is clicked",
+    );
+    it.todo(
+      "should save the config when the save button is clicked and close the modal",
+    );
+    it.todo("should warn the user about saving the config when in /app");
+  });
+});
--- a/frontend/tests/routes/app._index.test.tsx
+++ b/frontend/tests/routes/app._index.test.tsx
@@ -1,43 +0,0 @@
-import { createRemixStub } from "@remix-run/testing";
-import { describe, expect, it } from "vitest";
-import { screen, within } from "@testing-library/react";
-import { renderWithProviders } from "test-utils";
-import userEvent from "@testing-library/user-event";
-import CodeEditor from "#/routes/app._index/route";
-
-const RemixStub = createRemixStub([{ path: "/app", Component: CodeEditor }]);
-
-describe.skip("CodeEditor", () => {
-  it("should render", async () => {
-    renderWithProviders(<RemixStub initialEntries={["/app"]} />);
-    await screen.findByTestId("file-explorer");
-    expect(screen.getByTestId("code-editor-empty-message")).toBeInTheDocument();
-  });
-
-  it("should retrieve the files", async () => {
-    renderWithProviders(<RemixStub initialEntries={["/app"]} />);
-    const explorer = await screen.findByTestId("file-explorer");
-
-    const files = within(explorer).getAllByTestId("tree-node");
-    // request mocked with msw
-    expect(files).toHaveLength(3);
-  });
-
-  it("should open a file", async () => {
-    const user = userEvent.setup();
-    renderWithProviders(<RemixStub initialEntries={["/app"]} />);
-    const explorer = await screen.findByTestId("file-explorer");
-
-    const files = within(explorer).getAllByTestId("tree-node");
-    await user.click(files[0]);
-
-    // check if the file is opened
-    expect(
-      screen.queryByTestId("code-editor-empty-message"),
-    ).not.toBeInTheDocument();
-    const editor = await screen.findByTestId("code-editor");
-    expect(
-      within(editor).getByText(/content of file1.ts/i),
-    ).toBeInTheDocument();
-  });
-});
--- a/frontend/tests/routes/app.test.tsx
+++ b/frontend/tests/routes/app.test.tsx
@@ -1,56 +0,0 @@
-import { createRemixStub } from "@remix-run/testing";
-import { beforeAll, describe, expect, it, vi } from "vitest";
-import { render, screen, waitFor } from "@testing-library/react";
-import { ws } from "msw";
-import { setupServer } from "msw/node";
-import App from "#/routes/app";
-import AgentState from "#/types/AgentState";
-import { AgentStateChangeObservation } from "#/types/core/observations";
-
-const RemixStub = createRemixStub([{ path: "/app", Component: App }]);
-
-describe.skip("App", () => {
-  const agent = ws.link("ws://localhost:3001/ws");
-  const server = setupServer();
-
-  beforeAll(() => {
-    // mock `dom.scrollTo`
-    HTMLElement.prototype.scrollTo = vi.fn().mockImplementation(() => {});
-  });
-
-  it("should render", async () => {
-    render(<RemixStub initialEntries={["/app"]} />);
-
-    await waitFor(() => {
-      expect(screen.getByTestId("app")).toBeInTheDocument();
-      expect(
-        screen.getByText(/INITIALIZING_AGENT_LOADING_MESSAGE/i),
-      ).toBeInTheDocument();
-    });
-  });
-
-  it("should establish a ws connection and send the init message", async () => {
-    server.use(
-      agent.addEventListener("connection", ({ client }) => {
-        client.send(
-          JSON.stringify({
-            id: 1,
-            cause: 0,
-            message: "AGENT_INIT_MESSAGE",
-            source: "agent",
-            timestamp: new Date().toISOString(),
-            observation: "agent_state_changed",
-            content: "AGENT_INIT_MESSAGE",
-            extras: { agent_state: AgentState.INIT },
-          } satisfies AgentStateChangeObservation),
-        );
-      }),
-    );
-
-    render(<RemixStub initialEntries={["/app"]} />);
-
-    await waitFor(() => {
-      expect(screen.getByText(/AGENT_INIT_MESSAGE/i)).toBeInTheDocument();
-    });
-  });
-});
--- a/frontend/tests/routes/home.test.tsx
+++ b/frontend/tests/routes/home.test.tsx
@@ -1,50 +0,0 @@
-import { createRemixStub } from "@remix-run/testing";
-import { describe, expect, it } from "vitest";
-import { render, screen } from "@testing-library/react";
-import userEvent from "@testing-library/user-event";
-import Home from "#/routes/_index/route";
-
-const renderRemixStub = (config?: { authenticated: boolean }) =>
-  createRemixStub([
-    {
-      path: "/",
-      Component: Home,
-      loader: () => ({
-        ghToken: config?.authenticated ? "ghp_123456" : null,
-      }),
-    },
-  ]);
-
-describe.skip("Home (_index)", () => {
-  it("should render", async () => {
-    const RemixStub = renderRemixStub();
-    render(<RemixStub />);
-    await screen.findByText(/let's start building/i);
-  });
-
-  it("should load the gh repos if a token is present", async () => {
-    const user = userEvent.setup();
-    const RemixStub = renderRemixStub({ authenticated: true });
-    render(<RemixStub />);
-
-    const repos = await screen.findByPlaceholderText(
-      /select a github project/i,
-    );
-    await user.click(repos);
-    // mocked responses from msw
-    screen.getByText(/octocat\/hello-world/i);
-    screen.getByText(/octocat\/earth/i);
-  });
-
-  it("should not load the gh repos if a token is not present", async () => {
-    const RemixStub = renderRemixStub();
-    render(<RemixStub />);
-
-    const repos = await screen.findByPlaceholderText(
-      /select a github project/i,
-    );
-    await userEvent.click(repos);
-    expect(screen.queryByText(/octocat\/hello-world/i)).not.toBeInTheDocument();
-    expect(screen.queryByText(/octocat\/earth/i)).not.toBeInTheDocument();
-  });
-});
--- a/frontend/tests/routes/root.test.tsx
+++ b/frontend/tests/routes/root.test.tsx
@@ -1,40 +0,0 @@
-import { describe, expect, it } from "vitest";
-import { createRemixStub } from "@remix-run/testing";
-import { render, screen } from "@testing-library/react";
-import userEvent from "@testing-library/user-event";
-import App, { clientLoader } from "#/root";
-
-const RemixStub = createRemixStub([
-  {
-    path: "/",
-    Component: App,
-    loader: clientLoader,
-  },
-]);
-
-describe.skip("Root", () => {
-  it("should render", async () => {
-    render(<RemixStub />);
-    await screen.findByTestId("link-to-main");
-  });
-
-  describe("Auth Modal", () => {
-    it("should display the auth modal on first time visit", async () => {
-      render(<RemixStub />);
-      await screen.findByTestId("auth-modal");
-    });
-
-    it("should close the auth modal on accepting the terms", async () => {
-      const user = userEvent.setup();
-      render(<RemixStub />);
-      await screen.findByTestId("auth-modal");
-      await user.click(screen.getByTestId("accept-terms"));
-      await user.click(screen.getByRole("button", { name: /continue/i }));
-
-      expect(screen.queryByTestId("auth-modal")).not.toBeInTheDocument();
-      expect(screen.getByTestId("link-to-main")).toBeInTheDocument();
-    });
-
-    it.todo("should not display the auth modal on subsequent visits");
-  });
-});
--- a/frontend/global.d.ts
+++ b/frontend/global.d.ts
@@ -0,0 +1,4 @@
+interface Window {
+  __APP_MODE__?: "saas" | "oss";
+  __GITHUB_CLIENT_ID__?: string | null;
+}
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.8",
+  "version": "0.11.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "openhands-frontend",
-      "version": "0.9.8",
+      "version": "0.11.0",
      "dependencies": {
        "@monaco-editor/react": "^4.6.0",
        "@nextui-org/react": "^2.4.8",
@@ -35,7 +35,8 @@
        "react-markdown": "^9.0.1",
        "react-redux": "^9.1.2",
        "react-router-dom": "^6.26.1",
-        "react-syntax-highlighter": "^15.5.0",
+        "react-syntax-highlighter": "^15.6.1",
+        "react-textarea-autosize": "^8.5.4",
        "remark-gfm": "^4.0.0",
        "sirv-cli": "^3.0.0",
        "tailwind-merge": "^2.5.4",
@@ -47,10 +48,10 @@
        "@remix-run/dev": "^2.11.2",
        "@remix-run/testing": "^2.11.2",
        "@tailwindcss/typography": "^0.5.15",
-        "@testing-library/jest-dom": "^6.5.0",
+        "@testing-library/jest-dom": "^6.6.1",
        "@testing-library/react": "^16.0.1",
        "@testing-library/user-event": "^14.5.2",
-        "@types/node": "^22.7.5",
+        "@types/node": "^22.7.6",
        "@types/react": "^18.3.11",
        "@types/react-dom": "^18.3.0",
        "@types/react-highlight": "^0.12.8",
@@ -1601,9 +1602,9 @@
      }
    },
    "node_modules/@jspm/core": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.0.1.tgz",
-      "integrity": "sha512-Lg3PnLp0QXpxwLIAuuJboLeRaIhrgJjeuh797QADg3xz8wGLugQOS5DpsE8A6i6Adgzf+bacllkKZG3J0tGfDw==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.1.0.tgz",
+      "integrity": "sha512-3sRl+pkyFY/kLmHl0cgHiFp2xEqErA8N3ECjMs7serSUBmoJ70lBa0PG5t0IM6WJgdZNyyI0R8YFfi5wM8+mzg==",
      "dev": true
    },
    "node_modules/@mdx-js/mdx": {
@@ -3560,15 +3561,15 @@
      }
    },
    "node_modules/@react-aria/grid": {
-      "version": "3.10.4",
-      "resolved": "https://registry.npmjs.org/@react-aria/grid/-/grid-3.10.4.tgz",
-      "integrity": "sha512-3AjJ0hwRhOCIHThIZrGWrjAuKDpaZuBkODW3dvgLqtsNm3tL46DI6U9O3vfp8lNbrWMsXJgjRXwvXvdv0/gwCA==",
+      "version": "3.10.5",
+      "resolved": "https://registry.npmjs.org/@react-aria/grid/-/grid-3.10.5.tgz",
+      "integrity": "sha512-9sLa+rpLgRZk7VX+tvdSudn1tdVgolVzhDLGWd95yS4UtPVMihTMGBrRoByY57Wxvh1V+7Ptw8kc6tsRSotYKg==",
      "dependencies": {
-        "@react-aria/focus": "^3.18.3",
+        "@react-aria/focus": "^3.18.4",
        "@react-aria/i18n": "^3.12.3",
-        "@react-aria/interactions": "^3.22.3",
+        "@react-aria/interactions": "^3.22.4",
        "@react-aria/live-announcer": "^3.4.0",
-        "@react-aria/selection": "^3.20.0",
+        "@react-aria/selection": "^3.20.1",
        "@react-aria/utils": "^3.25.3",
        "@react-stately/collections": "^3.11.0",
        "@react-stately/grid": "^3.9.3",
@@ -3584,11 +3585,11 @@
      }
    },
    "node_modules/@react-aria/grid/node_modules/@react-aria/focus": {
-      "version": "3.18.3",
-      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.3.tgz",
-      "integrity": "sha512-WKUElg+5zS0D3xlVn8MntNnkzJql2J6MuzAMP8Sv5WTgFDse/XGR842dsxPTIyKKdrWVCRegCuwa4m3n/GzgJw==",
+      "version": "3.18.4",
+      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.4.tgz",
+      "integrity": "sha512-91J35077w9UNaMK1cpMUEFRkNNz0uZjnSwiyBCFuRdaVuivO53wNC9XtWSDNDdcO5cGy87vfJRVAiyoCn/mjqA==",
      "dependencies": {
-        "@react-aria/interactions": "^3.22.3",
+        "@react-aria/interactions": "^3.22.4",
        "@react-aria/utils": "^3.25.3",
        "@react-types/shared": "^3.25.0",
        "@swc/helpers": "^0.5.0",
@@ -3617,9 +3618,9 @@
      }
    },
    "node_modules/@react-aria/grid/node_modules/@react-aria/interactions": {
-      "version": "3.22.3",
-      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.3.tgz",
-      "integrity": "sha512-RRUb/aG+P0IKTIWikY/SylB6bIbLZeztnZY2vbe7RAG5MgVaCgn5HQ45SI15GlTmhsFG8CnF6slJsUFJiNHpbQ==",
+      "version": "3.22.4",
+      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.4.tgz",
+      "integrity": "sha512-E0vsgtpItmknq/MJELqYJwib+YN18Qag8nroqwjk1qOnBa9ROIkUhWJerLi1qs5diXq9LHKehZDXRlwPvdEFww==",
      "dependencies": {
        "@react-aria/ssr": "^3.9.6",
        "@react-aria/utils": "^3.25.3",
@@ -3631,13 +3632,13 @@
      }
    },
    "node_modules/@react-aria/grid/node_modules/@react-aria/selection": {
-      "version": "3.20.0",
-      "resolved": "https://registry.npmjs.org/@react-aria/selection/-/selection-3.20.0.tgz",
-      "integrity": "sha512-h3giMcXo4SMZRL5HrqZvOLNTsdh5jCXwLUx0wpj/2EF0tcYQL6WDfn1iJ+rHARkUIs7X70fUV8iwlbUySZy1xg==",
+      "version": "3.20.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/selection/-/selection-3.20.1.tgz",
+      "integrity": "sha512-My0w8UC/7PAkz/1yZUjr2VRuzDZz1RrbgTqP36j5hsJx8RczDTjI4TmKtQNKG0ggaP4w83G2Og5JPTq3w3LMAw==",
      "dependencies": {
-        "@react-aria/focus": "^3.18.3",
+        "@react-aria/focus": "^3.18.4",
        "@react-aria/i18n": "^3.12.3",
-        "@react-aria/interactions": "^3.22.3",
+        "@react-aria/interactions": "^3.22.4",
        "@react-aria/utils": "^3.25.3",
        "@react-stately/selection": "^3.17.0",
        "@react-types/shared": "^3.25.0",
@@ -4110,12 +4111,12 @@
      }
    },
    "node_modules/@react-aria/toggle": {
-      "version": "3.10.8",
-      "resolved": "https://registry.npmjs.org/@react-aria/toggle/-/toggle-3.10.8.tgz",
-      "integrity": "sha512-N6WTgE8ByMYY+ZygUUPGON2vW5NrxwU91H98+Nozl+Rq6ZYR2fD9i8oRtLtrYPxjU2HmaFwDyQdWvmMJZuDxig==",
+      "version": "3.10.9",
+      "resolved": "https://registry.npmjs.org/@react-aria/toggle/-/toggle-3.10.9.tgz",
+      "integrity": "sha512-dtfnyIU2/kcH9rFAiB48diSmaXDv45K7UCuTkMQLjbQa3QHC1oYNbleVN/VdGyAMBsIWtfl8L4uuPrAQmDV/bg==",
      "dependencies": {
-        "@react-aria/focus": "^3.18.3",
-        "@react-aria/interactions": "^3.22.3",
+        "@react-aria/focus": "^3.18.4",
+        "@react-aria/interactions": "^3.22.4",
        "@react-aria/utils": "^3.25.3",
        "@react-stately/toggle": "^3.7.8",
        "@react-types/checkbox": "^3.8.4",
@@ -4127,11 +4128,11 @@
      }
    },
    "node_modules/@react-aria/toggle/node_modules/@react-aria/focus": {
-      "version": "3.18.3",
-      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.3.tgz",
-      "integrity": "sha512-WKUElg+5zS0D3xlVn8MntNnkzJql2J6MuzAMP8Sv5WTgFDse/XGR842dsxPTIyKKdrWVCRegCuwa4m3n/GzgJw==",
+      "version": "3.18.4",
+      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.4.tgz",
+      "integrity": "sha512-91J35077w9UNaMK1cpMUEFRkNNz0uZjnSwiyBCFuRdaVuivO53wNC9XtWSDNDdcO5cGy87vfJRVAiyoCn/mjqA==",
      "dependencies": {
-        "@react-aria/interactions": "^3.22.3",
+        "@react-aria/interactions": "^3.22.4",
        "@react-aria/utils": "^3.25.3",
        "@react-types/shared": "^3.25.0",
        "@swc/helpers": "^0.5.0",
@@ -4142,9 +4143,9 @@
      }
    },
    "node_modules/@react-aria/toggle/node_modules/@react-aria/interactions": {
-      "version": "3.22.3",
-      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.3.tgz",
-      "integrity": "sha512-RRUb/aG+P0IKTIWikY/SylB6bIbLZeztnZY2vbe7RAG5MgVaCgn5HQ45SI15GlTmhsFG8CnF6slJsUFJiNHpbQ==",
+      "version": "3.22.4",
+      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.4.tgz",
+      "integrity": "sha512-E0vsgtpItmknq/MJELqYJwib+YN18Qag8nroqwjk1qOnBa9ROIkUhWJerLi1qs5diXq9LHKehZDXRlwPvdEFww==",
      "dependencies": {
        "@react-aria/ssr": "^3.9.6",
        "@react-aria/utils": "^3.25.3",
@@ -5814,9 +5815,9 @@
      }
    },
    "node_modules/@testing-library/jest-dom": {
-      "version": "6.5.0",
-      "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.5.0.tgz",
-      "integrity": "sha512-xGGHpBXYSHUUr6XsKBfs85TWlYKpTc37cSBBVrXcib2MkHLboWlkClhWF37JKlDb9KEq3dHs+f2xR7XJEWGBxA==",
+      "version": "6.6.1",
+      "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.6.1.tgz",
+      "integrity": "sha512-mNYIiAuP4yJwV2zBRQCV7PHoQwbb6/8TfMpPcwSUzcSVDJHWOXt6hjNtIN1v5knDmimYnjJxKhsoVd4LVGIO+w==",
      "dev": true,
      "dependencies": {
        "@adobe/css-tools": "^4.4.0",
@@ -6028,9 +6029,9 @@
      }
    },
    "node_modules/@types/node": {
-      "version": "22.7.5",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.5.tgz",
-      "integrity": "sha512-jML7s2NAzMWc//QSJ1a3prpk78cOPchGvXJsC3C6R6PSMoooztvRVQEz89gmBTBY1SPMaqo5teB4uNHPdetShQ==",
+      "version": "22.7.6",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.6.tgz",
+      "integrity": "sha512-/d7Rnj0/ExXDMcioS78/kf1lMzYk4BZV8MZGTBKzTGZ6/406ukkbYlIsZmMPhcR5KlkunDHQLrtAVmSq7r+mSw==",
      "devOptional": true,
      "dependencies": {
        "undici-types": "~6.19.2"
@@ -6619,9 +6620,9 @@
      }
    },
    "node_modules/acorn": {
-      "version": "8.12.1",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
-      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
+      "version": "8.13.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.13.0.tgz",
+      "integrity": "sha512-8zSiw54Oxrdym50NlZ9sUusyO1Z1ZchgRLWRaK6c86XJFClyCgFKetdowBg5bKxyp/u+CDBJG4Mpp0m3HLZl9w==",
      "dev": true,
      "bin": {
        "acorn": "bin/acorn"
@@ -7325,9 +7326,9 @@
      }
    },
    "node_modules/caniuse-lite": {
-      "version": "1.0.30001668",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001668.tgz",
-      "integrity": "sha512-nWLrdxqCdblixUO+27JtGJJE/txpJlyUy5YN1u53wLZkP0emYCo5zgS6QYft7VUYR42LGgi/S5hdLZTrnyIddw==",
+      "version": "1.0.30001669",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001669.tgz",
+      "integrity": "sha512-DlWzFDJqstqtIVx1zeSpIMLjunf5SmwOw0N2Ck/QSQdS8PLS4+9HrLaYei4w8BIAL7IB/UEDu889d8vhCTPA0w==",
      "funding": [
        {
          "type": "opencollective",
@@ -8396,9 +8397,9 @@
      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
    },
    "node_modules/electron-to-chromium": {
-      "version": "1.5.36",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.36.tgz",
-      "integrity": "sha512-HYTX8tKge/VNp6FGO+f/uVDmUkq+cEfcxYhKf15Akc4M5yxt5YmorwlAitKWjWhWQnKcDRBAQKXkhqqXMqcrjw=="
+      "version": "1.5.39",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.39.tgz",
+      "integrity": "sha512-4xkpSR6CjuiaNyvwiWDI85N9AxsvbPawB8xc7yzLPonYTuP19BVgYweKyUMFtHEZgIcHWMt1ks5Cqx2m+6/Grg=="
    },
    "node_modules/emoji-regex": {
      "version": "9.2.2",
@@ -9840,9 +9841,9 @@
      }
    },
    "node_modules/framer-motion": {
-      "version": "11.11.8",
-      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.11.8.tgz",
-      "integrity": "sha512-mnGQNEoz99GtFXBBPw+Ag5K4FcfP5XrXxrxHz+iE4Lmg7W3sf2gKmGuvfkZCW/yIfcdv5vJd6KiSPETH1Pw68Q==",
+      "version": "11.11.9",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.11.9.tgz",
+      "integrity": "sha512-XpdZseuCrZehdHGuW22zZt3SF5g6AHJHJi7JwQIigOznW4Jg1n0oGPMJQheMaKLC+0rp5gxUKMRYI6ytd3q4RQ==",
      "peer": true,
      "dependencies": {
        "tslib": "^2.4.0"
@@ -10321,9 +10322,9 @@
      }
    },
    "node_modules/hast-util-to-jsx-runtime": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.1.tgz",
-      "integrity": "sha512-Rbemi1rzrkysSin0FDHZfsxYPoqLGHFfxFm28aOBHPibT7aqjy7kUgY636se9xbuCWUsFpWAYlmtGHQakiqtEA==",
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.2.tgz",
+      "integrity": "sha512-1ngXYb+V9UT5h+PxNRa1O1FYguZK/XL+gkeqvp7EdHlB9oHUG0eYRo/vY5inBdcqo3RkPMC58/H94HvkbfGdyg==",
      "dependencies": {
        "@types/estree": "^1.0.0",
        "@types/hast": "^3.0.0",
@@ -11043,6 +11044,11 @@
        "node": "*"
      }
    },
+    "node_modules/highlightjs-vue": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/highlightjs-vue/-/highlightjs-vue-1.0.0.tgz",
+      "integrity": "sha512-PDEfEF102G23vHmPhLyPboFCD+BkMGu+GuJe2d9/eH4FsCwvgBpnc9n0pGE+ffKdph38s6foEZiEjdgHdzp+IA=="
+    },
    "node_modules/hosted-git-info": {
      "version": "6.1.1",
      "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-6.1.1.tgz",
@@ -20734,12 +20740,13 @@
      }
    },
    "node_modules/react-syntax-highlighter": {
-      "version": "15.5.0",
-      "resolved": "https://registry.npmjs.org/react-syntax-highlighter/-/react-syntax-highlighter-15.5.0.tgz",
-      "integrity": "sha512-+zq2myprEnQmH5yw6Gqc8lD55QHnpKaU8TOcFeC/Lg/MQSs8UknEA0JC4nTZGFAXC2J2Hyj/ijJ7NlabyPi2gg==",
+      "version": "15.6.1",
+      "resolved": "https://registry.npmjs.org/react-syntax-highlighter/-/react-syntax-highlighter-15.6.1.tgz",
+      "integrity": "sha512-OqJ2/vL7lEeV5zTJyG7kmARppUjiB9h9udl4qHQjjgEos66z00Ia0OckwYfRxCSFrW8RJIBnsBwQsHZbVPspqg==",
      "dependencies": {
        "@babel/runtime": "^7.3.1",
        "highlight.js": "^10.4.1",
+        "highlightjs-vue": "^1.0.0",
        "lowlight": "^1.17.0",
        "prismjs": "^1.27.0",
        "refractor": "^3.6.0"
@@ -22713,13 +22720,17 @@
      }
    },
    "node_modules/string.prototype.includes": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/string.prototype.includes/-/string.prototype.includes-2.0.0.tgz",
-      "integrity": "sha512-E34CkBgyeqNDcrbU76cDjL5JLcVrtSdYq0MEh/B10r17pRP4ciHLwTgnuLV8Ay6cgEMLkcBkFCKyFZ43YldYzg==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/string.prototype.includes/-/string.prototype.includes-2.0.1.tgz",
+      "integrity": "sha512-o7+c9bW6zpAdJHTtujeePODAhkuicdAryFsfVKwA+wGw89wJ4GTY484WTucM9hLtDEOpOvI+aHnzqnC5lHp4Rg==",
      "dev": true,
      "dependencies": {
-        "define-properties": "^1.1.3",
-        "es-abstract": "^1.17.5"
+        "call-bind": "^1.0.7",
+        "define-properties": "^1.2.1",
+        "es-abstract": "^1.23.3"
+      },
+      "engines": {
+        "node": ">= 0.4"
      }
    },
    "node_modules/string.prototype.matchall": {
@@ -23499,9 +23510,9 @@
      }
    },
    "node_modules/tslib": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
-      "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA=="
+      "version": "2.8.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.0.tgz",
+      "integrity": "sha512-jWVzBLplnCmoaTr13V9dYbiQ99wvZRd0vNWaDRg+aVYRcjDF3nDksxFDE/+fkXnKhpnUUkmx5pK/v8mCtLVqZA=="
    },
    "node_modules/turbo-stream": {
      "version": "2.4.0",
@@ -23661,9 +23672,9 @@
      }
    },
    "node_modules/undici": {
-      "version": "6.20.0",
-      "resolved": "https://registry.npmjs.org/undici/-/undici-6.20.0.tgz",
-      "integrity": "sha512-AITZfPuxubm31Sx0vr8bteSalEbs9wQb/BOBi9FPlD9Qpd6HxZ4Q0+hI742jBhkPb4RT2v5MQzaW5VhRVyj+9A==",
+      "version": "6.20.1",
+      "resolved": "https://registry.npmjs.org/undici/-/undici-6.20.1.tgz",
+      "integrity": "sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==",
      "engines": {
        "node": ">=18.17"
      }
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.8",
+  "version": "0.11.0",
  "private": true,
  "type": "module",
  "engines": {
@@ -34,7 +34,8 @@
    "react-markdown": "^9.0.1",
    "react-redux": "^9.1.2",
    "react-router-dom": "^6.26.1",
-    "react-syntax-highlighter": "^15.5.0",
+    "react-syntax-highlighter": "^15.6.1",
+    "react-textarea-autosize": "^8.5.4",
    "remark-gfm": "^4.0.0",
    "sirv-cli": "^3.0.0",
    "tailwind-merge": "^2.5.4",
@@ -46,7 +47,7 @@
    "dev": "npm run make-i18n && VITE_MOCK_API=false remix vite:dev",
    "dev:mock": "npm run make-i18n && VITE_MOCK_API=true remix vite:dev",
    "build": "npm run make-i18n && tsc && remix vite:build",
-    "start": "npx sirv-cli build/client/ --single",
+    "start": "npx sirv-cli build/ --single",
    "test": "vitest run",
    "test:coverage": "npm run make-i18n && vitest run --coverage",
    "dev_wsl": "VITE_WATCH_USE_POLLING=true vite",
@@ -72,10 +73,10 @@
    "@remix-run/dev": "^2.11.2",
    "@remix-run/testing": "^2.11.2",
    "@tailwindcss/typography": "^0.5.15",
-    "@testing-library/jest-dom": "^6.5.0",
+    "@testing-library/jest-dom": "^6.6.1",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.7.5",
+    "@types/node": "^22.7.6",
    "@types/react": "^18.3.11",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
--- a/frontend/public/config.json
+++ b/frontend/public/config.json
@@ -0,0 +1,4 @@
+{
+  "APP_MODE": "oss",
+  "GITHUB_CLIENT_ID": ""
+}
--- a/frontend/public/waitlist.json
+++ b/frontend/public/waitlist.json
@@ -1,3 +0,0 @@
-{
-  "users": []
-}
--- a/frontend/src/api/open-hands.ts
+++ b/frontend/src/api/open-hands.ts
@@ -6,6 +6,7 @@ import {
  FeedbackResponse,
  GitHubAccessTokenResponse,
  ErrorResponse,
+  GetConfigResponse,
 } from "./open-hands.types";

 /**
@@ -60,6 +61,15 @@ class OpenHands {
    return response.json();
  }

+  static async getConfig(): Promise<GetConfigResponse> {
+    const response = await fetch("config.json", {
+      headers: {
+        "Cache-Control": "no-cache",
+      },
+    });
+    return response.json();
+  }
+
  /**
   * Retrieve the list of files available in the workspace
   * @param token User token provided by the server
@@ -71,7 +81,9 @@ class OpenHands {
    if (path) url.searchParams.append("path", path);

    const response = await fetch(url.toString(), {
-      headers: OpenHands.generateHeaders(token),
+      headers: {
+        Authorization: `Bearer ${token}`,
+      },
    });

    return response.json();
@@ -87,7 +99,9 @@ class OpenHands {
    const url = new URL(`${OpenHands.BASE_URL}/api/select-file`);
    url.searchParams.append("file", path);
    const response = await fetch(url.toString(), {
-      headers: OpenHands.generateHeaders(token),
+      headers: {
+        Authorization: `Bearer ${token}`,
+      },
    });

    const data = await response.json();
@@ -109,7 +123,10 @@ class OpenHands {
    const response = await fetch(`${OpenHands.BASE_URL}/api/save-file`, {
      method: "POST",
      body: JSON.stringify({ filePath: path, content }),
-      headers: OpenHands.generateHeaders(token),
+      headers: {
+        Authorization: `Bearer ${token}`,
+        "Content-Type": "application/json",
+      },
    });

    return response.json();
@@ -130,8 +147,10 @@ class OpenHands {

    const response = await fetch(`${OpenHands.BASE_URL}/api/upload-files`, {
      method: "POST",
-      headers: OpenHands.generateHeaders(token),
      body: formData,
+      headers: {
+        Authorization: `Bearer ${token}`,
+      },
    });

    return response.json();
@@ -144,8 +163,11 @@ class OpenHands {
   */
  static async getWorkspaceZip(token: string): Promise<Blob> {
    const response = await fetch(`${OpenHands.BASE_URL}/api/zip-directory`, {
-      headers: OpenHands.generateHeaders(token),
+      headers: {
+        Authorization: `Bearer ${token}`,
+      },
    });
+
    return response.blob();
  }

@@ -158,12 +180,14 @@ class OpenHands {
  static async sendFeedback(
    token: string,
    data: Feedback,
-    // TODO: Type the response
  ): Promise<FeedbackResponse> {
    const response = await fetch(`${OpenHands.BASE_URL}/api/submit-feedback`, {
      method: "POST",
-      headers: OpenHands.generateHeaders(token),
      body: JSON.stringify(data),
+      headers: {
+        Authorization: `Bearer ${token}`,
+        "Content-Type": "application/json",
+      },
    });

    return response.json();
@@ -177,23 +201,32 @@ class OpenHands {
  static async getGitHubAccessToken(
    code: string,
  ): Promise<GitHubAccessTokenResponse> {
-    const response = await fetch(`${OpenHands.BASE_URL}/github/callback`, {
+    const response = await fetch(`${OpenHands.BASE_URL}/api/github/callback`, {
      method: "POST",
      body: JSON.stringify({ code }),
+      headers: {
+        "Content-Type": "application/json",
+      },
    });

    return response.json();
  }

  /**
-   * Generate the headers for the request
-   * @param token User token provided by the server
-   * @returns Headers for the request
+   * Check if the user is authenticated
+   * @param login The user's GitHub login handle
+   * @returns Whether the user is authenticated
   */
-  private static generateHeaders(token: string) {
-    return {
-      Authorization: `Bearer ${token}`,
-    };
+  static async isAuthenticated(login: string): Promise<boolean> {
+    const response = await fetch(`${OpenHands.BASE_URL}/api/authenticate`, {
+      method: "POST",
+      body: JSON.stringify({ login }),
+      headers: {
+        "Content-Type": "application/json",
+      },
+    });
+
+    return response.status === 200;
  }
 }

--- a/frontend/src/api/open-hands.types.ts
+++ b/frontend/src/api/open-hands.types.ts
@@ -35,3 +35,8 @@ export interface Feedback {
  permissions: "public" | "private";
  trajectory: unknown[];
 }
+
+export interface GetConfigResponse {
+  APP_MODE: "saas" | "oss";
+  GITHUB_CLIENT_ID: string | null;
+}
--- a/frontend/src/assets/chevron-left.tsx
+++ b/frontend/src/assets/chevron-left.tsx
@@ -0,0 +1,28 @@
+interface ChevronLeftProps {
+  width?: number;
+  height?: number;
+  active?: boolean;
+}
+
+export function ChevronLeft({
+  width = 20,
+  height = 20,
+  active,
+}: ChevronLeftProps) {
+  return (
+    <svg
+      width={width}
+      height={height}
+      viewBox={`0 0 ${width} ${height}`}
+      fill="none"
+      xmlns="http://www.w3.org/2000/svg"
+    >
+      <path
+        fillRule="evenodd"
+        clipRule="evenodd"
+        d="M11.204 15.0037L6.65511 9.99993L11.204 4.99617L12.1289 5.83701L8.34444 9.99993L12.1289 14.1628L11.204 15.0037Z"
+        fill={active ? "#D4D4D4" : "#525252"}
+      />
+    </svg>
+  );
+}
--- a/frontend/src/assets/chevron-right.tsx
+++ b/frontend/src/assets/chevron-right.tsx
@@ -0,0 +1,28 @@
+interface ChevronRightProps {
+  width?: number;
+  height?: number;
+  active?: boolean;
+}
+
+export function ChevronRight({
+  width = 20,
+  height = 20,
+  active,
+}: ChevronRightProps) {
+  return (
+    <svg
+      width={width}
+      height={height}
+      viewBox={`0 0 ${width} ${height}`}
+      fill="none"
+      xmlns="http://www.w3.org/2000/svg"
+    >
+      <path
+        fillRule="evenodd"
+        clipRule="evenodd"
+        d="M8.79602 4.99634L13.3449 10.0001L8.79602 15.0038L7.87109 14.163L11.6556 10.0001L7.87109 5.83718L8.79602 4.99634Z"
+        fill={active ? "#D4D4D4" : "#525252"}
+      />
+    </svg>
+  );
+}
--- a/frontend/src/assets/close.svg
+++ b/frontend/src/assets/close.svg
@@ -0,0 +1,5 @@
+<svg width="11" height="11" viewBox="0 0 11 11" fill="none" xmlns="http://www.w3.org/2000/svg">
+  <path fill-rule="evenodd" clip-rule="evenodd"
+    d="M5.69949 5.72974L7.91965 7.9505L8.35077 7.51999L6.13001 5.29922L8.35077 3.07907L7.92026 2.64795L5.69949 4.86871L3.47934 2.64795L3.04883 3.07907L5.26898 5.29922L3.04883 7.51938L3.47934 7.9505L5.69949 5.72974Z"
+    fill="black" />
+</svg>
--- a/frontend/src/components/attach-image-label.tsx
+++ b/frontend/src/components/attach-image-label.tsx
@@ -0,0 +1,10 @@
+import Clip from "#/assets/clip.svg?react";
+
+export function AttachImageLabel() {
+  return (
+    <div className="flex self-start items-center text-[#A3A3A3] text-xs leading-[18px] -tracking-[0.08px] cursor-pointer">
+      <Clip width={16} height={16} />
+      Attach images
+    </div>
+  );
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Brennan	c0d7f6d56b	random changes to agent	2024-10-28 16:19:15 -04:00
Robert Brennan	fdb385ab93	Simplify makefile (#4591 )	2024-10-28 13:10:32 -04:00
sp.wack	13d101e092	fix(frontend): Record events sent to WS (#4596 )	2024-10-28 15:53:31 +00:00
sp.wack	6cf3728247	test(frontend): Test, refactor, and improve the chat interface (#4549 )	2024-10-28 17:26:28 +04:00
sp.wack	ae188458ef	chore(frontend): Remove root level `package.json` (#4590 )	2024-10-28 16:42:17 +04:00
Robert Brennan	a20da54e3a	Remove verbose log from agent controller (#4585 )	2024-10-27 15:50:23 +00:00
Mahmoud Sehsah	2a6740f4ba	fix(builder): Build the runtime with docker version that contains (-) in the version name (#4580 )	2024-10-27 02:54:52 +01:00
Ryan H. Tran	5ba7bc6be1	Mention `build-essential` dependency for ubuntu in dev doc (#4511 )	2024-10-26 20:17:43 -05:00
Xingyao Wang	98d4884ced	fix(controller): stop when run into loop (#4579 )	2024-10-26 19:40:58 -05:00
Xingyao Wang	be3cbb045e	fix(controllor): make agent controller stops when encounter fatal observation (#4573 )	2024-10-26 13:28:27 -05:00
dependabot[bot]	8bfd2fcf4f	chore(deps): bump the version-all group across 1 directory with 8 updates (#4564 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-25 20:07:16 +02:00
tofarr	d4e3982a6b	Small refactor : EventStream as a dataclass (#4557 )	2024-10-25 17:31:20 +00:00
Xingyao Wang	1f23dc89b6	fix(eval): add runtime.connect to all eval harness (#4565 )	2024-10-26 00:41:30 +08:00
Xingyao Wang	7340b78962	feat(eval): rewrite log_completions to save completions to directory (#4566 )	2024-10-25 16:36:11 +00:00
tofarr	c3da25febc	Fix for docker leak (#4560 )	2024-10-25 15:53:39 +00:00
Robert Brennan	8d2b2d4318	Refactor runtime to add a `connect` method (#4410 ) Co-authored-by: Tim O'Farrell <tofarr@gmail.com>	2024-10-25 09:02:19 -04:00
tofarr	c4f5c07be1	Refactor: shorter syntax (#4558 )	2024-10-25 06:45:28 -06:00
Xingyao Wang	349e2dbe50	refactor: move bash related logic into `BashSession` for cleaner code (#4527 ) Co-authored-by: Tim O'Farrell <tofarr@gmail.com>	2024-10-25 20:44:25 +08:00
Xingyao Wang	dcd4b04f57	feat(llm): update prompt caching list to include new sonnet (#4552 )	2024-10-25 20:36:35 +08:00
sp.wack	78eacc4489	fix(frontend): Fix loader checking unset config variable in window (#4546 ) Co-authored-by: Robert Brennan <accounts@rbren.io>	2024-10-25 08:14:40 -04:00
tofarr	60990c128a	Feature: Minor refactor of SessionManager to make it a dataclass (#4553 )	2024-10-24 14:32:05 -06:00
Robert Brennan	c4c25ea229	Minor fixes for GitHub credential exchange (#4554 )	2024-10-24 16:29:03 -04:00
tofarr	930726f4e8	Fix for issue where we hammer docker needlessly (#4551 )	2024-10-24 20:03:35 +00:00
tofarr	ee2c2ff2b8	Feat changed "is_confirmed" to "confirmation_state" (#4508 )	2024-10-24 13:35:14 -06:00
Robert Brennan	8c064fe3df	add catch all route, disable caching (#4547 )	2024-10-24 15:06:17 -04:00
sp.wack	e878741ae7	test(frontend): Test, refactor, and improve the chat input (#4535 )	2024-10-24 18:19:41 +04:00
tofarr	90e2bf4883	Split bash commands by the new line character (#4462 )	2024-10-24 07:44:38 -06:00
dependabot[bot]	615b94cf2f	chore(deps): bump the version-all group across 1 directory with 19 updates (#4531 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-23 21:28:02 +02:00
Graham Neubig	ce2430180f	Update README.md to fix miniwob name (#4534 )	2024-10-23 18:24:43 +00:00
Xingyao Wang	eaea94cc1b	fix(remote runtime): retry on 429 error on remote build & log retries (#4532 )	2024-10-24 02:07:11 +08:00
sp.wack	385cc8f512	[ALL-561] feat(frontend\|backend): Display error messages in the chat (#4509 )	2024-10-23 18:56:00 +04:00
Xingyao Wang	2d5b360505	refactor: re-organize different runtime implementations into an impl folder (#4346 ) Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-10-23 10:10:03 +00:00
mamoodi	9b6fd239d0	Release 0.11.0 (#4523 )	2024-10-22 16:43:13 -04:00
sp.wack	dd15845b91	[ALL-570] fix(frontend): Don't wrap filenames in the file explorer (#4521 )	2024-10-22 23:31:42 +04:00
sp.wack	64adb64fef	[ALL-597] fix(frontend): Fetch `config.json` locally (#4522 )	2024-10-22 23:31:29 +04:00
Yashwanth S C	6573304014	fix(frontend): Error when API key is not entered is not clear (#4429 )	2024-10-22 22:23:09 +04:00
sp.wack	29ddcdaf46	[ALL-469] fix(frontend): Indicate that import projects require zips (#4515 )	2024-10-22 22:15:08 +04:00
mamoodi	d0bbad8eda	Remove settings base container as it is not supported (#4520 )	2024-10-22 18:14:59 +00:00
sp.wack	7b81df2a94	[ALL-596] fix(frontend): Fix import project from sending request before runtime is active (#4513 )	2024-10-22 18:04:49 +00:00
mamoodi	550044454c	Revert docker install in OpenHands app image (#4519 )	2024-10-22 13:46:19 -04:00
sp.wack	3927fc3616	[ALL-594] chore(frontend): Add frontend error handling for failed requests (#4501 )	2024-10-22 20:05:59 +04:00
sp.wack	864f81bc71	test(frontend): User actions and friends (#4497 )	2024-10-22 20:04:07 +04:00
Graham Neubig	54250e3fe2	Update evaluation README.md structure (#4516 )	2024-10-22 14:42:22 +00:00
Xingyao Wang	da548d308c	[agent] LLM-based editing (#3985 ) Co-authored-by: Tim O'Farrell <tofarr@gmail.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-10-22 04:51:44 +08:00
sp.wack	6fe5482b20	[ALL-571] chore(frontend): Move `saas`-related configs to `config.json` (#4496 )	2024-10-21 14:59:20 +00:00
dependabot[bot]	520586a89c	chore(deps): bump @mdx-js/react from 3.0.1 to 3.1.0 in /docs in the version-all group (#4478 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-21 09:21:33 +04:00
Xingyao Wang	263798584e	fix(runtime): replace codec error in pexcept (#4493 )	2024-10-20 12:51:05 +08:00
Alejandro Cuadron Lafuente	a9a593bb21	[Fix] Added support to specify the platform on which the runtime image should be built. (#4402 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev> Co-authored-by: mamoodi <mamoodiha@gmail.com> Co-authored-by: tofarr <tofarr@gmail.com> Co-authored-by: Robert Brennan <contact@rbren.io>	2024-10-20 09:19:05 +08:00
tobitege	6471d0f94d	.gitignore: ignore all `node_modules` folders (#4491 )	2024-10-20 09:17:45 +08:00
sp.wack	5cc16cb82a	fix(frontend): Fix waitlist logic (#4492 )	2024-10-19 14:20:54 -04:00
Robert Brennan	cc68756b26	fix freeze on zip-files endpoint (#4487 )	2024-10-18 15:29:07 -04:00
Xingyao Wang	126bf316bc	fix(docker): Dockerfile failed to build on RemoteRuntime (#4481 ) Co-authored-by: tofarr <tofarr@gmail.com>	2024-10-19 03:28:39 +08:00
Xingyao Wang	91308ba4dc	feat: clean-up retries RemoteRuntime & add FatalErrorObservation (#4485 )	2024-10-18 17:23:13 +00:00
Graham Neubig	b660aa99b8	Fix issue #4480 : '[Bug]: Being blocked by cloudflare results in futile retries (#4482 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-10-18 13:04:34 -04:00
sp.wack	cf793582a7	[ALL-543] feat(frontend): Setup auth route, replace loading spinner, add new route (#4448 )	2024-10-18 19:32:46 +04:00
Robert Brennan	56fe905241	reduce dependabot frequency (#4305 )	2024-10-18 11:21:15 -04:00
mamoodi	02abf60433	Run flaky mac tests nightly (#4470 )	2024-10-18 10:38:40 -04:00
mamoodi	e6a5e39047	Update docs associated with new UI (#4469 )	2024-10-18 10:19:56 -04:00
mamoodi	feee509de7	Update leftover versions (#4468 )	2024-10-18 09:28:53 -04:00
Robert Brennan	fd6facbf03	update contributing docs (#4438 ) Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>	2024-10-18 05:08:54 +02:00
dependabot[bot]	1ea3087eec	chore(deps): bump modal from 0.64.182 to 0.64.192 (#4460 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-18 05:02:20 +02:00
dependabot[bot]	2e09b4f95e	chore(deps-dev): bump torch from 2.2.2 to 2.5.0 (#4459 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-18 05:01:27 +02:00
mamoodi	d2d55f5ea2	Update custom sandbox doc (#4332 )	2024-10-17 18:23:57 -04:00
mamoodi	0e467b1429	Release 0.10.0 (#4463 )	2024-10-17 22:23:40 +00:00
Xingyao Wang	ec3152b6e1	linter: only lint on updated lines in the new file (#4409 )	2024-10-17 15:57:03 -04:00
sp.wack	642e01b673	fix(frontend): Update build directory and referenced paths (#4461 )	2024-10-17 23:24:49 +04:00
sp.wack	6cb174b7d1	[ALL-557] feat(frontend): Add save and discard actions to the editor (#4442 ) Co-authored-by: mamoodi <mamoodiha@gmail.com>	2024-10-17 17:14:55 +00:00
Robert Brennan	154854bbe3	run in dev mode in makefile (#4452 )	2024-10-17 12:40:47 -04:00
sp.wack	678630c5bd	fix(frontend): Catch config fetch error and set default fallback (#4453 )	2024-10-17 16:17:44 +00:00
dependabot[bot]	ad800bf373	chore(deps): bump litellm from 1.49.5 to 1.49.6 (#4458 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-17 17:51:45 +02:00
dependabot[bot]	206788a0e8	chore(deps): bump react-syntax-highlighter from 15.5.0 to 15.6.1 in /frontend (#4457 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-17 15:31:07 +00:00
dependabot[bot]	ca3fbb2a80	chore(deps-dev): bump @types/node from 22.7.5 to 22.7.6 in /frontend (#4455 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-17 15:29:23 +00:00
dependabot[bot]	cc500a622a	chore(deps-dev): bump @testing-library/jest-dom from 6.5.0 to 6.6.1 in /frontend (#4456 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-17 15:29:01 +00:00
tofarr	5fb3dece93	Feat: Divided docker layer to make it easier to cache (#4313 ) Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>	2024-10-17 15:08:56 +00:00
sp.wack	83c096b974	[ALL-551] chore(frontend): Retrieve `APP_MODE` from the server (#4423 )	2024-10-17 18:35:21 +04:00
Xingyao Wang	015df47e53	chore: remove integration tests from CI to unblock (#4451 )	2024-10-17 14:19:53 +00:00
Jiayi Pan	c1b323a076	Show actual dataset name in swebench log directory (#4417 )	2024-10-17 10:32:38 +08:00
Xingyao Wang	84a578ad20	[test] remove integration tests from CI & move them into evaluation (#4447 )	2024-10-17 05:38:23 +08:00
dependabot[bot]	8e5db345b2	chore(deps): bump boto3 from 1.35.40 to 1.35.42 (#4445 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-16 22:51:40 +02:00
dependabot[bot]	f61266841c	chore(deps): bump browsergym from 0.8.0 to 0.8.1 (#4437 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-16 22:50:39 +02:00
dependabot[bot]	277d991b37	chore(deps): bump fastapi from 0.115.0 to 0.115.2 (#4370 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-16 22:02:26 +02:00
Engel Nyst	20aa66d5e2	Bump Mac version in CI (#4441 )	2024-10-16 21:52:21 +02:00
dependabot[bot]	9bc6252967	chore(deps): bump anthropic from 0.36.0 to 0.36.1 (#4436 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-16 21:25:00 +02:00
Alejandro Cuadron Lafuente	bb416009c5	[Fix] Fixed the inputs to the ManagerAgent (#4427 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Xingyao Wang <xingyao@all-hands.dev> Co-authored-by: mamoodi <mamoodiha@gmail.com> Co-authored-by: tofarr <tofarr@gmail.com> Co-authored-by: Robert Brennan <contact@rbren.io>	2024-10-16 20:47:46 +02:00
Robert Brennan	226ea545fa	Add workflow scope to GitHub authentication URL (#4439 ) Co-authored-by: openhands <openhands@all-hands.dev>	2024-10-16 14:41:46 -04:00
tofarr	e12bff5189	Fix: Removed flaky test (#4444 )	2024-10-16 18:10:27 +00:00
dependabot[bot]	23d3becf1d	chore(deps): bump litellm from 1.49.4 to 1.49.5 (#4431 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-16 18:36:39 +02:00
Robert Brennan	be79ccdb39	fix default host (#4413 )	2024-10-16 10:56:42 -04:00
sp.wack	2277897f86	feat(frontend): Improve file based routing (#4317 )	2024-10-16 18:54:15 +04:00
tofarr	be9619be3a	Feat faster unit tests 2 (#4418 )	2024-10-16 08:40:53 -06:00
tofarr	cb58dab82b	Fix loop graceful shutdown (#4394 )	2024-10-16 08:40:33 -06:00
sp.wack	8ab293a667	fix(frontend): Fix request headers (#4422 )	2024-10-16 14:22:18 +00:00