fix templates

template fixes
split blocks
2026-04-29 03:00:45 -04:00 · 2024-10-27 18:51:22 +01:00 · 2024-10-27 17:23:44 +01:00 · 2024-10-27 15:47:32 +01:00 · 2024-10-27 15:04:13 +01:00 · 2024-10-27 09:10:03 +01:00
728 changed files with 42647 additions and 45000 deletions
--- a/.devcontainer/on_create.sh
+++ b/.devcontainer/on_create.sh
@@ -2,7 +2,5 @@
 sudo apt update
 sudo apt install -y netcat
 sudo add-apt-repository -y ppa:deadsnakes/ppa
-sudo apt install -y python3.11
-curl -sSL https://install.python-poetry.org | python3.11 -
-# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
-sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
+sudo apt install -y python3.12
+curl -sSL https://install.python-poetry.org | python3.12 -
--- a/.github/ISSUE_TEMPLATE/bug_template.yml
+++ b/.github/ISSUE_TEMPLATE/bug_template.yml
@@ -5,71 +5,55 @@ labels: ['bug']
 body:
  - type: markdown
    attributes:
-      value: Thank you for taking the time to fill out this bug report. We greatly appreciate your effort to complete this template fully. Please provide as much information as possible to help us understand and address the issue effectively.
+      value: Thank you for taking the time to fill out this bug report. Please provide as much information as possible to help us understand and address the issue effectively.

  - type: checkboxes
    attributes:
      label: Is there an existing issue for the same bug?
      description: Please check if an issue already exists for the bug you encountered.
      options:
-      - label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
-        required: true
      - label: I have checked the existing issues.
        required: true

  - type: textarea
    id: bug-description
    attributes:
-      label: Describe the bug
-      description: Provide a short description of the problem.
+      label: Describe the bug and reproduction steps
+      description: Provide a description of the issue along with any reproduction steps.
    validations:
      required: true

-  - type: textarea
-    id: current-version
+  - type: dropdown
+    id: installation
    attributes:
-      label: Current OpenHands version
-      description: What version of OpenHands are you using? If you're running in docker, tell us the tag you're using (e.g. ghcr.io/all-hands-ai/openhands:0.3.1).
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Installation
+      description: How are you running OpenHands?
+      options:
+        - Docker command in README
+        - Development workflow
+      default: 0

-  - type: textarea
-    id: config
+  - type: input
+    id: openhands-version
    attributes:
-      label: Installation and Configuration
-      description: Please provide any commands you ran and any configuration (redacting API keys)
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Version
+      description: What version of OpenHands are you using?
+      placeholder: ex. 0.9.8, main, etc.

-  - type: textarea
-    id: model-agent
-    attributes:
-      label: Model and Agent
-      description: What model and agent are you using? You can see these settings in the UI by clicking the settings wheel.
-      placeholder: |
-        - Model:
-        - Agent:
-
-  - type: textarea
-    id: os-version
+  - type: dropdown
+    id: os
    attributes:
      label: Operating System
-      description: What Operating System are you using? Linux, Mac OS, WSL on Windows
-
-  - type: textarea
-    id: repro-steps
-    attributes:
-      label: Reproduction Steps
-      description: Please list the steps to reproduce the issue.
-      placeholder: |
-        1.
-        2.
-        3.
+      options:
+        - MacOS
+        - Linux
+        - WSL on Windows

  - type: textarea
    id: additional-context
    attributes:
      label: Logs, Errors, Screenshots, and Additional Context
-      description: If you want to share the chat history you can click the thumbs-down (👎) button above the input field and you will get a shareable link (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the `logs/llm/default` folder. Please add any additional context about the problem here.
+      description: Please provide any additional information you think might help. If you want to share the chat history
+        you can click the thumbs-down (👎) button above the input field and you will get a shareable link
+        (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the
+        `logs/llm/default` folder. Please add any additional context about the problem here.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,21 +1,35 @@
-# To get started with Dependabot version updates, you'll need to specify which
-# package ecosystems to update and where the package manifests are located.
-# Please see the documentation for all configuration options:
-# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
-
 version: 2
 updates:
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
+    groups:
+      # put packages in their own group if they have a history of breaking the build or needing to be reverted
+      pre-commit:
+        patterns:
+          - "pre-commit"
+      llama:
+        patterns:
+          - "llama*"
+      chromadb:
+        patterns:
+          - "chromadb"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/frontend"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -23,12 +37,21 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/docs"
    schedule:
-      interval: "daily"
-    open-pull-requests-limit: 20
+      interval: "weekly"
+      day: "wednesday"
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -36,3 +59,11 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -14,6 +14,11 @@ on:
    branches:
      - main

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Build the documentation website
  build:
@@ -32,7 +37,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Generate Python Docs
        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -9,6 +9,11 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  test:
    runs-on: ubuntu-latest
@@ -36,7 +41,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
          cache: 'poetry'
      - name: Install Python dependencies using Poetry
        run: poetry install --without evaluation,llama-index
--- a/.github/workflows/fe-unit-tests.yml
+++ b/.github/workflows/fe-unit-tests.yml
@@ -12,6 +12,11 @@ on:
      - 'frontend/**'
      -  '.github/workflows/fe-unit-tests.yml'

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run frontend unit tests
  fe-test:
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -1,12 +1,6 @@
-# Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
+# Workflow that builds, tests and then pushes the OpenHands and runtime docker images to the ghcr.io repository
 name: Build, Test and Publish RT Image

-# Only run one workflow of the same group at a time.
-# There can be at most one running and one pending job in a concurrency group at any time.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
 # Always run on "main"
 # Always run on tags
 # Always run on PRs
@@ -25,8 +19,14 @@ on:
        required: true
        default: ''

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 env:
-  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.11-nodejs22
+  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.12-nodejs22
+  RELEVANT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}

 jobs:
  # Builds the OpenHands Docker images
@@ -83,13 +83,12 @@ jobs:
          export REPO_OWNER=${{ github.repository_owner }}
          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
          # Run the build script in the app image
-          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ github.sha }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
          # Get the hash from the build script
          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
          echo "Hash from app image: $hash_from_app_image"

-
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
@@ -100,7 +99,7 @@ jobs:
    strategy:
      matrix:
        base_image:
-          - image: 'nikolaik/python-nodejs:python3.11-nodejs22'
+          - image: 'nikolaik/python-nodejs:python3.12-nodejs22'
            tag: nikolaik
    steps:
      - name: Checkout
@@ -135,7 +134,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Cache Poetry dependencies
        uses: actions/cache@v4
        with:
@@ -160,7 +159,7 @@ jobs:
        if: github.event.pull_request.head.repo.fork
        uses: docker/build-push-action@v6
        with:
-          tags: ghcr.io/all-hands-ai/runtime:${{ github.sha }}-${{ matrix.base_image.tag }}
+          tags: ghcr.io/all-hands-ai/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image.tag }}
          outputs: type=docker,dest=/tmp/runtime-${{ matrix.base_image.tag }}.tar
          context: containers/runtime
      - name: Upload runtime image for fork
@@ -192,7 +191,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
@@ -271,7 +270,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
@@ -284,7 +283,7 @@ jobs:
          # Install to be able to retry on failures for flaky tests
          poetry run pip install pytest-rerunfailures

-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

          SKIP_CONTAINER_LOGS=true \
@@ -293,7 +292,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -349,7 +348,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
@@ -362,7 +361,7 @@ jobs:
          # Install to be able to retry on failures for flaky tests
          poetry run pip install pytest-rerunfailures

-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

          SKIP_CONTAINER_LOGS=true \
@@ -371,79 +370,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-  # Run integration tests with the eventstream runtime Docker image
-  runtime_integration_tests_on_linux:
-    name: RT Integration Tests (Linux)
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime]
-    strategy:
-      fail-fast: false
-      matrix:
-        base_image: ['nikolaik']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      # Forked repos can't push to GHCR, so we need to download the image as an artifact
-      - name: Download runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        uses: actions/download-artifact@v4
-        with:
-          name: runtime-${{ matrix.base_image }}
-          path: /tmp
-      - name: Load runtime image for fork
-        if: github.event.pull_request.head.repo.fork
-        run: |
-          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Run integration tests
-        run: |
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
-          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
-
-          TEST_RUNTIME=eventstream \
-          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
-          TEST_IN_CI=true \
-          TEST_ONLY=true \
-          ./tests/integration/regenerate.sh
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -457,7 +384,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@@ -466,7 +393,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
        run: |
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -10,6 +10,11 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run lint on the frontend code
  lint-frontend:
@@ -41,9 +46,9 @@ jobs:
      - name: Set up python
        uses: actions/setup-python@v5
        with:
-          python-version: 3.11
+          python-version: 3.12
          cache: 'pip'
      - name: Install pre-commit
        run: pip install pre-commit==3.7.0
      - name: Run pre-commit hooks
-        run: pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
--- a/.github/workflows/py-unit-tests-mac.yml
+++ b/.github/workflows/py-unit-tests-mac.yml
@@ -0,0 +1,96 @@
+# Workflow that runs python unit tests on mac
+name: Run Python Unit Tests Mac
+
+# This job is flaky so only run it nightly
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  # Run python unit tests on macOS
+  test-on-macos:
+    name: Python Unit Tests on macOS
+    runs-on: macos-14
+    env:
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
+    strategy:
+      matrix:
+        python-version: ['3.12']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Install & Start Docker
+        if: env.INSTALL_DOCKER == '1'
+        run: |
+          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
+
+          # Uninstall colima to upgrade to the latest version
+          if brew list colima &>/dev/null; then
+            brew uninstall colima
+            # unlinking colima dependency: go
+            brew uninstall go@1.21
+          fi
+          rm -rf ~/.colima ~/.lima
+          brew install --HEAD colima
+          brew install docker
+
+          start_colima() {
+            # Find a free port in the range 10000-20000
+            RANDOM_PORT=$((RANDOM % 10001 + 10000))
+
+            # Original line:
+            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
+              echo "Failed to start Colima."
+              return 1
+            fi
+            return 0
+          }
+
+          # Attempt to start Colima for 5 total attempts:
+          ATTEMPT_LIMIT=5
+          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
+
+            if start_colima; then
+              echo "Colima started successfully."
+              break
+            else
+              colima stop -f
+              sleep 10
+              colima delete -f
+              if [ $i -eq $ATTEMPT_LIMIT ]; then
+                exit 1
+              fi
+              sleep 10
+            fi
+          done
+
+          # For testcontainers to find the Colima socket
+          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
+          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
+      - name: Build Environment
+        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Run Tests
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -10,95 +10,12 @@ on:
      - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
-  # Run python unit tests on macOS
-  test-on-macos:
-    name: Python Unit Tests on macOS
-    runs-on: macos-12
-    env:
-      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
-    strategy:
-      matrix:
-        python-version: ['3.11']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Cache Poetry dependencies
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cache/pypoetry
-            ~/.virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation,llama-index
-      - name: Install & Start Docker
-        if: env.INSTALL_DOCKER == '1'
-        run: |
-          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
-
-          # Uninstall colima to upgrade to the latest version
-          if brew list colima &>/dev/null; then
-            brew uninstall colima
-            # unlinking colima dependency: go
-            brew uninstall go@1.21
-          fi
-          rm -rf ~/.colima ~/.lima
-          brew install --HEAD colima
-          brew install docker
-
-          start_colima() {
-            # Find a free port in the range 10000-20000
-            RANDOM_PORT=$((RANDOM % 10001 + 10000))
-
-            # Original line:
-            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
-              echo "Failed to start Colima."
-              return 1
-            fi
-            return 0
-          }
-
-          # Attempt to start Colima for 5 total attempts:
-          ATTEMPT_LIMIT=5
-          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
-
-            if start_colima; then
-              echo "Colima started successfully."
-              break
-            else
-              colima stop -f
-              sleep 10
-              colima delete -f
-              if [ $i -eq $ATTEMPT_LIMIT ]; then
-                exit 1
-              fi
-              sleep 10
-            fi
-          done
-
-          # For testcontainers to find the Colima socket
-          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
-          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-      - name: Build Environment
-        run: make build
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
  # Run python unit tests on Linux
  test-on-linux:
    name: Python Unit Tests on Linux
@@ -107,7 +24,7 @@ jobs:
      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v4
      - name: Set up Docker Buildx
@@ -125,7 +42,7 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml -svv ./tests/unit
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -17,7 +17,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: 3.11
+          python-version: 3.12
      - name: Install Poetry
        uses: snok/install-poetry@v1.4.1
        with:
@@ -26,6 +26,6 @@ jobs:
      - name: Install Poetry Dependencies
        run: poetry install --no-interaction --no-root
      - name: Build poetry project
-        run: poetry build -v
+        run: ./build.sh
      - name: publish
        run: poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/.github/workflows/regenerate_integration_tests.yml
+++ b/.github/workflows/regenerate_integration_tests.yml
@@ -1,73 +0,0 @@
-
-name: Regenerate Integration Tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      debug:
-        description: 'Enable debug mode'
-        type: boolean
-        default: true
-      log_to_file:
-        description: 'Enable logging to file'
-        type: boolean
-        default: true
-      force_regenerate_tests:
-        description: 'Force regeneration of tests'
-        type: boolean
-        default: false
-      force_use_llm:
-        description: 'Force use of LLM'
-        type: boolean
-        default: false
-
-jobs:
-  regenerate_integration_tests:
-    if: github.ref != 'refs/heads/main'
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Set up Docker Buildx
-      id: buildx
-      uses: docker/setup-buildx-action@v3
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.11"
-    - name: Cache Poetry dependencies
-      uses: actions/cache@v4
-      with:
-        path: |
-          ~/.cache/pypoetry
-          ~/.virtualenvs
-        key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-        restore-keys: |
-          ${{ runner.os }}-poetry-
-    - name: Install poetry via pipx
-      run: pipx install poetry
-    - name: Install Python dependencies using Poetry
-      run: make install-python-dependencies
-    - name: Build Environment
-      run: make build
-    - name: Regenerate integration tests
-      run: |
-        DEBUG=${{ inputs.debug }} \
-        LOG_TO_FILE=${{ inputs.log_to_file }} \
-        FORCE_REGENERATE_TESTS=${{ inputs.force_regenerate_tests }} \
-        FORCE_USE_LLM=${{ inputs.force_use_llm }} \
-        ./tests/integration/regenerate.sh
-    - name: Commit changes
-      run: |
-        if git diff --quiet --exit-code; then
-          echo "No changes to commit"
-          exit 0
-        fi
-
-        git config --global user.name 'github-actions[bot]'
-        git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-        git add .
-        # run it twice in case pre-commit makes changes
-        git commit -am "Regenerate integration tests" || git commit -am "Regenerate integration tests"
-        git push
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -21,7 +21,7 @@ jobs:
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
-        python-version: '3.11'
+        python-version: '3.12'
    - name: install git, github cli
      run: |
        sudo apt-get install -y git gh
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,7 @@ celerybeat.pid

 # Environments
 .env
+frontend/.env
 .venv
 env/
 venv/
@@ -177,7 +178,6 @@ evaluation/toolqa/data
 # frontend

 # dependencies
-frontend/node_modules
 frontend/.pnp
 frontend/bun.lockb
 frontend/yarn.lock
@@ -227,3 +227,4 @@ runtime_*.tar
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
 containers/runtime/code
+**/node_modules/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,95 +2,70 @@

 Thanks for your interest in contributing to OpenHands! We welcome and appreciate contributions.

-## How Can I Contribute?
-
-There are many ways that you can contribute:
-
-1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
-2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) issues that may be ones to start on.
-
 ## Understanding OpenHands's CodeBase

 To understand the codebase, please refer to the README in each module:
 - [frontend](./frontend/README.md)
- [agenthub](./agenthub/README.md)
 - [evaluation](./evaluation/README.md)
 - [openhands](./openhands/README.md)
-    - [server](./openhands/server/README.md)
+   - [agenthub](./openhands/agenthub/README.md)
+   - [server](./openhands/server/README.md)

+## Setting up your development environment
+
+We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
+
+## How can I contribute?
+
+There are many ways that you can contribute:
+
+1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
+2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
+3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
+
+## What can I build?
+Here are a few ways you can help improve the codebase.
+
+#### UI/UX
+We're always looking to improve the look and feel of the application. If you've got a small fix
+for something that's bugging you, feel free to open up a PR that changes the `./frontend` directory.
+
+If you're looking to make a bigger change, add a new UI element, or significantly alter the style
+of the application, please open an issue first, or better, join the #frontend channel in our Slack
+to gather consensus from our design team first.
+
+#### Improving the agent
+Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent)
+
+Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
+You can try modifying the prompts to see how they change the behavior of the agent as you use the app
+locally, but we will need to do an end-to-end evaluation of any changes here to ensure that the agent
+is getting better over time.
+
+We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. You can join the #evaluation
+channel in Slack to learn more.
+
+#### Adding a new agent
+You may want to experiment with building new types of agents. You can add an agent to `openhands/agenthub`
+to help expand the capabilities of OpenHands.
+
+#### Adding a new runtime
+The agent needs a place to run code and commands. When you run OpenHands on your laptop, it uses a Docker container
+to do this by default. But there are other ways of creating a sandbox for the agent.
+
+If you work for a company that provides a cloud-based runtime, you could help us add support for that runtime
+by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/runtime.py).
+
+#### Testing
 When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
 At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.

 ## Sending Pull Requests to OpenHands

-### 1. Fork the Official Repository
-Fork the [OpenHands repository](https://github.com/All-Hands-AI/OpenHands) into your own account.
-Clone your own forked repository into your local environment:
+You'll need to fork our repository to send us a Pull Request. You can learn more
+about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8)

-```shell
-git clone git@github.com:<YOUR-USERNAME>/OpenHands.git
-```
-
-### 2. Configure Git
-
-Set the official repository as your [upstream](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams) to synchronize with the latest update in the official repository.
-Add the original repository as upstream:
-
-```shell
-cd OpenHands
-git remote add upstream git@github.com:All-Hands-AI/OpenHands.git
-```
-
-Verify that the remote is set:
-
-```shell
-git remote -v
-```
-
-You should see both `origin` and `upstream` in the output.
-
-### 3. Synchronize with Official Repository
-Synchronize latest commit with official repository before coding:
-
-```shell
-git fetch upstream
-git checkout main
-git merge upstream/main
-git push origin main
-```
-
-### 4. Set up the Development Environment
-
-We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
-
-### 5. Write Code and Commit It
-
-Once you have done this, you can write code, test it, and commit it to a branch (replace `my_branch` with an appropriate name):
-
-```shell
-git checkout -b my_branch
-git add .
-git commit
-git push origin my_branch
-```
-
-### 6. Open a Pull Request
-
-* On GitHub, go to the page of your forked repository, and create a Pull Request:
-   - Click on `Branches`
-   - Click on the `...` beside your branch and click on `New pull request`
-   - Set `base repository` to `All-Hands-AI/OpenHands`
-   - Set `base` to `main`
-   - Click `Create pull request`
-
-The PR should appear in [OpenHands PRs](https://github.com/All-Hands-AI/OpenHands/pulls).
-
-Then the OpenHands team will review your code.
-
-## PR Rules
-
-### 1. Pull Request title
+### Pull Request title
 As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:

 - `feat`: A new feature
@@ -111,6 +86,9 @@ For example, a PR title could be:

 You may also check out previous PRs in the [PR list](https://github.com/All-Hands-AI/OpenHands/pulls).

-### 2. Pull Request description
+### Pull Request description
 - If your PR is small (such as a typo fix), you can go brief.
 - If it contains a lot of changes, it's better to write more details.
+
+If your changes are user-facing (e.g. a new feature in the UI, a change in behavior, or a bugfix)
+please include a short message that we can add to our changelog.
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -2,7 +2,7 @@

 ## Contributors

-We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. Your dedication and hard work are greatly appreciated.
+We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. We greatly appreciate your dedication and hard work.

 ## Open Source Projects

@@ -10,7 +10,7 @@ OpenHands includes and adapts the following open source projects. We are gratefu

 #### [SWE Agent](https://github.com/princeton-nlp/swe-agent)
   - License: MIT License
-   - Description: Adapted for use in OpenHands's agenthub
+   - Description: Adapted for use in OpenHands's agent hub

 #### [Aider](https://github.com/paul-gauthier/aider)
   - License: Apache License 2.0
--- a/Development.md
+++ b/Development.md
@@ -5,12 +5,14 @@ Otherwise, you can clone the OpenHands project directly.

 ## Start the server for development
 ### 1. Requirements
-* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [ Ubuntu <= 22.04]
+* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [Ubuntu <= 22.04]
 * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) = 3.11
+* [Python](https://www.python.org/downloads/) = 3.12
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
-* netcat => sudo apt-get install netcat
+* OS-specific dependencies:
+  - Ubuntu: build-essential => `sudo apt-get install build-essential`
+  - WSL: netcat => `sudo apt-get install netcat`

 Make sure you have all these dependencies installed before moving on to `make build`.

@@ -22,8 +24,8 @@ If you want to develop without system admin/sudo access to upgrade/install `Pyth
 curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 bash Miniforge3-$(uname)-$(uname -m).sh

-# Install Python 3.11, nodejs, and poetry
-mamba install python=3.11
+# Install Python 3.12, nodejs, and poetry
+mamba install python=3.12
 mamba install conda-forge::nodejs
 mamba install conda-forge::poetry
 ```
@@ -91,13 +93,15 @@ To run tests, refer to the following:
 poetry run pytest ./tests/unit/test_*.py
 ```

-#### Integration tests
-Please refer to [this README](./tests/integration/README.md) for details.
-
 ### 9. Add or update dependency
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`

+### 9. Use existing Docker image
+To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image. Follow these steps:
+1. Set the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
+2. Example: export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 ## Develop inside Docker container

 TL;DR
--- a/6
+++ b/6
@@ -10,7 +10,7 @@ DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
-PYTHON_VERSION = 3.11
+PYTHON_VERSION = 3.12

 # ANSI color codes
 GREEN=$(shell tput -Txterm setaf 2)
@@ -195,7 +195,7 @@ start-backend:
 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run dev -- --port $(FRONTEND_PORT)

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -214,7 +214,7 @@ _run_setup:
 run:
 	@echo "$(YELLOW)Running the app...$(RESET)"
 	@$(MAKE) -s _run_setup
-	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
+	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run dev -- --port $(FRONTEND_PORT)
 	@echo "$(GREEN)Application started successfully.$(RESET)"

 # Run the app (in docker)
--- a/README.md
+++ b/README.md
@@ -36,16 +36,16 @@ Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [
 The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to
 point OpenHands to existing code that you'd like to modify.

-See the [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) guide for
+See the [Installation](https://docs.all-hands.dev/modules/usage/installation) guide for
 system requirements and more information.

 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

-docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik

 docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -53,7 +53,7 @@ docker run -it --pull=always \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
+    ghcr.io/all-hands-ai/openhands:0.11
 ```

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -65,7 +65,7 @@ You'll need a model provider and API key. One option that works well: [Claude 3.
 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
 or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).

-Visit [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) for more information and setup instructions.
+Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.

 If you want to modify the OpenHands source code, check out [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).

@@ -120,8 +120,8 @@ For a list of open source projects and licenses used in OpenHands, please see ou
 ## 📚 Cite

 ```
-@misc{opendevin,
-      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+@misc{openhands,
+      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
      year={2024},
      eprint={2407.16741},
--- a/agenthub/browsing_agent/response_parser.py
+++ b/agenthub/browsing_agent/response_parser.py
@@ -1,88 +0,0 @@
-import ast
-
-from openhands.controller.action_parser import ActionParser, ResponseParser
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action import (
-    Action,
-    BrowseInteractiveAction,
-)
-
-
-class BrowsingResponseParser(ResponseParser):
-    def __init__(self):
-        # Need to pay attention to the item order in self.action_parsers
-        super().__init__()
-        self.action_parsers = [BrowsingActionParserMessage()]
-        self.default_parser = BrowsingActionParserBrowseInteractive()
-
-    def parse(self, response: str) -> Action:
-        action_str = self.parse_response(response)
-        return self.parse_action(action_str)
-
-    def parse_response(self, response) -> str:
-        action_str = response['choices'][0]['message']['content']
-        if action_str is None:
-            return ''
-        action_str = action_str.strip()
-        if action_str and not action_str.endswith('```'):
-            action_str = action_str + ')```'
-        logger.debug(action_str)
-        return action_str
-
-    def parse_action(self, action_str: str) -> Action:
-        for action_parser in self.action_parsers:
-            if action_parser.check_condition(action_str):
-                return action_parser.parse(action_str)
-        return self.default_parser.parse(action_str)
-
-
-class BrowsingActionParserMessage(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return '```' not in action_str
-
-    def parse(self, action_str: str) -> Action:
-        msg = f'send_msg_to_user("""{action_str}""")'
-        return BrowseInteractiveAction(
-            browser_actions=msg,
-            thought=action_str,
-            browsergym_send_msg_to_user=action_str,
-        )
-
-
-class BrowsingActionParserBrowseInteractive(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return True
-
-    def parse(self, action_str: str) -> Action:
-        thought = action_str.split('```')[0].strip()
-        action_str = action_str.split('```')[1].strip()
-        msg_content = ''
-        for sub_action in action_str.split('\n'):
-            if 'send_msg_to_user(' in sub_action:
-                tree = ast.parse(sub_action)
-                args = tree.body[0].value.args  # type: ignore
-                msg_content = args[0].value
-
-        return BrowseInteractiveAction(
-            browser_actions=action_str,
-            thought=thought,
-            browsergym_send_msg_to_user=msg_content,
-        )
--- a/agenthub/codeact_agent/system_prompt.j2
+++ b/agenthub/codeact_agent/system_prompt.j2
@@ -1,52 +0,0 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
-A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
-The assistant can use a Python environment with <execute_ipython>, e.g.:
-<execute_ipython>
-print("Hello World!")
-</execute_ipython>
-The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
-If a bash command returns exit code `-1`, this means the process is not yet finished.
-The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
-(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
-or it can contain the text `ctrl+c` to interrupt the process.
-
-For commands that may run indefinitely, the output should be redirected to a file and the command run
-in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
-If a command execution result says "Command timed out. Sending SIGINT to the process",
-the assistant should retry running the command in the background.
-{% endset %}
-{% set BROWSING_PREFIX %}
-The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
-Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set PIP_INSTALL_PREFIX %}
-The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
-{% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
-{% set COMMAND_DOCS %}
-Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
-{{ agent_skills_docs }}
-IMPORTANT:
- `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write the line out, with all leading spaces before the code!
- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
- Any code issued should be less than 50 lines to avoid context being cut off!
- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
- For `edit_file_by_replace` NEVER provide empty parameters!
- For `edit_file_by_replace` the file must be read fully before any replacements!
-{% endset %}
-{% set SYSTEM_SUFFIX %}
-Responses should be concise.
-The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
-Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
-If the assistant is finished with the task you MUST include <finish></finish> in your response.
-IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
-The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
-The assistant must avoid apologies and thanks in its responses.
-
-{% endset %}
-{# Combine all parts without newlines between them #}
-{{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+cp pyproject.toml poetry.lock openhands
+poetry build -v
--- a/config.template.toml
+++ b/config.template.toml
@@ -13,6 +13,10 @@
 # API key for E2B
 #e2b_api_key = ""

+# API key for Modal
+#modal_api_token_id = ""
+#modal_api_token_secret = ""
+
 # Base path for the workspace
 workspace_base = "./workspace"

@@ -28,6 +32,9 @@ workspace_base = "./workspace"
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false

+# Path to store trajectories
+#trajectories_path="./trajectories"
+
 # File store path
 #file_store_path = "/tmp/file_store"

@@ -165,11 +172,9 @@ model = "gpt-4o"
 #disable_vision = true

 [llm.gpt4o-mini]
-# API key to use
 api_key = "your-api-key"
+model = "gpt-4o"

-# Model to use
-model = "gpt-4o-mini"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
@@ -185,7 +190,7 @@ model = "gpt-4o-mini"
 #memory_enabled = false

 # Memory maximum threads
-#memory_max_threads = 2
+#memory_max_threads = 3

 # LLM config group to use
 #llm_config = 'your-llm-config-group'
@@ -206,7 +211,7 @@ llm_config = 'gpt3'
 #user_id = 1000

 # Container image to use for the sandbox
-#base_container_image = "nikolaik/python-nodejs:python3.11-nodejs22"
+#base_container_image = "nikolaik/python-nodejs:python3.12-nodejs22"

 # Use host network
 #use_host_network = false
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -8,7 +8,7 @@ RUN npm install -g npm@10.5.1
 RUN npm ci

 COPY ./frontend ./
-RUN npm run make-i18n && npm run build
+RUN npm run build

 FROM python:3.12.3-slim AS backend-builder

@@ -28,7 +28,7 @@ COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
 RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12.3-slim AS runtime
+FROM python:3.12.3-slim AS openhands-app

 WORKDIR /app

@@ -69,7 +69,7 @@ RUN playwright install --with-deps chromium

 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
-COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
+COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
 COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
 COPY --chown=openhands:app ./poetry.lock ./poetry.lock
 COPY --chown=openhands:app ./README.md ./README.md
@@ -82,7 +82,7 @@ RUN python openhands/core/download.py # No-op to download assets
 # openhands:openhands -> openhands:app
 RUN find /app \! -group app -exec chgrp app {} +

-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/dist ./frontend/dist
+COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
 COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root
--- a/containers/build.sh
+++ b/containers/build.sh
@@ -44,10 +44,10 @@ OPENHANDS_BUILD_VERSION="dev"
 cache_tag_base="buildcache"
 cache_tag="$cache_tag_base"

-if [[ -n $GITHUB_SHA ]]; then
-  git_hash=$(git rev-parse --short "$GITHUB_SHA")
+if [[ -n $RELEVANT_SHA ]]; then
+  git_hash=$(git rev-parse --short "$RELEVANT_SHA")
  tags+=("$git_hash")
-  tags+=("$GITHUB_SHA")
+  tags+=("$RELEVANT_SHA")
 fi

 if [[ -n $GITHUB_REF_NAME ]]; then
--- a/containers/dev/Dockerfile
+++ b/containers/dev/Dockerfile
@@ -55,18 +55,18 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | d
  && apt-get clean \
  && apt-get autoremove -y

-# Python 3.11
+# Python 3.12
 RUN add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update \
-    && apt-get install -y python3.11 python3.11-venv python3.11-dev python3-pip \
-    && ln -s /usr/bin/python3.11 /usr/bin/python
+    && apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
+    && ln -s /usr/bin/python3.12 /usr/bin/python

 # NodeJS >= 18.17.1
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
    && apt-get install -y nodejs

 # Poetry >= 1.8
-RUN curl -fsSL https://install.python-poetry.org | python3.11 - \
+RUN curl -fsSL https://install.python-poetry.org | python3.12 - \
    && ln -s ~/.local/bin/poetry /usr/local/bin/poetry

 #
--- a/containers/runtime/README.md
+++ b/containers/runtime/README.md
@@ -3,10 +3,10 @@
 This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
 that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.

-The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
+The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.12-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
+    --base_image nikolaik/python-nodejs:python3.12-nodejs22 \
    --build_folder containers/runtime
 ```
--- a/dev_config/python/.pre-commit-config.yaml
+++ b/dev_config/python/.pre-commit-config.yaml
@@ -38,6 +38,6 @@ repos:
      - id: mypy
        additional_dependencies:
          [types-requests, types-setuptools, types-pyyaml, types-toml]
-        entry: mypy --config-file dev_config/python/mypy.ini openhands/ agenthub/
+        entry: mypy --config-file dev_config/python/mypy.ini openhands/
        always_run: true
        pass_filenames: false
--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -1,7 +1,3 @@
---
-sidebar_position: 8
---
-
 # 📚 Misc

 ## ⭐️ Research Strategy
--- a/docs/modules/usage/agents.md
+++ b/docs/modules/usage/agents.md
@@ -1,7 +1,3 @@
---
-sidebar_position: 3
---
-
 # 🧠 Main Agent and Capabilities

 ## CodeActAgent
--- a/docs/modules/usage/architecture/backend.mdx
+++ b/docs/modules/usage/architecture/backend.mdx
@@ -1,7 +1,3 @@
---
-sidebar_position: 7
---
-
 # 🏛️ System Architecture

 <div style={{ textAlign: 'center' }}>
--- a/docs/modules/usage/architecture/runtime.md
+++ b/docs/modules/usage/architecture/runtime.md
@@ -21,7 +21,7 @@ The OpenHands Runtime system uses a client-server architecture implemented with
 graph TD
    A[User-provided Custom Docker Image] --> B[OpenHands Backend]
    B -->|Builds| C[OH Runtime Image]
-    C -->|Launches| D[Runtime Client]
+    C -->|Launches| D[Action Executor]
    D -->|Initializes| E[Browser]
    D -->|Initializes| F[Bash Shell]
    D -->|Initializes| G[Plugins]
@@ -49,10 +49,10 @@ graph TD
 1. User Input: The user provides a custom base Docker image
 2. Image Building: OpenHands builds a new Docker image (the "OH runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
 3. Container Launch: When OpenHands starts, it launches a Docker container using the OH runtime image
-4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins
-5. Communication: The OpenHands backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations
+4. Action Execution Server Initialization: The action execution server initializes an `ActionExecutor` inside the container, setting up necessary components like a bash shell and loading any specified plugins
+5. Communication: The OpenHands backend (`openhands/runtime/impl/eventstream/eventstream_runtime.py`) communicates with the action execution server over RESTful API, sending actions and receiving observations
 6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
-7. Observation Return: The client sends execution results back to the OpenHands backend as observations
+7. Observation Return: The action execution server sends execution results back to the OpenHands backend as observations


 The role of the client:
@@ -70,74 +70,46 @@ Check out the [relevant code](https://github.com/All-Hands-AI/OpenHands/blob/mai

 ### Image Tagging System

-OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility.
+Tags may be in one of 2 formats:

-1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
-   Example: `runtime:abc123def456`
+- **Generic**: `oh_v{openhands_version}_{16_digit_lock_hash}` (e.g.: `oh_v0.9.9_1234567890abcdef`)
+- **Specific**: `oh_v{openhands_version}_{16_digit_lock_hash}_{16_digit_source_hash}`
+  (e.g.: `oh_v0.9.9_1234567890abcdef_1234567890abcdef`)

-   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile
-   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile
-   - This ensures reproducibility; the same hash always means the same image contents
+#### Lock Hash

-2. Generic tag: `{target_image_repo}:{target_image_tag}`.
-   Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+This hash is built from the first 16 digits of the MD5 of:
+- The name of the base image upon which the image was built (e.g.: `nikolaik/python-nodejs:python3.12-nodejs22`)
+- The content of the `pyproject.toml` included in the image.
+- The content of the `poetry.lock` included in the image.

-   - This tag follows the format: `runtime:oh_v{OH_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
-   - It represents the latest build for a particular base image and OpenHands version combination
-   - This tag is updated whenever a new image is built from the same base image, even if the source code changes
+This effectively gives a hash for the dependencies of Openhands independent of the source code.

-The hash-based tag ensures reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.
+#### Source Hash

-### Build Process
+This is the first 16 digits of the MD5 of the directory hash for the source directory. This gives a hash
+for only the openhands source

-1. Image Naming Convention:
-   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
-     Example: `runtime:abc123def456`
-   - Generic tag: `{target_image_repo}:{target_image_tag}`.
-     Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+#### Build Process

-2. Build Process:
-   - a. Convert the base image name to an OH runtime image name
-      Example: `ubuntu:22.04` -> `runtime:oh_v0.9.3_ubuntu_tag_22.04`
-   - b. Generate a build context (Dockerfile and OpenHands source code) and calculate its hash
-   - c. Check for an existing image with the calculated hash
-   - d. If not found, check for a recent compatible image to use as a base
-   - e. If no compatible image exists, build from scratch using the original base image
-   - f. Tag the new image with both hash-based and generic tags
+When generating an image...

-3. Image Reuse and Rebuilding Logic:
-   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
-   - a. If an image exists with the same hash (e.g., `runtime:abc123def456`), it will be reused as is
-   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
-   - c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch
+- OpenHands first checks whether an image with the same **Specific** tag exists. If there is such an image,
+  no build is performed - the existing image is used.
+- OpenHands next checks whether an image with the **Generic** tag exists. If there is such an image,
+  OpenHands builds a new image based upon it, bypassing all installation steps (like `poetry install` and
+  `apt-get`) except a final operation to copy the current source code. The new image is tagged with a
+  **Specific** tag only.
+- If neither a **Specific** nor **Generic** tag exists, a brand new image is built based upon the base
+  image (Which is a slower operation). This new image is tagged with both the **Generic** and **Specific**
+  tags.

-4. Caching and Efficiency:
-   - The system attempts to reuse existing images when possible to save build time
-   - If an exact match (by hash) is found, it's used without rebuilding
-   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation
-
-Here's a flowchart illustrating the build process:
-
-```mermaid
-flowchart TD
-    A[Start] --> B{Convert base image name}
-    B --> |ubuntu:22.04 -> runtime:oh_v0.9.3_ubuntu_tag_22.04| C[Generate build context and hash]
-    C --> D{Check for existing image with hash}
-    D -->|Found runtime:abc123def456| E[Use existing image]
-    D -->|Not found| F{Check for runtime:oh_v0.9.3_ubuntu_tag_22.04}
-    F -->|Found| G[Rebuild based on recent image]
-    F -->|Not found| H[Build from scratch]
-    G --> I[Tag with hash and generic tags]
-    H --> I
-    E --> J[End]
-    I --> J
-```
-
-This approach ensures that:
+This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.

 1. Identical source code and Dockerfile always produce the same image (via hash-based tags)
 2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images)
-3. The generic tag (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination
+3. The generic tag (e.g., `runtime:oh_v0.9.3_1234567890abcdef`) always points to the latest build for a particular base image and OpenHands version combination

 ## Runtime Plugin System

--- a/docs/modules/usage/feedback.md
+++ b/docs/modules/usage/feedback.md
@@ -1,7 +1,3 @@
---
-sidebar_position: 5
---
-
 # ✅ Providing Feedback

 When using OpenHands, you will encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenHands to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenHands!
--- a/docs/modules/usage/getting-started.mdx
+++ b/docs/modules/usage/getting-started.mdx
@@ -1,67 +1,111 @@
---
-sidebar_position: 2
---
+# Getting Started with OpenHands

-# Getting Started
+So you've [installed OpenHands](./installation) and have
+[set up your LLM](./installation#setup). Now what?

-## System Requirements
+OpenHands can help you tackle a wide variety of engineering tasks. But the technology
+is still new, and we're a long way off from having agents that can take on large, complicated
+engineering tasks without any guidance. So it's important to get a feel for what the agent
+does well, and where it might need some help.

-* Docker version 26.0.0+ or Docker Desktop 4.31.0+.
-* You must be using Linux or Mac OS.
-  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+## Hello World

-## Installation
+The first thing you might want to try is a simple "hello world" example.
+This can be more complicated than it sounds!

-The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
-existing code that you'd like to modify.
+Try prompting the agent with:
+> Please write a bash script hello.sh that prints "hello world!"

-```bash
-export WORKSPACE_BASE=$(pwd)/workspace
+You should see that the agent not only writes the script, it sets the correct
+permissions and runs the script to check the output.

-docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+You can continue prompting the agent to refine your code. This is a great way to
+work with agents. Start simple, and iterate.

-docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
-```
+> Please modify hello.sh so that it accepts a name as the first argument, but defaults to "world"

-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
+You can also work in any language you need, though the agent might need to spend some
+time setting up its environment!

-## Setup
+> Please convert hello.sh to a Ruby script, and run it

-After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
+## Building From Scratch

-The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
-command to point to an existing folder.
+Agents do exceptionally well at "greenfield" tasks (tasks where they don't need
+any context about an existing codebase) and they can just start from scratch.

-Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
-These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
+It's best to start with a simple task, and then iterate on it. It's also best to be
+as specific as possible about what you want, what the tech stack should be, etc.

-If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
-in the `Custom Model` text box.
-The `Advanced Options` also allow you to specify a `Base URL` if required.
+For example, we might build a TODO app:

-<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
-  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
-  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
-</div>
+> Please build a basic TODO list app in React. It should be frontend-only, and all state
+> should be kept in localStorage.

-## Versions
+We can keep iterating on the app once the skeleton is there:

-The command above pulls the most recent stable release of OpenHands. You have other options as well:
- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
+> Please allow adding an optional due date to every task

-You can choose the tag that best suits your needs based on stability requirements and desired features.
+Just like with normal development, it's good to commit and push your code frequently.
+This way you can always revert back to an old state if the agent goes off track.
+You can ask the agent to commit and push for you:

-For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+> Please commit the changes and push them to a new branch called "feature/due-dates"

-Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
+
+## Adding New Code
+
+OpenHands can also do a great job adding new code to an existing code base.
+
+For example, you can ask OpenHands to add a new GitHub action to your project
+which lints your code. OpenHands may take a peek at your codebase to see what language
+it should use, but then it can just drop a new file into `./github/workflows/lint.yml`
+
+> Please add a GitHub action that lints the code in this repository
+
+Some tasks might require a bit more context. While OpenHands can use `ls` and `grep`
+to search through your codebase, providing context up front allows it to move faster,
+and more accurately. And it'll cost you fewer tokens!
+
+> Please modify ./backend/api/routes.js to add a new route that returns a list of all tasks
+
+> Please add a new React component that displays a list of Widgets to the ./frontend/components
+> directory. It should use the existing Widget component.
+
+## Refactoring
+
+OpenHands does great at refactoring existing code, especially in small chunks.
+You probably don't want to try rearchitecting your whole codebase, but breaking up
+long files and functions, renaming variables, etc. tend to work very well.
+
+> Please rename all the single-letter variables in ./app.go
+
+> Please break the function `build_and_deploy_widgets` into two functions, `build_widgets` and `deploy_widgets` in widget.php
+
+> Please break ./api/routes.js into separate files for each route
+
+## Bug Fixes
+
+OpenHands can also help you track down and fix bugs in your code. But, as any
+developer knows, bug fixing can be extremely tricky, and often OpenHands will need more context.
+It helps if you've diagnosed the bug, but want OpenHands to figure out the logic.
+
+> Currently the email field in the `/subscribe` endpoint is rejecting .io domains. Please fix this.
+
+> The `search_widgets` function in ./app.py is doing a case-sensitive search. Please make it case-insensitive.
+
+It often helps to do test-driven development when bugfixing with an agent.
+You can ask the agent to write a new test, and then iterate until it fixes the bug:
+
+> The `hello` function crashes on the empty string. Please write a test that reproduces this bug, then fix the code so it passes.
+
+## More
+
+OpenHands is capable of helping out on just about any coding task. But it takes some practice
+to get the most out of it. Remember to:
+* Keep your tasks small
+* Be as specific as possible
+* Provide as much context as possible
+* Commit and push frequently
+
+See [Prompting Best Practices](./prompting-best-practices) for more tips on how to get the most out of OpenHands.
--- a/docs/modules/usage/how-to/cli-mode.md
+++ b/docs/modules/usage/how-to/cli-mode.md
@@ -57,7 +57,7 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
+    ghcr.io/all-hands-ai/openhands:0.11 \
    python -m openhands.core.cli
 ```

--- a/docs/modules/usage/how-to/custom-sandbox-guide.md
+++ b/docs/modules/usage/how-to/custom-sandbox-guide.md
@@ -1,81 +1,64 @@
 # Custom Sandbox

-The sandbox is where the agent does its work. Instead of running commands directly on your computer
-(which could be dangerous), the agent runs them inside of a Docker container.
+The sandbox is where the agent performs its tasks. Instead of running commands directly on your computer
+(which could be risky), the agent runs them inside a Docker container.

-The default OpenHands sandbox (`python-nodejs:python3.11-nodejs22`
+The default OpenHands sandbox (`python-nodejs:python3.12-nodejs22`
 from [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)) comes with some packages installed such
-as python and Node.js but your use case may need additional software installed by default.
+as python and Node.js but may need other software installed by default.

-There are two ways you can do so:
+You have two options for customization:

-1. Use an existing image from docker hub.
-2. Creating your own custom docker image and using it.
+1. Use an existing image with the required software.
+2. Create your own custom Docker image.

-If you want to take the first approach, you can skip the `Create Your Docker Image` section.
-
-## Setup
-
-Make sure you are able to run OpenHands using the [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) first.
+If you choose the first option, you can skip the `Create Your Docker Image` section.

 ## Create Your Docker Image

-To create a custom docker image, it must be debian/ubuntu based.
+To create a custom Docker image, it must be Debian based.

-For example, if we want OpenHands to have access to the `node` binary, we would use the following Dockerfile:
+For example, if you want OpenHands to have `ruby` installed, create a `Dockerfile` with the following content:

 ```dockerfile
-# Start with latest ubuntu image
-FROM ubuntu:latest
+FROM debian:latest

-# Run needed updates
-RUN apt-get update && apt-get install -y
-
-# Install node
-RUN apt-get install -y nodejs
+# Install required packages
+RUN apt-get update && apt-get install -y ruby
 ```

-Next build your docker image with the name of your choice, for example `custom_image`.
+Save this file in a folder. Then, build your Docker image (e.g., named custom-image) by navigating to the folder in
+the terminal and running::
+```bash
+docker build -t custom-image .
+```

-To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:
+This will produce a new image called `custom-image`, which will be available in Docker.
+
+> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the
+> sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
+
+## Using the Development Workflow
+
+### Setup
+
+First, ensure you can run OpenHands by following the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+### Specify the Base Sandbox Image
+
+In the `config.toml` file within the OpenHands directory, set the `sandbox_base_container_image` to the image you want to use.
+This can be an image you’ve already pulled or one you’ve built:

 ```bash
-docker build -t custom_image .
-```
-
-This will produce a new image called ```custom_image``` that will be available in Docker Engine.
-
-> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
->
-> Installing with apt-get above installs node for all users.
-
-## Specify your sandbox image in config.toml file
-
-OpenHands configuration occurs via the top-level `config.toml` file.
-
-Create a `config.toml` file in the OpenHands directory and enter these contents:
-
-```toml
 [core]
-workspace_base="./workspace"
-run_as_openhands=true
-sandbox_base_container_image="custom_image"
+...
+sandbox_base_container_image="custom-image"
 ```

-For `sandbox_base_container_image`, you can specify either:
+### Run

-1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
-2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)
-
-## Run
 Run OpenHands by running ```make run``` in the top level directory.

-Navigate to ```localhost:3001``` and check if your desired dependencies are available.
-
-In the case of the example above, running ```node -v``` in the terminal produces ```v20.15.0```.
-
-Congratulations!
-
 ## Technical Explanation

 Please refer to [custom docker image section of the runtime documentation](https://docs.all-hands.dev/modules/usage/architecture/runtime#advanced-how-openhands-builds-and-maintains-od-runtime-images) for more details.
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -84,7 +84,7 @@ To create an evaluation workflow for your benchmark, follow these steps:

 1. Import relevant OpenHands utilities:
   ```python
-    import agenthub
+    import openhands.agenthub
    from evaluation.utils.shared import (
        EvalMetadata,
        EvalOutput,
@@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:

 4. Create a function to process each instance:
   ```python
+   from openhands.utils.async_utils import call_async_from_sync
   def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
       config = get_config(instance, metadata)
-       runtime = create_runtime(config, sid=instance.instance_id)
+       runtime = create_runtime(config)
+       call_async_from_sync(runtime.connect)
       initialize_runtime(runtime, instance)

       instruction = get_instruction(instance, metadata)
--- a/docs/modules/usage/how-to/gui-mode.md
+++ b/docs/modules/usage/how-to/gui-mode.md
@@ -0,0 +1,51 @@
+# GUI Mode
+
+## Introduction
+
+OpenHands provides a user-friendly Graphical User Interface (GUI) mode for interacting with the AI assistant. This mode offers an intuitive way to set up the environment, manage settings, and communicate with the AI.
+
+## Installation and Setup
+
+1. Follow the instructions in the [Installation](../installation) guide to install OpenHands.
+
+2. After running the command, access OpenHands at [http://localhost:3000](http://localhost:3000).
+
+## Interacting with the GUI
+
+### Initial Setup
+
+1. Upon first launch, you'll see a settings modal.
+2. Select an `LLM Provider` and `LLM Model` from the dropdown menus.
+3. Enter the corresponding `API Key` for your chosen provider.
+4. Click "Save" to apply the settings.
+
+### Advanced Settings
+
+1. Toggle `Advanced Options` to access additional settings.
+2. Use the `Custom Model` text box to manually enter a model if it's not in the list.
+3. Specify a `Base URL` if required by your LLM provider.
+
+### Main Interface
+
+The main interface consists of several key components:
+
+1. **Chat Window**: The central area where you can view the conversation history with the AI assistant.
+2. **Input Box**: Located at the bottom of the screen, use this to type your messages or commands to the AI.
+3. **Send Button**: Click this to send your message to the AI.
+4. **Settings Button**: A gear icon that opens the settings modal, allowing you to adjust your configuration at any time.
+5. **Workspace Panel**: Displays the files and folders in your workspace, allowing you to navigate and view files, or the agent's past commands or web browsing history.
+
+### Interacting with the AI
+
+1. Type your question, request, or task description in the input box.
+2. Click the send button or press Enter to submit your message.
+3. The AI will process your input and provide a response in the chat window.
+4. You can continue the conversation by asking follow-up questions or providing additional information.
+
+## Tips for Effective Use
+
+1. Be specific in your requests to get the most accurate and helpful responses, as described in the [prompting best practices](../prompting-best-practices).
+2. Use the workspace panel to explore your project structure.
+3. Use one of the recommended models, as described in the [LLMs section](usage/llms/llms.md).
+
+Remember, the GUI mode of OpenHands is designed to make your interaction with the AI assistant as smooth and intuitive as possible. Don't hesitate to explore its features to maximize your productivity.
--- a/docs/modules/usage/how-to/headless-mode.md
+++ b/docs/modules/usage/how-to/headless-mode.md
@@ -51,6 +51,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9 \
+    ghcr.io/all-hands-ai/openhands:0.11 \
    python -m openhands.core.main -t "write a bash script that prints hi"
 ```
--- a/docs/modules/usage/installation.mdx
+++ b/docs/modules/usage/installation.mdx
@@ -0,0 +1,63 @@
+# Installation
+
+## System Requirements
+
+* Docker version 26.0.0+ or Docker Desktop 4.31.0+.
+* You must be using Linux or Mac OS.
+  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+
+## Start the app
+
+The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
+existing code that you'd like to modify.
+
+```bash
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik
+
+docker run -it --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/all-hands-ai/openhands:0.11
+```
+
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
+
+## Setup
+
+After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
+
+The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
+command to point to an existing folder.
+
+Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
+These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
+
+If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
+in the `Custom Model` text box.
+The `Advanced Options` also allow you to specify a `Base URL` if required.
+
+<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
+  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
+  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
+</div>
+
+## Versions
+
+The command above pulls the most recent stable release of OpenHands. You have other options as well:
+- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
+- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
+- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
+
+You can choose the tag that best suits your needs based on stability requirements and desired features.
+
+For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
--- a/docs/modules/usage/llms/azure-llms.md
+++ b/docs/modules/usage/llms/azure-llms.md
@@ -5,7 +5,7 @@ OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their
 ## Azure OpenAI Configuration

 When running OpenHands, you'll need to set the following environment variable using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+[docker run command](/modules/usage/installation#start-the-app):

 ```
 LLM_API_VERSION="<api-version>"              # e.g. "2023-05-15"
@@ -37,7 +37,7 @@ OpenHands uses llama-index for embeddings. You can find their documentation on A
 ### Azure OpenAI Configuration

 When running OpenHands, set the following environment variables using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+[docker run command](/modules/usage/installation#start-the-app):

 ```
 LLM_EMBEDDING_MODEL="azureopenai"
--- a/docs/modules/usage/llms/google-llms.md
+++ b/docs/modules/usage/llms/google-llms.md
@@ -16,7 +16,7 @@ If the model is not in the list, toggle `Advanced Options`, and enter it in `Cus
 ## VertexAI - Google Cloud Platform Configs

 To use Vertex AI through Google Cloud Platform when running OpenHands, you'll need to set the following environment
-variables using `-e` in the [docker run command](/modules/usage/getting-started#installation):
+variables using `-e` in the [docker run command](/modules/usage/installation#start-the-app):

 ```
 GOOGLE_APPLICATION_CREDENTIALS="<json-dump-of-gcp-service-account-json>"
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -1,16 +1,25 @@
---
-sidebar_position: 3
---
-
 # 🤖 LLM Backends

 OpenHands can connect to any LLM supported by LiteLLM. However, it requires a powerful model to work.
-The following are verified by the community to work with OpenHands:

-* claude-3-5-sonnet (recommended)
-* gemini-1.5-pro / gemini-1.5-flash
-* gpt-4 / gpt-4o
-* llama-3.1-405b / hermes-3-llama-3.1-405b
+## Model Recommendations
+
+Based on a recent evaluation of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. The full analysis can be found in [this blog article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed).
+
+When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
+
+- Claude 3.5 Sonnet is the best by a fair amount, achieving a 27% resolve rate with the default agent in OpenHands.
+- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
+- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.
+
+Please refer to the [full article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) for more details.
+
+Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
+
+- claude-3-5-sonnet (recommended)
+- gpt-4 / gpt-4o
+- llama-3.1-405b
+- deepseek-v2.5

 :::warning
 OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
@@ -32,29 +41,30 @@ models driving it. However, if you do find ones that work, please add them to th
 ## LLM Configuration

 The following can be set in the OpenHands UI through the Settings:
-* `LLM Provider`
-* `LLM Model`
-* `API Key`
-* `Base URL` (through `Advanced Settings`)
+
+- `LLM Provider`
+- `LLM Model`
+- `API Key`
+- `Base URL` (through `Advanced Settings`)

 There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
-can be set through environment variables passed to the [docker run command](/modules/usage/getting-started#installation)
+can be set through environment variables passed to the [docker run command](/modules/usage/installation#start-the-app)
 using `-e`:

-* `LLM_API_VERSION`
-* `LLM_EMBEDDING_MODEL`
-* `LLM_EMBEDDING_DEPLOYMENT_NAME`
-* `LLM_DROP_PARAMS`
-* `LLM_DISABLE_VISION`
-* `LLM_CACHING_PROMPT`
+- `LLM_API_VERSION`
+- `LLM_EMBEDDING_MODEL`
+- `LLM_EMBEDDING_DEPLOYMENT_NAME`
+- `LLM_DROP_PARAMS`
+- `LLM_DISABLE_VISION`
+- `LLM_CACHING_PROMPT`

 We have a few guides for running OpenHands with specific model providers:

-* [Azure](llms/azure-llms)
-* [Google](llms/google-llms)
-* [Groq](llms/groq)
-* [OpenAI](llms/openai-llms)
-* [OpenRouter](llms/openrouter)
+- [Azure](llms/azure-llms)
+- [Google](llms/google-llms)
+- [Groq](llms/groq)
+- [OpenAI](llms/openai-llms)
+- [OpenRouter](llms/openrouter)

 ### API retries and rate limits

@@ -62,10 +72,10 @@ LLM providers typically have rate limits, sometimes very low, and may require re

 You can customize these options as you need for the provider you're using. Check their documentation, and set the following environment variables to control the number of retries and the time between retries:

-* `LLM_NUM_RETRIES` (Default of 8)
-* `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
-* `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
-* `LLM_RETRY_MULTIPLIER` (Default of 2)
+- `LLM_NUM_RETRIES` (Default of 8)
+- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
+- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+- `LLM_RETRY_MULTIPLIER` (Default of 2)

 If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:

--- a/docs/modules/usage/prompting-best-practices.md
+++ b/docs/modules/usage/prompting-best-practices.md
@@ -0,0 +1,41 @@
+# Prompting Best Practices
+
+When working with OpenHands AI software developer, it's crucial to provide clear and effective prompts. This guide outlines best practices for creating prompts that will yield the most accurate and useful responses.
+
+## Characteristics of Good Prompts
+
+Good prompts are:
+
+1. **Concrete**: They explain exactly what functionality should be added or what error needs to be fixed.
+2. **Location-specific**: If known, they explain the locations in the code base that should be modified.
+3. **Appropriately scoped**: They should be the size of a single feature, typically not exceeding 100 lines of code.
+
+## Examples
+
+### Good Prompt Examples
+
+1. "Add a function `calculate_average` in `utils/math_operations.py` that takes a list of numbers as input and returns their average."
+
+2. "Fix the TypeError in `frontend/src/components/UserProfile.tsx` occurring on line 42. The error suggests we're trying to access a property of undefined."
+
+3. "Implement input validation for the email field in the registration form. Update `frontend/src/components/RegistrationForm.tsx` to check if the email is in a valid format before submission."
+
+### Bad Prompt Examples
+
+1. "Make the code better." (Too vague, not concrete)
+
+2. "Rewrite the entire backend to use a different framework." (Not appropriately scoped)
+
+3. "There's a bug somewhere in the user authentication. Can you find and fix it?" (Lacks specificity and location information)
+
+## Tips for Effective Prompting
+
+1. Be as specific as possible about the desired outcome or the problem to be solved.
+2. Provide context, including relevant file paths and line numbers if available.
+3. Break down large tasks into smaller, manageable prompts.
+4. Include any relevant error messages or logs.
+5. Specify the programming language or framework if it's not obvious from the context.
+
+Remember, the more precise and informative your prompt is, the better the AI can assist you in developing or modifying the OpenHands software.
+
+See [Getting Started with OpenHands](./getting-started) for more examples of helpful prompts.
--- a/docs/modules/usage/troubleshooting/troubleshooting.md
+++ b/docs/modules/usage/troubleshooting/troubleshooting.md
@@ -1,7 +1,3 @@
---
-sidebar_position: 4
---
-
 # 🚧 Troubleshooting

 There are some error messages that frequently get reported by users.
--- a/docs/modules/usage/upgrade-guide.md
+++ b/docs/modules/usage/upgrade-guide.md
@@ -1,7 +1,3 @@
---
-sidebar_position: 8
---
-
 # ⬆️ Upgrade Guide

 ## 0.8.0 (2024-07-13)
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -12,7 +12,7 @@
        "@docusaurus/plugin-content-pages": "^3.5.2",
        "@docusaurus/preset-classic": "^3.5.2",
        "@docusaurus/theme-mermaid": "^3.5.2",
-        "@mdx-js/react": "^3.0.0",
+        "@mdx-js/react": "^3.1.0",
        "clsx": "^2.0.0",
        "prism-react-renderer": "^2.4.0",
        "react": "^18.3.1",
@@ -24,7 +24,7 @@
        "@docusaurus/module-type-aliases": "^3.5.1",
        "@docusaurus/tsconfig": "^3.5.2",
        "@docusaurus/types": "^3.5.1",
-        "typescript": "~5.6.2"
+        "typescript": "~5.6.3"
      },
      "engines": {
        "node": ">=18.0"
@@ -2883,9 +2883,9 @@
      }
    },
    "node_modules/@mdx-js/react": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.0.1.tgz",
-      "integrity": "sha512-9ZrPIU4MGf6et1m1ov3zKf+q9+deetI51zprKB1D/z3NOb+rUxxtEl3mCjW5wTGh6VhRdwPueh1oRzi6ezkA8A==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.0.tgz",
+      "integrity": "sha512-QjHtSaoameoalGnKDT3FoIl4+9RwyTmo9ZJGBdLOks/YOiWHoRDI3PUwEzOE7kEmGcV3AFcp9K6dYu9rEuKLAQ==",
      "dependencies": {
        "@types/mdx": "^2.0.0"
      },
@@ -14853,9 +14853,9 @@
      }
    },
    "node_modules/typescript": {
-      "version": "5.6.2",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz",
-      "integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==",
+      "version": "5.6.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz",
+      "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==",
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"
--- a/docs/package.json
+++ b/docs/package.json
@@ -19,7 +19,7 @@
    "@docusaurus/plugin-content-pages": "^3.5.2",
    "@docusaurus/preset-classic": "^3.5.2",
    "@docusaurus/theme-mermaid": "^3.5.2",
-    "@mdx-js/react": "^3.0.0",
+    "@mdx-js/react": "^3.1.0",
    "clsx": "^2.0.0",
    "prism-react-renderer": "^2.4.0",
    "react": "^18.3.1",
@@ -31,7 +31,7 @@
    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.5.2",
    "@docusaurus/types": "^3.5.1",
-    "typescript": "~5.6.2"
+    "typescript": "~5.6.3"
  },
  "browserslist": {
    "production": [
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -3,15 +3,30 @@ import type { SidebarsConfig } from "@docusaurus/plugin-content-docs";
 const sidebars: SidebarsConfig = {
  apiSidebar: [require("./modules/python/sidebar.json")],
  docsSidebar: [
+    {
+      type: 'doc',
+      label: 'Installation',
+      id: 'usage/installation',
+    },
    {
      type: 'doc',
      label: 'Getting Started',
      id: 'usage/getting-started',
    },
+    {
+      type: 'doc',
+      label: 'Prompting Best Practices',
+      id: 'usage/prompting-best-practices',
+    },
    {
      type: 'category',
      label: 'Usage Methods',
      items: [
+        {
+          type: 'doc',
+          label: 'GUI Mode',
+          id: 'usage/how-to/gui-mode',
+        },
        {
          type: 'doc',
          label: 'CLI Mode',
--- a/docs/static/img/settings-advanced.png
+++ b/docs/static/img/settings-advanced.png
--- a/docs/static/img/settings-screenshot.png
+++ b/docs/static/img/settings-screenshot.png
--- a/docs/yarn.lock
+++ b/docs/yarn.lock
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -22,6 +22,8 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync

 game = None

@@ -62,7 +64,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
        ),
@@ -117,12 +119,13 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance['text'].strip())
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -36,7 +36,7 @@ fi

 # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
 # We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")

 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
  --data-split test \
  --max-iterations 20 \
  --OPENAI_API_KEY $OPENAI_API_KEY \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -2,19 +2,47 @@

 This folder contains code and resources to run experiments and evaluations.

-## Logistics
+## For Benchmark Users

-To better organize the evaluation folder, we should follow the rules below:
+### Setup

- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
-all the preprocessing/evaluation/analysis scripts.
- Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+Before starting evaluation, follow the instructions here [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
+
+Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the evaluation directory.
+Generally these will involve running `run_infer.py` to perform inference with the agents.
+
+### Implementing and Evaluating an Agent
+
+To add an agent to OpenHands, you will need to implement it in the [agenthub directory](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub). There is a README there with more information.
+
+To evaluate an agent, you can provide the agent's name to the `run_infer.py` program.
+
+### Evaluating Different LLMs
+
+OpenHands in development mode uses `config.toml` to keep track of most configuration.
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```

 ## Supported Benchmarks

-To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness).
+The OpenHands evaluation harness supports a wide variety of benchmarks across software engineering, web browsing, and miscellaneous assistance tasks.

 ### Software Engineering

@@ -41,36 +69,19 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
 - ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)

-## Before everything begins: Setup Environment and LLM Configuration
-
-Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
-
-OpenHands in development mode uses `config.toml` to keep track of most configurations.
-
-Here's an example configuration file you can use to define and use multiple LLMs:
-
-```toml
-[llm]
-# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
-api_key = "sk-XXX"
-
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
-
-### Result Visualization
+## Result Visualization

 Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.

-### Upload your results
-
 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+## For Benchmark Developers
+
+To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
+
+- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
+all the preprocessing/evaluation/analysis scripts.
+- Raw data and experimental records should not be stored within this repo.
+- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
+- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def get_config(
@@ -44,7 +45,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -209,7 +210,8 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=instance.instance_id)
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    initialize_runtime(runtime, instance=instance)

@@ -217,7 +219,7 @@ def process_instance(
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
        )
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/aider_bench/README.md
+++ b/evaluation/aider_bench/README.md
@@ -59,15 +59,13 @@ You can update the arguments in the script
 ## Summarize Results

 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
-# with optional SKIP_NUM
-poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```

 Full example:

 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```

 This will list the instances that passed and the instances that failed. For each
@@ -81,21 +79,3 @@ outcome of the tests. If there are no syntax or indentation errors, you can
 expect to see something like "`..F...EF..`", where "`.`" means the test case
 passed, "`E`" means there was an error while executing the test case and "`F`"
 means some assertion failed and some returned output was not as expected.
-
-## Visualization
-
-If the required Python libraries are installed (`matplotlib.pyplot` and `seaborn`),
-the `summarize_results.py` script will also generate two histograms to
-the output folder.
-
-### Cost Histogram
-
-The cost histogram shows the number of successful and failed instances per cost point.
-
-![Cost Histogram](./examples/cost_histogram.png)
-
-### Actions Histogram
-
-The actions histogram shows per number of actions the number of successful and failed instances.
-
-![Actions Histogram](./examples/actions_histogram.png)
--- a/evaluation/aider_bench/examples/actions_histogram.png
+++ b/evaluation/aider_bench/examples/actions_histogram.png
--- a/evaluation/aider_bench/examples/cost_histogram.png
+++ b/evaluation/aider_bench/examples/cost_histogram.png
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import os
 import tempfile
 from typing import Any
@@ -24,13 +25,15 @@ from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
+    load_from_toml,
    parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 # Configure visibility of unit tests to the Agent.
 USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
@@ -46,19 +49,27 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            timeout=100,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
    return config


@@ -129,7 +140,7 @@ def complete_runtime(
        logger.info(f'Running test file: {script_name}')

    action = CmdRunAction(
-        command=f'python -m unittest {script_name}',
+        command=f'python3 -m unittest {script_name}',
        keep_prompt=False,
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
@@ -177,7 +188,9 @@ def process_instance(
        signature_file=f'{instance.instance_name}.py',
    )
    if USE_UNIT_TESTS:
-        print(f'\nInstruction to run test_file: {instance.instance_name}_test.py\n')
+        logger.info(
+            f'\nInstruction to run test_file: {instance.instance_name}_test.py\n'
+        )
        instruction += (
            f'Use `python -m unittest {instance.instance_name}_test.py` to run the test_file '
            'and verify the correctness of your solution. DO NOT EDIT the test file.\n\n'
@@ -194,7 +207,8 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=str(instance.instance_id))
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

    initialize_runtime(runtime, instance=instance)

@@ -202,7 +216,7 @@ def process_instance(
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
        )
--- a/evaluation/aider_bench/scripts/run_infer.sh
+++ b/evaluation/aider_bench/scripts/run_infer.sh
@@ -27,19 +27,24 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
-  --agent-cls $AGENT \
-  --llm-config $MODEL_CONFIG \
-  --max-iterations 30 \
-  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+EVAL_NOTE=$AGENT_VERSION

 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then
  export USE_UNIT_TESTS=false
 fi
 echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
--- a/evaluation/aider_bench/scripts/summarize_results.py
+++ b/evaluation/aider_bench/scripts/summarize_results.py
@@ -1,61 +1,25 @@
-import json
-import os
-import sys
+import argparse

 import numpy as np
 import pandas as pd

-# Try to import visualization libraries
-visualization_available = False
-try:
-    import matplotlib.pyplot as plt
-    import seaborn as sns

-    visualization_available = True
-except ImportError:
-    print(
-        '\n*** WARNING: libraries matplotlib and/or seaborn are not installed.\n*** Visualization will not be available!\n'
-    )
-
-
-def show_usage():
-    print(
-        'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file> <model_name>'
-    )
-    print(
-        'Example:\npoetry run python summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620\n'
-    )
-
-
-def print_error(message: str):
-    print(f'\n***\n*** ERROR: {message}\n***\n')
-    show_usage()
-
-
-def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
    passed = []
    failed = []
-    with open(res_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'test_result' in data and 'exit_code' in data['test_result']:
-                resolved = data['test_result']['exit_code'] == 0
-            if resolved:
-                passed.append(instance_id)
-            else:
-                failed.append(instance_id)
+    for _, row in df.iterrows():
+        instance_id = row['instance_id']
+        resolved = False
+        if 'test_result' in row and 'exit_code' in row['test_result']:
+            resolved = row['test_result']['exit_code'] == 0
+        if resolved:
+            passed.append(instance_id)
+        else:
+            failed.append(instance_id)
    return passed, failed


-def visualize_results(json_file_path: str, model: str, output_dir: str):
-    # based on a Colab notebook by RajMaheshwari
-    with open(json_file_path, 'r') as f:
-        data = [json.loads(line) for line in f]
-
-    df = pd.DataFrame.from_records(data)
-
+def visualize_results(df: pd.DataFrame):
    df1 = pd.DataFrame()
    df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
    df1['result'] = (
@@ -67,60 +31,35 @@ def visualize_results(json_file_path: str, model: str, output_dir: str):
    total = df.shape[0]
    resolve_rate = round((passed / total) * 100, 2)

-    print('Number of passed tests:', f'{passed}/{total}')
+    print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
+    print('\nDescriptive statistics for number of actions:')
+    print(df1['actions'].describe())
+    print('\nDescriptive statistics for costs:')
+    print(df1['cost'].describe())

-    if not visualization_available:
-        return resolve_rate
+    # Bin counts for actions
+    action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
+    print('\nAction bin counts:')
+    print(action_bins.value_counts().sort_index())

-    # Cost histogram
-    plt.figure(figsize=(10, 6))
-    bins = 10
-    mx = pd.Series.max(df1['cost'])
-    g = sns.histplot(df1, x='cost', bins=bins, hue='result', multiple='stack')
-    x_ticks = np.around(np.linspace(0, mx, bins + 1), 3)
-    g.set_xticks(x_ticks)
-    g.set_xlabel('Cost in $')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'cost_histogram.png'))
-    plt.close()
-
-    # Actions histogram
-    plt.figure(figsize=(10, 6))
-    bins = np.arange(0, 31, 2)
-    g = sns.histplot(df1, x='actions', bins=bins, hue='result', multiple='stack')
-    g.set_xticks(bins)
-    g.set_xlabel('# of actions')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'actions_histogram.png'))
-    plt.close()
+    # Bin counts for costs
+    cost_bins = pd.cut(df1['cost'], bins=10)
+    print('\nCost bin counts:')
+    print(cost_bins.value_counts().sort_index())

    return resolve_rate


 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print_error('Argument(s) missing!')
-        sys.exit(1)
+    parser = argparse.ArgumentParser(description='Summarize AiderBench results')
+    parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
+    args = parser.parse_args()

-    json_file_path = sys.argv[1]
-    model_name = sys.argv[2]
+    # Create DataFrame from JSONL file
+    df = pd.read_json(args.input_filepath, lines=True)

-    if not os.path.exists(json_file_path):
-        print_error('Output file does not exist!')
-        sys.exit(1)
-    if not os.path.isfile(json_file_path):
-        print_error('Path-to-output-file is not a file!')
-        sys.exit(1)
-
-    output_dir = os.path.dirname(json_file_path)
-    if not os.access(output_dir, os.W_OK):
-        print_error('Output folder is not writable!')
-        sys.exit(1)
-
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    resolve_rate = visualize_results(json_file_path, model_name, output_dir)
+    passed_tests, failed_tests = extract_test_results(df)
+    resolve_rate = visualize_results(df)

    print(
        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
@@ -129,7 +68,3 @@ if __name__ == '__main__':
    print(passed_tests)
    print('FAILED TESTS:')
    print(failed_tests)
-    print(
-        '\nVisualization results were saved as cost_histogram.png and actions_histogram.png'
-    )
-    print('in folder: ', output_dir)
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -27,9 +27,10 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': functools.partial(
@@ -274,18 +275,15 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance.instance_id.replace('/', '__')
-
-    runtime = create_runtime(config, sid=sid)
-
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def codeact_user_response(state: State) -> str:
@@ -75,7 +76,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -402,14 +403,15 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
            ],
--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/bird/scripts/run_infer.sh
@@ -30,7 +30,6 @@ COMMAND="poetry run python evaluation/bird/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION" \

--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -23,6 +23,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction

 # Only CodeActAgent can delegate to BrowsingAgent
 SUPPORTED_AGENT_CLS = {'CodeActAgent'}
@@ -40,7 +41,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
        ),
@@ -71,12 +72,12 @@ def process_instance(
        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
        )
    )
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 1 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"

--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -28,7 +28,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')

@@ -51,7 +52,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -141,14 +142,15 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-    runtime = create_runtime(config, sid=instance['instance_id'])
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
--- a/evaluation/gaia/scripts/run_infer.sh
+++ b/evaluation/gaia/scripts/run_infer.sh
@@ -41,7 +41,6 @@ COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
  --max-iterations 30 \
  --level $LEVELS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -24,6 +24,8 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -43,7 +45,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -79,11 +81,12 @@ def process_instance(
    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/gorilla/run_infer.py \
  --max-iterations 30 \
  --hubs $HUBS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -48,6 +48,7 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import Observation
+from openhands.utils.async_utils import call_async_from_sync

 ACTION_FORMAT = """
 <<FINAL_ANSWER||
@@ -65,7 +66,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -214,12 +215,12 @@ Again do not quit without reporting the answer first.
 Ok now its time to start solving the question. Good luck!
 """

-    runtime = create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')
-
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/gpqa/scripts/run_infer.sh
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/gpqa/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --data-split $DATA_SPLIT \
  --eval-note $AGENT_VERSION"
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -35,9 +35,10 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 IMPORT_HELPER = {
    'python': [
@@ -86,7 +87,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -99,7 +100,7 @@ def get_config(


 def _get_instance_id(instance: pd.Series) -> str:
-    return instance.task_id.replace('/', '__')
+    return instance.instance_id.replace('/', '__')


 def initialize_runtime(
@@ -206,9 +207,9 @@ def process_instance(
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-        reset_logger_for_multiprocessing(logger, instance.task_id, log_dir)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {instance.task_id}.')
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    # Create file with HumanEvalFix problem
    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
@@ -232,12 +233,13 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -257,7 +259,7 @@ def process_instance(

    # Save the output
    output = EvalOutput(
-        instance_id=instance.task_id,
+        instance_id=instance.instance_id,
        instruction=instruction,
        metadata=metadata,
        history=histories,
--- a/evaluation/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/humanevalfix/scripts/run_infer.sh
@@ -68,7 +68,6 @@ COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/integration_tests/README.md
+++ b/evaluation/integration_tests/README.md
@@ -0,0 +1,69 @@
+# Integration tests
+
+This directory implements integration tests that [was running in CI](https://github.com/All-Hands-AI/OpenHands/tree/23d3becf1d6f5d07e592f7345750c314a826b4e9/tests/integration).
+
+[PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985) introduce LLM-based editing, which requires access to LLM to perform edit. Hence, we remove integration tests from CI and intend to run them as nightly evaluation to ensure the quality of OpenHands softwares.
+
+## To add new tests
+
+Each test is a file named like `tXX_testname.py` where `XX` is a number.
+Make sure to name the file for each test to start with `t` and ends with `.py`.
+
+Each test should be structured as a subclass of [`BaseIntegrationTest`](./tests/base.py), where you need to implement `initialize_runtime` that setup the runtime enviornment before test, and `verify_result` that takes in a `Runtime` and history of `Event` and return a `TestResult`. See [t01_fix_simple_typo.py](./tests/t01_fix_simple_typo.py) and [t05_simple_browsing.py](./tests/t05_simple_browsing.py) for two representative examples.
+
+```python
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
+```
+
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local
+development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+    your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+    defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+    instances. By default, the script evaluates the entire Exercism test set
+    (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
+
+Example:
+```bash
+./evaluation/integration_tests/scripts/run_infer.sh llm.claude-35-sonnet-eval HEAD CodeActAgent
+```
--- a/agenthub/micro/_instructions/actions/kill.md
+++ b/agenthub/micro/_instructions/actions/kill.md
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -0,0 +1,213 @@
+import asyncio
+import importlib.util
+import os
+
+import pandas as pd
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    codeact_user_response,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+FAKE_RESPONSES = {
+    'CodeActAgent': codeact_user_response,
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+    instance_id: str,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            # use default base_container_image
+            enable_auto_lint=True,
+            use_host_network=False,
+            timeout=100,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    if metadata.llm_config.log_completions:
+        metadata.llm_config.log_completions_folder = os.path.join(
+            metadata.eval_output_dir, 'llm_completions', instance_id
+        )
+        logger.info(
+            f'Logging LLM completions for instance {instance_id} to '
+            f'{metadata.llm_config.log_completions_folder}'
+        )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata, instance.instance_id)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # import test instance
+    # =============================================
+    instance_id = instance.instance_id
+    spec = importlib.util.spec_from_file_location(instance_id, instance.file_path)
+    test_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(test_module)
+    assert hasattr(
+        test_module, 'Test'
+    ), f'Test module {instance_id} does not have a Test class'
+
+    test_class: type[BaseIntegrationTest] = test_module.Test
+    assert issubclass(
+        test_class, BaseIntegrationTest
+    ), f'Test class {instance_id} does not inherit from BaseIntegrationTest'
+
+    instruction = test_class.INSTRUCTION
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    test_class.initialize_runtime(runtime)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # # =============================================
+    # # result evaluation
+    # # =============================================
+
+    histories = state.history.get_events()
+    test_result: TestResult = test_class.verify_result(runtime, histories)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result.model_dump(),
+    )
+    return output
+
+
+def load_integration_tests() -> pd.DataFrame:
+    """Load tests from python files under ./tests"""
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    test_dir = os.path.join(cur_dir, 'tests')
+    test_files = [
+        os.path.join(test_dir, f)
+        for f in os.listdir(test_dir)
+        if f.startswith('t') and f.endswith('.py')
+    ]
+    df = pd.DataFrame(test_files, columns=['file_path'])
+    df['instance_id'] = df['file_path'].apply(
+        lambda x: os.path.basename(x).rstrip('.py')
+    )
+    return df
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    integration_tests = load_integration_tests()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'integration_tests',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        integration_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
+
+    df = pd.read_json(output_file, lines=True, orient='records')
+    df['success'] = df['test_result'].apply(lambda x: x['success'])
+    df['reason'] = df['test_result'].apply(lambda x: x['reason'])
+    logger.info('-' * 100)
+    logger.info(
+        f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})'
+    )
+    logger.info(
+        '\nEvaluation Results:'
+        + '\n'
+        + df[['instance_id', 'success', 'reason']].to_string(index=False)
+    )
+    logger.info('-' * 100)
--- a/evaluation/integration_tests/scripts/run_infer.sh
+++ b/evaluation/integration_tests/scripts/run_infer.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+EVAL_IDS=$6
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_agent_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$AGENT_VERSION
+
+# Default to NOT use unit tests.
+if [ -z "$USE_UNIT_TESTS" ]; then
+  export USE_UNIT_TESTS=false
+fi
+echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+# export PYTHONPATH=evaluation/integration_tests:\$PYTHONPATH
+COMMAND="poetry run python evaluation/integration_tests/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/integration_tests/tests/init.py
+++ b/evaluation/integration_tests/tests/init.py
--- a/evaluation/integration_tests/tests/base.py
+++ b/evaluation/integration_tests/tests/base.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel
+
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class TestResult(BaseModel):
+    success: bool
+    reason: str | None = None
+
+
+class BaseIntegrationTest(ABC):
+    """Base class for integration tests."""
+
+    INSTRUCTION: str
+
+    @classmethod
+    @abstractmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        """Initialize the runtime for the test to run."""
+        pass
+
+    @classmethod
+    @abstractmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        """Verify the result of the test.
+
+        This method will be called after the agent performs the task on the runtime.
+        """
+        pass
--- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py
+++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@@ -0,0 +1,39 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Fix typos in bad.txt.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'bad.txt')
+            with open(temp_file_path, 'w') as f:
+                f.write('This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!')
+
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/workspace')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/bad.txt has been fixed
+        action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to run command: {obs.content}'
+            )
+        # check if the file /workspace/bad.txt has been fixed
+        if (
+            obs.content.strip().replace('\r\n', '\n')
+            == 'This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!'
+        ):
+            return TestResult(success=True)
+        return TestResult(success=False, reason=f'File not fixed: {obs.content}')
--- a/evaluation/integration_tests/tests/t02_add_bash_hello.py
+++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py
@@ -0,0 +1,40 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Write a shell script '/workspace/hello.sh' that prints 'hello'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/hello.sh', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.sh: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='bash /workspace/hello.sh', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to execute /workspace/hello.sh: {obs.content}.',
+            )
+        if obs.content.strip() != 'hello':
+            return TestResult(
+                success=False, reason=f'Script did not print "hello": {obs.content}.'
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py
+++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
@@ -0,0 +1,43 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.sh exists
+        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        # execute the script
+        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        obs = runtime.run_action(action)
+
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/test.txt: {obs.content}.',
+            )
+
+        if 'hello world' not in obs.content.strip():
+            return TestResult(
+                success=False,
+                reason=f'File did not contain "hello world": {obs.content}.',
+            )
+        return TestResult(success=True)
--- a/evaluation/integration_tests/tests/t04_git_staging.py
+++ b/evaluation/integration_tests/tests/t04_git_staging.py
@@ -0,0 +1,58 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import CmdRunAction
+from openhands.events.event import Event
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Write a git commit message for the current staging area and commit the changes.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git init
+        action = CmdRunAction(command='git init', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create README.md
+        action = CmdRunAction(
+            command='echo \'print("hello world")\' > hello.py', keep_prompt=False
+        )
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # git add README.md
+        action = CmdRunAction(command='git add hello.py', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the file /workspace/hello.py exists
+        action = CmdRunAction(command='cat /workspace/hello.py', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False,
+                reason=f'Failed to cat /workspace/hello.py: {obs.content}.',
+            )
+
+        # check if the staging area is empty
+        action = CmdRunAction(command='git status', keep_prompt=False)
+        obs = runtime.run_action(action)
+        if obs.exit_code != 0:
+            return TestResult(
+                success=False, reason=f'Failed to git status: {obs.content}.'
+            )
+        if 'nothing to commit, working tree clean' in obs.content.strip():
+            return TestResult(success=True)
+
+        return TestResult(
+            success=False,
+            reason=f'Failed to check for "nothing to commit, working tree clean": {obs.content}.',
+        )
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -0,0 +1,134 @@
+import os
+import tempfile
+
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from evaluation.utils.shared import assert_and_raise
+from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+HTML_FILE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>The Ultimate Answer</title>
+    <style>
+        body {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+            margin: 0;
+            background: linear-gradient(to right, #1e3c72, #2a5298);
+            color: #fff;
+            font-family: 'Arial', sans-serif;
+            text-align: center;
+        }
+        .container {
+            text-align: center;
+            padding: 20px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.2);
+        }
+        h1 {
+            font-size: 36px;
+            margin-bottom: 20px;
+        }
+        p {
+            font-size: 18px;
+            margin-bottom: 30px;
+        }
+        #showButton {
+            padding: 10px 20px;
+            font-size: 16px;
+            color: #1e3c72;
+            background: #fff;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            transition: background 0.3s ease;
+        }
+        #showButton:hover {
+            background: #f0f0f0;
+        }
+        #result {
+            margin-top: 20px;
+            font-size: 24px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>The Ultimate Answer</h1>
+        <p>Click the button to reveal the answer to life, the universe, and everything.</p>
+        <button id="showButton">Click me</button>
+        <div id="result"></div>
+    </div>
+    <script>
+        document.getElementById('showButton').addEventListener('click', function() {
+            document.getElementById('result').innerText = 'The answer is OpenHands is all you need!';
+        });
+    </script>
+</body>
+</html>
+"""
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Browse localhost:8000, and tell me the ultimate answer to life.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        action = CmdRunAction(command='mkdir -p /tmp/server', keep_prompt=False)
+        obs = runtime.run_action(action)
+        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
+
+        # create a file with a typo in /workspace/bad.txt
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, 'index.html')
+            with open(temp_file_path, 'w') as f:
+                f.write(HTML_FILE)
+            # Copy the file to the desired location
+            runtime.copy_to(temp_file_path, '/tmp/server')
+
+        # create README.md
+        action = CmdRunAction(
+            command='cd /tmp/server && nohup python3 -m http.server 8000 &',
+            keep_prompt=False,
+        )
+        obs = runtime.run_action(action)
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the "The answer is OpenHands is all you need!" is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        for event in message_actions:
+            if isinstance(event, AgentDelegateObservation):
+                content = event.content
+            elif isinstance(event, AgentFinishAction):
+                content = event.outputs.get('content', '')
+            elif isinstance(event, MessageAction):
+                content = event.content
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')
+
+            if 'OpenHands is all you need!' in content:
+                return TestResult(success=True)
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
+        )
--- a/evaluation/logic_reasoning/Dockerfile
+++ b/evaluation/logic_reasoning/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN pip install scitools-pyke

--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -29,7 +29,8 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -201,17 +202,15 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance['instance_id']
-
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --dataset $DATASET \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/miniwob/Dockerfile
+++ b/evaluation/miniwob/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN apt-get update && apt-get install -y python3 python3-pip git

--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -1,4 +1,4 @@
-# WebArena Evaluation with OpenHands Browsing Agents
+# Mini-World of Bits Evaluation with OpenHands Browsing Agents

 This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.

--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -30,11 +30,12 @@ from openhands.events.action import (
    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
    BROWSER_EVAL_GET_GOAL_ACTION,
    BROWSER_EVAL_GET_REWARDS_ACTION,
 )
-from openhands.runtime.runtime import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

@@ -126,13 +127,15 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {env_id}.')

-    runtime = create_runtime(config, sid=env_id)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    task_str = initialize_runtime(runtime)
-
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=task_str,  # take output from initialize_runtime
+            initial_user_action=MessageAction(
+                content=task_str
+            ),  # take output from initialize_runtime
            runtime=runtime,
        )
    )
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS"

 if [ -n "$EVAL_LIMIT" ]; then
--- a/evaluation/mint/Dockerfile
+++ b/evaluation/mint/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN apt-get update && apt-get install -y python3 python3-pip git gcc

--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -29,9 +29,11 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
    CmdRunAction,
+    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
@@ -174,13 +176,14 @@ def process_instance(
        },
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=fake_user_response_fn,
        )
--- a/evaluation/mint/tasks/reasoning.py
+++ b/evaluation/mint/tasks/reasoning.py
@@ -131,11 +131,9 @@ class MultipleChoiceTask(Task):


 def compare_two_numbers(p, gt):
-    if isinstance(p, int) or isinstance(p, float):
+    if isinstance(p, (int, float)):
        pass
-    elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
-        return False
-    elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
+    elif isinstance(p, (bool, complex, dict, list, str, tuple)):
        return False
    else:
        raise ValueError(p)
@@ -227,8 +225,8 @@ class TheoremqaTask(Task):
            prediction = prediction.replace('°', '')

        # Detect the boolean keyword in the generation
-        if prediction in ['true', 'yes', 'false', 'no']:
-            if prediction == 'true' or prediction == 'yes':
+        if prediction in ('true', 'yes', 'false', 'no'):
+            if prediction in ('true', 'yes'):
                prediction = 'True'
            else:
                prediction = 'False'
@@ -342,7 +340,7 @@ class TheoremqaTask(Task):
        answer_type = self._answer_type
        gt = self.extract_answer(self.reference)

-        if isinstance(prediction, (str, int, float)) or isinstance(prediction, list):
+        if isinstance(prediction, (str, int, float, list)):
            # Comparing prediction against the reference
            if answer_type in ['bool', 'option', 'Option']:
                cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -39,9 +39,10 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 config = load_app_config()

@@ -211,9 +212,6 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    else:
        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')

-    # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
-    sid = str(instance['instance_id'])
-
    repo_url = instance['github']
    repo_name = repo_url.split('/')[-1]
    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
@@ -235,14 +233,15 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    )
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
    initialize_runtime(runtime, instance)

    # Run the agent
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
--- a/evaluation/regression/conftest.py
+++ b/evaluation/regression/conftest.py
@@ -8,7 +8,7 @@ import pytest

 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
-AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../../', 'agenthub')
+AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub')


 def agents():
--- a/Show More
+++ b/Show More