Setup basic auth guard

Fix for lockup - create the runtime in a background thread (#4412 )
Co-authored-by: Robert Brennan <contact@rbren.io>
2026-04-29 03:00:45 -04:00 · 2024-10-16 12:25:48 +04:00 · 2024-10-15 23:52:21 +00:00 · 2024-10-15 22:45:08 +00:00 · 2024-10-15 19:31:49 +00:00 · 2024-10-15 19:06:40 +02:00
558 changed files with 29810 additions and 11973 deletions
@@ -2,7 +2,5 @@
 sudo apt update
 sudo apt install -y netcat
 sudo add-apt-repository -y ppa:deadsnakes/ppa
-sudo apt install -y python3.11
-curl -sSL https://install.python-poetry.org | python3.11 -
-# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
-sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
+sudo apt install -y python3.12
+curl -sSL https://install.python-poetry.org | python3.12 -
@@ -5,71 +5,55 @@ labels: ['bug']
 body:
  - type: markdown
    attributes:
-      value: Thank you for taking the time to fill out this bug report. We greatly appreciate your effort to complete this template fully. Please provide as much information as possible to help us understand and address the issue effectively.
+      value: Thank you for taking the time to fill out this bug report. Please provide as much information as possible to help us understand and address the issue effectively.

  - type: checkboxes
    attributes:
      label: Is there an existing issue for the same bug?
      description: Please check if an issue already exists for the bug you encountered.
      options:
-      - label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
-        required: true
      - label: I have checked the existing issues.
        required: true

  - type: textarea
    id: bug-description
    attributes:
-      label: Describe the bug
-      description: Provide a short description of the problem.
+      label: Describe the bug and reproduction steps
+      description: Provide a description of the issue along with any reproduction steps.
    validations:
      required: true

-  - type: textarea
-    id: current-version
+  - type: dropdown
+    id: installation
    attributes:
-      label: Current OpenHands version
-      description: What version of OpenHands are you using? If you're running in docker, tell us the tag you're using (e.g. ghcr.io/all-hands-ai/openhands:0.3.1).
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Installation
+      description: How are you running OpenHands?
+      options:
+        - Docker command in README
+        - Development workflow
+      default: 0

-  - type: textarea
-    id: config
+  - type: input
+    id: openhands-version
    attributes:
-      label: Installation and Configuration
-      description: Please provide any commands you ran and any configuration (redacting API keys)
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Version
+      description: What version of OpenHands are you using?
+      placeholder: ex. 0.9.8, main, etc.

-  - type: textarea
-    id: model-agent
-    attributes:
-      label: Model and Agent
-      description: What model and agent are you using? You can see these settings in the UI by clicking the settings wheel.
-      placeholder: |
-        - Model:
-        - Agent:
-
-  - type: textarea
-    id: os-version
+  - type: dropdown
+    id: os
    attributes:
      label: Operating System
-      description: What Operating System are you using? Linux, Mac OS, WSL on Windows
-
-  - type: textarea
-    id: repro-steps
-    attributes:
-      label: Reproduction Steps
-      description: Please list the steps to reproduce the issue.
-      placeholder: |
-        1.
-        2.
-        3.
+      options:
+        - MacOS
+        - Linux
+        - WSL on Windows

  - type: textarea
    id: additional-context
    attributes:
      label: Logs, Errors, Screenshots, and Additional Context
-      description: If you want to share the chat history you can click the thumbs-down (👎) button above the input field and you will get a shareable link (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the `logs/llm/default` folder. Please add any additional context about the problem here.
+      description: Please provide any additional information you think might help. If you want to share the chat history
+        you can click the thumbs-down (👎) button above the input field and you will get a shareable link
+        (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the
+        `logs/llm/default` folder. Please add any additional context about the problem here.
@@ -1,6 +1,6 @@
-**Short description of the problem this fixes or functionality that this introduces. This may be used for the CHANGELOG**
-
+**End-user friendly description of the problem this fixes or functionality that this introduces**

+- [ ] Include this change in the Release Notes. If checked, you must provide an **end-user friendly** description for your change below

 ---
 **Give a summary of what the PR does, explaining any non-trivial design decisions**
@@ -14,6 +14,11 @@ on:
    branches:
      - main

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Build the documentation website
  build:
@@ -32,7 +37,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Generate Python Docs
        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
@@ -9,25 +9,48 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
-      - name: Set up environment
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation,llama-index
-          poetry run playwright install --with-deps chromium
-          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
+          python-version: '3.12'
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Build Environment
+        run: make build
      - name: Run tests
        run: |
          set -e
-          poetry run python openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
+          SANDBOX_FORCE_REBUILD_RUNTIME=True poetry run python3 openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
      - name: Check exit code
        run: |
          if [ $? -ne 0 ]; then
@@ -12,6 +12,11 @@ on:
      - 'frontend/**'
      -  '.github/workflows/fe-unit-tests.yml'

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run frontend unit tests
  fe-test:
@@ -1,12 +1,6 @@
-# Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
+# Workflow that builds, tests and then pushes the OpenHands and runtime docker images to the ghcr.io repository
 name: Build, Test and Publish RT Image

-# Only run one workflow of the same group at a time.
-# There can be at most one running and one pending job in a concurrency group at any time.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
 # Always run on "main"
 # Always run on tags
 # Always run on PRs
@@ -25,7 +19,84 @@ on:
        required: true
        default: ''

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.12-nodejs22
+  RELEVANT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+
 jobs:
+  # Builds the OpenHands Docker images
+  ghcr_build_app:
+    name: Build App Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push app image
+        if: "!github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --push
+      - name: Build app image
+        if: "github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --load
+      - name: Get hash in App Image
+        id: get_hash_in_app_image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+          # Run the build script in the app image
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          # Get the hash from the build script
+          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
+          echo "Hash from app image: $hash_from_app_image"
+      # This test should move when we have a test suite for the app image
+      - name: Test docker in App Image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "docker run hello-world"
+
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
@@ -36,7 +107,7 @@ jobs:
    strategy:
      matrix:
        base_image:
-          - image: 'nikolaik/python-nodejs:python3.11-nodejs22'
+          - image: 'nikolaik/python-nodejs:python3.12-nodejs22'
            tag: nikolaik
    steps:
      - name: Checkout
@@ -56,7 +127,9 @@ jobs:
          docker-images: false
          swap-storage: true
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
@@ -69,7 +142,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Cache Poetry dependencies
        uses: actions/cache@v4
        with:
@@ -88,13 +161,13 @@ jobs:
      - name: Build and push runtime image ${{ matrix.base_image.image }}
        if: github.event.pull_request.head.repo.fork != true
        run: |
-          ./containers/build.sh runtime ${{ github.repository_owner }} --push ${{ matrix.base_image.tag }}
+          ./containers/build.sh -i runtime -o ${{ github.repository_owner }} --push -t ${{ matrix.base_image.tag }}
      # Forked repos can't push to GHCR, so we need to upload the image as an artifact
      - name: Build runtime image ${{ matrix.base_image.image }} for fork
        if: github.event.pull_request.head.repo.fork
        uses: docker/build-push-action@v6
        with:
-          tags: ghcr.io/all-hands-ai/runtime:${{ github.sha }}-${{ matrix.base_image.tag }}
+          tags: ghcr.io/all-hands-ai/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image.tag }}
          outputs: type=docker,dest=/tmp/runtime-${{ matrix.base_image.tag }}.tar
          context: containers/runtime
      - name: Upload runtime image for fork
@@ -104,6 +177,56 @@ jobs:
          name: runtime-${{ matrix.base_image.tag }}
          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

+  verify_hash_equivalence_in_runtime_and_app:
+    name: Verify Hash Equivalence in Runtime and Docker images
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_build_app]
+    strategy:
+      fail-fast: false
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Get hash in App Image
+        run: |
+          echo "Hash from app image: ${{ needs.ghcr_build_app.outputs.hash_from_app_image }}"
+          echo "hash_from_app_image=${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" >> $GITHUB_ENV
+
+      - name: Get hash using code (development mode)
+        run: |
+          mkdir -p containers/runtime
+          poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild > output.txt 2>&1
+          hash_from_code=$(cat output.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_code=$hash_from_code" >> $GITHUB_ENV
+
+      - name: Compare hashes
+        run: |
+          echo "Hash from App Image: ${{ env.hash_from_app_image }}"
+          echo "Hash from Code: ${{ env.hash_from_code }}"
+          if [ "${{ env.hash_from_app_image }}" = "${{ env.hash_from_code }}" ]; then
+            echo "Hashes match!"
+          else
+            echo "Hashes do not match!"
+            exit 1
+          fi
+
  # Run unit tests with the EventStream runtime Docker images as root
  test_runtime_root:
    name: RT Unit Tests (Root)
@@ -115,6 +238,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -138,7 +278,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
@@ -151,7 +291,7 @@ jobs:
          # Install to be able to retry on failures for flaky tests
          poetry run pip install pytest-rerunfailures

-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

          SKIP_CONTAINER_LOGS=true \
@@ -160,7 +300,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -176,6 +316,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -199,7 +356,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
@@ -212,7 +369,7 @@ jobs:
          # Install to be able to retry on failures for flaky tests
          poetry run pip install pytest-rerunfailures

-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

          SKIP_CONTAINER_LOGS=true \
@@ -221,7 +378,7 @@ jobs:
          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -238,6 +395,23 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -261,14 +435,14 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
        run: make install-python-dependencies
      - name: Run integration tests
        run: |
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')

          TEST_RUNTIME=eventstream \
@@ -290,7 +464,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: All tests passed
        run: echo "All runtime tests have passed successfully!"
@@ -299,7 +473,7 @@ jobs:
    name: All Runtime Tests Passed
    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
    runs-on: ubuntu-latest
-    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
+    needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux, verify_hash_equivalence_in_runtime_and_app]
    steps:
      - name: Some tests failed
        run: |
@@ -1,65 +0,0 @@
-# Workflow that builds, tests and then pushes the app docker images to the ghcr.io repository
-name: Build and Publish App Image
-
-# Always run on "main"
-# Always run on tags
-# Always run on PRs
-# Can also be triggered manually
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  # Builds the OpenHands Docker images
-  ghcr_build:
-    name: Build App Image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build and push app image
-        if: "!github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands ${{ github.repository_owner }} --push
-      - name: Build app image
-        if: "github.event.pull_request.head.repo.fork"
-        run: |
-          ./containers/build.sh openhands image ${{ github.repository_owner }}
@@ -10,6 +10,11 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run lint on the frontend code
  lint-frontend:
@@ -41,9 +46,9 @@ jobs:
      - name: Set up python
        uses: actions/setup-python@v5
        with:
-          python-version: 3.11
+          python-version: 3.12
          cache: 'pip'
      - name: Install pre-commit
        run: pip install pre-commit==3.7.0
      - name: Run pre-commit hooks
-        run: pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+name: Resolve Issues with OpenHands
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  call-openhands-resolver:
+    uses: All-Hands-AI/openhands-resolver/.github/workflows/openhands-resolver.yml@main
+    if: github.event.label.name == 'fix-me'
+    with:
+      issue_number: ${{ github.event.issue.number }}
+    secrets: inherit
@@ -10,6 +10,11 @@ on:
      - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run python unit tests on macOS
  test-on-macos:
@@ -19,7 +24,7 @@ jobs:
      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
@@ -89,8 +94,11 @@ jobs:
          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
      - name: Build Environment
        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -104,9 +112,12 @@ jobs:
      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
@@ -119,7 +130,7 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml -svv ./tests/unit
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -17,7 +17,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: 3.11
+          python-version: 3.12
      - name: Install Poetry
        uses: snok/install-poetry@v1.4.1
        with:
@@ -26,6 +26,6 @@ jobs:
      - name: Install Poetry Dependencies
        run: poetry install --no-interaction --no-root
      - name: Build poetry project
-        run: poetry build -v
+        run: ./build.sh
      - name: publish
        run: poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
@@ -29,10 +29,13 @@ jobs:
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
-        python-version: "3.11"
+        python-version: "3.12"
    - name: Cache Poetry dependencies
      uses: actions/cache@v4
      with:
@@ -52,7 +55,7 @@ jobs:
      run: |
        DEBUG=${{ inputs.debug }} \
        LOG_TO_FILE=${{ inputs.log_to_file }} \
-        FORCE_REGENERATE_TESTS=${{ inputs.force_regenerate_tests }} \
+        FORCE_REGENERATE=${{ inputs.force_regenerate_tests }} \
        FORCE_USE_LLM=${{ inputs.force_use_llm }} \
        ./tests/integration/regenerate.sh
    - name: Commit changes
@@ -15,10 +15,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
-        python-version: '3.11'
+        python-version: '3.12'
    - name: install git, github cli
      run: |
        sudo apt-get install -y git gh
@@ -1,113 +0,0 @@
-# Workflow that uses OpenHands to resolve a GitHub issue. Issue must be labeled 'solve-this'
-name: Use OpenHands to Resolve GitHub Issue
-
-on:
-  issues:
-    types: [labeled]
-
-permissions:
-  contents: write
-  pull-requests: write
-  issues: write
-
-jobs:
-  dogfood:
-    if: github.event.label.name == 'solve-this'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/all-hands-ai/openhands
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-    steps:
-    - name: install git, github cli
-      run: apt-get install -y git gh
-    - name: Checkout Repository
-      uses: actions/checkout@v4
-    - name: Write Task File
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-      run: |
-        echo "TITLE:" > task.txt
-        echo "${ISSUE_TITLE}" >> task.txt
-        echo "" >> task.txt
-        echo "BODY:" >> task.txt
-        echo "${ISSUE_BODY}" >> task.txt
-    - name: Set up environment
-      run: |
-        curl -sSL https://install.python-poetry.org | python3 -
-        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation,llama-index
-        poetry run playwright install --with-deps chromium
-    - name: Run OpenHands
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      run: |
-        # Append path to launch poetry
-        export PATH="/github/home/.local/bin:$PATH"
-        # Append path to correctly import package, note: must set pwd at first
-        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./openhands/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
-        rm task.txt
-    - name: Setup Git, Create Branch, and Commit Changes
-      run: |
-        # Setup Git configuration
-        git config --global --add safe.directory $PWD
-        git config --global user.name 'OpenHands'
-        git config --global user.email 'OpenHands@users.noreply.github.com'
-
-        # Create a unique branch name with a timestamp
-        BRANCH_NAME="fix/${{ github.event.issue.number }}-$(date +%Y%m%d%H%M%S)"
-
-        # Checkout new branch
-        git checkout -b $BRANCH_NAME
-
-        # Add all changes to staging, except task.txt
-        git add --all -- ':!task.txt'
-
-        # Commit the changes, if any
-        git commit -m "OpenHands: Resolve Issue #${{ github.event.issue.number }}"
-        if [ $? -ne 0 ]; then
-          echo "No changes to commit."
-          exit 0
-        fi
-
-        # Push changes
-        git push --set-upstream origin $BRANCH_NAME
-    - name: Fetch Default Branch
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Fetch the default branch using gh cli
-        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
-        echo "Default branch is $DEFAULT_BRANCH"
-        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-    - name: Generate PR
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Create PR and capture URL
-        PR_URL=$(gh pr create \
-          --title "OpenHands: Resolve Issue #2" \
-          --body "This PR was generated by OpenHands to resolve issue #2" \
-          --repo "foragerr/OpenHands" \
-          --head "${{ github.head_ref }}" \
-          --base "${{ env.DEFAULT_BRANCH }}" \
-          | grep -o 'https://github.com/[^ ]*')
-
-        # Extract PR number from URL
-        PR_NUMBER=$(echo "$PR_URL" | grep -o '[0-9]\+$')
-
-        # Set environment vars
-        echo "PR_URL=$PR_URL" >> $GITHUB_ENV
-        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
-    - name: Post Comment
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        gh issue comment ${{ github.event.issue.number }} \
-          -b "OpenHands raised [PR #${{ env.PR_NUMBER }}](${{ env.PR_URL }}) to resolve this issue."
@@ -121,6 +121,7 @@ celerybeat.pid

 # Environments
 .env
+frontend/.env
 .venv
 env/
 venv/
@@ -217,8 +218,6 @@ config.toml
 config.toml_
 config.toml.bak

-containers/agnostic_sandbox
-
 # swe-bench-eval
 image_build_logs
 run_instance_logs
@@ -0,0 +1,28 @@
+OpenHands is an automated AI software engineer. It is a repo with a Python backend
+(in the `openhands` directory) and TypeScript frontend (in the `frontend` directory).
+
+General Setup:
+- To set up the entire repo, including frontend and backend, run `make build`
+- To run linting and type-checking before finishing the job, run `poetry run pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
+
+Backend:
+- Located in the `openhands` directory
+- Testing:
+  - All tests are in `tests/unit/test_*.py`
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - Write all tests with pytest
+
+Frontend:
+- Located in the `frontend` directory
+- Prerequisites: A recent version of NodeJS / NPM
+- Setup: Run `npm install` in the frontend directory
+- Testing:
+  - Run tests: `npm run test`
+  - To run specific tests: `npm run test -- -t "TestName"`
+- Building:
+  - Build for production: `npm run build`
+- Environment Variables:
+  - Set in `frontend/.env` or as environment variables
+  - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
+- Internationalization:
+  - Generate i18n declaration file: `npm run make-i18n`
@@ -8,16 +8,17 @@ There are many ways that you can contribute:

 1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
 2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) issues that may be ones to start on.
+3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.

 ## Understanding OpenHands's CodeBase

 To understand the codebase, please refer to the README in each module:
 - [frontend](./frontend/README.md)
- [agenthub](./agenthub/README.md)
 - [evaluation](./evaluation/README.md)
 - [openhands](./openhands/README.md)
-    - [server](./openhands/server/README.md)
+   - [agenthub](./openhands/agenthub/README.md)
+   - [server](./openhands/server/README.md)
+

 When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
 At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
@@ -2,7 +2,7 @@

 ## Contributors

-We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. Your dedication and hard work are greatly appreciated.
+We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. We greatly appreciate your dedication and hard work.

 ## Open Source Projects

@@ -10,7 +10,7 @@ OpenHands includes and adapts the following open source projects. We are gratefu

 #### [SWE Agent](https://github.com/princeton-nlp/swe-agent)
   - License: MIT License
-   - Description: Adapted for use in OpenHands's agenthub
+   - Description: Adapted for use in OpenHands's agent hub

 #### [Aider](https://github.com/paul-gauthier/aider)
   - License: Apache License 2.0
@@ -7,7 +7,7 @@ Otherwise, you can clone the OpenHands project directly.
 ### 1. Requirements
 * Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [ Ubuntu <= 22.04]
 * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) = 3.11
+* [Python](https://www.python.org/downloads/) = 3.12
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
 * netcat => sudo apt-get install netcat
@@ -22,8 +22,8 @@ If you want to develop without system admin/sudo access to upgrade/install `Pyth
 curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 bash Miniforge3-$(uname)-$(uname -m).sh

-# Install Python 3.11, nodejs, and poetry
-mamba install python=3.11
+# Install Python 3.12, nodejs, and poetry
+mamba install python=3.12
 mamba install conda-forge::nodejs
 mamba install conda-forge::poetry
 ```
@@ -98,6 +98,11 @@ Please refer to [this README](./tests/integration/README.md) for details.
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`

+### 9. Use existing Docker image
+To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image. Follow these steps:
+1. Set the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
+2. Example: export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 ## Develop inside Docker container

 TL;DR
@@ -10,7 +10,7 @@ DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
-PYTHON_VERSION = 3.11
+PYTHON_VERSION = 3.12

 # ANSI color codes
 GREEN=$(shell tput -Txterm setaf 2)
@@ -190,12 +190,12 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "$(shell pwd)/workspace"

 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start -- --port $(FRONTEND_PORT)

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -36,12 +36,14 @@ Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [
 The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to
 point OpenHands to existing code that you'd like to modify.

-See the [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) guide for
+See the [Installation](https://docs.all-hands.dev/modules/usage/installation) guide for
 system requirements and more information.

 ```bash
 export WORKSPACE_BASE=$(pwd)/workspace

+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
 docker run -it --pull=always \
    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -56,10 +58,14 @@ docker run -it --pull=always \

 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!

+You'll need a model provider and API key. One option that works well: [Claude 3.5 Sonnet](https://www.anthropic.com/api), but you have [many options](https://docs.all-hands.dev/modules/usage/llms).
+
+---
+
 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
 or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).

-Visit [Getting Started](https://docs.all-hands.dev/modules/usage/getting-started) for more information and setup instructions.
+Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.

 If you want to modify the OpenHands source code, check out [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).

@@ -114,8 +120,8 @@ For a list of open source projects and licenses used in OpenHands, please see ou
 ## 📚 Cite

 ```
-@misc{opendevin,
-      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+@misc{openhands,
+      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
      year={2024},
      eprint={2407.16741},
@@ -1,88 +0,0 @@
-import ast
-
-from openhands.controller.action_parser import ActionParser, ResponseParser
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action import (
-    Action,
-    BrowseInteractiveAction,
-)
-
-
-class BrowsingResponseParser(ResponseParser):
-    def __init__(self):
-        # Need to pay attention to the item order in self.action_parsers
-        super().__init__()
-        self.action_parsers = [BrowsingActionParserMessage()]
-        self.default_parser = BrowsingActionParserBrowseInteractive()
-
-    def parse(self, response: str) -> Action:
-        action_str = self.parse_response(response)
-        return self.parse_action(action_str)
-
-    def parse_response(self, response) -> str:
-        action_str = response['choices'][0]['message']['content']
-        if action_str is None:
-            return ''
-        action_str = action_str.strip()
-        if action_str and not action_str.endswith('```'):
-            action_str = action_str + ')```'
-        logger.debug(action_str)
-        return action_str
-
-    def parse_action(self, action_str: str) -> Action:
-        for action_parser in self.action_parsers:
-            if action_parser.check_condition(action_str):
-                return action_parser.parse(action_str)
-        return self.default_parser.parse(action_str)
-
-
-class BrowsingActionParserMessage(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return '```' not in action_str
-
-    def parse(self, action_str: str) -> Action:
-        msg = f'send_msg_to_user("""{action_str}""")'
-        return BrowseInteractiveAction(
-            browser_actions=msg,
-            thought=action_str,
-            browsergym_send_msg_to_user=action_str,
-        )
-
-
-class BrowsingActionParserBrowseInteractive(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return True
-
-    def parse(self, action_str: str) -> Action:
-        thought = action_str.split('```')[0].strip()
-        action_str = action_str.split('```')[1].strip()
-        msg_content = ''
-        for sub_action in action_str.split('\n'):
-            if 'send_msg_to_user(' in sub_action:
-                tree = ast.parse(sub_action)
-                args = tree.body[0].value.args  # type: ignore
-                msg_content = args[0].value
-
-        return BrowseInteractiveAction(
-            browser_actions=action_str,
-            thought=thought,
-            browsergym_send_msg_to_user=msg_content,
-        )
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+cp pyproject.toml poetry.lock openhands
+poetry build -v
@@ -13,6 +13,10 @@
 # API key for E2B
 #e2b_api_key = ""

+# API key for Modal
+#modal_api_token_id = ""
+#modal_api_token_secret = ""
+
 # Base path for the workspace
 workspace_base = "./workspace"

@@ -28,6 +32,9 @@ workspace_base = "./workspace"
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false

+# Path to store trajectories
+#trajectories_path="./trajectories"
+
 # File store path
 #file_store_path = "/tmp/file_store"

@@ -112,7 +119,7 @@ api_key = "your-api-key"
 #embedding_deployment_name = ""

 # Embedding model to use
-embedding_model = ""
+embedding_model = "local"

 # Maximum number of characters in an observation's content
 #max_message_chars = 10000
@@ -146,8 +153,8 @@ model = "gpt-4o"
 # Drop any unmapped (unsupported) params without causing an exception
 #drop_params = false

-# Using the prompt caching feature provided by the LLM
-#caching_prompt = false
+# Using the prompt caching feature if provided by the LLM and supported
+#caching_prompt = true

 # Base URL for the OLLAMA API
 #ollama_base_url = ""
@@ -185,10 +192,10 @@ model = "gpt-4o-mini"
 #memory_enabled = false

 # Memory maximum threads
-#memory_max_threads = 2
+#memory_max_threads = 3

 # LLM config group to use
-#llm_config = 'llm'
+#llm_config = 'your-llm-config-group'

 [agent.RepoExplorerAgent]
 # Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
@@ -206,7 +213,7 @@ llm_config = 'gpt3'
 #user_id = 1000

 # Container image to use for the sandbox
-#base_container_image = "nikolaik/python-nodejs:python3.11-nodejs22"
+#base_container_image = "nikolaik/python-nodejs:python3.12-nodejs22"

 # Use host network
 #use_host_network = false
@@ -232,7 +239,7 @@ llm_config = 'gpt3'
 [security]

 # Enable confirmation mode
-#confirmation_mode = true
+#confirmation_mode = false

 # The security analyzer to use
 #security_analyzer = ""
@@ -28,7 +28,7 @@ COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
 RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12.3-slim AS runtime
+FROM python:3.12.3-slim AS openhands-app

 WORKDIR /app

@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
@@ -46,6 +46,14 @@ RUN mkdir -p $WORKSPACE_BASE
 RUN apt-get update -y \
    && apt-get install -y curl ssh sudo

+# Install Docker - https://docs.docker.com/engine/install/debian/
+RUN apt-get install ca-certificates curl \
+    && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \
+    && chmod a+r /etc/apt/keyrings/docker.asc \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null \
+    && apt-get update \
+    && apt install -y docker-ce
+
 # Default is 1000, but OSX is often 501
 RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
 # Default is 60000, but we've seen up to 200000
@@ -69,11 +77,12 @@ RUN playwright install --with-deps chromium

 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
-COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
-COPY --chown=openhands:app --chmod=770 ./pyproject.toml ./pyproject.toml
-COPY --chown=openhands:app --chmod=770 ./poetry.lock ./poetry.lock
-COPY --chown=openhands:app --chmod=770 ./README.md ./README.md
-COPY --chown=openhands:app --chmod=770 ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
+COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
+COPY --chown=openhands:app ./poetry.lock ./poetry.lock
+COPY --chown=openhands:app ./README.md ./README.md
+COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./LICENSE ./LICENSE

 # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
 RUN python openhands/core/download.py # No-op to download assets
@@ -81,7 +90,7 @@ RUN python openhands/core/download.py # No-op to download assets
 # openhands:openhands -> openhands:app
 RUN find /app \! -group app -exec chgrp app {} +

-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/dist ./frontend/dist
+COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build/client ./frontend/build
 COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root
@@ -1,13 +1,40 @@
 #!/bin/bash
 set -eo pipefail

-image_name=$1
-org_name=$2
+# Initialize variables with default values
+image_name=""
+org_name=""
 push=0
-if [[ $3 == "--push" ]]; then
-  push=1
+load=0
+tag_suffix=""
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
+    echo "  -i: Image name (required)"
+    echo "  -o: Organization name"
+    echo "  --push: Push the image"
+    echo "  --load: Load the image"
+    echo "  -t: Tag suffix"
+    exit 1
+}
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i) image_name="$2"; shift 2 ;;
+        -o) org_name="$2"; shift 2 ;;
+        --push) push=1; shift ;;
+        --load) load=1; shift ;;
+        -t) tag_suffix="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+# Check if required arguments are provided
+if [[ -z "$image_name" ]]; then
+    echo "Error: Image name is required."
+    usage
 fi
-tag_suffix=$4

 echo "Building: $image_name"
 tags=()
@@ -17,10 +44,10 @@ OPENHANDS_BUILD_VERSION="dev"
 cache_tag_base="buildcache"
 cache_tag="$cache_tag_base"

-if [[ -n $GITHUB_SHA ]]; then
-  git_hash=$(git rev-parse --short "$GITHUB_SHA")
+if [[ -n $RELEVANT_SHA ]]; then
+  git_hash=$(git rev-parse --short "$RELEVANT_SHA")
  tags+=("$git_hash")
-  tags+=("$GITHUB_SHA")
+  tags+=("$RELEVANT_SHA")
 fi

 if [[ -n $GITHUB_REF_NAME ]]; then
@@ -95,14 +122,35 @@ if [[ $push -eq 1 ]]; then
  args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
 fi

+if [[ $load -eq 1 ]]; then
+  args+=" --load"
+fi
+
 echo "Args: $args"

+# Modify the platform selection based on --load flag
+if [[ $load -eq 1 ]]; then
+  # When loading, build only for the current platform
+  platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
+else
+  # For push or without load, build for multiple platforms
+  platform="linux/amd64,linux/arm64"
+fi
+
+echo "Building for platform(s): $platform"
+
 docker buildx build \
  $args \
  --build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
-  --platform linux/amd64,linux/arm64 \
+  --platform $platform \
  --provenance=false \
  -f "$dir/Dockerfile" \
  "$DOCKER_BASE_DIR"
+
+# If load was requested, print the loaded images
+if [[ $load -eq 1 ]]; then
+  echo "Local images built:"
+  docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
+fi
@@ -55,18 +55,18 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | d
  && apt-get clean \
  && apt-get autoremove -y

-# Python 3.11
+# Python 3.12
 RUN add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update \
-    && apt-get install -y python3.11 python3.11-venv python3.11-dev python3-pip \
-    && ln -s /usr/bin/python3.11 /usr/bin/python
+    && apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
+    && ln -s /usr/bin/python3.12 /usr/bin/python

 # NodeJS >= 18.17.1
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
    && apt-get install -y nodejs

 # Poetry >= 1.8
-RUN curl -fsSL https://install.python-poetry.org | python3.11 - \
+RUN curl -fsSL https://install.python-poetry.org | python3.12 - \
    && ln -s ~/.local/bin/poetry /usr/local/bin/poetry

 #
@@ -3,10 +3,10 @@
 This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
 that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.

-The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
+The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.12-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
+    --base_image nikolaik/python-nodejs:python3.12-nodejs22 \
    --build_folder containers/runtime
 ```
@@ -1,44 +0,0 @@
-FROM ubuntu:22.04
-
-# install basic packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    wget \
-    git \
-    vim \
-    nano \
-    unzip \
-    zip \
-    python3 \
-    python3-pip \
-    python3-venv \
-    python3-dev \
-    build-essential \
-    openssh-server \
-    sudo \
-    gcc \
-    jq \
-    g++ \
-    make \
-    iproute2 \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN mkdir -p -m0755 /var/run/sshd
-
-# symlink python3 to python
-RUN ln -s /usr/bin/python3 /usr/bin/python
-
-# ==== OpenHands Runtime Client ====
-RUN mkdir -p /openhands && mkdir -p /openhands/logs && chmod 777 /openhands/logs
-RUN wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-RUN bash Miniforge3.sh -b -p /openhands/miniforge3
-RUN chmod -R g+w /openhands/miniforge3
-RUN bash -c ". /openhands/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
-RUN echo "" > /openhands/bash.bashrc
-RUN rm -f Miniforge3.sh
-
-# - agentskills dependencies
-RUN /openhands/miniforge3/bin/pip install --upgrade pip
-RUN /openhands/miniforge3/bin/pip install jupyterlab notebook jupyter_kernel_gateway flake8
-RUN /openhands/miniforge3/bin/pip install python-docx PyPDF2 python-pptx pylatexenc openai
-RUN /openhands/miniforge3/bin/pip install python-dotenv toml termcolor pydantic python-docx pyyaml docker pexpect tenacity e2b browsergym minio
@@ -1,4 +0,0 @@
-DOCKER_REGISTRY=ghcr.io
-DOCKER_ORG=all-hands-ai
-DOCKER_IMAGE=sandbox
-DOCKER_BASE_DIR="."
@@ -38,6 +38,6 @@ repos:
      - id: mypy
        additional_dependencies:
          [types-requests, types-setuptools, types-pyyaml, types-toml]
-        entry: mypy --config-file dev_config/python/mypy.ini openhands/ agenthub/
+        entry: mypy --config-file dev_config/python/mypy.ini openhands/
        always_run: true
        pass_filenames: false
@@ -1,7 +1,3 @@
---
-sidebar_position: 8
---
-
 # 📚 Misc

 ## ⭐️ Research Strategy
@@ -1,7 +1,3 @@
---
-sidebar_position: 3
---
-
 # 🧠 Main Agent and Capabilities

 ## CodeActAgent
@@ -1,7 +1,3 @@
---
-sidebar_position: 7
---
-
 # 🏛️ System Architecture

 <div style={{ textAlign: 'center' }}>
@@ -1,7 +1,3 @@
---
-sidebar_position: 5
---
-
 # ✅ Providing Feedback

 When using OpenHands, you will encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenHands to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenHands!
@@ -1,66 +1,111 @@
---
-sidebar_position: 2
---
+# Getting Started with OpenHands

-# Getting Started
+So you've [installed OpenHands](./installation) and have
+[set up your LLM](./installation#setup). Now what?

-## System Requirements
+OpenHands can help you tackle a wide variety of engineering tasks. But the technology
+is still new, and we're a long way off from having agents that can take on large, complicated
+engineering tasks without any guidance. So it's important to get a feel for what the agent
+does well, and where it might need some help.

-* Docker version 26.0.0+ or Docker Desktop 4.31.0+
-* You must be using Linux or Mac OS
-  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install)
+## Hello World

-## Installation
+The first thing you might want to try is a simple "hello world" example.
+This can be more complicated than it sounds!

-The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
-existing code that you'd like to modify.
+Try prompting the agent with:
+> Please write a bash script hello.sh that prints "hello world!"

-```bash
-export WORKSPACE_BASE=$(pwd)/workspace
+You should see that the agent not only writes the script, it sets the correct
+permissions and runs the script to check the output.

-docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
-```
+You can continue prompting the agent to refine your code. This is a great way to
+work with agents. Start simple, and iterate.

-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
+> Please modify hello.sh so that it accepts a name as the first argument, but defaults to "world"

-## Setup
+You can also work in any language you need, though the agent might need to spend some
+time setting up its environment!

-After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
+> Please convert hello.sh to a Ruby script, and run it

-The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
-command to point to an existing folder.
+## Building From Scratch

-Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
-These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
+Agents do exceptionally well at "greenfield" tasks (tasks where they don't need
+any context about an existing codebase) and they can just start from scratch.

-If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
-in the `Custom Model` text box.
-The `Advanced Options` also allow you to specify a `Base URL` if required.
+It's best to start with a simple task, and then iterate on it. It's also best to be
+as specific as possible about what you want, what the tech stack should be, etc.

-<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
-  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
-  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
-</div>
+For example, we might build a TODO app:

-## Versions
+> Please build a basic TODO list app in React. It should be frontend-only, and all state
+> should be kept in localStorage.

-The command above pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
+We can keep iterating on the app once the skeleton is there:

-You can choose the tag that best suits your needs based on stability requirements and desired features.
+> Please allow adding an optional due date to every task

-For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+Just like with normal development, it's good to commit and push your code frequently.
+This way you can always revert back to an old state if the agent goes off track.
+You can ask the agent to commit and push for you:

-Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
+> Please commit the changes and push them to a new branch called "feature/due-dates"
+
+
+## Adding New Code
+
+OpenHands can also do a great job adding new code to an existing code base.
+
+For example, you can ask OpenHands to add a new GitHub action to your project
+which lints your code. OpenHands may take a peek at your codebase to see what language
+it should use, but then it can just drop a new file into `./github/workflows/lint.yml`
+
+> Please add a GitHub action that lints the code in this repository
+
+Some tasks might require a bit more context. While OpenHands can use `ls` and `grep`
+to search through your codebase, providing context up front allows it to move faster,
+and more accurately. And it'll cost you fewer tokens!
+
+> Please modify ./backend/api/routes.js to add a new route that returns a list of all tasks
+
+> Please add a new React component that displays a list of Widgets to the ./frontend/components
+> directory. It should use the existing Widget component.
+
+## Refactoring
+
+OpenHands does great at refactoring existing code, especially in small chunks.
+You probably don't want to try rearchitecting your whole codebase, but breaking up
+long files and functions, renaming variables, etc. tend to work very well.
+
+> Please rename all the single-letter variables in ./app.go
+
+> Please break the function `build_and_deploy_widgets` into two functions, `build_widgets` and `deploy_widgets` in widget.php
+
+> Please break ./api/routes.js into separate files for each route
+
+## Bug Fixes
+
+OpenHands can also help you track down and fix bugs in your code. But, as any
+developer knows, bug fixing can be extremely tricky, and often OpenHands will need more context.
+It helps if you've diagnosed the bug, but want OpenHands to figure out the logic.
+
+> Currently the email field in the `/subscribe` endpoint is rejecting .io domains. Please fix this.
+
+> The `search_widgets` function in ./app.py is doing a case-sensitive search. Please make it case-insensitive.
+
+It often helps to do test-driven development when bugfixing with an agent.
+You can ask the agent to write a new test, and then iterate until it fixes the bug:
+
+> The `hello` function crashes on the empty string. Please write a test that reproduces this bug, then fix the code so it passes.
+
+## More
+
+OpenHands is capable of helping out on just about any coding task. But it takes some practice
+to get the most out of it. Remember to:
+* Keep your tasks small
+* Be as specific as possible
+* Provide as much context as possible
+* Commit and push frequently
+
+See [Prompting Best Practices](./prompting-best-practices) for more tips on how to get the most out of OpenHands.
@@ -8,7 +8,7 @@ This mode is different from the [headless mode](headless-mode), which is non-int

 To start an interactive OpenHands session via the command line, follow these steps:

-1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md)
+1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).

 2. Run the following command:

@@ -3,14 +3,14 @@
 The sandbox is where the agent does its work. Instead of running commands directly on your computer
 (which could be dangerous), the agent runs them inside of a Docker container.

-The default OpenHands sandbox (`python-nodejs:python3.11-nodejs22`
+The default OpenHands sandbox (`python-nodejs:python3.12-nodejs22`
 from [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)) comes with some packages installed such
 as python and Node.js but your use case may need additional software installed by default.

 There are two ways you can do so:

-1. Use an existing image from docker hub
-2. Creating your own custom docker image and using it
+1. Use an existing image from docker hub.
+2. Creating your own custom docker image and using it.

 If you want to take the first approach, you can skip the `Create Your Docker Image` section.

@@ -0,0 +1,71 @@
+# Debugging
+
+The following is intended as a primer on debugging OpenHands for Development purposes.
+
+## Server / VSCode
+
+The following `launch.json` will allow debugging the agent, controller and server elements, but not the sandbox (Which runs inside docker). It will ignore any changes inside the `workspace/` directory:
+
+```
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "OpenHands CLI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "openhands.core.cli",
+            "justMyCode": false
+        },
+        {
+            "name": "OpenHands WebApp",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "openhands.server.listen:app",
+                "--reload",
+                "--reload-exclude",
+                "${workspaceFolder}/workspace",
+                "--port",
+                "3000"
+            ],
+            "justMyCode": false
+        }
+    ]
+}
+```
+
+More specific debugging configurations which include more parameters may be specified:
+
+```
+    ...
+    {
+      "name": "Debug CodeAct",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "openhands.core.main",
+      "args": [
+        "-t",
+        "Ask me what your task is.",
+        "-d",
+        "${workspaceFolder}/workspace",
+        "-c",
+        "CodeActAgent",
+        "-l",
+        "llm.o1",
+        "-n",
+        "prompts"
+      ],
+      "justMyCode": false
+    }
+    ...
+```
+
+Values in the snippet above can be updated such that:
+
+    * *t*: the task
+    * *d*: the openhands workspace directory
+    * *c*: the agent
+    * *l*: the LLM config (pre-defined in config.toml)
+    * *n*: session name (e.g. eventstream name)
@@ -84,7 +84,7 @@ To create an evaluation workflow for your benchmark, follow these steps:

 1. Import relevant OpenHands utilities:
   ```python
-    import agenthub
+    import openhands.agenthub
    from evaluation.utils.shared import (
        EvalMetadata,
        EvalOutput,
@@ -136,7 +136,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
   ```python
   def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
       config = get_config(instance, metadata)
-       runtime = create_runtime(config, sid=instance.instance_id)
+       runtime = create_runtime(config)
       initialize_runtime(runtime, instance)

       instruction = get_instruction(instance, metadata)
@@ -0,0 +1,15 @@
+# Using the OpenHands GitHub Action
+
+This guide explains how to use the OpenHands GitHub Action, both within the OpenHands repository and in your own projects.
+
+## Using the Action in the OpenHands Repository
+
+To use the OpenHands GitHub Action in the OpenHands repository, an OpenHands maintainer can:
+
+1. Create an issue in the repository.
+2. Add the `fix-me` label to the issue.
+3. The action will automatically trigger and attempt to resolve the issue.
+
+## Installing the Action in a New Repository
+
+To install the OpenHands GitHub Action in your own repository, follow the [directions in the OpenHands Resolver repo](https://github.com/All-Hands-AI/OpenHands-resolver?tab=readme-ov-file#using-the-github-actions-workflow).
@@ -0,0 +1,51 @@
+# GUI Mode
+
+## Introduction
+
+OpenHands provides a user-friendly Graphical User Interface (GUI) mode for interacting with the AI assistant. This mode offers an intuitive way to set up the environment, manage settings, and communicate with the AI.
+
+## Installation and Setup
+
+1. Follow the instructions in the [Installation](../installation) guide to install OpenHands.
+
+2. After running the command, access OpenHands at [http://localhost:3000](http://localhost:3000).
+
+## Interacting with the GUI
+
+### Initial Setup
+
+1. Upon first launch, you'll see a settings modal.
+2. Select an `LLM Provider` and `LLM Model` from the dropdown menus.
+3. Enter the corresponding `API Key` for your chosen provider.
+4. Click "Save" to apply the settings.
+
+### Advanced Settings
+
+1. Toggle `Advanced Options` to access additional settings.
+2. Use the `Custom Model` text box to manually enter a model if it's not in the list.
+3. Specify a `Base URL` if required by your LLM provider.
+
+### Main Interface
+
+The main interface consists of several key components:
+
+1. **Chat Window**: The central area where you can view the conversation history with the AI assistant.
+2. **Input Box**: Located at the bottom of the screen, use this to type your messages or commands to the AI.
+3. **Send Button**: Click this to send your message to the AI.
+4. **Settings Button**: A gear icon that opens the settings modal, allowing you to adjust your configuration at any time.
+5. **Workspace Panel**: Displays the files and folders in your workspace, allowing you to navigate and view files, or the agent's past commands or web browsing history.
+
+### Interacting with the AI
+
+1. Type your question, request, or task description in the input box.
+2. Click the send button or press Enter to submit your message.
+3. The AI will process your input and provide a response in the chat window.
+4. You can continue the conversation by asking follow-up questions or providing additional information.
+
+## Tips for Effective Use
+
+1. Be specific in your requests to get the most accurate and helpful responses, as described in the [prompting best practices](../prompting-best-practices).
+2. Use the workspace panel to explore your project structure.
+3. Use one of the recommended models, as described in the [LLMs section](usage/llms/llms.md).
+
+Remember, the GUI mode of OpenHands is designed to make your interaction with the AI assistant as smooth and intuitive as possible. Don't hesitate to explore its features to maximize your productivity.
@@ -177,6 +177,7 @@ spec:
      claimName: docker-pvc
 ```

+
 ```bash
 # create the pod
 $ oc create -f pod.yaml
@@ -262,3 +263,167 @@ Events:                   <none>
 6. Connect to OpenHands UI, configure the Agent, then test:

 ![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)
+
+
+
+## GCP GKE Openhands deployment
+
+**Warning**: this deployment grants the OpenHands application access to the Kubernetes docker socket, which creates security risk. Use at your own discretion.
+1- Create policy for privillege access
+2- Create gke credentials(optional)
+3- Create openhands deployment
+4- Verification and ui access commands
+5- Tshoot pod to verify the internal container
+
+1. create policy for privillege access
+```bash
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: privileged-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: ["apps"]
+  resources: ["deployments"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: privileged-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: privileged-role
+subjects:
+- kind: ServiceAccount
+  name: default  # Change to your service account name
+  namespace: default
+```
+2. create gke credentials(optional)
+```bash
+kubectl create secret generic google-cloud-key \
+  --from-file=key.json=/path/to/your/google-cloud-key.json
+  ```
+3. create openhands deployment
+## as this is tested for the single worker node if you have multiple specify the flag for the single worker
+
+```bash
+kind: Deployment
+metadata:
+  name: openhands-app-2024
+  labels:
+    app: openhands-app-2024
+spec:
+  replicas: 1  # You can increase this number for multiple replicas
+  selector:
+    matchLabels:
+      app: openhands-app-2024
+  template:
+    metadata:
+      labels:
+        app: openhands-app-2024
+    spec:
+      containers:
+      - name: openhands-app-2024
+        image: ghcr.io/all-hands-ai/openhands:main
+        env:
+        - name: SANDBOX_USER_ID
+          value: "1000"
+        - name: SANDBOX_API_HOSTNAME
+          value: '10.164.0.4'
+        - name: WORKSPACE_MOUNT_PATH
+          value: "/tmp/workspace_base"
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: "/tmp/workspace_base/google-cloud-key.json"
+        volumeMounts:
+        - name: workspace-volume
+          mountPath: /tmp/workspace_base
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+        - name: google-credentials
+          mountPath: "/tmp/workspace_base/google-cloud-key.json"
+        securityContext:
+          privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 3000
+      - name: openhands-sandbox-2024
+        image: ghcr.io/opendevin/sandbox:main
+    #    securityContext:
+    #      privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 51963
+        command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+      volumes:
+      #- name: workspace-volume
+      #  persistentVolumeClaim:
+      #    claimName: workspace-pvc
+      - name: workspace-volume
+        emptyDir: {}
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock       # Use host's Docker socket
+          type: Socket
+      - name: google-credentials
+        secret:
+          secretName: google-cloud-key
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openhands-app-2024-svc
+spec:
+  selector:
+    app: openhands-app-2024
+  ports:
+  - name: http
+    protocol: TCP
+    port: 80
+    targetPort: 3000
+  - name: ssh
+    protocol: TCP
+    port: 51963
+    targetPort: 51963
+  type: LoadBalancer
+  ```
+
+5. Tshoot pod to verify the internal container
+### if you want to know more regarding the internal container runtime use below mention pod deployment use kubectl exec -it to enter into container and you can check the contaienr run time using normal docker commands like "docker ps -a"
+
+```bash
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docker-in-docker
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docker-in-docker
+  template:
+    metadata:
+      labels:
+        app: docker-in-docker
+    spec:
+      containers:
+      - name: dind
+        image: docker:20.10-dind
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+      volumes:
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock
+          type: Socket
+```
@@ -0,0 +1,63 @@
+# Installation
+
+## System Requirements
+
+* Docker version 26.0.0+ or Docker Desktop 4.31.0+.
+* You must be using Linux or Mac OS.
+  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+
+## Start the app
+
+The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
+existing code that you'd like to modify.
+
+```bash
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker pull ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
+docker run -it --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/all-hands-ai/openhands:0.9
+```
+
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
+
+## Setup
+
+After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
+
+The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
+command to point to an existing folder.
+
+Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
+These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
+
+If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
+in the `Custom Model` text box.
+The `Advanced Options` also allow you to specify a `Base URL` if required.
+
+<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
+  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
+  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
+</div>
+
+## Versions
+
+The command above pulls the most recent stable release of OpenHands. You have other options as well:
+- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
+- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
+- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
+
+You can choose the tag that best suits your needs based on stability requirements and desired features.
+
+For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
@@ -5,7 +5,7 @@ OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their
 ## Azure OpenAI Configuration

 When running OpenHands, you'll need to set the following environment variable using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+[docker run command](/modules/usage/installation):

 ```
 LLM_API_VERSION="<api-version>"              # e.g. "2023-05-15"
@@ -37,7 +37,7 @@ OpenHands uses llama-index for embeddings. You can find their documentation on A
 ### Azure OpenAI Configuration

 When running OpenHands, set the following environment variables using `-e` in the
-[docker run command](/modules/usage/getting-started#installation):
+[docker run command](/modules/usage/installation):

 ```
 LLM_EMBEDDING_MODEL="azureopenai"
@@ -16,7 +16,7 @@ If the model is not in the list, toggle `Advanced Options`, and enter it in `Cus
 ## VertexAI - Google Cloud Platform Configs

 To use Vertex AI through Google Cloud Platform when running OpenHands, you'll need to set the following environment
-variables using `-e` in the [docker run command](/modules/usage/getting-started#installation):
+variables using `-e` in the [docker run command](/modules/usage/installation):

 ```
 GOOGLE_APPLICATION_CREDENTIALS="<json-dump-of-gcp-service-account-json>"
@@ -8,8 +8,8 @@ When running OpenHands, you'll need to set the following in the OpenHands UI thr
 * `LLM Provider` to `Groq`
 * `LLM Model` to the model you will be using. [Visit here to see the list of
 models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)
+`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
+* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).



@@ -1,17 +1,25 @@
---
-sidebar_position: 3
---
-
 # 🤖 LLM Backends

 OpenHands can connect to any LLM supported by LiteLLM. However, it requires a powerful model to work.
-The following are verified by the community to work with OpenHands:

-* claude-3-5-sonnet
-* gemini-1.5-pro / gemini-1.5-flash
-* gpt-4 / gpt-4o
-* llama-3.1-405b / hermes-3-llama-3.1-405b
-* wizardlm-2-8x22b
+## Model Recommendations
+
+Based on a recent evaluation of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. The full analysis can be found in [this blog article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed).
+
+When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
+
+- Claude 3.5 Sonnet is the best by a fair amount, achieving a 27% resolve rate with the default agent in OpenHands.
+- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
+- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.
+
+Please refer to the [full article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) for more details.
+
+Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
+
+- claude-3-5-sonnet (recommended)
+- gpt-4 / gpt-4o
+- llama-3.1-405b
+- deepseek-v2.5

 :::warning
 OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
@@ -33,35 +41,48 @@ models driving it. However, if you do find ones that work, please add them to th
 ## LLM Configuration

 The following can be set in the OpenHands UI through the Settings:
-* `LLM Provider`
-* `LLM Model`
-* `API Key`
-* `Base URL` (through `Advanced Settings`)
+
+- `LLM Provider`
+- `LLM Model`
+- `API Key`
+- `Base URL` (through `Advanced Settings`)

 There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
-can be set through environment variables passed to the [docker run command](/modules/usage/getting-started#installation)
+can be set through environment variables passed to the [docker run command](/modules/usage/installation)
 using `-e`:

-* `LLM_API_VERSION`
-* `LLM_EMBEDDING_MODEL`
-* `LLM_EMBEDDING_DEPLOYMENT_NAME`
-* `LLM_DROP_PARAMS`
-* `LLM_DISABLE_VISION`
-* `LLM_CACHING_PROMPT`
+- `LLM_API_VERSION`
+- `LLM_EMBEDDING_MODEL`
+- `LLM_EMBEDDING_DEPLOYMENT_NAME`
+- `LLM_DROP_PARAMS`
+- `LLM_DISABLE_VISION`
+- `LLM_CACHING_PROMPT`

 We have a few guides for running OpenHands with specific model providers:

-* [Azure](llms/azure-llms)
-* [Google](llms/google-llms)
-* [Groq](llms/groq)
-* [OpenAI](llms/openai-llms)
-* [OpenRouter](llms/openrouter)
+- [Azure](llms/azure-llms)
+- [Google](llms/google-llms)
+- [Groq](llms/groq)
+- [OpenAI](llms/openai-llms)
+- [OpenRouter](llms/openrouter)

 ### API retries and rate limits

-Some LLMs have rate limits and may require retries. OpenHands will automatically retry requests if it receives a 429 error or API connection error.
-You can set the following environment variables to control the number of retries and the time between retries:
+LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.

-* `LLM_NUM_RETRIES` (Default of 8)
-* `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
-* `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+You can customize these options as you need for the provider you're using. Check their documentation, and set the following environment variables to control the number of retries and the time between retries:
+
+- `LLM_NUM_RETRIES` (Default of 8)
+- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
+- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+- `LLM_RETRY_MULTIPLIER` (Default of 2)
+
+If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
+
+```toml
+[llm]
+num_retries = 8
+retry_min_wait = 15
+retry_max_wait = 120
+retry_multiplier = 2
+```
@@ -0,0 +1,41 @@
+# Prompting Best Practices
+
+When working with OpenHands AI software developer, it's crucial to provide clear and effective prompts. This guide outlines best practices for creating prompts that will yield the most accurate and useful responses.
+
+## Characteristics of Good Prompts
+
+Good prompts are:
+
+1. **Concrete**: They explain exactly what functionality should be added or what error needs to be fixed.
+2. **Location-specific**: If known, they explain the locations in the code base that should be modified.
+3. **Appropriately scoped**: They should be the size of a single feature, typically not exceeding 100 lines of code.
+
+## Examples
+
+### Good Prompt Examples
+
+1. "Add a function `calculate_average` in `utils/math_operations.py` that takes a list of numbers as input and returns their average."
+
+2. "Fix the TypeError in `frontend/src/components/UserProfile.tsx` occurring on line 42. The error suggests we're trying to access a property of undefined."
+
+3. "Implement input validation for the email field in the registration form. Update `frontend/src/components/RegistrationForm.tsx` to check if the email is in a valid format before submission."
+
+### Bad Prompt Examples
+
+1. "Make the code better." (Too vague, not concrete)
+
+2. "Rewrite the entire backend to use a different framework." (Not appropriately scoped)
+
+3. "There's a bug somewhere in the user authentication. Can you find and fix it?" (Lacks specificity and location information)
+
+## Tips for Effective Prompting
+
+1. Be as specific as possible about the desired outcome or the problem to be solved.
+2. Provide context, including relevant file paths and line numbers if available.
+3. Break down large tasks into smaller, manageable prompts.
+4. Include any relevant error messages or logs.
+5. Specify the programming language or framework if it's not obvious from the context.
+
+Remember, the more precise and informative your prompt is, the better the AI can assist you in developing or modifying the OpenHands software.
+
+See [Getting Started with OpenHands](./getting-started) for more examples of helpful prompts.
@@ -1,7 +1,3 @@
---
-sidebar_position: 4
---
-
 # 🚧 Troubleshooting

 There are some error messages that frequently get reported by users.
@@ -1,7 +1,3 @@
---
-sidebar_position: 8
---
-
 # ⬆️ Upgrade Guide

 ## 0.8.0 (2024-07-13)
@@ -24,7 +24,7 @@
        "@docusaurus/module-type-aliases": "^3.5.1",
        "@docusaurus/tsconfig": "^3.5.2",
        "@docusaurus/types": "^3.5.1",
-        "typescript": "~5.6.2"
+        "typescript": "~5.6.3"
      },
      "engines": {
        "node": ">=18.0"
@@ -14853,9 +14853,9 @@
      }
    },
    "node_modules/typescript": {
-      "version": "5.6.2",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz",
-      "integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==",
+      "version": "5.6.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz",
+      "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==",
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"
@@ -31,7 +31,7 @@
    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.5.2",
    "@docusaurus/types": "^3.5.1",
-    "typescript": "~5.6.2"
+    "typescript": "~5.6.3"
  },
  "browserslist": {
    "production": [
@@ -3,51 +3,98 @@ import type { SidebarsConfig } from "@docusaurus/plugin-content-docs";
 const sidebars: SidebarsConfig = {
  apiSidebar: [require("./modules/python/sidebar.json")],
  docsSidebar: [
+    {
+      type: 'doc',
+      label: 'Installation',
+      id: 'usage/installation',
+    },
    {
      type: 'doc',
      label: 'Getting Started',
      id: 'usage/getting-started',
    },
+    {
+      type: 'doc',
+      label: 'Prompting Best Practices',
+      id: 'usage/prompting-best-practices',
+    },
    {
      type: 'category',
-      label: 'LLMs',
+      label: 'Usage Methods',
      items: [
        {
          type: 'doc',
-          label: 'Overview',
-          id: 'usage/llms/llms',
+          label: 'GUI Mode',
+          id: 'usage/how-to/gui-mode',
        },
+        {
+          type: 'doc',
+          label: 'CLI Mode',
+          id: 'usage/how-to/cli-mode',
+        },
+        {
+          type: 'doc',
+          label: 'Headless Mode',
+          id: 'usage/how-to/headless-mode',
+        },
+        {
+          type: 'doc',
+          label: 'Github Actions',
+          id: 'usage/how-to/github-action',
+        },
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Advanced Configuration',
+      items: [
        {
          type: 'category',
-          label: 'Providers',
+          label: 'LLM Configuration',
          items: [
            {
              type: 'doc',
-              label: 'Azure',
-              id: 'usage/llms/azure-llms',
+              label: 'Overview',
+              id: 'usage/llms/llms',
            },
            {
-              type: 'doc',
-              label: 'Google',
-              id: 'usage/llms/google-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Groq',
-              id: 'usage/llms/groq',
-            },
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
-            {
-              type: 'doc',
-              label: 'OpenRouter',
-              id: 'usage/llms/openrouter',
+              type: 'category',
+              label: 'Providers',
+              items: [
+                {
+                  type: 'doc',
+                  label: 'Azure',
+                  id: 'usage/llms/azure-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Google',
+                  id: 'usage/llms/google-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Groq',
+                  id: 'usage/llms/groq',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenAI',
+                  id: 'usage/llms/openai-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenRouter',
+                  id: 'usage/llms/openrouter',
+                },
+              ],
            },
          ],
        },
+        {
+          type: 'doc',
+          label: 'Custom Sandbox',
+          id: 'usage/how-to/custom-sandbox-guide',
+        },
      ],
    },
    {
@@ -62,44 +109,39 @@ const sidebars: SidebarsConfig = {
    },
    {
      type: 'category',
-      label: 'How-to Guides',
+      label: 'For OpenHands Developers',
      items: [
        {
-          type: 'doc',
-          id: 'usage/how-to/cli-mode',
+          type: 'category',
+          label: 'Architecture',
+          items: [
+            {
+              type: 'doc',
+              label: 'Backend',
+              id: 'usage/architecture/backend',
+            },
+            {
+              type: 'doc',
+              label: 'Runtime',
+              id: 'usage/architecture/runtime',
+            },
+          ],
        },
        {
          type: 'doc',
-          id: 'usage/how-to/headless-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/custom-sandbox-guide',
+          label: 'Debugging',
+          id: 'usage/how-to/debugging',
        },
        {
          type: 'doc',
+          label: 'Evaluation',
          id: 'usage/how-to/evaluation-harness',
        },
        {
          type: 'doc',
+          label: 'Kubernetes Deployment',
          id: 'usage/how-to/openshift-example',
-        }
-      ]
-    },
-    {
-      type: 'category',
-      label: 'Architecture',
-      items: [
-        {
-          type: 'doc',
-          label: 'Backend',
-          id: 'usage/architecture/backend',
        },
-        {
-          type: 'doc',
-          label: 'Runtime',
-          id: 'usage/architecture/runtime',
-        }
      ],
    },
    {
@@ -22,6 +22,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction

 game = None

@@ -62,7 +63,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
        ),
@@ -117,12 +118,12 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance['text'].strip())
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
@@ -36,7 +36,7 @@ fi

 # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
 # We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")

 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
  --data-split test \
  --max-iterations 20 \
  --OPENAI_API_KEY $OPENAI_API_KEY \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

@@ -44,7 +44,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -209,7 +209,7 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=instance.instance_id)
+    runtime: Runtime = create_runtime(config)

    initialize_runtime(runtime, instance=instance)

@@ -217,7 +217,7 @@ def process_instance(
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
        )
@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

@@ -59,15 +59,13 @@ You can update the arguments in the script
 ## Summarize Results

 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
-# with optional SKIP_NUM
-poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```

 Full example:

 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```

 This will list the instances that passed and the instances that failed. For each
@@ -81,21 +79,3 @@ outcome of the tests. If there are no syntax or indentation errors, you can
 expect to see something like "`..F...EF..`", where "`.`" means the test case
 passed, "`E`" means there was an error while executing the test case and "`F`"
 means some assertion failed and some returned output was not as expected.
-
-## Visualization
-
-If the required Python libraries are installed (`matplotlib.pyplot` and `seaborn`),
-the `summarize_results.py` script will also generate two histograms to
-the output folder.
-
-### Cost Histogram
-
-The cost histogram shows the number of successful and failed instances per cost point.
-
-![Cost Histogram](./examples/cost_histogram.png)
-
-### Actions Histogram
-
-The actions histogram shows per number of actions the number of successful and failed instances.
-
-![Actions Histogram](./examples/actions_histogram.png)
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import os
 import tempfile
 from typing import Any
@@ -24,11 +25,12 @@ from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
+    load_from_toml,
    parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime

@@ -59,6 +61,13 @@ def get_config(
        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
    return config


@@ -129,7 +138,7 @@ def complete_runtime(
        logger.info(f'Running test file: {script_name}')

    action = CmdRunAction(
-        command=f'python -m unittest {script_name}',
+        command=f'python3 -m unittest {script_name}',
        keep_prompt=False,
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
@@ -194,7 +203,7 @@ def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = create_runtime(config, sid=str(instance.instance_id))
+    runtime: Runtime = create_runtime(config)

    initialize_runtime(runtime, instance=instance)

@@ -202,7 +211,7 @@ def process_instance(
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
        )
@@ -256,7 +265,6 @@ def process_instance(
 if __name__ == '__main__':
    args = parse_arguments()
    dataset = load_dataset('RajMaheshwari/Exercism-Python')
-    dataset = dataset.shuffle(seed=42)
    aider_bench_tests = dataset['train'].to_pandas()

    llm_config = None
@@ -27,19 +27,24 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

-COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
-  --agent-cls $AGENT \
-  --llm-config $MODEL_CONFIG \
-  --max-iterations 30 \
-  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+EVAL_NOTE=$AGENT_VERSION

 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then
  export USE_UNIT_TESTS=false
 fi
 echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -1,61 +1,25 @@
-import json
-import os
-import sys
+import argparse

 import numpy as np
 import pandas as pd

-# Try to import visualization libraries
-visualization_available = False
-try:
-    import matplotlib.pyplot as plt
-    import seaborn as sns

-    visualization_available = True
-except ImportError:
-    print(
-        '\n*** WARNING: libraries matplotlib and/or seaborn are not installed.\n*** Visualization will not be available!\n'
-    )
-
-
-def show_usage():
-    print(
-        'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file> <model_name>'
-    )
-    print(
-        'Example:\npoetry run python summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620\n'
-    )
-
-
-def print_error(message: str):
-    print(f'\n***\n*** ERROR: {message}\n***\n')
-    show_usage()
-
-
-def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
    passed = []
    failed = []
-    with open(res_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'test_result' in data and 'exit_code' in data['test_result']:
-                resolved = data['test_result']['exit_code'] == 0
-            if resolved:
-                passed.append(instance_id)
-            else:
-                failed.append(instance_id)
+    for _, row in df.iterrows():
+        instance_id = row['instance_id']
+        resolved = False
+        if 'test_result' in row and 'exit_code' in row['test_result']:
+            resolved = row['test_result']['exit_code'] == 0
+        if resolved:
+            passed.append(instance_id)
+        else:
+            failed.append(instance_id)
    return passed, failed


-def visualize_results(json_file_path: str, model: str, output_dir: str):
-    # based on a Colab notebook by RajMaheshwari
-    with open(json_file_path, 'r') as f:
-        data = [json.loads(line) for line in f]
-
-    df = pd.DataFrame.from_records(data)
-
+def visualize_results(df: pd.DataFrame):
    df1 = pd.DataFrame()
    df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
    df1['result'] = (
@@ -67,60 +31,35 @@ def visualize_results(json_file_path: str, model: str, output_dir: str):
    total = df.shape[0]
    resolve_rate = round((passed / total) * 100, 2)

-    print('Number of passed tests:', f'{passed}/{total}')
+    print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
+    print('\nDescriptive statistics for number of actions:')
+    print(df1['actions'].describe())
+    print('\nDescriptive statistics for costs:')
+    print(df1['cost'].describe())

-    if not visualization_available:
-        return resolve_rate
+    # Bin counts for actions
+    action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
+    print('\nAction bin counts:')
+    print(action_bins.value_counts().sort_index())

-    # Cost histogram
-    plt.figure(figsize=(10, 6))
-    bins = 10
-    mx = pd.Series.max(df1['cost'])
-    g = sns.histplot(df1, x='cost', bins=bins, hue='result', multiple='stack')
-    x_ticks = np.around(np.linspace(0, mx, bins + 1), 3)
-    g.set_xticks(x_ticks)
-    g.set_xlabel('Cost in $')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'cost_histogram.png'))
-    plt.close()
-
-    # Actions histogram
-    plt.figure(figsize=(10, 6))
-    bins = np.arange(0, 31, 2)
-    g = sns.histplot(df1, x='actions', bins=bins, hue='result', multiple='stack')
-    g.set_xticks(bins)
-    g.set_xlabel('# of actions')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'actions_histogram.png'))
-    plt.close()
+    # Bin counts for costs
+    cost_bins = pd.cut(df1['cost'], bins=10)
+    print('\nCost bin counts:')
+    print(cost_bins.value_counts().sort_index())

    return resolve_rate


 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print_error('Argument(s) missing!')
-        sys.exit(1)
+    parser = argparse.ArgumentParser(description='Summarize AiderBench results')
+    parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
+    args = parser.parse_args()

-    json_file_path = sys.argv[1]
-    model_name = sys.argv[2]
+    # Create DataFrame from JSONL file
+    df = pd.read_json(args.input_filepath, lines=True)

-    if not os.path.exists(json_file_path):
-        print_error('Output file does not exist!')
-        sys.exit(1)
-    if not os.path.isfile(json_file_path):
-        print_error('Path-to-output-file is not a file!')
-        sys.exit(1)
-
-    output_dir = os.path.dirname(json_file_path)
-    if not os.access(output_dir, os.W_OK):
-        print_error('Output folder is not writable!')
-        sys.exit(1)
-
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    resolve_rate = visualize_results(json_file_path, model_name, output_dir)
+    passed_tests, failed_tests = extract_test_results(df)
+    resolve_rate = visualize_results(df)

    print(
        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
@@ -129,7 +68,3 @@ if __name__ == '__main__':
    print(passed_tests)
    print('FAILED TESTS:')
    print(failed_tests)
-    print(
-        '\nVisualization results were saved as cost_histogram.png and actions_histogram.png'
-    )
-    print('in folder: ', output_dir)
@@ -27,7 +27,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime

@@ -274,10 +274,7 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance.instance_id.replace('/', '__')
-
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)

    initialize_runtime(runtime, instance)

@@ -285,7 +282,7 @@ def process_instance(
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

@@ -75,7 +75,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -402,14 +402,14 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
            ],
@@ -30,7 +30,6 @@ COMMAND="poetry run python evaluation/bird/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION" \

@@ -23,6 +23,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction

 # Only CodeActAgent can delegate to BrowsingAgent
 SUPPORTED_AGENT_CLS = {'CodeActAgent'}
@@ -40,7 +41,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
        ),
@@ -71,12 +72,12 @@ def process_instance(
        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
        )
    )
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 1 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"

@@ -51,7 +51,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -141,14 +141,14 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

-    runtime = create_runtime(config, sid=instance['instance_id'])
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
@@ -41,7 +41,6 @@ COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
  --max-iterations 30 \
  --level $LEVELS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

@@ -24,6 +24,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -43,7 +44,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -79,11 +80,11 @@ def process_instance(
    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/gorilla/run_infer.py \
  --max-iterations 30 \
  --hubs $HUBS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

@@ -65,7 +65,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -214,12 +214,12 @@ Again do not quit without reporting the answer first.
 Ok now its time to start solving the question. Good luck!
 """

-    runtime = create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')
+    runtime = create_runtime(config)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/gpqa/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --data-split $DATA_SPLIT \
  --eval-note $AGENT_VERSION"
@@ -35,7 +35,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime

@@ -86,7 +86,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -99,7 +99,7 @@ def get_config(


 def _get_instance_id(instance: pd.Series) -> str:
-    return instance.task_id.replace('/', '__')
+    return instance.instance_id.replace('/', '__')


 def initialize_runtime(
@@ -206,9 +206,9 @@ def process_instance(
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
-        reset_logger_for_multiprocessing(logger, instance.task_id, log_dir)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {instance.task_id}.')
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

    # Create file with HumanEvalFix problem
    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
@@ -232,12 +232,12 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -257,7 +257,7 @@ def process_instance(

    # Save the output
    output = EvalOutput(
-        instance_id=instance.task_id,
+        instance_id=instance.instance_id,
        instruction=instruction,
        metadata=metadata,
        history=histories,
@@ -68,7 +68,6 @@ COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN pip install scitools-pyke

@@ -201,17 +201,14 @@ def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance['instance_id']
-
-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --dataset $DATASET \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN apt-get update && apt-get install -y python3 python3-pip git

@@ -126,13 +126,14 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {env_id}.')

-    runtime = create_runtime(config, sid=env_id)
+    runtime = create_runtime(config)
    task_str = initialize_runtime(runtime)
-
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=task_str,  # take output from initialize_runtime
+            initial_user_action=MessageAction(
+                content=task_str
+            ),  # take output from initialize_runtime
            runtime=runtime,
        )
    )
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS"

 if [ -n "$EVAL_LIMIT" ]; then
@@ -1,4 +1,4 @@
-FROM python:3.11-bookworm
+FROM python:3.12-bookworm

 RUN apt-get update && apt-get install -y python3 python3-pip git gcc

@@ -29,6 +29,7 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
    CmdRunAction,
+    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
@@ -174,13 +175,13 @@ def process_instance(
        },
    )

-    runtime = create_runtime(config, sid=instance.instance_id)
+    runtime = create_runtime(config)
    initialize_runtime(runtime)

    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=fake_user_response_fn,
        )
@@ -39,7 +39,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime

@@ -211,9 +211,6 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    else:
        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')

-    # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
-    sid = str(instance['instance_id'])
-
    repo_url = instance['github']
    repo_name = repo_url.split('/')[-1]
    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
@@ -235,14 +232,14 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    )
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    runtime = create_runtime(config, sid=sid)
+    runtime = create_runtime(config)
    initialize_runtime(runtime, instance)

    # Run the agent
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                metadata.agent_class
@@ -8,7 +8,7 @@ import pytest

 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
-AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../../', 'agenthub')
+AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub')


 def agents():
@@ -28,7 +28,9 @@ When the `run_infer.sh` script is started, it will automatically pull the releva

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
+
+# Example
+./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
 ```

 where `model_config` is mandatory, and the rest are optional.
@@ -68,10 +70,11 @@ then your command would be:
 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
-# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
+
+# Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
-# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```

 To clean-up all existing runtime you've already started, run:
@@ -133,8 +136,9 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag
 Then you can run the following:

 ```bash
-# ./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
-# For example:
+./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
+
+# Example
 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```

@@ -162,9 +166,11 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
-# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
-# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+
+# Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 ```

 To clean-up all existing runtimes that you've already started, run:
@@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig:
            # large enough timeout, since some testcases take very long to run
            timeout=1800,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
@@ -126,7 +127,7 @@ def process_instance(
            test_result=instance['test_result'],
        )

-    runtime = create_runtime(config, sid=instance_id)
+    runtime = create_runtime(config)

    # Get patch and save it to /tmp/patch.diff
    with tempfile.TemporaryDirectory() as temp_dir:
@@ -8,11 +8,12 @@ import pandas as pd
 import toml
 from datasets import load_dataset

-import agenthub
+import openhands.agenthub
 from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    assert_and_raise,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
@@ -28,7 +29,7 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
@@ -131,6 +132,8 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_remote_runtime_alive=False,
        ),
        # do not mount workspace
        workspace_base=None,
@@ -162,14 +165,16 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(
+        obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {obs.content}'
+    )

    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {obs.content}')

    if USE_INSTANCE_IMAGE:
        # inject the init script
@@ -181,9 +186,10 @@ def initialize_runtime(
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert (
-            obs.exit_code == 0
-        ), f'Failed to create /swe_util/eval_data/instances: {obs.content}'
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to create /swe_util/eval_data/instances: {obs.content}',
+        )

        swe_instance_json_name = 'swe-bench-instance.json'
        with tempfile.TemporaryDirectory() as temp_dir:
@@ -209,44 +215,53 @@ def initialize_runtime(
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert obs.exit_code == 0
+        assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {obs.content}')

        action = CmdRunAction(command='source ~/.bashrc')
        action.timeout = 600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert obs.exit_code == 0
+        assert_and_raise(
+            obs.exit_code == 0, f'Failed to source ~/.bashrc: {obs.content}'
+        )

        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
        action.timeout = 3600
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert obs.exit_code == 0
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to source /swe_util/instance_swe_entry.sh: {obs.content}',
+        )
    else:
        action = CmdRunAction(command='source /swe_util/swe_entry.sh')
        action.timeout = 1800
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert (
-            obs.exit_code == 0
-        ), f'Failed to source /swe_util/swe_entry.sh: {obs.content}'
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to source /swe_util/swe_entry.sh: {obs.content}',
+        )

    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {obs.content}',
+    )

    action = CmdRunAction(command='git reset --hard')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {obs.content}')

    action = CmdRunAction(
        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
@@ -255,7 +270,7 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {obs.content}')

    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
@@ -283,21 +298,27 @@ def complete_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {obs.content}',
+    )

    action = CmdRunAction(command='git config --global core.pager ""')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to git config --global core.pager "": {obs.content}',
+    )

    action = CmdRunAction(command='git add -A')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
+    assert_and_raise(obs.exit_code == 0, f'Failed to git add -A: {obs.content}')

    n_retries = 0
    git_patch = None
@@ -322,7 +343,7 @@ def complete_runtime(
            logger.error(f'Error occurred: {obs.content}. Retrying...')
            sleep_if_should_continue(10)
        else:
-            raise ValueError(f'Unexpected observation type: {type(obs)}')
+            assert_and_raise(False, f'Unexpected observation type: {type(obs)}')

    logger.info('-' * 30)
    logger.info('END Runtime Completion Fn')
@@ -344,32 +365,34 @@ def process_instance(
    else:
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')

-    runtime = create_runtime(config, sid=instance.instance_id)
-    initialize_runtime(runtime, instance)
+    runtime = create_runtime(config)

-    instruction = get_instruction(instance, metadata)
+    try:
+        initialize_runtime(runtime, instance)

-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_controller(
-            config=config,
-            task_str=instruction,
-            runtime=runtime,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                metadata.agent_class
-            ],
+        instruction = get_instruction(instance, metadata)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=MessageAction(content=instruction),
+                runtime=runtime,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                    metadata.agent_class
+                ],
+            )
        )
-    )

-    # ======= THIS IS SWE-Bench specific =======
-    # Get git patch
-    return_val = complete_runtime(runtime, instance)
-    git_patch = return_val['git_patch']
-    logger.info(
-        f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
-    )
-
-    runtime.close()
+        # ======= THIS IS SWE-Bench specific =======
+        # Get git patch
+        return_val = complete_runtime(runtime, instance)
+        git_patch = return_val['git_patch']
+        logger.info(
+            f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
+        )
+    finally:
+        runtime.close()
    # ==========================================

    # ======= Attempt to evaluate the agent's edits =======
@@ -448,7 +471,7 @@ if __name__ == '__main__':
        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

    details = {}
-    _agent_cls = agenthub.Agent.get_cls(args.agent_cls)
+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
    if hasattr(_agent_cls, 'system_message'):
        details['system_message'] = _agent_cls.system_message
    if hasattr(_agent_cls, 'in_context_example'):
@@ -2,26 +2,32 @@


 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"

 # Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
  --header "X-API-Key: ${ALLHANDS_API_KEY}")

 n_runtimes=$(echo $response | jq -r '.total')
 echo "Found ${n_runtimes} runtimes. Stopping them..."

 runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
-# Loop through each runtime and stop it
-counter=1
-for runtime_id in $runtime_ids; do
+
+# Function to stop a single runtime
+stop_runtime() {
+  local runtime_id=$1
+  local counter=$2
  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
  echo
-  ((counter++))
-done
+}
+export -f stop_runtime
+export BASE_URL ALLHANDS_API_KEY n_runtimes
+
+# Use GNU Parallel to stop runtimes in parallel
+echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}

 echo "All runtimes have been stopped."
@@ -22,11 +22,10 @@ if [ -z "$SET" ]; then
    SET="lite"
 fi

-NAMESPACE=$2 # xingyaoww
-if [ -z "$NAMESPACE" ]; then
-    echo "Default to namespace: xingyaoww"
-    NAMESPACE="xingyaoww"
-fi
+# Check if namespace is provided via argument $3, otherwise default to 'xingyaoww'
+NAMESPACE=${3:-xingyaoww}
+
+echo "Using namespace: $NAMESPACE"

 if [ "$SET" == "lite" ]; then
    IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""Convert OpenHands output to a readable markdown format for visualization."""
+
+import argparse
+import json
+import os
+
+import pandas as pd
+from tqdm import tqdm
+
+from evaluation.swe_bench.eval_infer import process_git_patch
+from openhands.events.serialization import event_from_dict
+
+tqdm.pandas()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('oh_output_file', type=str)
+args = parser.parse_args()
+output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
+print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
+
+oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
+# model name is the folder name of oh_output_file
+model_name = os.path.basename(os.path.dirname(args.oh_output_file))
+
+
+def convert_history_to_str(history):
+    ret = ''
+    separator = '\n\n' + '-' * 100 + '\n'
+
+    for i, event in enumerate(history):
+        if i != 0:
+            ret += separator
+
+        if isinstance(event, list):
+            # "event" is a legacy pair of (action, observation)
+            event_obj = event_from_dict(event[0])
+            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += str(event_obj)
+            ret += separator
+
+            event_obj = event_from_dict(event[1])
+            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += str(event_obj)
+        else:
+            # "event" is a single event
+            event_obj = event_from_dict(event)
+            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += str(event_obj)
+    return ret
+
+
+def write_row_to_md_file(row):
+    if 'git_patch' in row:
+        model_patch = row['git_patch']
+    elif 'test_result' in row and 'git_patch' in row['test_result']:
+        model_patch = row['test_result']['git_patch']
+    else:
+        raise ValueError(f'Row {row} does not have a git_patch')
+
+    if 'report' in row:
+        resolved = row['report'].get('resolved', False)
+    else:
+        resolved = None
+
+    instance_id = row['instance_id']
+    filename = f'{str(resolved).lower()}.{instance_id}.md'
+    os.makedirs(output_md_folder, exist_ok=True)
+    filepath = os.path.join(output_md_folder, filename)
+
+    with open(filepath, 'w') as f:
+        f.write(f'# {instance_id} (resolved: {resolved})\n')
+
+        # MetaData
+        f.write('## MetaData\n')
+        f.write('```json\n')
+        f.write(json.dumps(row['metadata'], indent=2))
+        f.write('\n```\n')
+
+        # Trajectory
+        f.write('## History\n')
+        f.write(convert_history_to_str(row['history']))
+
+        f.write('## Model Patch\n')
+        f.write(f'{process_git_patch(model_patch)}\n')
+
+
+oh_format.progress_apply(write_row_to_md_file, axis=1)
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+import argparse
+import json
+from collections import Counter
+
+ERROR_KEYWORDS = [
+    'Agent encountered an error while processing the last action',
+    'APIError',
+    'Action execution failed',
+]
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('output_file', type=str, help='The file to summarize')
+    args = parser.parse_args()
+
+    with open(args.output_file, 'r') as file:
+        lines = file.readlines()
+
+    num_lines = len(lines)
+    num_error_lines = 0
+    num_agent_stuck_in_loop = 0
+
+    num_resolved = 0
+    num_empty_patch = 0
+
+    error_counter = Counter()
+
+    for line in lines:
+        _d = json.loads(line)
+        patch = _d.get('test_result', {}).get('git_patch', '')
+        if patch == '':
+            num_empty_patch += 1
+            continue
+
+        report = _d.get('report', {}) or {}
+        resolved = report.get('resolved', False)
+        if resolved:
+            num_resolved += 1
+
+        error = _d.get('error', None)
+
+        if error is not None and isinstance(error, str):
+            agent_stuck_in_loop = 'Agent got stuck in a loop' in error
+            contains_error = bool(error) and not agent_stuck_in_loop
+            if agent_stuck_in_loop:
+                error_counter['Agent got stuck in a loop'] += 1
+                num_agent_stuck_in_loop += 1
+            elif contains_error:
+                error_counter[error] += 1
+            continue
+
+        for keyword in ERROR_KEYWORDS:
+            if keyword in line:
+                error_counter[keyword] += 1
+                num_error_lines += 1
+                break
+
+    # print the error counter (with percentage)
+    print('-' * 100)
+    print(
+        f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+    )
+    print('-' * 100)
+    print('Detailed error breakdown:')
+    for error, count in error_counter.items():
+        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
@@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
 args = parser.parse_args()

 dirname = os.path.dirname(args.input_file)
-report_json = os.path.join(dirname, 'report.json')

 df = pd.read_json(args.input_file, lines=True)

-output_md_filepath = os.path.join(dirname, 'README.md')
 instance_id_to_status = defaultdict(
-    lambda: {'resolved': False, 'empty_generation': False}
+    lambda: {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+        'test_timeout': False,
+    }
 )
-if os.path.exists(report_json):
-    with open(report_json, 'r') as f:
+
+
+# Apply the status to the dataframe
+def apply_report(row):
+    instance_id = row['instance_id']
+    if instance_id in instance_id_to_status:
+        return dict(instance_id_to_status[instance_id])
+    return row.get('report', {})
+
+
+swebench_official_report_json = os.path.join(dirname, 'report.json')
+openhands_remote_report_jsonl = args.input_file.replace(
+    '.jsonl', '.swebench_eval.jsonl'
+)
+
+if os.path.exists(swebench_official_report_json):
+    output_md_filepath = os.path.join(dirname, 'README.md')
+    with open(swebench_official_report_json, 'r') as f:
        report = json.load(f)

    output_md = (
@@ -70,15 +90,101 @@ if os.path.exists(report_json):
            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
        )

-    # Apply the status to the dataframe
-    def apply_report(row):
-        instance_id = row['instance_id']
-        if instance_id in instance_id_to_status:
-            return dict(instance_id_to_status[instance_id])
-        return row.get('report', {})
-
    df['report'] = df.apply(apply_report, axis=1)

+    with open(output_md_filepath, 'w') as f:
+        f.write(output_md)
+
+elif os.path.exists(openhands_remote_report_jsonl):
+    output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
+
+    df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
+
+    assert len(df['instance_id'].unique()) == len(
+        df
+    ), 'There are duplicate instance ids in the original output which is not allowed'
+    assert len(df_eval['instance_id'].unique()) == len(
+        df_eval
+    ), 'There are duplicate instance ids in the eval report which is not allowed'
+
+    for _, row in df_eval.iterrows():
+        instance_id_to_status[row['instance_id']] = row['test_result']['report']
+    df['report'] = df.apply(apply_report, axis=1)
+
+    _n_instances = len(df)
+    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
+    _n_unresolved = _n_instances - _n_resolved
+    _n_empty_patch = len(
+        df[df['report'].apply(lambda x: x.get('empty_generation', False))]
+    )
+    _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
+    output_md = (
+        '# SWE-bench Report\n'
+        'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
+        '## Summary\n'
+        f'- submitted instances: {_n_instances}\n'
+        f'- empty patch instances: {_n_empty_patch}\n'
+        f'- resolved instances: {_n_resolved}\n'
+        f'- unresolved instances: {_n_unresolved}\n'
+        f'- error instances: {_n_error}\n'
+    )
+
+    def _instance_id_to_log_path(instance_id):
+        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
+        # make it relative path
+        path = os.path.relpath(path, start=dirname)
+        return path
+
+    output_md += '\n## Resolved Instances\n'
+    # instance_id to status
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('resolved', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['resolved'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Unresolved Instances\n'
+    for instance_id in sorted(
+        df[~df['report'].apply(lambda x: x.get('resolved', False))][
+            'instance_id'
+        ].unique()
+    ):
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Error Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('error_eval', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['error_eval'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Empty Patch Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('empty_generation', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['empty_generation'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Incomplete Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('test_timeout', False))][
+            'instance_id'
+        ].unique()
+    ):
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+    with open(output_md_filepath, 'w') as f:
+        f.write(output_md)
+else:
+    print(
+        f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
+    )
+    exit()

 if os.path.exists(args.input_file + '.bak'):
    conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
@@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
 # backup the original file
 os.rename(args.input_file, args.input_file + '.bak')
 df.to_json(args.input_file, orient='records', lines=True)
-
-with open(output_md_filepath, 'w') as f:
-    f.write(output_md)
--- a/Show More
+++ b/Show More