random changes to agent

Simplify makefile (#4591 )
fix(frontend): Record events sent to WS (#4596 )
2026-04-29 03:00:45 -04:00 · 2024-10-28 16:19:15 -04:00 · 2024-10-28 13:10:32 -04:00 · 2024-10-28 15:53:31 +00:00 · 2024-10-28 17:26:28 +04:00 · 2024-10-28 16:42:17 +04:00
997 changed files with 56796 additions and 64322 deletions
@@ -1 +1 @@
-The files in this directory configure a development container for GitHub Codespaces.
+The files in this directory configure a development container for GitHub Codespaces.
@@ -2,7 +2,5 @@
 sudo apt update
 sudo apt install -y netcat
 sudo add-apt-repository -y ppa:deadsnakes/ppa
-sudo apt install -y python3.11
-curl -sSL https://install.python-poetry.org | python3.11 -
-# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
-sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
+sudo apt install -y python3.12
+curl -sSL https://install.python-poetry.org | python3.12 -
@@ -5,71 +5,55 @@ labels: ['bug']
 body:
  - type: markdown
    attributes:
-      value: Thank you for taking the time to fill out this bug report. We greatly appreciate your effort to complete this template fully. Please provide as much information as possible to help us understand and address the issue effectively.
+      value: Thank you for taking the time to fill out this bug report. Please provide as much information as possible to help us understand and address the issue effectively.

  - type: checkboxes
    attributes:
      label: Is there an existing issue for the same bug?
      description: Please check if an issue already exists for the bug you encountered.
      options:
-      - label: I have checked the troubleshooting document at https://docs.all-hands.dev/modules/usage/troubleshooting
-        required: true
      - label: I have checked the existing issues.
        required: true

  - type: textarea
    id: bug-description
    attributes:
-      label: Describe the bug
-      description: Provide a short description of the problem.
+      label: Describe the bug and reproduction steps
+      description: Provide a description of the issue along with any reproduction steps.
    validations:
      required: true

-  - type: textarea
-    id: current-version
+  - type: dropdown
+    id: installation
    attributes:
-      label: Current OpenHands version
-      description: What version of OpenHands are you using? If you're running in docker, tell us the tag you're using (e.g. ghcr.io/all-hands-ai/openhands:0.3.1).
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Installation
+      description: How are you running OpenHands?
+      options:
+        - Docker command in README
+        - Development workflow
+      default: 0

-  - type: textarea
-    id: config
+  - type: input
+    id: openhands-version
    attributes:
-      label: Installation and Configuration
-      description: Please provide any commands you ran and any configuration (redacting API keys)
-      render: bash
-    validations:
-      required: true
+      label: OpenHands Version
+      description: What version of OpenHands are you using?
+      placeholder: ex. 0.9.8, main, etc.

-  - type: textarea
-    id: model-agent
-    attributes:
-      label: Model and Agent
-      description: What model and agent are you using? You can see these settings in the UI by clicking the settings wheel.
-      placeholder: |
-        - Model:
-        - Agent:
-
-  - type: textarea
-    id: os-version
+  - type: dropdown
+    id: os
    attributes:
      label: Operating System
-      description: What Operating System are you using? Linux, Mac OS, WSL on Windows
-
-  - type: textarea
-    id: repro-steps
-    attributes:
-      label: Reproduction Steps
-      description: Please list the steps to reproduce the issue.
-      placeholder: |
-        1.
-        2.
-        3.
+      options:
+        - MacOS
+        - Linux
+        - WSL on Windows

  - type: textarea
    id: additional-context
    attributes:
      label: Logs, Errors, Screenshots, and Additional Context
-      description: If you want to share the chat history you can click the thumbs-down (👎) button above the input field and you will get a shareable link (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the `logs/llm/default` folder. Please add any additional context about the problem here.
+      description: Please provide any additional information you think might help. If you want to share the chat history
+        you can click the thumbs-down (👎) button above the input field and you will get a shareable link
+        (you can also click thumbs up when things are going well of course!). LLM logs will be stored in the
+        `logs/llm/default` folder. Please add any additional context about the problem here.
@@ -1,21 +1,35 @@
-# To get started with Dependabot version updates, you'll need to specify which
-# package ecosystems to update and where the package manifests are located.
-# Please see the documentation for all configuration options:
-# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
-
 version: 2
 updates:
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
+    groups:
+      # put packages in their own group if they have a history of breaking the build or needing to be reverted
+      pre-commit:
+        patterns:
+          - "pre-commit"
+      llama:
+        patterns:
+          - "llama*"
+      chromadb:
+        patterns:
+          - "chromadb"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/frontend"
    schedule:
      interval: "daily"
-    open-pull-requests-limit: 20
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -23,12 +37,21 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"

  - package-ecosystem: "npm"
    directory: "/docs"
    schedule:
-      interval: "daily"
-    open-pull-requests-limit: 20
+      interval: "weekly"
+      day: "wednesday"
+    open-pull-requests-limit: 1
    groups:
      docusaurus:
        patterns:
@@ -36,3 +59,11 @@ updates:
      eslint:
        patterns:
          - "*eslint*"
+      security-all:
+        applies-to: "security-updates"
+        patterns:
+          - "*"
+      version-all:
+        applies-to: "version-updates"
+        patterns:
+          - "*"
@@ -1,6 +1,6 @@
-**What is the problem that this fixes or functionality that this introduces? Does it fix any open issues?**
-
+**End-user friendly description of the problem this fixes or functionality that this introduces**

+- [ ] Include this change in the Release Notes. If checked, you must provide an **end-user friendly** description for your change below

 ---
 **Give a summary of what the PR does, explaining any non-trivial design decisions**
@@ -8,4 +8,4 @@


 ---
-**Other references**
+**Link of any specific issues this addresses**
@@ -14,6 +14,11 @@ on:
    branches:
      - main

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Build the documentation website
  build:
@@ -32,7 +37,7 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
+          python-version: '3.12'
      - name: Generate Python Docs
        run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
      - name: Install dependencies
@@ -9,25 +9,48 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.11'
-      - name: Set up environment
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation,llama-index
-          poetry run playwright install --with-deps chromium
-          wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
+          python-version: '3.12'
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Build Environment
+        run: make build
      - name: Run tests
        run: |
          set -e
-          poetry run python openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
+          SANDBOX_FORCE_REBUILD_RUNTIME=True poetry run python3 openhands/core/main.py -t "do a flip" -d ./workspace/ -c DummyAgent
      - name: Check exit code
        run: |
          if [ $? -ne 0 ]; then
@@ -12,6 +12,11 @@ on:
      - 'frontend/**'
      -  '.github/workflows/fe-unit-tests.yml'

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run frontend unit tests
  fe-test:
@@ -0,0 +1,401 @@
+# Workflow that builds, tests and then pushes the OpenHands and runtime docker images to the ghcr.io repository
+name: Build, Test and Publish RT Image
+
+# Always run on "main"
+# Always run on tags
+# Always run on PRs
+# Can also be triggered manually
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - '*'
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: 'Reason for manual trigger'
+        required: true
+        default: ''
+
+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.12-nodejs22
+  RELEVANT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+
+jobs:
+  # Builds the OpenHands Docker images
+  ghcr_build_app:
+    name: Build App Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push app image
+        if: "!github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --push
+      - name: Build app image
+        if: "github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh -i openhands -o ${{ github.repository_owner }} --load
+      - name: Get hash in App Image
+        id: get_hash_in_app_image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+          # Run the build script in the app image
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          # Get the hash from the build script
+          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
+          echo "Hash from app image: $hash_from_app_image"
+
+  # Builds the runtime Docker images
+  ghcr_build_runtime:
+    name: Build Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        base_image:
+          - image: 'nikolaik/python-nodejs:python3.12-nodejs22'
+            tag: nikolaik
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Create source distribution and Dockerfile
+        run: poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image.image }} --build_folder containers/runtime --force_rebuild
+      - name: Build and push runtime image ${{ matrix.base_image.image }}
+        if: github.event.pull_request.head.repo.fork != true
+        run: |
+          ./containers/build.sh -i runtime -o ${{ github.repository_owner }} --push -t ${{ matrix.base_image.tag }}
+      # Forked repos can't push to GHCR, so we need to upload the image as an artifact
+      - name: Build runtime image ${{ matrix.base_image.image }} for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: docker/build-push-action@v6
+        with:
+          tags: ghcr.io/all-hands-ai/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image.tag }}
+          outputs: type=docker,dest=/tmp/runtime-${{ matrix.base_image.tag }}.tar
+          context: containers/runtime
+      - name: Upload runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: actions/upload-artifact@v4
+        with:
+          name: runtime-${{ matrix.base_image.tag }}
+          path: /tmp/runtime-${{ matrix.base_image.tag }}.tar
+
+  verify_hash_equivalence_in_runtime_and_app:
+    name: Verify Hash Equivalence in Runtime and Docker images
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_build_app]
+    strategy:
+      fail-fast: false
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Get hash in App Image
+        run: |
+          echo "Hash from app image: ${{ needs.ghcr_build_app.outputs.hash_from_app_image }}"
+          echo "hash_from_app_image=${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" >> $GITHUB_ENV
+
+      - name: Get hash using code (development mode)
+        run: |
+          mkdir -p containers/runtime
+          poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild > output.txt 2>&1
+          hash_from_code=$(cat output.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_code=$hash_from_code" >> $GITHUB_ENV
+
+      - name: Compare hashes
+        run: |
+          echo "Hash from App Image: ${{ env.hash_from_app_image }}"
+          echo "Hash from Code: ${{ env.hash_from_code }}"
+          if [ "${{ env.hash_from_app_image }}" = "${{ env.hash_from_code }}" ]; then
+            echo "Hashes match!"
+          else
+            echo "Hashes do not match!"
+            exit 1
+          fi
+
+  # Run unit tests with the EventStream runtime Docker images as root
+  test_runtime_root:
+    name: RT Unit Tests (Root)
+    needs: [ghcr_build_runtime]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      # Forked repos can't push to GHCR, so we need to download the image as an artifact
+      - name: Download runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: actions/download-artifact@v4
+        with:
+          name: runtime-${{ matrix.base_image }}
+          path: /tmp
+      - name: Load runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        run: |
+          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Run runtime tests
+        run: |
+          # We install pytest-xdist in order to run tests across CPUs
+          poetry run pip install pytest-xdist
+
+          # Install to be able to retry on failures for flaky tests
+          poetry run pip install pytest-rerunfailures
+
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
+          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
+
+          SKIP_CONTAINER_LOGS=true \
+          TEST_RUNTIME=eventstream \
+          SANDBOX_USER_ID=$(id -u) \
+          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          TEST_IN_CI=true \
+          RUN_AS_OPENHANDS=false \
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run unit tests with the EventStream runtime Docker images as openhands user
+  test_runtime_oh:
+    name: RT Unit Tests (openhands)
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime]
+    strategy:
+      matrix:
+        base_image: ['nikolaik']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      # Forked repos can't push to GHCR, so we need to download the image as an artifact
+      - name: Download runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        uses: actions/download-artifact@v4
+        with:
+          name: runtime-${{ matrix.base_image }}
+          path: /tmp
+      - name: Load runtime image for fork
+        if: github.event.pull_request.head.repo.fork
+        run: |
+          docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Run runtime tests
+        run: |
+          # We install pytest-xdist in order to run tests across CPUs
+          poetry run pip install pytest-xdist
+
+          # Install to be able to retry on failures for flaky tests
+          poetry run pip install pytest-rerunfailures
+
+          image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
+          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
+
+          SKIP_CONTAINER_LOGS=true \
+          TEST_RUNTIME=eventstream \
+          SANDBOX_USER_ID=$(id -u) \
+          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          TEST_IN_CI=true \
+          RUN_AS_OPENHANDS=true \
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # The two following jobs (named identically) are to check whether all the runtime tests have passed as the
+  # "All Runtime Tests Passed" is a required job for PRs to merge
+  # Due to this bug: https://github.com/actions/runner/issues/2566, we want to create a job that runs when the
+  # prerequisites have been cancelled or failed so merging is disallowed, otherwise Github considers "skipped" as "success"
+  runtime_tests_check_success:
+    name: All Runtime Tests Passed
+    if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
+    runs-on: ubuntu-latest
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
+    steps:
+      - name: All tests passed
+        run: echo "All runtime tests have passed successfully!"
+
+  runtime_tests_check_fail:
+    name: All Runtime Tests Passed
+    if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
+    runs-on: ubuntu-latest
+    needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app]
+    steps:
+      - name: Some tests failed
+        run: |
+          echo "Some runtime tests failed or were cancelled"
+          exit 1
@@ -1,61 +0,0 @@
-# Workflow that builds, tests and then pushes the app docker images to the ghcr.io repository
-name: Build and Publish App Image
-
-
-# Always run on "main"
-# Always run on tags
-# Always run on PRs
-# Can also be triggered manually
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  # Builds the OpenHands Docker images
-  ghcr_build:
-    name: Build App Image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build and export image
-        id: build
-        run: ./containers/build.sh openhands ${{ github.repository_owner }} --push
@@ -1,168 +0,0 @@
-# Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
-name: Build, Test and Publish Runtime Image
-
-# Only run one workflow of the same group at a time.
-# There can be at most one running and one pending job in a concurrency group at any time.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  # Builds the runtime Docker images
-  ghcr_build_runtime:
-    name: Build Image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    strategy:
-      matrix:
-        base_image: ['nikolaik/python-nodejs:python3.11-nodejs22', 'python:3.11-bookworm', 'node:22-bookworm']
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-          cache: 'poetry'
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Create source distribution and Dockerfile
-        run: poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime --force_rebuild
-      - name: Build and export image
-        id: build
-        run: |
-          suffix=$(echo "${{ matrix.base_image }}" | cut -d ':' -f 1 | cut -d '/' -f 1)
-          ./containers/build.sh runtime ${{ github.repository_owner }} --push $suffix
-
-  # Run unit tests with the EventStream runtime Docker images
-  test_runtime:
-    name: Test Runtime
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime]
-    strategy:
-      matrix:
-        base_image: ['nikolaik', 'python', 'node']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          swap-storage: true
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-          cache: 'poetry'
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Run runtime tests
-        run: |
-          git_hash=$(git rev-parse --short "$GITHUB_SHA")
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:$git_hash-${{ matrix.base_image }}
-          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
-
-          TEST_RUNTIME=eventstream \
-          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
-          TEST_IN_CI=true \
-          poetry run pytest --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-  # Run integration tests with the eventstream runtime Docker image
-  runtime_integration_tests_on_linux:
-    name: Runtime Integration Tests on Linux
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime]
-    strategy:
-      fail-fast: false
-      matrix:
-        base_image: ['nikolaik', 'python', 'node']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-          cache: 'poetry'
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-      - name: Run integration tests
-        run: |
-          git_hash=$(git rev-parse --short "$GITHUB_SHA")
-          image_name=ghcr.io/${{ github.repository_owner }}/runtime:$git_hash-${{ matrix.base_image }}
-          image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
-
-          TEST_RUNTIME=eventstream \
-          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
-          TEST_IN_CI=true \
-          TEST_ONLY=true \
-          ./tests/integration/regenerate.sh
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-  # Checks that all runtime tests have passed
-  all_runtime_tests_passed:
-    name: All Runtime Tests Passed
-    runs-on: ubuntu-latest
-    needs: [test_runtime, runtime_integration_tests_on_linux]
-    steps:
-      - name: All tests passed
-        run: echo "All runtime tests have passed successfully!"
@@ -10,6 +10,11 @@ on:
    - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  # Run lint on the frontend code
  lint-frontend:
@@ -41,9 +46,9 @@ jobs:
      - name: Set up python
        uses: actions/setup-python@v5
        with:
-          python-version: 3.11
+          python-version: 3.12
          cache: 'pip'
      - name: Install pre-commit
        run: pip install pre-commit==3.7.0
      - name: Run pre-commit hooks
-        run: pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+name: Resolve Issues with OpenHands
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  call-openhands-resolver:
+    uses: All-Hands-AI/openhands-resolver/.github/workflows/openhands-resolver.yml@main
+    if: github.event.label.name == 'fix-me'
+    with:
+      issue_number: ${{ github.event.issue.number }}
+    secrets: inherit
@@ -0,0 +1,96 @@
+# Workflow that runs python unit tests on mac
+name: Run Python Unit Tests Mac
+
+# This job is flaky so only run it nightly
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  # Run python unit tests on macOS
+  test-on-macos:
+    name: Python Unit Tests on macOS
+    runs-on: macos-14
+    env:
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
+    strategy:
+      matrix:
+        python-version: ['3.12']
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.virtualenvs
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+      - name: Install & Start Docker
+        if: env.INSTALL_DOCKER == '1'
+        run: |
+          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
+
+          # Uninstall colima to upgrade to the latest version
+          if brew list colima &>/dev/null; then
+            brew uninstall colima
+            # unlinking colima dependency: go
+            brew uninstall go@1.21
+          fi
+          rm -rf ~/.colima ~/.lima
+          brew install --HEAD colima
+          brew install docker
+
+          start_colima() {
+            # Find a free port in the range 10000-20000
+            RANDOM_PORT=$((RANDOM % 10001 + 10000))
+
+            # Original line:
+            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
+              echo "Failed to start Colima."
+              return 1
+            fi
+            return 0
+          }
+
+          # Attempt to start Colima for 5 total attempts:
+          ATTEMPT_LIMIT=5
+          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
+
+            if start_colima; then
+              echo "Colima started successfully."
+              break
+            else
+              colima stop -f
+              sleep 10
+              colima delete -f
+              if [ $i -eq $ATTEMPT_LIMIT ]; then
+                exit 1
+              fi
+              sleep 10
+            fi
+          done
+
+          # For testcontainers to find the Colima socket
+          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
+          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
+      - name: Build Environment
+        run: make build
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Run Tests
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -10,84 +10,12 @@ on:
      - main
  pull_request:

+# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
-  # Run python unit tests on macOS
-  test-on-macos:
-    name: Python Unit Tests on macOS
-    runs-on: macos-12
-    env:
-      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
-    strategy:
-      matrix:
-        python-version: ['3.11']
-    steps:
-      - uses: actions/checkout@v4
-      - name: Install poetry via pipx
-        run: pipx install poetry
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'poetry'
-      - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation,llama-index
-      - name: Install & Start Docker
-        if: env.INSTALL_DOCKER == '1'
-        run: |
-          INSTANCE_NAME="colima-${GITHUB_RUN_ID}"
-
-          # Uninstall colima to upgrade to the latest version
-          if brew list colima &>/dev/null; then
-            brew uninstall colima
-            # unlinking colima dependency: go
-            brew uninstall go@1.21
-          fi
-          rm -rf ~/.colima ~/.lima
-          brew install --HEAD colima
-          brew install docker
-
-          start_colima() {
-            # Find a free port in the range 10000-20000
-            RANDOM_PORT=$((RANDOM % 10001 + 10000))
-
-            # Original line:
-            if ! colima start --network-address --arch x86_64 --cpu=1 --memory=1 --verbose --ssh-port $RANDOM_PORT; then
-              echo "Failed to start Colima."
-              return 1
-            fi
-            return 0
-          }
-
-          # Attempt to start Colima for 5 total attempts:
-          ATTEMPT_LIMIT=5
-          for ((i=1; i<=ATTEMPT_LIMIT; i++)); do
-
-            if start_colima; then
-              echo "Colima started successfully."
-              break
-            else
-              colima stop -f
-              sleep 10
-              colima delete -f
-              if [ $i -eq $ATTEMPT_LIMIT ]; then
-                exit 1
-              fi
-              sleep 10
-            fi
-          done
-
-          # For testcontainers to find the Colima socket
-          # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
-          sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-      - name: Build Environment
-        run: make build
-      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
  # Run python unit tests on Linux
  test-on-linux:
    name: Python Unit Tests on Linux
@@ -96,9 +24,12 @@ jobs:
      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
    strategy:
      matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
@@ -111,7 +42,7 @@ jobs:
      - name: Build Environment
        run: make build
      - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -0,0 +1,31 @@
+# Publishes the OpenHands PyPi package
+name: Publish PyPi Package
+
+# Triggered manually
+on:
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: 'Reason for manual trigger'
+        required: true
+        default: ''
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+      - name: Install Poetry
+        uses: snok/install-poetry@v1.4.1
+        with:
+          virtualenvs-in-project: true
+          virtualenvs-path: ~/.virtualenvs
+      - name: Install Poetry Dependencies
+        run: poetry install --no-interaction --no-root
+      - name: Build poetry project
+        run: ./build.sh
+      - name: publish
+        run: poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
@@ -15,10 +15,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v3
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
-        python-version: '3.11'
+        python-version: '3.12'
    - name: install git, github cli
      run: |
        sudo apt-get install -y git gh
@@ -1,113 +0,0 @@
-# Workflow that uses OpenHands to resolve a GitHub issue. Issue must be labeled 'solve-this'
-name: Use OpenHands to Resolve GitHub Issue
-
-on:
-  issues:
-    types: [labeled]
-
-permissions:
-  contents: write
-  pull-requests: write
-  issues: write
-
-jobs:
-  dogfood:
-    if: github.event.label.name == 'solve-this'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/all-hands-ai/openhands
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-    steps:
-    - name: install git, github cli
-      run: apt-get install -y git gh
-    - name: Checkout Repository
-      uses: actions/checkout@v4
-    - name: Write Task File
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-      run: |
-        echo "TITLE:" > task.txt
-        echo "${ISSUE_TITLE}" >> task.txt
-        echo "" >> task.txt
-        echo "BODY:" >> task.txt
-        echo "${ISSUE_BODY}" >> task.txt
-    - name: Set up environment
-      run: |
-        curl -sSL https://install.python-poetry.org | python3 -
-        export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation,llama-index
-        poetry run playwright install --with-deps chromium
-    - name: Run OpenHands
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      run: |
-        # Append path to launch poetry
-        export PATH="/github/home/.local/bin:$PATH"
-        # Append path to correctly import package, note: must set pwd at first
-        export PYTHONPATH=$(pwd):$PYTHONPATH
-        WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./openhands/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
-        rm task.txt
-    - name: Setup Git, Create Branch, and Commit Changes
-      run: |
-        # Setup Git configuration
-        git config --global --add safe.directory $PWD
-        git config --global user.name 'OpenHands'
-        git config --global user.email 'OpenHands@users.noreply.github.com'
-
-        # Create a unique branch name with a timestamp
-        BRANCH_NAME="fix/${{ github.event.issue.number }}-$(date +%Y%m%d%H%M%S)"
-
-        # Checkout new branch
-        git checkout -b $BRANCH_NAME
-
-        # Add all changes to staging, except task.txt
-        git add --all -- ':!task.txt'
-
-        # Commit the changes, if any
-        git commit -m "OpenHands: Resolve Issue #${{ github.event.issue.number }}"
-        if [ $? -ne 0 ]; then
-          echo "No changes to commit."
-          exit 0
-        fi
-
-        # Push changes
-        git push --set-upstream origin $BRANCH_NAME
-    - name: Fetch Default Branch
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Fetch the default branch using gh cli
-        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
-        echo "Default branch is $DEFAULT_BRANCH"
-        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-    - name: Generate PR
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Create PR and capture URL
-        PR_URL=$(gh pr create \
-          --title "OpenHands: Resolve Issue #2" \
-          --body "This PR was generated by OpenHands to resolve issue #2" \
-          --repo "foragerr/OpenHands" \
-          --head "${{ github.head_ref }}" \
-          --base "${{ env.DEFAULT_BRANCH }}" \
-          | grep -o 'https://github.com/[^ ]*')
-
-        # Extract PR number from URL
-        PR_NUMBER=$(echo "$PR_URL" | grep -o '[0-9]\+$')
-
-        # Set environment vars
-        echo "PR_URL=$PR_URL" >> $GITHUB_ENV
-        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
-    - name: Post Comment
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        gh issue comment ${{ github.event.issue.number }} \
-          -b "OpenHands raised [PR #${{ env.PR_NUMBER }}](${{ env.PR_URL }}) to resolve this issue."
@@ -15,6 +15,7 @@ jobs:
          stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
          stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
          days-before-stale: 30
+          exempt-issue-labels: 'tracked'
          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
          days-before-close: 7
@@ -121,6 +121,7 @@ celerybeat.pid

 # Environments
 .env
+frontend/.env
 .venv
 env/
 venv/
@@ -177,7 +178,6 @@ evaluation/toolqa/data
 # frontend

 # dependencies
-frontend/node_modules
 frontend/.pnp
 frontend/bun.lockb
 frontend/yarn.lock
@@ -217,8 +217,6 @@ config.toml
 config.toml_
 config.toml.bak

-containers/agnostic_sandbox
-
 # swe-bench-eval
 image_build_logs
 run_instance_logs
@@ -228,3 +226,5 @@ runtime_*.tar
 # docker build
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
+containers/runtime/code
+**/node_modules/
@@ -0,0 +1,28 @@
+OpenHands is an automated AI software engineer. It is a repo with a Python backend
+(in the `openhands` directory) and TypeScript frontend (in the `frontend` directory).
+
+General Setup:
+- To set up the entire repo, including frontend and backend, run `make build`
+- To run linting and type-checking before finishing the job, run `poetry run pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
+
+Backend:
+- Located in the `openhands` directory
+- Testing:
+  - All tests are in `tests/unit/test_*.py`
+  - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
+  - Write all tests with pytest
+
+Frontend:
+- Located in the `frontend` directory
+- Prerequisites: A recent version of NodeJS / NPM
+- Setup: Run `npm install` in the frontend directory
+- Testing:
+  - Run tests: `npm run test`
+  - To run specific tests: `npm run test -- -t "TestName"`
+- Building:
+  - Build for production: `npm run build`
+- Environment Variables:
+  - Set in `frontend/.env` or as environment variables
+  - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
+- Internationalization:
+  - Generate i18n declaration file: `npm run make-i18n`
@@ -2,95 +2,70 @@

 Thanks for your interest in contributing to OpenHands! We welcome and appreciate contributions.

-## How Can I Contribute?
-
-There are many ways that you can contribute:
-
-1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
-2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) issues that may be ones to start on.
-
 ## Understanding OpenHands's CodeBase

 To understand the codebase, please refer to the README in each module:
 - [frontend](./frontend/README.md)
- [agenthub](./agenthub/README.md)
 - [evaluation](./evaluation/README.md)
 - [openhands](./openhands/README.md)
-    - [server](./openhands/server/README.md)
+   - [agenthub](./openhands/agenthub/README.md)
+   - [server](./openhands/server/README.md)

+## Setting up your development environment
+
+We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
+
+## How can I contribute?
+
+There are many ways that you can contribute:
+
+1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
+2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
+3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
+
+## What can I build?
+Here are a few ways you can help improve the codebase.
+
+#### UI/UX
+We're always looking to improve the look and feel of the application. If you've got a small fix
+for something that's bugging you, feel free to open up a PR that changes the `./frontend` directory.
+
+If you're looking to make a bigger change, add a new UI element, or significantly alter the style
+of the application, please open an issue first, or better, join the #frontend channel in our Slack
+to gather consensus from our design team first.
+
+#### Improving the agent
+Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent)
+
+Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
+You can try modifying the prompts to see how they change the behavior of the agent as you use the app
+locally, but we will need to do an end-to-end evaluation of any changes here to ensure that the agent
+is getting better over time.
+
+We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. You can join the #evaluation
+channel in Slack to learn more.
+
+#### Adding a new agent
+You may want to experiment with building new types of agents. You can add an agent to `openhands/agenthub`
+to help expand the capabilities of OpenHands.
+
+#### Adding a new runtime
+The agent needs a place to run code and commands. When you run OpenHands on your laptop, it uses a Docker container
+to do this by default. But there are other ways of creating a sandbox for the agent.
+
+If you work for a company that provides a cloud-based runtime, you could help us add support for that runtime
+by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/runtime.py).
+
+#### Testing
 When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
 At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.

 ## Sending Pull Requests to OpenHands

-### 1. Fork the Official Repository
-Fork the [OpenHands repository](https://github.com/All-Hands-AI/OpenHands) into your own account.
-Clone your own forked repository into your local environment:
+You'll need to fork our repository to send us a Pull Request. You can learn more
+about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8)

-```shell
-git clone git@github.com:<YOUR-USERNAME>/OpenHands.git
-```
-
-### 2. Configure Git
-
-Set the official repository as your [upstream](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams) to synchronize with the latest update in the official repository.
-Add the original repository as upstream:
-
-```shell
-cd OpenHands
-git remote add upstream git@github.com:All-Hands-AI/OpenHands.git
-```
-
-Verify that the remote is set:
-
-```shell
-git remote -v
-```
-
-You should see both `origin` and `upstream` in the output.
-
-### 3. Synchronize with Official Repository
-Synchronize latest commit with official repository before coding:
-
-```shell
-git fetch upstream
-git checkout main
-git merge upstream/main
-git push origin main
-```
-
-### 4. Set up the Development Environment
-
-We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
-
-### 5. Write Code and Commit It
-
-Once you have done this, you can write code, test it, and commit it to a branch (replace `my_branch` with an appropriate name):
-
-```shell
-git checkout -b my_branch
-git add .
-git commit
-git push origin my_branch
-```
-
-### 6. Open a Pull Request
-
-* On GitHub, go to the page of your forked repository, and create a Pull Request:
-   - Click on `Branches`
-   - Click on the `...` beside your branch and click on `New pull request`
-   - Set `base repository` to `All-Hands-AI/OpenHands`
-   - Set `base` to `main`
-   - Click `Create pull request`
-
-The PR should appear in [OpenHands PRs](https://github.com/All-Hands-AI/OpenHands/pulls).
-
-Then the OpenHands team will review your code.
-
-## PR Rules
-
-### 1. Pull Request title
+### Pull Request title
 As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:

 - `feat`: A new feature
@@ -111,6 +86,9 @@ For example, a PR title could be:

 You may also check out previous PRs in the [PR list](https://github.com/All-Hands-AI/OpenHands/pulls).

-### 2. Pull Request description
+### Pull Request description
 - If your PR is small (such as a typo fix), you can go brief.
 - If it contains a lot of changes, it's better to write more details.
+
+If your changes are user-facing (e.g. a new feature in the UI, a change in behavior, or a bugfix)
+please include a short message that we can add to our changelog.
@@ -2,7 +2,7 @@

 ## Contributors

-We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. Your dedication and hard work are greatly appreciated.
+We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. We greatly appreciate your dedication and hard work.

 ## Open Source Projects

@@ -10,7 +10,7 @@ OpenHands includes and adapts the following open source projects. We are gratefu

 #### [SWE Agent](https://github.com/princeton-nlp/swe-agent)
   - License: MIT License
-   - Description: Adapted for use in OpenHands's agenthub
+   - Description: Adapted for use in OpenHands's agent hub

 #### [Aider](https://github.com/paul-gauthier/aider)
   - License: Apache License 2.0
@@ -5,12 +5,14 @@ Otherwise, you can clone the OpenHands project directly.

 ## Start the server for development
 ### 1. Requirements
-* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [ Ubuntu <= 22.04]
+* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [Ubuntu <= 22.04]
 * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) = 3.11
+* [Python](https://www.python.org/downloads/) = 3.12
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
 * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
-* netcat => sudo apt-get install netcat
+* OS-specific dependencies:
+  - Ubuntu: build-essential => `sudo apt-get install build-essential`
+  - WSL: netcat => `sudo apt-get install netcat`

 Make sure you have all these dependencies installed before moving on to `make build`.

@@ -22,8 +24,8 @@ If you want to develop without system admin/sudo access to upgrade/install `Pyth
 curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 bash Miniforge3-$(uname)-$(uname -m).sh

-# Install Python 3.11, nodejs, and poetry
-mamba install python=3.11
+# Install Python 3.12, nodejs, and poetry
+mamba install python=3.12
 mamba install conda-forge::nodejs
 mamba install conda-forge::poetry
 ```
@@ -91,9 +93,36 @@ To run tests, refer to the following:
 poetry run pytest ./tests/unit/test_*.py
 ```

-#### Integration tests
-Please refer to [this README](./tests/integration/README.md) for details.
-
 ### 9. Add or update dependency
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`
+
+### 9. Use existing Docker image
+To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image. Follow these steps:
+1. Set the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
+2. Example: export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.9-nikolaik
+
+## Develop inside Docker container
+
+TL;DR
+
+```bash
+make docker-dev
+```
+
+See more details [here](./containers/dev/README.md)
+
+If you are just interested in running `OpenHands` without installing all the required tools on your host.
+
+```bash
+make docker-run
+```
+
+If you do not have `make` on your host, run:
+
+```bash
+cd ./containers/dev
+./dev.sh
+```
+
+You do need [Docker](https://docs.docker.com/engine/install/) installed on your host though.
@@ -2,14 +2,15 @@ SHELL=/bin/bash
 # Makefile for OpenHands project

 # Variables
+BACKEND_HOST ?= "127.0.0.1"
 BACKEND_PORT = 3000
-BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
+BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
 CONFIG_FILE = config.toml
 PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
-PYTHON_VERSION = 3.11
+PYTHON_VERSION = 3.12

 # ANSI color codes
 GREEN=$(shell tput -Txterm setaf 2)
@@ -189,12 +190,12 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "$(shell pwd)/workspace"

 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run dev -- --port $(FRONTEND_PORT) --host $(BACKEND_HOST)

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -204,7 +205,7 @@ _run_setup:
 	fi
 	@mkdir -p logs
 	@echo "$(YELLOW)Starting backend server...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) &
+	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) &
 	@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
 	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
 	@echo "$(GREEN)Backend started successfully.$(RESET)"
@@ -213,9 +214,23 @@ _run_setup:
 run:
 	@echo "$(YELLOW)Running the app...$(RESET)"
 	@$(MAKE) -s _run_setup
-	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
+	@$(MAKE) -s start-frontend
 	@echo "$(GREEN)Application started successfully.$(RESET)"

+# Run the app (in docker)
+docker-run: WORKSPACE_BASE ?= $(PWD)/workspace
+docker-run:
+	@if [ -f /.dockerenv ]; then \
+		echo "Running inside a Docker container. Exiting..."; \
+		exit 0; \
+	else \
+		echo "$(YELLOW)Running the app in Docker $(OPTIONS)...$(RESET)"; \
+		export WORKSPACE_BASE=${WORKSPACE_BASE}; \
+		export SANDBOX_USER_ID=$(shell id -u); \
+		export DATE=$(shell date +%Y%m%d%H%M%S); \
+		docker compose up $(OPTIONS); \
+	fi
+
 # Run the app (WSL mode)
 run-wsl:
 	@echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
@@ -260,6 +275,10 @@ setup-config-prompts:
 		echo "    - nomic-embed-text"; \
 		echo "    - all-minilm"; \
 		echo "    - stable-code"; \
+		echo "    - bge-m3"; \
+		echo "    - bge-large"; \
+		echo "    - paraphrase-multilingual"; \
+		echo "    - snowflake-arctic-embed"; \
 		echo "  - Leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
 		read -p "> " llm_embedding_model; \
 		echo "embedding_model=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
@@ -276,6 +295,16 @@ setup-config-prompts:
 		fi


+# Develop in container
+docker-dev:
+	@if [ -f /.dockerenv ]; then \
+		echo "Running inside a Docker container. Exiting..."; \
+		exit 0; \
+	else \
+		echo "$(YELLOW)Build and run in Docker $(OPTIONS)...$(RESET)"; \
+		./containers/dev/dev.sh $(OPTIONS); \
+	fi
+
 # Clean up all caches
 clean:
 	@echo "$(YELLOW)Cleaning up caches...$(RESET)"
@@ -294,7 +323,10 @@ help:
 	@echo "  $(GREEN)start-frontend$(RESET)      - Start the frontend server for the OpenHands project."
 	@echo "  $(GREEN)run$(RESET)                 - Run the OpenHands application, starting both backend and frontend servers."
 	@echo "                        Backend Log file will be stored in the 'logs' directory."
+	@echo "  $(GREEN)docker-dev$(RESET)          - Build and run the OpenHands application in Docker."
+	@echo "  $(GREEN)docker-run$(RESET)          - Run the OpenHands application, starting both backend and frontend servers in Docker."
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
 .PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
+.PHONY: docker-dev docker-run
@@ -1,65 +1,51 @@
 <a name="readme-top"></a>

-<!--
-*** Thanks for checking out the Best-README-Template. If you have a suggestion
-*** that would make this better, please fork the repo and create a pull request
-*** or simply open an issue with the tag "enhancement".
-*** Don't forget to give the project a star!
-*** Thanks again! Now go create something AMAZING! :D
-->
+<div align="center">
+  <img src="./docs/static/img/logo.png" alt="Logo" width="200">
+  <h1 align="center">OpenHands: Code Less, Make More</h1>
+</div>

-<!-- PROJECT SHIELDS -->
-<!--
-*** I'm using markdown "reference style" links for readability.
-*** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
-*** See the bottom of this document for the declaration of the reference variables
-*** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
-*** https://www.markdownguide.org/basic-syntax/#reference-style-links
-->

 <div align="center">
  <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors"></a>
-  <a href="https://github.com/All-Hands-AI/OpenHands/network/members"><img src="https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Forks"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://github.com/All-Hands-AI/OpenHands/issues"><img src="https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Issues"></a>
+  <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
-  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=blue" alt="Credits"></a>
  <br/>
  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
  <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge"></a>
-</div>
-
-<!-- PROJECT LOGO -->
-<div align="center">
-  <img src="./docs/static/img/logo.png" alt="Logo" width="200" height="200">
-  <h1 align="center">OpenHands: Code Less, Make More</h1>
-  <a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenHands-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
-  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
+  <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
  <br/>
-  <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
+  <a href="https://docs.all-hands.dev/modules/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
+  <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score"></a>
+  <hr>
 </div>
-<hr>

-Welcome to OpenHands, a platform for autonomous software engineers, powered by AI and LLMs (previously called "OpenDevin").
+Welcome to OpenHands (formerly OpenDevin), a platform for software development agents powered by AI.

-OpenHands agents collaborate with human developers to write code, fix bugs, and ship features.
+OpenHands agents can do anything a human developer can: modify code, run commands, browse the web,
+call APIs, and yes—even copy code snippets from StackOverflow.
+
+Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [Quick Start](#-quick-start).

 ![App screenshot](./docs/static/img/screenshot.png)

-## ⚡ Getting Started
-OpenHands works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
-You must be using Linux, Mac OS, or WSL on Windows.
+## ⚡ Quick Start

-To start OpenHands in a docker container, run the following commands in your terminal:
+The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to
+point OpenHands to existing code that you'd like to modify.

-> [!WARNING]
-> When you run the following command, files in `./workspace` may be modified or deleted.
+See the [Installation](https://docs.all-hands.dev/modules/usage/installation) guide for
+system requirements and more information.

 ```bash
-WORKSPACE_BASE=$(pwd)/workspace
-docker run -it \
-    --pull=always \
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik
+
+docker run -it --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
    -e SANDBOX_USER_ID=$(id -u) \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -67,32 +53,30 @@ docker run -it \
    -p 3000:3000 \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.9
+    ghcr.io/all-hands-ai/openhands:0.11
 ```

-> [!NOTE]
-> This command pulls the `0.9` tag, which represents the most recent stable release of OpenHands. You have other options as well:
-> - For a specific release version, use `ghcr.io/all-hands-ai/openhands:<OpenHands_version>` (replace <OpenHands_version> with the desired version number).
-> - For the most up-to-date development version, use `ghcr.io/all-hands-ai/openhands:main`. This version may be **(unstable!)** and is recommended for testing or development purposes only.
->
-> Choose the tag that best suits your needs based on stability requirements and desired features.
+You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!

-You'll find OpenHands running at [http://localhost:3000](http://localhost:3000) with access to `./workspace`. To have OpenHands operate on your code, place it in `./workspace`.
-OpenHands will only have access to this workspace folder. The rest of your system will not be affected as it runs in a secured docker sandbox.
+You'll need a model provider and API key. One option that works well: [Claude 3.5 Sonnet](https://www.anthropic.com/api), but you have [many options](https://docs.all-hands.dev/modules/usage/llms).

-Upon opening OpenHands, you must select the appropriate `Model` and enter the `API Key` within the settings that should pop up automatically. These can be set at any time by selecting
-the `Settings` button (gear icon) in the UI. If the required `Model` does not exist in the list, you can manually enter it in the text box.
+---

-For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
+or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).

-Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
+Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.

-## 🚀 Documentation
+If you want to modify the OpenHands source code, check out [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+Having issues? The [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting) can help.
+
+## 📖 Documentation

 To learn more about the project, and for tips on using OpenHands,
-**check out our [documentation](https://docs.all-hands.dev/modules/usage/intro)**.
+**check out our [documentation](https://docs.all-hands.dev/modules/usage/getting-started)**.

-There you'll find resources on how to use different LLM providers (like ollama and Anthropic's Claude),
+There you'll find resources on how to use different LLM providers,
 troubleshooting resources, and advanced configuration options.

 ## 🤝 How to Contribute
@@ -127,17 +111,6 @@ Let's make software engineering better together!

 Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more information.

-[contributors-shield]: https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge
-[contributors-url]: https://github.com/All-Hands-AI/OpenHands/graphs/contributors
-[forks-shield]: https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge
-[forks-url]: https://github.com/All-Hands-AI/OpenHands/network/members
-[stars-shield]: https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge
-[stars-url]: https://github.com/All-Hands-AI/OpenHands/stargazers
-[issues-shield]: https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge
-[issues-url]: https://github.com/All-Hands-AI/OpenHands/issues
-[license-shield]: https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge
-[license-url]: https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE
-
 ## 🙏 Acknowledgements

 OpenHands is built by a large number of contributors, and every contribution is greatly appreciated! We also build upon other open source projects, and we are deeply thankful for their work.
@@ -147,8 +120,8 @@ For a list of open source projects and licenses used in OpenHands, please see ou
 ## 📚 Cite

 ```
-@misc{opendevin,
-      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
+@misc{openhands,
+      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
      year={2024},
      eprint={2407.16741},
@@ -1,88 +0,0 @@
-import ast
-
-from openhands.controller.action_parser import ActionParser, ResponseParser
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action import (
-    Action,
-    BrowseInteractiveAction,
-)
-
-
-class BrowsingResponseParser(ResponseParser):
-    def __init__(self):
-        # Need to pay attention to the item order in self.action_parsers
-        super().__init__()
-        self.action_parsers = [BrowsingActionParserMessage()]
-        self.default_parser = BrowsingActionParserBrowseInteractive()
-
-    def parse(self, response: str) -> Action:
-        action_str = self.parse_response(response)
-        return self.parse_action(action_str)
-
-    def parse_response(self, response) -> str:
-        action_str = response['choices'][0]['message']['content']
-        if action_str is None:
-            return ''
-        action_str = action_str.strip()
-        if not action_str.endswith('```'):
-            action_str = action_str + ')```'
-        logger.info(action_str)
-        return action_str
-
-    def parse_action(self, action_str: str) -> Action:
-        for action_parser in self.action_parsers:
-            if action_parser.check_condition(action_str):
-                return action_parser.parse(action_str)
-        return self.default_parser.parse(action_str)
-
-
-class BrowsingActionParserMessage(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return '```' not in action_str
-
-    def parse(self, action_str: str) -> Action:
-        msg = f'send_msg_to_user("""{action_str}""")'
-        return BrowseInteractiveAction(
-            browser_actions=msg,
-            thought=action_str,
-            browsergym_send_msg_to_user=action_str,
-        )
-
-
-class BrowsingActionParserBrowseInteractive(ActionParser):
-    """Parser action:
-    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
-    """
-
-    def __init__(
-        self,
-    ):
-        pass
-
-    def check_condition(self, action_str: str) -> bool:
-        return True
-
-    def parse(self, action_str: str) -> Action:
-        thought = action_str.split('```')[0].strip()
-        action_str = action_str.split('```')[1].strip()
-        msg_content = ''
-        for sub_action in action_str.split('\n'):
-            if 'send_msg_to_user(' in sub_action:
-                tree = ast.parse(sub_action)
-                args = tree.body[0].value.args  # type: ignore
-                msg_content = args[0].value
-
-        return BrowseInteractiveAction(
-            browser_actions=action_str,
-            thought=thought,
-            browsergym_send_msg_to_user=msg_content,
-        )
@@ -1,41 +0,0 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
-A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
-The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
-<execute_ipython>
-print("Hello World!")
-</execute_ipython>
-The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
-
-For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
-Important, however: do not run interactive commands. You do not have access to stdin.
-Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
-For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
-Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
-{% endset %}
-{% set BROWSING_PREFIX %}
-The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
-Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set PIP_INSTALL_PREFIX %}
-The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
-{% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
-{% set COMMAND_DOCS %}
-Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
-{{ agent_skills_docs }}
-Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
-{% endset %}
-{% set SYSTEM_SUFFIX %}
-Responses should be concise.
-The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
-Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
-If the assistant is finished with the task you MUST include <finish></finish> in your response.
-IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
-The assistant should utilize full file paths and the 'pwd' command to prevent path-related errors. The assistant should refrain from excessive apologies in its responses.
-
-{% endset %}
-{# Combine all parts without newlines between them #}
-{{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+cp pyproject.toml poetry.lock openhands
+poetry build -v
@@ -0,0 +1,22 @@
+#
+services:
+  openhands:
+    build:
+      context: ./
+      dockerfile: ./containers/app/Dockerfile
+    image: openhands:latest
+    container_name: openhands-app-${DATE:-}
+    environment:
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
+      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
+      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
+    ports:
+      - "3000:3000"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
+    pull_policy: build
+    stdin_open: true
+    tty: true
@@ -13,6 +13,10 @@
 # API key for E2B
 #e2b_api_key = ""

+# API key for Modal
+#modal_api_token_id = ""
+#modal_api_token_secret = ""
+
 # Base path for the workspace
 workspace_base = "./workspace"

@@ -28,6 +32,9 @@ workspace_base = "./workspace"
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false

+# Path to store trajectories
+#trajectories_path="./trajectories"
+
 # File store path
 #file_store_path = "/tmp/file_store"

@@ -64,6 +71,15 @@ workspace_base = "./workspace"
 # Name of the default agent
 #default_agent = "CodeActAgent"

+# JWT secret for authentication
+#jwt_secret = ""
+
+# Restrict file types for file uploads
+#file_uploads_restrict_file_types = false
+
+# List of allowed file extensions for uploads
+#file_uploads_allowed_extensions = [".*"]
+
 #################################### LLM #####################################
 # Configuration for LLM models (group name starts with 'llm')
 # use 'llm' for the default LLM config
@@ -103,7 +119,7 @@ api_key = "your-api-key"
 #embedding_deployment_name = ""

 # Embedding model to use
-embedding_model = ""
+embedding_model = "local"

 # Maximum number of characters in an observation's content
 #max_message_chars = 10000
@@ -117,14 +133,31 @@ embedding_model = ""
 # Model to use
 model = "gpt-4o"

-# Number of retries to attempt
-#num_retries = 5
+# Number of retries to attempt when an operation fails with the LLM.
+# Increase this value to allow more attempts before giving up
+#num_retries = 8

-# Retry maximum wait time
-#retry_max_wait = 60
+# Maximum wait time (in seconds) between retry attempts
+# This caps the exponential backoff to prevent excessively long
+#retry_max_wait = 120

-# Retry minimum wait time
-#retry_min_wait = 3
+# Minimum wait time (in seconds) between retry attempts
+# This sets the initial delay before the first retry
+#retry_min_wait = 15
+
+# Multiplier for exponential backoff calculation
+# The wait time increases by this factor after each failed attempt
+# A value of 2.0 means each retry waits twice as long as the previous one
+#retry_multiplier = 2.0
+
+# Drop any unmapped (unsupported) params without causing an exception
+#drop_params = false
+
+# Using the prompt caching feature if provided by the LLM and supported
+#caching_prompt = true
+
+# Base URL for the OLLAMA API
+#ollama_base_url = ""

 # Temperature for the API
 #temperature = 0.0
@@ -133,14 +166,15 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 0.5
+#top_p = 1.0

-[llm.gpt3]
-# API key to use
+# If model is vision capable, this option allows to disable image processing (useful for cost reduction).
+#disable_vision = true
+
+[llm.gpt4o-mini]
 api_key = "your-api-key"
+model = "gpt-4o"

-# Model to use
-model = "gpt-3.5"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
@@ -149,14 +183,17 @@ model = "gpt-3.5"
 # agent.CodeActAgent
 ##############################################################################
 [agent]
+# Name of the micro agent to use for this agent
+#micro_agent_name = ""
+
 # Memory enabled
 #memory_enabled = false

 # Memory maximum threads
-#memory_max_threads = 2
+#memory_max_threads = 3

 # LLM config group to use
-#llm_config = 'llm'
+#llm_config = 'your-llm-config-group'

 [agent.RepoExplorerAgent]
 # Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
@@ -174,7 +211,7 @@ llm_config = 'gpt3'
 #user_id = 1000

 # Container image to use for the sandbox
-#base_container_image = "nikolaik/python-nodejs:python3.11-nodejs22"
+#base_container_image = "nikolaik/python-nodejs:python3.12-nodejs22"

 # Use host network
 #use_host_network = false
@@ -182,13 +219,25 @@ llm_config = 'gpt3'
 # Enable auto linting after editing
 #enable_auto_lint = false

+# Whether to initialize plugins
+#initialize_plugins = true
+
+# Extra dependencies to install in the runtime image
+#runtime_extra_deps = ""
+
+# Environment variables to set at the launch of the runtime
+#runtime_startup_env_vars = {}
+
+# BrowserGym environment to use for evaluation
+#browsergym_eval_env = ""
+
 #################################### Security ###################################
 # Configuration for security features
 ##############################################################################
 [security]

 # Enable confirmation mode
-#confirmation_mode = true
+#confirmation_mode = false

 # The security analyzer to use
 #security_analyzer = ""
@@ -8,7 +8,7 @@ RUN npm install -g npm@10.5.1
 RUN npm ci

 COPY ./frontend ./
-RUN npm run make-i18n && npm run build
+RUN npm run build

 FROM python:3.12.3-slim AS backend-builder

@@ -28,7 +28,7 @@ COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
 RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR

-FROM python:3.12.3-slim AS runtime
+FROM python:3.12.3-slim AS openhands-app

 WORKDIR /app

@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
@@ -69,11 +69,12 @@ RUN playwright install --with-deps chromium

 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
-COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
-COPY --chown=openhands:app --chmod=770 ./pyproject.toml ./pyproject.toml
-COPY --chown=openhands:app --chmod=770 ./poetry.lock ./poetry.lock
-COPY --chown=openhands:app --chmod=770 ./README.md ./README.md
-COPY --chown=openhands:app --chmod=770 ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
+COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
+COPY --chown=openhands:app ./poetry.lock ./poetry.lock
+COPY --chown=openhands:app ./README.md ./README.md
+COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
+COPY --chown=openhands:app ./LICENSE ./LICENSE

 # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
 RUN python openhands/core/download.py # No-op to download assets
@@ -81,7 +82,7 @@ RUN python openhands/core/download.py # No-op to download assets
 # openhands:openhands -> openhands:app
 RUN find /app \! -group app -exec chgrp app {} +

-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/dist ./frontend/dist
+COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
 COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh

 USER root
@@ -1,13 +1,40 @@
 #!/bin/bash
 set -eo pipefail

-image_name=$1
-org_name=$2
+# Initialize variables with default values
+image_name=""
+org_name=""
 push=0
-if [[ $3 == "--push" ]]; then
-  push=1
+load=0
+tag_suffix=""
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
+    echo "  -i: Image name (required)"
+    echo "  -o: Organization name"
+    echo "  --push: Push the image"
+    echo "  --load: Load the image"
+    echo "  -t: Tag suffix"
+    exit 1
+}
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i) image_name="$2"; shift 2 ;;
+        -o) org_name="$2"; shift 2 ;;
+        --push) push=1; shift ;;
+        --load) load=1; shift ;;
+        -t) tag_suffix="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+# Check if required arguments are provided
+if [[ -z "$image_name" ]]; then
+    echo "Error: Image name is required."
+    usage
 fi
-tag_suffix=$4

 echo "Building: $image_name"
 tags=()
@@ -17,9 +44,10 @@ OPENHANDS_BUILD_VERSION="dev"
 cache_tag_base="buildcache"
 cache_tag="$cache_tag_base"

-if [[ -n $GITHUB_SHA ]]; then
-  git_hash=$(git rev-parse --short "$GITHUB_SHA")
+if [[ -n $RELEVANT_SHA ]]; then
+  git_hash=$(git rev-parse --short "$RELEVANT_SHA")
  tags+=("$git_hash")
+  tags+=("$RELEVANT_SHA")
 fi

 if [[ -n $GITHUB_REF_NAME ]]; then
@@ -94,14 +122,35 @@ if [[ $push -eq 1 ]]; then
  args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
 fi

+if [[ $load -eq 1 ]]; then
+  args+=" --load"
+fi
+
 echo "Args: $args"

+# Modify the platform selection based on --load flag
+if [[ $load -eq 1 ]]; then
+  # When loading, build only for the current platform
+  platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
+else
+  # For push or without load, build for multiple platforms
+  platform="linux/amd64,linux/arm64"
+fi
+
+echo "Building for platform(s): $platform"
+
 docker buildx build \
  $args \
  --build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
  --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
-  --platform linux/amd64,linux/arm64 \
+  --platform $platform \
  --provenance=false \
  -f "$dir/Dockerfile" \
  "$DOCKER_BASE_DIR"
+
+# If load was requested, print the loaded images
+if [[ $load -eq 1 ]]; then
+  echo "Local images built:"
+  docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
+fi
@@ -0,0 +1,124 @@
+# syntax=docker/dockerfile:1
+
+###
+FROM ubuntu:22.04 AS dind
+
+# https://docs.docker.com/engine/install/ubuntu/
+RUN apt-get update && apt-get install -y \
+	ca-certificates \
+	curl \
+	&& install -m 0755 -d /etc/apt/keyrings \
+	&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
+	&& chmod a+r /etc/apt/keyrings/docker.asc \
+	&& echo \
+		"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+		$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+RUN apt-get update && apt-get install -y \
+	docker-ce \
+	docker-ce-cli \
+	containerd.io \
+	docker-buildx-plugin \
+	docker-compose-plugin \
+	&& rm -rf /var/lib/apt/lists/* \
+	&& apt-get clean \
+	&& apt-get autoremove -y
+
+###
+FROM dind AS openhands
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+#
+RUN apt-get update && apt-get install -y \
+	bash \
+    build-essential \
+    curl \
+	git \
+	git-lfs \
+	software-properties-common \
+	make \
+    netcat \
+    sudo \
+	wget \
+	&& rm -rf /var/lib/apt/lists/* \
+	&& apt-get clean \
+	&& apt-get autoremove -y
+
+# https://github.com/cli/cli/blob/trunk/docs/install_linux.md
+RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
+	&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
+	&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+	&& apt-get update && apt-get -y install \
+    gh \
+  && rm -rf /var/lib/apt/lists/* \
+  && apt-get clean \
+  && apt-get autoremove -y
+
+# Python 3.12
+RUN add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
+    && ln -s /usr/bin/python3.12 /usr/bin/python
+
+# NodeJS >= 18.17.1
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs
+
+# Poetry >= 1.8
+RUN curl -fsSL https://install.python-poetry.org | python3.12 - \
+    && ln -s ~/.local/bin/poetry /usr/local/bin/poetry
+
+#
+RUN <<EOF
+#!/bin/bash
+printf "#!/bin/bash
+set +x
+uname -a
+docker --version
+gh --version | head -n 1
+git --version
+#
+python --version
+echo node `node --version`
+echo npm `npm --version`
+poetry --version
+netcat -h 2>&1 | head -n 1
+" > /version.sh
+chmod a+x /version.sh
+EOF
+
+###
+FROM openhands AS dev
+
+RUN apt-get update && apt-get install -y \
+	dnsutils \
+	file \
+	iproute2 \
+	jq \
+	lsof \
+	ripgrep \
+	silversearcher-ag \
+	vim \
+	&& rm -rf /var/lib/apt/lists/* \
+	&& apt-get clean \
+	&& apt-get autoremove -y
+
+WORKDIR /app
+
+# cache build dependencies
+RUN \
+  --mount=type=bind,source=./,target=/app/ \
+  <<EOF
+#!/bin/bash
+make -s clean
+make -s check-dependencies
+make -s install-python-dependencies
+
+# NOTE
+# node_modules are .dockerignore-d therefore not mountable
+# make -s install-frontend-dependencies
+EOF
+
+#
+CMD ["bash"]
@@ -0,0 +1,54 @@
+# Develop in Docker
+
+Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:
+
+```bash
+make docker-dev
+# same as:
+cd ./containers/dev
+./dev.sh
+```
+
+It could take some time if you are running for the first time as Docker will pull all the  tools required for building OpenHands. The next time you run again, it should be instant.
+
+## Build and run
+
+If everything goes well, you should be inside a container after Docker finishes building the `openhands:dev` image similar to the following:
+
+```bash
+Build and run in Docker ...
+root@93fc0005fcd2:/app#
+```
+
+You may now proceed with the normal [build and run](../../Development.md) workflow as if you were on the host.
+
+## Make changes
+
+The source code on the host is mounted as `/app` inside docker. You may edit the files as usual either inside the Docker container or on your host with your favorite IDE/editors.
+
+The following are also mapped as readonly from your host:
+
+```yaml
+# host credentials
+- $HOME/.git-credentials:/root/.git-credentials:ro
+- $HOME/.gitconfig:/root/.gitconfig:ro
+- $HOME/.npmrc:/root/.npmrc:ro
+```
+
+## VSCode
+
+Alternatively, if you use VSCode, you could also [attach to the running container](https://code.visualstudio.com/docs/devcontainers/attach-container).
+
+See details for [developing in docker](https://code.visualstudio.com/docs/devcontainers/containers) or simply ask `OpenHands` ;-)
+
+## Rebuild dev image
+
+You could optionally pass additional options to the build script.
+
+```bash
+make docker-dev OPTIONS="--build"
+# or
+./containers/dev/dev.sh --build
+```
+
+See [docker compose run](https://docs.docker.com/reference/cli/docker/compose/run/) for more options.
@@ -0,0 +1,38 @@
+#
+services:
+  dev:
+    privileged: true
+    build:
+      context: ${OPENHANDS_WORKSPACE:-../../}
+      dockerfile: ./containers/dev/Dockerfile
+    image: openhands:dev
+    container_name: openhands-dev
+    environment:
+      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
+      - SANDBOX_API_HOSTNAME=host.docker.internal
+      #
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
+      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
+      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
+    ports:
+      - "3000:3000"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
+      # source code
+      - ${OPENHANDS_WORKSPACE:-../../}:/app
+      # host credentials
+      - $HOME/.git-credentials:/root/.git-credentials:ro
+      - $HOME/.gitconfig:/root/.gitconfig:ro
+      - $HOME/.npmrc:/root/.npmrc:ro
+      # cache
+      - cache-data:/root/.cache
+    pull_policy: never
+    stdin_open: true
+    tty: true
+
+##
+volumes:
+  cache-data:
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -o pipefail
+
+function get_docker() {
+    echo "Docker is required to build and run OpenHands."
+    echo "https://docs.docker.com/get-started/get-docker/"
+    exit 1
+}
+
+function check_tools() {
+	command -v docker &>/dev/null || get_docker
+}
+
+function exit_if_indocker() {
+    if [ -f /.dockerenv ]; then
+        echo "Running inside a Docker container. Exiting..."
+        exit 1
+    fi
+}
+
+#
+exit_if_indocker
+
+check_tools
+
+##
+OPENHANDS_WORKSPACE=$(git rev-parse --show-toplevel)
+
+cd "$OPENHANDS_WORKSPACE/containers/dev/" || exit 1
+
+##
+export BACKEND_HOST="0.0.0.0"
+#
+export SANDBOX_USER_ID=$(id -u)
+export WORKSPACE_BASE=${WORKSPACE_BASE:-$OPENHANDS_WORKSPACE/workspace}
+
+docker compose run --rm --service-ports "$@" dev
+
+##
@@ -1,11 +1,12 @@
-# Dynamic constructed Dockerfile
+# Dynamically constructed Dockerfile

-This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `openhands`.
+This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
+that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.

-The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.
+The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.12-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image ubuntu:22.04 \
+    --base_image nikolaik/python-nodejs:python3.12-nodejs22 \
    --build_folder containers/runtime
 ```
@@ -1,44 +0,0 @@
-FROM ubuntu:22.04
-
-# install basic packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    wget \
-    git \
-    vim \
-    nano \
-    unzip \
-    zip \
-    python3 \
-    python3-pip \
-    python3-venv \
-    python3-dev \
-    build-essential \
-    openssh-server \
-    sudo \
-    gcc \
-    jq \
-    g++ \
-    make \
-    iproute2 \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN mkdir -p -m0755 /var/run/sshd
-
-# symlink python3 to python
-RUN ln -s /usr/bin/python3 /usr/bin/python
-
-# ==== OpenHands Runtime Client ====
-RUN mkdir -p /openhands && mkdir -p /openhands/logs && chmod 777 /openhands/logs
-RUN wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-RUN bash Miniforge3.sh -b -p /openhands/miniforge3
-RUN chmod -R g+w /openhands/miniforge3
-RUN bash -c ". /openhands/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
-RUN echo "" > /openhands/bash.bashrc
-RUN rm -f Miniforge3.sh
-
-# - agentskills dependencies
-RUN /openhands/miniforge3/bin/pip install --upgrade pip
-RUN /openhands/miniforge3/bin/pip install jupyterlab notebook jupyter_kernel_gateway flake8
-RUN /openhands/miniforge3/bin/pip install python-docx PyPDF2 python-pptx pylatexenc openai
-RUN /openhands/miniforge3/bin/pip install python-dotenv toml termcolor pydantic python-docx pyyaml docker pexpect tenacity e2b browsergym minio
@@ -1,4 +0,0 @@
-DOCKER_REGISTRY=ghcr.io
-DOCKER_ORG=all-hands-ai
-DOCKER_IMAGE=sandbox
-DOCKER_BASE_DIR="."
@@ -38,6 +38,6 @@ repos:
      - id: mypy
        additional_dependencies:
          [types-requests, types-setuptools, types-pyyaml, types-toml]
-        entry: mypy --config-file dev_config/python/mypy.ini openhands/ agenthub/
+        entry: mypy --config-file dev_config/python/mypy.ini openhands/
        always_run: true
        pass_filenames: false
@@ -4,8 +4,8 @@ import { themes as prismThemes } from "prism-react-renderer";

 const config: Config = {
  title: "OpenHands",
-  tagline: "An Open Platform for AI Software Developers as Generalist Agents",
-  favicon: "img/logo.png",
+  tagline: "Code Less, Make More",
+  favicon: "img/logo-square.png",

  // Set the production url of your site here
  url: "https://docs.all-hands.dev",
@@ -73,23 +73,28 @@ const config: Config = {
          type: "docSidebar",
          sidebarId: "docsSidebar",
          position: "left",
-          label: "Docs",
+          label: "User Guides",
        },
        {
          type: "docSidebar",
          sidebarId: "apiSidebar",
          position: "left",
-          label: "Codebase",
+          label: "Python API",
+        },
+        {
+          type: 'localeDropdown',
+          position: 'left',
+        },
+        {
+          href: "https://all-hands.dev",
+          label: "Company",
+          position: "right",
        },
        {
          href: "https://github.com/All-Hands-AI/OpenHands",
          label: "GitHub",
          position: "right",
        },
-        {
-          type: 'localeDropdown',
-          position: 'left',
-        },
      ],
    },
    prism: {
@@ -59,10 +59,6 @@ Félicitations !

 ## Explication technique

-Le code pertinent est défini dans [ssh_box.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/ssh_box.py) et [image_agnostic_util.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py).
-
-En particulier, ssh_box.py vérifie l'objet config pour ```config.sandbox.base_container_image``` et ensuite tente de récupérer l'image à l'aide de [get_openhands_sandbox_image](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L72), qui est défini dans image_agnostic_util.py.
-
 Lorsqu'une image personnalisée est utilisée pour la première fois, elle ne sera pas trouvée et donc elle sera construite (à l'exécution ultérieure, l'image construite sera trouvée et renvoyée).

 L'image personnalisée est construite avec [_build_sandbox_image()](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L29), qui crée un fichier docker en utilisant votre image personnalisée comme base et configure ensuite l'environnement pour OpenHands, comme ceci:
@@ -41,4 +41,4 @@ ne peut être aussi puissant que les modèles qui le pilotent -- heureusement, l

 Certains LLM ont des limites de taux et peuvent nécessiter des réessais. OpenHands réessaiera automatiquement les demandes s'il reçoit une erreur 429 ou une erreur de connexion API.
 Vous pouvez définir les variables d'environnement `LLM_NUM_RETRIES`, `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` pour contrôler le nombre de réessais et le temps entre les réessais.
-Par défaut, `LLM_NUM_RETRIES` est 5 et `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` sont respectivement de 3 secondes et 60 secondes.
+Par défaut, `LLM_NUM_RETRIES` est 8 et `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` sont respectivement de 15 secondes et 120 secondes.
@@ -2,13 +2,30 @@

 默认的 OpenHands 沙箱包含一个[最小化 ubuntu 配置](https://github.com/All-Hands-AI/OpenHands/blob/main/containers/sandbox/Dockerfile)。您的应用场景可能需要在默认状态下安装额外的软件。本指南将教您如何通过使用自定义 Docker 映像来实现这一目标。

+目前提供两种实现方案：
+1. 从 Docker Hub 拉取已有镜像。例如，如果您想安装 `nodejs` ，您可以通过使用 `node:20` 镜像来实现。
+2. 创建并使用您自定义 Docker 镜像。
+
+若选择第一种方案，您可以直接略过 `Create Your Docker Image` 部分。
+
+为了获得功能更丰富的环境，您可能想要考虑使用预构建的镜像，比如 [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)，这个镜像预装了 Python 和 Node.js，同时还包含了许多其他有用的工具和库，比如：
+
+- Node.js: 22.x
+- npm: 10.x
+- yarn: stable
+- Python: latest
+- pip: latest
+- pipenv: latest
+- poetry: latest
+- uv: latest
+
 ## 环境设置

 确保您能够首先通过 [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) 运行 OpenHands。

 ## 创建您的 Docker 映像

-接下来，您必须创建一个自定义的 Docker 映像，该映像是基于 Debian 或 Ubuntu 的。例如，如果我们希望 OpenHands 能够访问 "node" 可执行文件，我们可以使用以下 `Dockerfile`:
+接下来，您可以开始创建一个自定义的 Docker 映像，该映像必须是基于 Debian 或 Ubuntu 的。例如，如果我们希望 OpenHands 能够访问 `node` 可执行文件，我们可以使用以下 `Dockerfile`:

 ```bash
 # 从最新版 ubuntu 开始
@@ -21,7 +38,7 @@ RUN apt-get update && apt-get install
 RUN apt-get install -y nodejs
 ```

-然后构建您选择的映像，例如“custom_image”。为此可以在目录中创建文件夹并将 `Dockerfile` 放入其中，并在该目录内运行以下命令：
+然后命名并构建您选择的映像，例如“custom_image”。为此可以创建一个文件夹并将 `Dockerfile` 放入其中，并在该文件夹内运行以下命令：

 ```bash
 docker build -t custom_image .
@@ -31,7 +48,7 @@ docker build -t custom_image .

 > 注意：在本文档描述的配置中，OpenHands 将在沙箱内部以“openhands”用户身份运行。因此，通过 Dockerfile 安装的所有包应可供系统上的所有用户使用，而不仅仅是 root 用户。

-> 使用 `apt-get` 上面安装的 node 是为所有用户安装的。
+> `Dockerfile`中，使用 `apt-get` 安装的 node 是为所有用户安装的。

 ## 在 config.toml 文件中指定自定义映像

@@ -41,46 +58,26 @@ docker build -t custom_image .
 [core]
 workspace_base="./workspace"
 run_as_openhands=true
-base_container_image="custom_image"
+sandbox_base_container_image="custom_image"
 ```

-> 确保 `sandbox_base_container_image` 设置为您前一步中自定义映像的名称。
+对于 `sandbox_base_container_image` 的值, 您可以选择以下任意一项：
+1. 在上一步中您构建的自定义镜像的名称（例如，`“custom_image”`）
+2. 从 Docker Hub 拉取的镜像（例如，`“node:20”`，如果你需要一个预装 `Node.js` 的沙箱环境）

 ## 运行

-通过运行 ```make run``` 在顶层目录下运行 OpenHands。
+在顶层目录下通过执行 ```make run``` 运行 OpenHands。

 导航至 ```localhost:3001``` 并检查所需依赖是否可用。

-在上述示例的情况下，终端中运行 `node -v` 会输出 `v18.19.1`。
+在上述示例的情况下，终端中运行 `node -v` 会输出 `v20.15.0`。

 恭喜您！

 ## 技术解释

-相关代码定义在 [ssh_box.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/ssh_box.py) 和 [image_agnostic_util.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py) 中。
-
-特别是 ssh_box.py 检查配置对象中的 ```config.sandbox.base_container_image```，然后尝试使用 [get_openhands_sandbox_image](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L72)，在 image_agnostic_util.py 定义中进行检索。
-
-初次使用自定义映像时，该映像将不会被找到，因此将被构建（在后续运行中已构建的映像将被查找并返回）。
-
-自定义映像是通过 `_build_sandbox_image()` 构建的，在 [image_agnostic_util.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L29) 中，使用您的 custom_image 作为基础，并为 OpenHands 配置环境。例如：
-
-```python
-dockerfile_content = (
-        f'FROM {base_image}\n'
-        'RUN apt update && apt install -y openssh-server wget sudo\n'
-        'RUN mkdir -p -m0755 /var/run/sshd\n'
-        'RUN mkdir -p /openhands && mkdir -p /openhands/logs && chmod 777 /openhands/logs\n'
-        'RUN wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"\n'
-        'RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p /openhands/miniforge3\n'
-        'RUN bash -c ". /openhands/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"\n'
-        'RUN echo "export PATH=/openhands/miniforge3/bin:$PATH" >> ~/.bashrc\n'
-        'RUN echo "export PATH=/openhands/miniforge3/bin:$PATH" >> /openhands/bash.bashrc\n'
-    ).strip()
-```
-
-> 注意：映像名称通过 [_get_new_image_name()](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L63) 修改，并且是后续运行中搜索的修改后的名称。
+请参考[运行时文档中自定义 Docker 镜像的章节](https://docs.all-hands.dev/modules/usage/architecture/runtime#advanced-how-openhands-builds-and-maintains-od-runtime-images)获取更详细的解释。

 ## 故障排除 / 错误

@@ -98,8 +95,8 @@ sandbox_user_id="1001"

 ### 端口使用错误

-如果您看到关于端口被占用或不可用的错误，请尝试删除所有正在运行的 Docker 容器（通过运行 `docker ps` 和 `docker rm` 相关容器），然后重新运行 ```make run```。
+如果您遇到端口被占用或不可用的错误提示，可以尝试先用`docker ps`命令列出所有运行中的 Docker 容器，然后使用`docker rm`命令删除相关容器，最后再重新执行```make run```命令。

 ## 讨论

-对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
+对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA) 或 [Discord](https://discord.gg/ESHStjSjD4) 提问！
@@ -43,4 +43,4 @@ OpenHands 将向你配置的 LLM 发出许多提示。大多数这些 LLM 都是

 一些 LLM 有速率限制，可能需要重试操作。OpenHands 会在收到 429 错误或 API 连接错误时自动重试请求。
 你可以设置 `LLM_NUM_RETRIES`，`LLM_RETRY_MIN_WAIT`，`LLM_RETRY_MAX_WAIT` 环境变量来控制重试次数和重试之间的时间。
-默认情况下，`LLM_NUM_RETRIES` 为 5，`LLM_RETRY_MIN_WAIT` 和 `LLM_RETRY_MAX_WAIT` 分别为 3 秒和 60 秒。
+默认情况下，`LLM_NUM_RETRIES` 为 8，`LLM_RETRY_MIN_WAIT` 和 `LLM_RETRY_MAX_WAIT` 分别为 15 秒和 120 秒。
@@ -74,7 +74,7 @@ WORKSPACE_DIR="$(pwd)/workspace"

 如有需要，可以替换您选择的 `LLM_MODEL`。

-完成！现在您可以通过 `make run` 启动 Devin 而无需 Docker。现在您应该可以连接到 `http://localhost:3000/`
+完成！现在您可以通过 `make run` 启动 OpenHands 而无需 Docker。现在您应该可以连接到 `http://localhost:3000/`

 ## 选择您的模型

@@ -1,29 +1,25 @@
---
-sidebar_position: 8
---
-
 # 📚 Misc

 ## ⭐️ Research Strategy

 Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:

-1. **Core Technical Research:** Focusing on foundational research to understand and improve the technical aspects of code generation and handling.
-2. **Specialist Abilities:** Enhancing the effectiveness of core components through data curation, training methods, and more.
-3. **Task Planning:** Developing capabilities for bug detection, codebase management, and optimization.
-4. **Evaluation:** Establishing comprehensive evaluation metrics to better understand and improve our models.
+1. **Core Technical Research:** Focusing on foundational research to understand and improve the technical aspects of code generation and handling
+2. **Specialist Abilities:** Enhancing the effectiveness of core components through data curation, training methods, and more
+3. **Task Planning:** Developing capabilities for bug detection, codebase management, and optimization
+4. **Evaluation:** Establishing comprehensive evaluation metrics to better understand and improve our models

 ## 🚧 Default Agent

- Our default Agent is currently the CodeActAgent, which is capable of generating code and handling files. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./agents).
+Our default Agent is currently the [CodeActAgent](agents), which is capable of generating code and handling files.

 ## 🤝 How to Contribute

 OpenHands is a community-driven project, and we welcome contributions from everyone. Whether you're a developer, a researcher, or simply enthusiastic about advancing the field of software engineering with AI, there are many ways to get involved:

- **Code Contributions:** Help us develop the core functionalities, frontend interface, or sandboxing solutions.
- **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in evaluating the models, or suggest improvements.
- **Feedback and Testing:** Use the OpenHands toolset, report bugs, suggest features, or provide feedback on usability.
+- **Code Contributions:** Help us develop the core functionalities, frontend interface, or sandboxing solutions
+- **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in evaluating the models, or suggest improvements
+- **Feedback and Testing:** Use the OpenHands toolset, report bugs, suggest features, or provide feedback on usability

 For details, please check [this document](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md).

@@ -1,14 +1,11 @@
---
-sidebar_position: 3
---
+# 🧠 Main Agent and Capabilities

-# 🧠 Agents and Capabilities
-
-## CodeAct Agent
+## CodeActAgent

 ### Description

-This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both _simplicity_ and _performance_ (see paper for more details).
+This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a
+unified **code** action space for both _simplicity_ and _performance_.

 The conceptual idea is illustrated below. At each turn, the agent can:

@@ -20,74 +17,8 @@ The conceptual idea is illustrated below. At each turn, the agent can:

 ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)

-### Plugin System
-
-To make the CodeAct agent more powerful with only access to `bash` action space, CodeAct agent leverages OpenHands&#x27;s plugin system:
-
- [Jupyter plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/jupyter): for IPython execution via bash command
- [SWE-agent tool plugin](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/swe_agent_commands): Powerful bash command line tools for software development tasks introduced by [swe-agent](https://github.com/princeton-nlp/swe-agent).
-
 ### Demo

 https://github.com/All-Hands-AI/OpenHands/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac

-_Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)_
-
-### Actions
-
-`Action`,
-`CmdRunAction`,
-`IPythonRunCellAction`,
-`AgentEchoAction`,
-`AgentFinishAction`,
-`AgentTalkAction`
-
-### Observations
-
-`CmdOutputObservation`,
-`IPythonRunCellObservation`,
-`AgentMessageObservation`,
-`UserMessageObservation`
-
-### Methods
-
-| Method          | Description                                                                                                                                     |
-| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `__init__`      | Initializes an agent with `llm` and a list of messages `list[Mapping[str, str]]`                                                                |
-| `step`          | Performs one step using the CodeAct Agent. This includes gathering info on previous steps and prompting the model to make a command to execute. |
-
-## Planner Agent
-
-### Description
-
-The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
-The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
-
-### Actions
-
-`NullAction`,
-`CmdRunAction`,
-`BrowseURLAction`,
-`GithubPushAction`,
-`FileReadAction`,
-`FileWriteAction`,
-`AgentThinkAction`,
-`AgentFinishAction`,
-`AgentSummarizeAction`,
-`AddTaskAction`,
-`ModifyTaskAction`,
-
-### Observations
-
-`Observation`,
-`NullObservation`,
-`CmdOutputObservation`,
-`FileReadObservation`,
-`BrowserOutputObservation`
-
-### Methods
-
-| Method          | Description                                                                                                                                                                               |
-| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `__init__`      | Initializes an agent with `llm`                                                                                                                                                           |
-| `step`          | Checks to see if current step is completed, returns `AgentFinishAction` if True. Otherwise, creates a plan prompt and sends to model for inference, adding the result as the next action. |
+_Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)_.
@@ -1,7 +1,3 @@
---
-sidebar_position: 7
---
-
 # 🏛️ System Architecture

 <div style={{ textAlign: 'center' }}>
@@ -3,30 +3,25 @@
 The OpenHands EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
 It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.

-
 ## Why do we need a sandboxed runtime?

 OpenHands needs to execute arbitrary code in a secure, isolated environment for several reasons:

-1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources.
+1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources
+2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues
+3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system
+4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system
+5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable

-2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues.
-
-3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system.
-
-4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system.
-
-5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable.
-
-## How does our Runtime work?
+## How does the Runtime work?

 The OpenHands Runtime system uses a client-server architecture implemented with Docker containers. Here's an overview of how it works:

 ```mermaid
 graph TD
    A[User-provided Custom Docker Image] --> B[OpenHands Backend]
-    B -->|Builds| C[OD Runtime Image]
-    C -->|Launches| D[Runtime Client]
+    B -->|Builds| C[OH Runtime Image]
+    C -->|Launches| D[Action Executor]
    D -->|Initializes| E[Browser]
    D -->|Initializes| F[Bash Shell]
    D -->|Initializes| G[Plugins]
@@ -51,127 +46,83 @@ graph TD
    end
 ```

-1. User Input: The user provides a custom base Docker image.
-
-2. Image Building: OpenHands builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client."
-
-3. Container Launch: When OpenHands starts, it launches a Docker container using the OD runtime image.
-
-4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins.
-
-5. Communication: The OpenHands backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations.
-
-6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations.
-
-7. Observation Return: The client sends execution results back to the OpenHands backend as observations.
+1. User Input: The user provides a custom base Docker image
+2. Image Building: OpenHands builds a new Docker image (the "OH runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
+3. Container Launch: When OpenHands starts, it launches a Docker container using the OH runtime image
+4. Action Execution Server Initialization: The action execution server initializes an `ActionExecutor` inside the container, setting up necessary components like a bash shell and loading any specified plugins
+5. Communication: The OpenHands backend (`openhands/runtime/impl/eventstream/eventstream_runtime.py`) communicates with the action execution server over RESTful API, sending actions and receiving observations
+6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
+7. Observation Return: The action execution server sends execution results back to the OpenHands backend as observations


-The role of the client is crucial:
- It acts as an intermediary between the OpenHands backend and the sandboxed environment.
- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container.
- It manages the state of the sandboxed environment, including the current working directory and loaded plugins.
- It formats and returns observations to the backend, ensuring a consistent interface for processing results.
+The role of the client:
+- It acts as an intermediary between the OpenHands backend and the sandboxed environment
+- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container
+- It manages the state of the sandboxed environment, including the current working directory and loaded plugins
+- It formats and returns observations to the backend, ensuring a consistent interface for processing results


-## Advanced: How OpenHands builds and maintains OD Runtime images
+## How OpenHands builds and maintains OH Runtime images

-OpenHands uses a sophisticated approach to build and manage runtime images. This process ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
+OpenHands' approach to building and managing runtime images ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.

-Check out [relavant code](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/utils/runtime_build.py) if you are interested in more details.
+Check out the [relevant code](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/utils/runtime_build.py) if you are interested in more details.

 ### Image Tagging System

-OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+OpenHands uses a dual-tagging system for its runtime images to balance reproducibility with flexibility.
+Tags may be in one of 2 formats:

-1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
-   Example: `runtime:abc123def456`
+- **Generic**: `oh_v{openhands_version}_{16_digit_lock_hash}` (e.g.: `oh_v0.9.9_1234567890abcdef`)
+- **Specific**: `oh_v{openhands_version}_{16_digit_lock_hash}_{16_digit_source_hash}`
+  (e.g.: `oh_v0.9.9_1234567890abcdef_1234567890abcdef`)

-   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile.
-   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile.
-   - This ensures reproducibility: the same hash always means the same image contents.
+#### Lock Hash

-2. Generic tag: `{target_image_repo}:{target_image_tag}`
-   Example: `runtime:openhands_v0.8.3_ubuntu_tag_22.04`
+This hash is built from the first 16 digits of the MD5 of:
+- The name of the base image upon which the image was built (e.g.: `nikolaik/python-nodejs:python3.12-nodejs22`)
+- The content of the `pyproject.toml` included in the image.
+- The content of the `poetry.lock` included in the image.

-   - This tag follows the format: `runtime:openhands_v{OPENHANDS_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
-   - It represents the latest build for a particular base image and OpenHands version combination.
-   - This tag is updated whenever a new image is built from the same base image, even if the source code changes.
+This effectively gives a hash for the dependencies of Openhands independent of the source code.

-The hash-based tag ensures exact reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.
+#### Source Hash

-### Build Process
+This is the first 16 digits of the MD5 of the directory hash for the source directory. This gives a hash
+for only the openhands source

-1. Image Naming Convention:
-   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
-     Example: `runtime:abc123def456`
-   - Generic tag: `{target_image_repo}:{target_image_tag}`
-     Example: `runtime:openhands_v0.8.3_ubuntu_tag_22.04`
+#### Build Process

-2. Build Process:
-   - a. Convert the base image name to an OD runtime image name.
-      Example: `ubuntu:22.04` -> `runtime:openhands_v0.8.3_ubuntu_tag_22.04`
-   - b. Generate a build context (Dockerfile and OpenHands source code) and calculate its hash.
-   - c. Check for an existing image with the calculated hash.
-   - d. If not found, check for a recent compatible image to use as a base.
-   - e. If no compatible image exists, build from scratch using the original base image.
-   - f. Tag the new image with both hash-based and generic tags.
+When generating an image...

-3. Image Reuse and Rebuilding Logic:
-   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
+- OpenHands first checks whether an image with the same **Specific** tag exists. If there is such an image,
+  no build is performed - the existing image is used.
+- OpenHands next checks whether an image with the **Generic** tag exists. If there is such an image,
+  OpenHands builds a new image based upon it, bypassing all installation steps (like `poetry install` and
+  `apt-get`) except a final operation to copy the current source code. The new image is tagged with a
+  **Specific** tag only.
+- If neither a **Specific** nor **Generic** tag exists, a brand new image is built based upon the base
+  image (Which is a slower operation). This new image is tagged with both the **Generic** and **Specific**
+  tags.

-   a. If an image exists with the same hash (e.g., `runtime:abc123def456`), it will be reused as is.
+This dual-tagging approach allows OpenHands to efficiently manage both development and production environments.

-   b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:openhands_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies.
+1. Identical source code and Dockerfile always produce the same image (via hash-based tags)
+2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images)
+3. The generic tag (e.g., `runtime:oh_v0.9.3_1234567890abcdef`) always points to the latest build for a particular base image and OpenHands version combination

-   c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch.
-
-4. Caching and Efficiency:
-   - The system attempts to reuse existing images when possible to save build time.
-   - If an exact match (by hash) is found, it's used without rebuilding.
-   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation.
-
-Here's a flowchart illustrating the build process:
-
-```mermaid
-flowchart TD
-    A[Start] --> B{Convert base image name}
-    B --> |ubuntu:22.04 -> runtime:openhands_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
-    C --> D{Check for existing image with hash}
-    D -->|Found runtime:abc123def456| E[Use existing image]
-    D -->|Not found| F{Check for runtime:openhands_v0.8.3_ubuntu_tag_22.04}
-    F -->|Found| G[Rebuild based on recent image]
-    F -->|Not found| H[Build from scratch]
-    G --> I[Tag with hash and generic tags]
-    H --> I
-    E --> J[End]
-    I --> J
-```
-
-This approach ensures that:
-
-1. Identical source code and Dockerfile always produce the same image (via hash-based tags).
-2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images).
-3. The generic tag (e.g., `runtime:openhands_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination.
-
-By using this method, OpenHands maintains an efficient and flexible system for building and managing runtime images, adapting to both development needs and production requirements.
-
-
-## Advanced: Runtime Plugin System
+## Runtime Plugin System

 The OpenHands Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.

-Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/openhands/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.
+Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/ecf4aed28b0cf7c18d4d8ff554883ba182fc6bdd/openhands/runtime/plugins/jupyter/__init__.py#L21-L55) if you want to implement your own plugin.

 *More details about the Plugin system are still under construction - contributions are welcomed!*

 Key aspects of the plugin system:

-1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class.
-
-2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary.
-
-3. Plugin Specification: Plugins are associate with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime.
-
-4. Initialization: Plugins are initialized asynchronously when the runtime client starts.
-
-5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells).
+1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class
+2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary
+3. Plugin Specification: Plugins are associated with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime
+4. Initialization: Plugins are initialized asynchronously when the runtime client starts
+5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells)
@@ -1,14 +1,10 @@
---
-sidebar_position: 5
---
-
 # ✅ Providing Feedback

-When using OpenHands, you will undoubtably encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenHands to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenHands!
+When using OpenHands, you will encounter cases where things work well, and others where they don't. We encourage you to provide feedback when you use OpenHands to help give feedback to the development team, and perhaps more importantly, create an open corpus of coding agent training examples -- Share-OpenHands!

 ## 📝 How to Provide Feedback

-Providing feedback is easy! When you are using OpenHands, you can press the thumbs-up or thumbs-down button at any point during your interaction with. You will be prompted to provide your email address (e.g. so we can contact you if we want to ask any follow-up questions), and you can choose whether you want to provide feedback publicly or privately.
+Providing feedback is easy! When you are using OpenHands, you can press the thumbs-up or thumbs-down button at any point during your interaction. You will be prompted to provide your email address (e.g. so we can contact you if we want to ask any follow-up questions), and you can choose whether you want to provide feedback publicly or privately.

 <iframe width="560" height="315" src="https://www.youtube.com/embed/5rFx-StMVV0?si=svo7xzp6LhGK_GXr" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>

@@ -31,7 +27,7 @@ The public data will be released when we hit fixed milestones, such as 1,000 pub
 At this time, we will follow the following release process:

 1. All people who contributed public feedback will receive an email describing the data release and being given an opportunity to opt out.
-2. The person or people in charge of the data release will perform quality control of the data, removing low-quality feedback, removing email submitter email addresses, and attempting to remove any sensitive information such as API keys.
+2. The person or people in charge of the data release will perform quality control of the data, removing low-quality feedback, removing email submitter email addresses, and attempting to remove any sensitive information.
 3. The data will be released publicly under the MIT license through commonly used sites such as github or Hugging Face.

 ### What if I want my data deleted?
@@ -0,0 +1,111 @@
+# Getting Started with OpenHands
+
+So you've [installed OpenHands](./installation) and have
+[set up your LLM](./installation#setup). Now what?
+
+OpenHands can help you tackle a wide variety of engineering tasks. But the technology
+is still new, and we're a long way off from having agents that can take on large, complicated
+engineering tasks without any guidance. So it's important to get a feel for what the agent
+does well, and where it might need some help.
+
+## Hello World
+
+The first thing you might want to try is a simple "hello world" example.
+This can be more complicated than it sounds!
+
+Try prompting the agent with:
+> Please write a bash script hello.sh that prints "hello world!"
+
+You should see that the agent not only writes the script, it sets the correct
+permissions and runs the script to check the output.
+
+You can continue prompting the agent to refine your code. This is a great way to
+work with agents. Start simple, and iterate.
+
+> Please modify hello.sh so that it accepts a name as the first argument, but defaults to "world"
+
+You can also work in any language you need, though the agent might need to spend some
+time setting up its environment!
+
+> Please convert hello.sh to a Ruby script, and run it
+
+## Building From Scratch
+
+Agents do exceptionally well at "greenfield" tasks (tasks where they don't need
+any context about an existing codebase) and they can just start from scratch.
+
+It's best to start with a simple task, and then iterate on it. It's also best to be
+as specific as possible about what you want, what the tech stack should be, etc.
+
+For example, we might build a TODO app:
+
+> Please build a basic TODO list app in React. It should be frontend-only, and all state
+> should be kept in localStorage.
+
+We can keep iterating on the app once the skeleton is there:
+
+> Please allow adding an optional due date to every task
+
+Just like with normal development, it's good to commit and push your code frequently.
+This way you can always revert back to an old state if the agent goes off track.
+You can ask the agent to commit and push for you:
+
+> Please commit the changes and push them to a new branch called "feature/due-dates"
+
+
+## Adding New Code
+
+OpenHands can also do a great job adding new code to an existing code base.
+
+For example, you can ask OpenHands to add a new GitHub action to your project
+which lints your code. OpenHands may take a peek at your codebase to see what language
+it should use, but then it can just drop a new file into `./github/workflows/lint.yml`
+
+> Please add a GitHub action that lints the code in this repository
+
+Some tasks might require a bit more context. While OpenHands can use `ls` and `grep`
+to search through your codebase, providing context up front allows it to move faster,
+and more accurately. And it'll cost you fewer tokens!
+
+> Please modify ./backend/api/routes.js to add a new route that returns a list of all tasks
+
+> Please add a new React component that displays a list of Widgets to the ./frontend/components
+> directory. It should use the existing Widget component.
+
+## Refactoring
+
+OpenHands does great at refactoring existing code, especially in small chunks.
+You probably don't want to try rearchitecting your whole codebase, but breaking up
+long files and functions, renaming variables, etc. tend to work very well.
+
+> Please rename all the single-letter variables in ./app.go
+
+> Please break the function `build_and_deploy_widgets` into two functions, `build_widgets` and `deploy_widgets` in widget.php
+
+> Please break ./api/routes.js into separate files for each route
+
+## Bug Fixes
+
+OpenHands can also help you track down and fix bugs in your code. But, as any
+developer knows, bug fixing can be extremely tricky, and often OpenHands will need more context.
+It helps if you've diagnosed the bug, but want OpenHands to figure out the logic.
+
+> Currently the email field in the `/subscribe` endpoint is rejecting .io domains. Please fix this.
+
+> The `search_widgets` function in ./app.py is doing a case-sensitive search. Please make it case-insensitive.
+
+It often helps to do test-driven development when bugfixing with an agent.
+You can ask the agent to write a new test, and then iterate until it fixes the bug:
+
+> The `hello` function crashes on the empty string. Please write a test that reproduces this bug, then fix the code so it passes.
+
+## More
+
+OpenHands is capable of helping out on just about any coding task. But it takes some practice
+to get the most out of it. Remember to:
+* Keep your tasks small
+* Be as specific as possible
+* Provide as much context as possible
+* Commit and push frequently
+
+See [Prompting Best Practices](./prompting-best-practices) for more tips on how to get the most out of OpenHands.
@@ -0,0 +1,108 @@
+# CLI Mode
+
+OpenHands can be run in an interactive CLI mode, which allows users to start an interactive session via the command line.
+
+This mode is different from the [headless mode](headless-mode), which is non-interactive and better for scripting.
+
+## With Python
+
+To start an interactive OpenHands session via the command line, follow these steps:
+
+1. Ensure you have followed the [Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+2. Run the following command:
+
+```bash
+poetry run python -m openhands.core.cli
+```
+
+This command will start an interactive session where you can input tasks and receive responses from OpenHands.
+
+You'll need to be sure to set your model, API key, and other settings via environment variables
+[or the `config.toml` file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
+
+
+## With Docker
+
+To run OpenHands in CLI mode with Docker, follow these steps:
+
+1. Set `WORKSPACE_BASE` to the directory you want OpenHands to edit:
+
+```bash
+WORKSPACE_BASE=$(pwd)/workspace
+```
+
+2. Set `LLM_MODEL` to the model you want to use:
+
+```bash
+LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
+```
+
+3. Set `LLM_API_KEY` to your API key:
+
+```bash
+LLM_API_KEY="sk_test_12345"
+```
+
+4. Run the following Docker command:
+
+```bash
+docker run -it \
+    --pull=always \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -e LLM_API_KEY=$LLM_API_KEY \
+    -e LLM_MODEL=$LLM_MODEL \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/all-hands-ai/openhands:0.11 \
+    python -m openhands.core.cli
+```
+
+This command will start an interactive session in Docker where you can input tasks and receive responses from OpenHands.
+
+## Examples of CLI Commands and Expected Outputs
+
+Here are some examples of CLI commands and their expected outputs:
+
+### Example 1: Simple Task
+
+```bash
+How can I help? >> Write a Python script that prints "Hello, World!"
+```
+
+Expected Output:
+
+```bash
+🤖 Sure! Here is a Python script that prints "Hello, World!":
+
+❯ print("Hello, World!")
+```
+
+### Example 2: Bash Command
+
+```bash
+How can I help? >> Create a directory named "test_dir"
+```
+
+Expected Output:
+
+```bash
+🤖 Creating a directory named "test_dir":
+
+❯ mkdir test_dir
+```
+
+### Example 3: Error Handling
+
+```bash
+How can I help? >> Delete a non-existent file
+```
+
+Expected Output:
+
+```bash
+🤖 An error occurred. Please try again.
+```
@@ -1,89 +1,64 @@
-# Create and Use a Custom Docker Sandbox
+# Custom Sandbox

-The default OpenHands sandbox comes with a [minimal ubuntu configuration](https://github.com/All-Hands-AI/OpenHands/blob/main/containers/sandbox/Dockerfile).
+The sandbox is where the agent performs its tasks. Instead of running commands directly on your computer
+(which could be risky), the agent runs them inside a Docker container.

-Your use case may need additional software installed by default.
+The default OpenHands sandbox (`python-nodejs:python3.12-nodejs22`
+from [nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)) comes with some packages installed such
+as python and Node.js but may need other software installed by default.

-There are two ways you can do so:
+You have two options for customization:

-1. Use an existing image from docker hub. For instance, if you want to have `nodejs` installed, you can do so by using the `node:20` image
-2. Creating your own custom docker image and using it
+1. Use an existing image with the required software.
+2. Create your own custom Docker image.

-If you want to take the first approach, you can skip the `Create Your Docker Image` section.
-
-For a more feature-rich environment, you might consider using pre-built images like **[nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)**, which comes with both Python and Node.js pre-installed, along with many other useful tools and libraries, like:
-
- Node.js: 22.x
- npm: 10.x
- yarn: stable
- Python: latest
- pip: latest
- pipenv: latest
- poetry: latest
- uv: latest
-
-## Setup
-
-Make sure you are able to run OpenHands using the [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) first.
+If you choose the first option, you can skip the `Create Your Docker Image` section.

 ## Create Your Docker Image

-To create a custom docker image, it must be debian/ubuntu based.
+To create a custom Docker image, it must be Debian based.

-For example, if we want OpenHands to have access to the `node` binary, we would use the following Dockerfile:
+For example, if you want OpenHands to have `ruby` installed, create a `Dockerfile` with the following content:

 ```dockerfile
-# Start with latest ubuntu image
-FROM ubuntu:latest
+FROM debian:latest

-# Run needed updates
-RUN apt-get update && apt-get install -y
-
-# Install node
-RUN apt-get install -y nodejs
+# Install required packages
+RUN apt-get update && apt-get install -y ruby
 ```

-Next build your docker image with the name of your choice, for example `custom_image`.
+Save this file in a folder. Then, build your Docker image (e.g., named custom-image) by navigating to the folder in
+the terminal and running::
+```bash
+docker build -t custom-image .
+```

-To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:
+This will produce a new image called `custom-image`, which will be available in Docker.
+
+> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the
+> sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
+
+## Using the Development Workflow
+
+### Setup
+
+First, ensure you can run OpenHands by following the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+### Specify the Base Sandbox Image
+
+In the `config.toml` file within the OpenHands directory, set the `sandbox_base_container_image` to the image you want to use.
+This can be an image you’ve already pulled or one you’ve built:

 ```bash
-docker build -t custom_image .
-```
-
-This will produce a new image called ```custom_image``` that will be available in Docker Engine.
-
-> Note that in the configuration described in this document, OpenHands will run as user "openhands" inside the sandbox and thus all packages installed via the docker file should be available to all users on the system, not just root.
->
-> Installing with apt-get above installs node for all users.
-
-## Specify your sandbox image in config.toml file
-
-OpenHands configuration occurs via the top-level `config.toml` file.
-
-Create a `config.toml` file in the OpenHands directory and enter these contents:
-
-```toml
 [core]
-workspace_base="./workspace"
-run_as_openhands=true
-sandbox_base_container_image="custom_image"
+...
+sandbox_base_container_image="custom-image"
 ```

-For `sandbox_base_container_image`, you can specify either:
+### Run

-1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
-2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)
-
-## Run
 Run OpenHands by running ```make run``` in the top level directory.

-Navigate to ```localhost:3001``` and check if your desired dependencies are available.
-
-In the case of the example above, running ```node -v``` in the terminal produces ```v20.15.0```
-
-Congratulations!
-
 ## Technical Explanation

 Please refer to [custom docker image section of the runtime documentation](https://docs.all-hands.dev/modules/usage/architecture/runtime#advanced-how-openhands-builds-and-maintains-od-runtime-images) for more details.
@@ -0,0 +1,71 @@
+# Debugging
+
+The following is intended as a primer on debugging OpenHands for Development purposes.
+
+## Server / VSCode
+
+The following `launch.json` will allow debugging the agent, controller and server elements, but not the sandbox (Which runs inside docker). It will ignore any changes inside the `workspace/` directory:
+
+```
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "OpenHands CLI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "openhands.core.cli",
+            "justMyCode": false
+        },
+        {
+            "name": "OpenHands WebApp",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "openhands.server.listen:app",
+                "--reload",
+                "--reload-exclude",
+                "${workspaceFolder}/workspace",
+                "--port",
+                "3000"
+            ],
+            "justMyCode": false
+        }
+    ]
+}
+```
+
+More specific debugging configurations which include more parameters may be specified:
+
+```
+    ...
+    {
+      "name": "Debug CodeAct",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "openhands.core.main",
+      "args": [
+        "-t",
+        "Ask me what your task is.",
+        "-d",
+        "${workspaceFolder}/workspace",
+        "-c",
+        "CodeActAgent",
+        "-l",
+        "llm.o1",
+        "-n",
+        "prompts"
+      ],
+      "justMyCode": false
+    }
+    ...
+```
+
+Values in the snippet above can be updated such that:
+
+    * *t*: the task
+    * *d*: the openhands workspace directory
+    * *c*: the agent
+    * *l*: the LLM config (pre-defined in config.toml)
+    * *n*: session name (e.g. eventstream name)
@@ -1,11 +1,10 @@
-# Contribute to OpenHands Evaluation Harness
+# Evaluation

 This guide provides an overview of how to integrate your own evaluation benchmark into the OpenHands framework.

-## Before everything begins: Setup Environment and LLM Configuration
-
-Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
+## Setup Environment and LLM Configuration

+Please follow instructions [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment.
 OpenHands in development mode uses `config.toml` to keep track of most configurations.

 Here's an example configuration file you can use to define and use multiple LLMs:
@@ -13,7 +12,7 @@ Here's an example configuration file you can use to define and use multiple LLMs
 ```toml
 [llm]
 # IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
+model = "claude-3-5-sonnet-20240620"
 api_key = "sk-XXX"

 [llm.eval_gpt4_1106_preview_llm]
@@ -61,9 +60,9 @@ This command runs OpenHands with:

 The main entry point for OpenHands is in `openhands/core/main.py`. Here's a simplified flow of how it works:

-1. Parse command-line arguments and load the configuration.
-2. Create a runtime environment using `create_runtime()`.
-3. Initialize the specified agent.
+1. Parse command-line arguments and load the configuration
+2. Create a runtime environment using `create_runtime()`
+3. Initialize the specified agent
 4. Run the controller using `run_controller()`, which:
   - Attaches the runtime to the agent
   - Executes the agent's task
@@ -85,7 +84,7 @@ To create an evaluation workflow for your benchmark, follow these steps:

 1. Import relevant OpenHands utilities:
   ```python
-    import agenthub
+    import openhands.agenthub
    from evaluation.utils.shared import (
        EvalMetadata,
        EvalOutput,
@@ -127,7 +126,7 @@ To create an evaluation workflow for your benchmark, follow these steps:

 3. Initialize the runtime and set up the evaluation environment:
   ```python
-   async def initialize_runtime(runtime: Runtime, instance: pd.Series):
+   def initialize_runtime(runtime: Runtime, instance: pd.Series):
       # Set up your evaluation environment here
       # For example, setting environment variables, preparing files, etc.
       pass
@@ -135,14 +134,16 @@ To create an evaluation workflow for your benchmark, follow these steps:

 4. Create a function to process each instance:
   ```python
-   async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
+   from openhands.utils.async_utils import call_async_from_sync
+   def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
       config = get_config(instance, metadata)
-       runtime = await create_runtime(config, sid=instance.instance_id)
-       await initialize_runtime(runtime, instance)
+       runtime = create_runtime(config)
+       call_async_from_sync(runtime.connect)
+       initialize_runtime(runtime, instance)

       instruction = get_instruction(instance, metadata)

-       state = await run_controller(
+       state = run_controller(
           config=config,
           task_str=instruction,
           runtime=runtime,
@@ -234,9 +235,9 @@ Here's a more accurate visual representation:

 In this workflow:

- Executable actions (like running commands or executing code) are handled directly by the Runtime.
- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`.
- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`.
+- Executable actions (like running commands or executing code) are handled directly by the Runtime
+- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`
+- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`

 This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.

@@ -270,8 +271,8 @@ def codeact_user_response(state: State | None) -> str:

 This function does the following:

-1. Provides a standard message encouraging the agent to continue working.
-2. Checks how many times the agent has attempted to communicate with the user.
-3. If the agent has made multiple attempts, it provides an option to give up.
+1. Provides a standard message encouraging the agent to continue working
+2. Checks how many times the agent has attempted to communicate with the user
+3. If the agent has made multiple attempts, it provides an option to give up

 By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.
@@ -0,0 +1,15 @@
+# Using the OpenHands GitHub Action
+
+This guide explains how to use the OpenHands GitHub Action, both within the OpenHands repository and in your own projects.
+
+## Using the Action in the OpenHands Repository
+
+To use the OpenHands GitHub Action in the OpenHands repository, an OpenHands maintainer can:
+
+1. Create an issue in the repository.
+2. Add the `fix-me` label to the issue.
+3. The action will automatically trigger and attempt to resolve the issue.
+
+## Installing the Action in a New Repository
+
+To install the OpenHands GitHub Action in your own repository, follow the [directions in the OpenHands Resolver repo](https://github.com/All-Hands-AI/OpenHands-resolver?tab=readme-ov-file#using-the-github-actions-workflow).
@@ -0,0 +1,51 @@
+# GUI Mode
+
+## Introduction
+
+OpenHands provides a user-friendly Graphical User Interface (GUI) mode for interacting with the AI assistant. This mode offers an intuitive way to set up the environment, manage settings, and communicate with the AI.
+
+## Installation and Setup
+
+1. Follow the instructions in the [Installation](../installation) guide to install OpenHands.
+
+2. After running the command, access OpenHands at [http://localhost:3000](http://localhost:3000).
+
+## Interacting with the GUI
+
+### Initial Setup
+
+1. Upon first launch, you'll see a settings modal.
+2. Select an `LLM Provider` and `LLM Model` from the dropdown menus.
+3. Enter the corresponding `API Key` for your chosen provider.
+4. Click "Save" to apply the settings.
+
+### Advanced Settings
+
+1. Toggle `Advanced Options` to access additional settings.
+2. Use the `Custom Model` text box to manually enter a model if it's not in the list.
+3. Specify a `Base URL` if required by your LLM provider.
+
+### Main Interface
+
+The main interface consists of several key components:
+
+1. **Chat Window**: The central area where you can view the conversation history with the AI assistant.
+2. **Input Box**: Located at the bottom of the screen, use this to type your messages or commands to the AI.
+3. **Send Button**: Click this to send your message to the AI.
+4. **Settings Button**: A gear icon that opens the settings modal, allowing you to adjust your configuration at any time.
+5. **Workspace Panel**: Displays the files and folders in your workspace, allowing you to navigate and view files, or the agent's past commands or web browsing history.
+
+### Interacting with the AI
+
+1. Type your question, request, or task description in the input box.
+2. Click the send button or press Enter to submit your message.
+3. The AI will process your input and provide a response in the chat window.
+4. You can continue the conversation by asking follow-up questions or providing additional information.
+
+## Tips for Effective Use
+
+1. Be specific in your requests to get the most accurate and helpful responses, as described in the [prompting best practices](../prompting-best-practices).
+2. Use the workspace panel to explore your project structure.
+3. Use one of the recommended models, as described in the [LLMs section](usage/llms/llms.md).
+
+Remember, the GUI mode of OpenHands is designed to make your interaction with the AI assistant as smooth and intuitive as possible. Don't hesitate to explore its features to maximize your productivity.
@@ -1,9 +1,12 @@
-# Running in Headless Mode
+# Headless Mode

-You can run OpenHands via a CLI, without starting the web application. This makes it easy
-to automate tasks with OpenHands.
+You can run OpenHands with a single command, without starting the web application.
+This makes it easy to write scripts and automate tasks with OpenHands.
+
+This is different from [CLI Mode](cli-mode), which is interactive, and better for active development.

 ## With Python
+
 To run OpenHands in headless mode with Python,
 [follow the Development setup instructions](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md),
 and then run:
@@ -12,19 +15,32 @@ and then run:
 poetry run python -m openhands.core.main -t "write a bash script that prints hi"
 ```

+You'll need to be sure to set your model, API key, and other settings via environment variables
+[or the `config.toml` file](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml).
+
 ## With Docker
-To run OpenHands in headless mode with Docker, run:
+
+1. Set `WORKSPACE_BASE` to the directory you want OpenHands to edit:

 ```bash
-# Set WORKSPACE_BASE to the directory you want OpenHands to edit
 WORKSPACE_BASE=$(pwd)/workspace
+```

-# Set LLM_API_KEY to an API key, e.g. for OpenAI or Anthropic
-LLM_API_KEY="abcde"
+2. Set `LLM_MODEL` to the model you want to use:

-# Set LLM_MODEL to the model you want to use
-LLM_MODEL="gpt-4o"
+```bash
+LLM_MODEL="anthropic/claude-3-5-sonnet-20240620"
+```

+3. Set `LLM_API_KEY` to your API key:
+
+```bash
+LLM_API_KEY="sk_test_12345"
+```
+
+4. Run the following Docker command:
+
+```bash
 docker run -it \
    --pull=always \
    -e SANDBOX_USER_ID=$(id -u) \
@@ -35,7 +51,6 @@ docker run -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    --add-host host.docker.internal:host-gateway \
    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:main \ # TODO: pin a version here
-    python -m openhands.core.main \
-    -t "Write a bash script that prints Hello World"
+    ghcr.io/all-hands-ai/openhands:0.11 \
+    python -m openhands.core.main -t "write a bash script that prints hi"
 ```
@@ -1,5 +0,0 @@
---
-sidebar_position: 6
---
-
-# 🔎 How To Section
@@ -1,11 +1,11 @@
-# Use OpenHands in OpenShift/K8S
+# Kubernetes

-There are different ways and scenarios that you can do, we're just mentioning one example here:
-1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node.
-2. Create a PVC to be able to mount those PVs to the POD
-3. Create a POD which contains two containers; the OpenHands and Sandbox containers.
+There are different ways you might run OpenHands on Kubernetes or OpenShift. This guide goes through one possible way:
+1. Create a PV "as a cluster admin" to map workspace_base data and docker directory to the pod through the worker node
+2. Create a PVC to be able to mount those PVs to the pod
+3. Create a pod which contains two containers; the OpenHands and Sandbox containers

-## Steps to follow the above example.
+## Detailed Steps for the Example Above

 > Note: Make sure you are logged in to the cluster first with the proper account for each step. PV creation requires cluster administrator!

@@ -135,8 +135,8 @@ LAST SEEN   TYPE     REASON                 OBJECT
 10s         Normal   WaitForFirstConsumer   persistentvolumeclaim/workspace-pvc   waiting for first consumer to be created before binding
 ```

-3. Create the POD yaml file:
-Sample POD yaml file below:
+3. Create the pod yaml file:
+Sample pod yaml file below:

 - pod.yaml

@@ -177,6 +177,7 @@ spec:
      claimName: docker-pvc
 ```

+
 ```bash
 # create the pod
 $ oc create -f pod.yaml
@@ -264,33 +265,165 @@ Events:                   <none>
 ![image](https://github.com/user-attachments/assets/12f94804-a0c7-4744-b873-e003c9caf40e)


-## Challenges
-Some of the challenages that would be needed to improve:

-1. Install GIT into the container:
-   This can be resolved by building a custom image which includes GIT software and use that image during pod deplyment.
+## GCP GKE Openhands deployment

-Example below: "to be tested!"
+**Warning**: this deployment grants the OpenHands application access to the Kubernetes docker socket, which creates security risk. Use at your own discretion.
+1- Create policy for privillege access
+2- Create gke credentials(optional)
+3- Create openhands deployment
+4- Verification and ui access commands
+5- Tshoot pod to verify the internal container

-```dockerfile
-FROM ghcr.io/all-hands-ai/openhands:main
-
-# Install Git
-RUN apt-get update && apt-get install -y git
-
-# Ensure /opt/workspace_base is writable
-RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
-
-# Verify Git installation
-RUN git --version
+1. create policy for privillege access
+```bash
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: privileged-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: ["apps"]
+  resources: ["deployments"]
+  verbs: ["create", "get", "list", "watch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: privileged-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: privileged-role
+subjects:
+- kind: ServiceAccount
+  name: default  # Change to your service account name
+  namespace: default
 ```
+2. create gke credentials(optional)
+```bash
+kubectl create secret generic google-cloud-key \
+  --from-file=key.json=/path/to/your/google-cloud-key.json
+  ```
+3. create openhands deployment
+## as this is tested for the single worker node if you have multiple specify the flag for the single worker

-2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
-   This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
+```bash
+kind: Deployment
+metadata:
+  name: openhands-app-2024
+  labels:
+    app: openhands-app-2024
+spec:
+  replicas: 1  # You can increase this number for multiple replicas
+  selector:
+    matchLabels:
+      app: openhands-app-2024
+  template:
+    metadata:
+      labels:
+        app: openhands-app-2024
+    spec:
+      containers:
+      - name: openhands-app-2024
+        image: ghcr.io/all-hands-ai/openhands:main
+        env:
+        - name: SANDBOX_USER_ID
+          value: "1000"
+        - name: SANDBOX_API_HOSTNAME
+          value: '10.164.0.4'
+        - name: WORKSPACE_MOUNT_PATH
+          value: "/tmp/workspace_base"
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: "/tmp/workspace_base/google-cloud-key.json"
+        volumeMounts:
+        - name: workspace-volume
+          mountPath: /tmp/workspace_base
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+        - name: google-credentials
+          mountPath: "/tmp/workspace_base/google-cloud-key.json"
+        securityContext:
+          privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 3000
+      - name: openhands-sandbox-2024
+        image: ghcr.io/opendevin/sandbox:main
+    #    securityContext:
+    #      privileged: true  # Add this to allow privileged access
+        ports:
+        - containerPort: 51963
+        command: ["/usr/sbin/sshd", "-D", "-p 51963", "-o", "PermitRootLogin=yes"]
+      volumes:
+      #- name: workspace-volume
+      #  persistentVolumeClaim:
+      #    claimName: workspace-pvc
+      - name: workspace-volume
+        emptyDir: {}
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock       # Use host's Docker socket
+          type: Socket
+      - name: google-credentials
+        secret:
+          secretName: google-cloud-key
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openhands-app-2024-svc
+spec:
+  selector:
+    app: openhands-app-2024
+  ports:
+  - name: http
+    protocol: TCP
+    port: 80
+    targetPort: 3000
+  - name: ssh
+    protocol: TCP
+    port: 51963
+    targetPort: 51963
+  type: LoadBalancer
+  ```

-3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
+5. Tshoot pod to verify the internal container
+### if you want to know more regarding the internal container runtime use below mention pod deployment use kubectl exec -it to enter into container and you can check the contaienr run time using normal docker commands like "docker ps -a"

-
-## Discuss
-
-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+```bash
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docker-in-docker
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docker-in-docker
+  template:
+    metadata:
+      labels:
+        app: docker-in-docker
+    spec:
+      containers:
+      - name: dind
+        image: docker:20.10-dind
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: docker-sock
+          mountPath: /var/run/docker.sock
+      volumes:
+      - name: docker-sock
+        hostPath:
+          path: /var/run/docker.sock
+          type: Socket
+```
@@ -0,0 +1,63 @@
+# Installation
+
+## System Requirements
+
+* Docker version 26.0.0+ or Docker Desktop 4.31.0+.
+* You must be using Linux or Mac OS.
+  * If you are on Windows, you must use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+
+## Start the app
+
+The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to point OpenHands to
+existing code that you'd like to modify.
+
+```bash
+export WORKSPACE_BASE=$(pwd)/workspace
+
+docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik
+
+docker run -it --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
+    -e SANDBOX_USER_ID=$(id -u) \
+    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
+    -v $WORKSPACE_BASE:/opt/workspace_base \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -p 3000:3000 \
+    --add-host host.docker.internal:host-gateway \
+    --name openhands-app-$(date +%Y%m%d%H%M%S) \
+    ghcr.io/all-hands-ai/openhands:0.11
+```
+
+You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).
+
+## Setup
+
+After running the command above, you'll find OpenHands running at [http://localhost:3000](http://localhost:3000).
+
+The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
+command to point to an existing folder.
+
+Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
+These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
+
+If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
+in the `Custom Model` text box.
+The `Advanced Options` also allow you to specify a `Base URL` if required.
+
+<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
+  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
+  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
+</div>
+
+## Versions
+
+The command above pulls the most recent stable release of OpenHands. You have other options as well:
+- For a specific release, use `ghcr.io/all-hands-ai/openhands:$VERSION`, replacing $VERSION with the version number.
+- We use semver, and release major, minor, and patch tags. So `0.9` will automatically point to the latest `0.9.x` release, and `0` will point to the latest `0.x.x` release.
+- For the most up-to-date development version, you can use `ghcr.io/all-hands-ai/openhands:main`. This version is unstable and is recommended for testing or development purposes only.
+
+You can choose the tag that best suits your needs based on stability requirements and desired features.
+
+For the development workflow, see [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md).
+
+Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
@@ -1,71 +0,0 @@
---
-sidebar_position: 1
---
-
-# 💻 OpenHands
-
-OpenHands is an **autonomous AI software engineer** capable of executing complex engineering tasks and collaborating actively with users on software development projects.
-This project is fully open-source, so you can use and modify it however you like.
-
-:::tip
-Explore the codebase of OpenHands on [GitHub](https://github.com/All-Hands-AI/OpenHands) or join one of our communities!
-
-<a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors">
-  <img
-    src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge"
-    alt="Contributors"
-  />
-</a>
-<a href="https://github.com/All-Hands-AI/OpenHands/network/members">
-  <img
-    src="https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge"
-    alt="Forks"
-  />
-</a>
-<a href="https://github.com/All-Hands-AI/OpenHands/stargazers">
-  <img
-    src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge"
-    alt="Stargazers"
-  />
-</a>
-<a href="https://github.com/All-Hands-AI/OpenHands/issues">
-  <img
-    src="https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge"
-    alt="Issues"
-  />
-</a>
-<br></br>
-<a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE">
-  <img
-    src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge"
-    alt="MIT License"
-  />
-</a>
-<br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA">
-  <img
-    src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
-    alt="Join our Slack community"
-  />
-</a>
-<a href="https://discord.gg/ESHStjSjD4">
-  <img
-    src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge"
-    alt="Join our Discord community"
-  />
-</a>
-:::
-
-## 🛠️ Getting Started
-[Check out the getting started guide on Github](https://github.com/All-Hands-AI/OpenHands?tab=readme-ov-file#-getting-started)
-
-[contributors-shield]: https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge
-[contributors-url]: https://github.com/All-Hands-AI/OpenHands/graphs/contributors
-[forks-shield]: https://img.shields.io/github/forks/All-Hands-AI/OpenHands?style=for-the-badge
-[forks-url]: https://github.com/All-Hands-AI/OpenHands/network/members
-[stars-shield]: https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge
-[stars-url]: https://github.com/All-Hands-AI/OpenHands/stargazers
-[issues-shield]: https://img.shields.io/github/issues/All-Hands-AI/OpenHands?style=for-the-badge
-[issues-url]: https://github.com/All-Hands-AI/OpenHands/issues
-[license-shield]: https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge
-[license-url]: https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE
@@ -1,55 +1,46 @@
-# Azure OpenAI LLM
+# Azure

-## Completion
+OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).

-OpenHands uses LiteLLM for completion calls. You can find their documentation on Azure [here](https://docs.litellm.ai/docs/providers/azure)
+## Azure OpenAI Configuration

-### Azure openai configs
-
-When running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:
+When running OpenHands, you'll need to set the following environment variable using `-e` in the
+[docker run command](/modules/usage/installation#start-the-app):

 ```
-LLM_BASE_URL="<azure-api-base-url>"          # e.g. "https://openai-gpt-4-test-v-1.openai.azure.com/"
-LLM_API_KEY="<azure-api-key>"
-LLM_MODEL="azure/<your-gpt-deployment-name>"
-LLM_API_VERSION="<api-version>"          # e.g. "2024-02-15-preview"
+LLM_API_VERSION="<api-version>"              # e.g. "2023-05-15"
 ```

 Example:
 ```bash
-docker run -it \
--pull=always \
-e SANDBOX_USER_ID=$(id -u) \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_BASE_URL="x.openai.azure.com" \
-e LLM_API_VERSION="2024-02-15-preview" \
-v $WORKSPACE_BASE:/opt/workspace_base \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
--add-host host.docker.internal:host-gateway \
--name openhands-app-$(date +%Y%m%d%H%M%S) \
-ghcr.io/all-hands-ai/openhands:main
+docker run -it --pull=always \
+    -e LLM_API_VERSION="2023-05-15"
+    ...
 ```

-You can set the LLM_MODEL and LLM_API_KEY in the OpenHands UI itself.
+Then set the following in the OpenHands UI through the Settings:

 :::note
-You can find your ChatGPT deployment name on the deployments page in Azure. It could be the same with the chat model name (e.g. 'GPT4-1106-preview'), by default or initially set, but it doesn't have to be the same. Run openhands, and when you load it in the browser, go to Settings and set model as above: "azure/&lt;your-actual-gpt-deployment-name&gt;". If it's not in the list, enter your own text and save it.
+You will need your ChatGPT deployment name which can be found on the deployments page in Azure. This is referenced as
+&lt;deployment-name&gt; below.
 :::

+* Enable `Advanced Options`
+* `Custom Model` to azure/&lt;deployment-name&gt;
+* `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)
+* `API Key` to your Azure API key
+
 ## Embeddings

-OpenHands uses llama-index for embeddings. You can find their documentation on Azure [here](https://docs.llamaindex.ai/en/stable/api_reference/embeddings/azure_openai/)
+OpenHands uses llama-index for embeddings. You can find their documentation on Azure [here](https://docs.llamaindex.ai/en/stable/api_reference/embeddings/azure_openai/).

-### Azure openai configs
+### Azure OpenAI Configuration

-The model used for Azure OpenAI embeddings is "text-embedding-ada-002".
-You need the correct deployment name for this model in your Azure account.
-
-When running OpenHands in Docker, set the following environment variables using `-e`:
+When running OpenHands, set the following environment variables using `-e` in the
+[docker run command](/modules/usage/installation#start-the-app):

 ```
 LLM_EMBEDDING_MODEL="azureopenai"
-LLM_EMBEDDING_DEPLOYMENT_NAME="<your-embedding-deployment-name>"        # e.g. "TextEmbedding...<etc>"
-LLM_API_VERSION="<api-version>"         # e.g. "2024-02-15-preview"
+LLM_EMBEDDING_DEPLOYMENT_NAME="<your-embedding-deployment-name>"   # e.g. "TextEmbedding...<etc>"
+LLM_API_VERSION="<api-version>"                                    # e.g. "2024-02-15-preview"
 ```
@@ -1,28 +1,30 @@
-# Google Gemini/Vertex LLM
+# Google Gemini/Vertex

-## Completion
-
-OpenHands uses LiteLLM for completion calls. The following resources are relevant for using OpenHands with Google's LLMs
+OpenHands uses LiteLLM to make calls to Google's chat models. You can find their documentation on using Google as a provider:

 - [Gemini - Google AI Studio](https://docs.litellm.ai/docs/providers/gemini)
 - [VertexAI - Google Cloud Platform](https://docs.litellm.ai/docs/providers/vertex)

-### Gemini - Google AI Studio Configs
+## Gemini - Google AI Studio Configs

-To use Gemini through Google AI Studio when running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `Gemini`
+* `LLM Model` to the model you will be using.
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
+* `API Key` to your Gemini API key

-```
-GEMINI_API_KEY="<your-google-api-key>"
-LLM_MODEL="gemini/gemini-1.5-pro"
-```
+## VertexAI - Google Cloud Platform Configs

-### Vertex AI - Google Cloud Platform Configs
-
-To use Vertex AI through Google Cloud Platform when running the OpenHands Docker image, you'll need to set the following environment variables using `-e`:
+To use Vertex AI through Google Cloud Platform when running OpenHands, you'll need to set the following environment
+variables using `-e` in the [docker run command](/modules/usage/installation#start-the-app):

 ```
 GOOGLE_APPLICATION_CREDENTIALS="<json-dump-of-gcp-service-account-json>"
 VERTEXAI_PROJECT="<your-gcp-project-id>"
 VERTEXAI_LOCATION="<your-gcp-location>"
-LLM_MODEL="vertex_ai/<desired-llm-model>"
 ```
+
+Then set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `VertexAI`
+* `LLM Model` to the model you will be using.
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
@@ -0,0 +1,23 @@
+# Groq
+
+OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).
+
+## Configuration
+
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `Groq`
+* `LLM Model` to the model you will be using. [Visit here to see the list of
+models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
+`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`).
+* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys).
+
+
+
+## Using Groq as an OpenAI-Compatible Endpoint
+
+The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
+would access any OpenAI-compatible endpoint. You can set the following in the OpenHands UI through the Settings:
+* Enable `Advanced Options`
+* `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
+* `Base URL` to `https://api.groq.com/openai/v1`
+* `API Key` to your Groq API key
@@ -1,49 +1,88 @@
---
-sidebar_position: 2
---
-
 # 🤖 LLM Backends

-OpenHands can work with any LLM backend.
-For a full list of the LM providers and models available, please consult the
-[litellm documentation](https://docs.litellm.ai/docs/providers).
+OpenHands can connect to any LLM supported by LiteLLM. However, it requires a powerful model to work.
+
+## Model Recommendations
+
+Based on a recent evaluation of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. The full analysis can be found in [this blog article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed).
+
+When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
+
+- Claude 3.5 Sonnet is the best by a fair amount, achieving a 27% resolve rate with the default agent in OpenHands.
+- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
+- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.
+
+Please refer to the [full article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) for more details.
+
+Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
+
+- claude-3-5-sonnet (recommended)
+- gpt-4 / gpt-4o
+- llama-3.1-405b
+- deepseek-v2.5

 :::warning
-OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money--be sure to set spending limits and monitor usage.
+OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
+limits and monitor usage.
 :::

-The `LLM_MODEL` environment variable controls which model is used in programmatic interactions.
-But when using the OpenHands UI, you'll need to choose your model in the settings window.
+If you have successfully run OpenHands with specific LLMs not in the list, please add them to the verified list. We
+also encourage you to open a PR to share your setup process to help others using the same provider and LLM!

-The following environment variables might be necessary for some LLMs/providers:
+For a full list of the providers and models available, please consult the
+[litellm documentation](https://docs.litellm.ai/docs/providers).

- `LLM_API_KEY`
- `LLM_BASE_URL`
+:::note
+Most current local and open source models are not as powerful. When using such models, you may see long
+wait times between messages, poor responses, or errors about malformed JSON. OpenHands can only be as powerful as the
+models driving it. However, if you do find ones that work, please add them to the verified list above.
+:::
+
+## LLM Configuration
+
+The following can be set in the OpenHands UI through the Settings:
+
+- `LLM Provider`
+- `LLM Model`
+- `API Key`
+- `Base URL` (through `Advanced Settings`)
+
+There are some settings that may be necessary for some LLMs/providers that cannot be set through the UI. Instead, these
+can be set through environment variables passed to the [docker run command](/modules/usage/installation#start-the-app)
+using `-e`:
+
+- `LLM_API_VERSION`
 - `LLM_EMBEDDING_MODEL`
 - `LLM_EMBEDDING_DEPLOYMENT_NAME`
- `LLM_API_VERSION`
 - `LLM_DROP_PARAMS`
+- `LLM_DISABLE_VISION`
+- `LLM_CACHING_PROMPT`

 We have a few guides for running OpenHands with specific model providers:

- [OpenAI](llms/openai-llms)
- [ollama](llms/local-llms)
 - [Azure](llms/azure-llms)
 - [Google](llms/google-llms)
+- [Groq](llms/groq)
+- [OpenAI](llms/openai-llms)
+- [OpenRouter](llms/openrouter)

-If you're using another provider, we encourage you to open a PR to share your setup!
+### API retries and rate limits

-## Note on Alternative Models
+LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.

-The best models are GPT-4 and Claude 3. Current local and open source models are
-not nearly as powerful. When using an alternative model,
-you may see long wait times between messages,
-poor responses, or errors about malformed JSON. OpenHands
-can only be as powerful as the models driving it--fortunately folks on our team
-are actively working on building better open source models!
+You can customize these options as you need for the provider you're using. Check their documentation, and set the following environment variables to control the number of retries and the time between retries:

-## API retries and rate limits
+- `LLM_NUM_RETRIES` (Default of 8)
+- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
+- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+- `LLM_RETRY_MULTIPLIER` (Default of 2)

-Some LLMs have rate limits and may require retries. OpenHands will automatically retry requests if it receives a 429 error or API connection error.
-You can set `LLM_NUM_RETRIES`, `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` environment variables to control the number of retries and the time between retries.
-By default, `LLM_NUM_RETRIES` is 5 and `LLM_RETRY_MIN_WAIT`, `LLM_RETRY_MAX_WAIT` are 3 seconds and 60 seconds respectively.
+If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
+
+```toml
+[llm]
+num_retries = 8
+retry_min_wait = 15
+retry_max_wait = 120
+retry_multiplier = 2
+```
@@ -1,7 +1,11 @@
 # Local LLM with Ollama

+:::warning
+When using a Local LLM, OpenHands may have limited functionality.
+:::
+
 Ensure that you have the Ollama server up and running.
-For detailed startup instructions, refer to [here](https://github.com/ollama/ollama)
+For detailed startup instructions, refer to [here](https://github.com/ollama/ollama).

 This guide assumes you've started ollama with `ollama serve`. If you're running ollama differently (e.g. inside docker), the instructions might need to be modified. Please note that if you're running WSL the default ollama configuration blocks requests from docker containers. See [here](#configuring-ollama-service-wsl-en).

@@ -24,17 +28,14 @@ mistral:7b-instruct-v0.2-q4_K_M eb14864c7427    4.4 GB  2 weeks ago
 starcoder2:latest               f67ae0f64584    1.7 GB  19 hours ago
 ```

-## Start OpenHands
+## Run OpenHands with Docker

-### Docker
-
-Use the instructions [here](../intro) to start OpenHands using Docker.
+### Start OpenHands
+Use the instructions [here](../getting-started) to start OpenHands using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
 --add-host host.docker.internal:host-gateway \
-e LLM_API_KEY="ollama" \
-e LLM_BASE_URL="http://host.docker.internal:11434" \
 -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
 ```

@@ -51,8 +52,6 @@ docker run \
    --pull=always \
    --add-host host.docker.internal:host-gateway \
    -e SANDBOX_USER_ID=$(id -u) \
-    -e LLM_API_KEY="ollama" \
-    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -63,6 +62,16 @@ docker run \

 You should now be able to connect to `http://localhost:3000/`

+### Configure the Web Application
+
+When running `openhands`, you'll need to set the following in the OpenHands UI through the Settings:
+- the model to "ollama/&lt;model-name&gt;"
+- the base url to `http://host.docker.internal:11434`
+- the API key is optional, you can use any string, such as `ollama`.
+
+
+## Run OpenHands in Development Mode
+
 ### Build from Source

 Use the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to build OpenHands.
@@ -73,23 +82,22 @@ Make sure `config.toml` is there by running `make setup-config` which will creat
 workspace_base="./workspace"

 [llm]
-model="ollama/codellama:7b"
-api_key="ollama"
 embedding_model="local"
-base_url="http://localhost:11434"
 ollama_base_url="http://localhost:11434"

 ```

-Replace `LLM_MODEL` of your choice if you need to.
+Done! Now you can start OpenHands by: `make run`. You now should be able to connect to `http://localhost:3000/`

-Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
-
-## Select your Model
+### Configure the Web Application

 In the OpenHands UI, click on the Settings wheel in the bottom-left corner.
 Then in the `Model` input, enter `ollama/codellama:7b`, or the name of the model you pulled earlier.
-If it doesn’t show up in a dropdown, that’s fine, just type it in. Click Save when you’re done.
+If it doesn’t show up in the dropdown, enable `Advanced Settings` and type it in. Please note: you need the model name as listed by `ollama list`, with the prefix `ollama/`.
+
+In the API Key field, enter `ollama` or any value, since you don't need a particular key.
+
+In the Base URL field, enter `http://localhost:11434`.

 And now you're ready to go!

@@ -196,9 +204,9 @@ base_url="http://localhost:1234/v1"
 custom_llm_provider="openai"
 ```

-Done! Now you can start Devin by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
+Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`

-# Note:
+# Note

 For WSL, run the following commands in cmd to set up the networking mode to mirrored:

@@ -1,75 +1,24 @@
 # OpenAI

-OpenHands uses [LiteLLM](https://www.litellm.ai/) to make calls to OpenAI's chat models. You can find their full documentation on OpenAI chat calls [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

-### Manual Configuration
-
-When running the OpenHands Docker image, you'll need to set the following environment variables:
-
-```sh
-LLM_MODEL="openai/<gpt-model-name>" # e.g. "openai/gpt-4o"
-LLM_API_KEY="<your-openai-project-api-key>"
-```
-
-To see a full list of OpenAI models that LiteLLM supports, please visit https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models.
-
-To find or create your OpenAI Project API Key, please visit https://platform.openai.com/api-keys.
-
-**Example**:
-
-```sh
-export WORKSPACE_BASE=$(pwd)/workspace
-
-docker run -it \
-    --pull=always \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e LLM_MODEL="openai/<gpt-model-name>" \
-    -e LLM_API_KEY="<your-openai-project-api-key>" \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/opendevin/opendevin:0.8
-```
-
-### UI Configuration
-
-You can also directly set the `LLM_MODEL` and `LLM_API_KEY` in the OpenHands client itself. Follow this guide to get up and running with the OpenHands client.
-
-From there, you can set your model and API key in the settings window.
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `OpenAI`
+* `LLM Model` to the model you will be using.
+[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
+* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).

 ## Using OpenAI-Compatible Endpoints

 Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoints. You can find their full documentation on this topic [here](https://docs.litellm.ai/docs/providers/openai_compatible).

-When running the OpenHands Docker image, you'll need to set the following environment variables:
+## Using an OpenAI Proxy

-```sh
-LLM_BASE_URL="<api-base-url>" # e.g. "http://0.0.0.0:3000"
-LLM_MODEL="openai/<model-name>" # e.g. "openai/mistral"
-LLM_API_KEY="<your-api-key>"
-```
-
-**Example**:
-
-```sh
-export WORKSPACE_BASE=$(pwd)/workspace
-
-docker run -it \
-    --pull=always \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -e LLM_BASE_URL="<api-base-url>" \
-    -e LLM_MODEL="openai/<model-name>" \
-    -e LLM_API_KEY="<your-api-key>" \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/opendevin/opendevin:0.8
-```
+If you're using an OpenAI proxy, you'll need to set the following in the OpenHands UI through the Settings:
+* Enable `Advanced Options`
+* `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
+* `Base URL` to the URL of your OpenAI proxy
+* `API Key` to your OpenAI API key
@@ -0,0 +1,12 @@
+# OpenRouter
+
+OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
+
+## Configuration
+
+When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
+* `LLM Provider` to `OpenRouter`
+* `LLM Model` to the model you will be using.
+[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
+* `API Key` to your OpenRouter API key.
@@ -0,0 +1,41 @@
+# Prompting Best Practices
+
+When working with OpenHands AI software developer, it's crucial to provide clear and effective prompts. This guide outlines best practices for creating prompts that will yield the most accurate and useful responses.
+
+## Characteristics of Good Prompts
+
+Good prompts are:
+
+1. **Concrete**: They explain exactly what functionality should be added or what error needs to be fixed.
+2. **Location-specific**: If known, they explain the locations in the code base that should be modified.
+3. **Appropriately scoped**: They should be the size of a single feature, typically not exceeding 100 lines of code.
+
+## Examples
+
+### Good Prompt Examples
+
+1. "Add a function `calculate_average` in `utils/math_operations.py` that takes a list of numbers as input and returns their average."
+
+2. "Fix the TypeError in `frontend/src/components/UserProfile.tsx` occurring on line 42. The error suggests we're trying to access a property of undefined."
+
+3. "Implement input validation for the email field in the registration form. Update `frontend/src/components/RegistrationForm.tsx` to check if the email is in a valid format before submission."
+
+### Bad Prompt Examples
+
+1. "Make the code better." (Too vague, not concrete)
+
+2. "Rewrite the entire backend to use a different framework." (Not appropriately scoped)
+
+3. "There's a bug somewhere in the user authentication. Can you find and fix it?" (Lacks specificity and location information)
+
+## Tips for Effective Prompting
+
+1. Be as specific as possible about the desired outcome or the problem to be solved.
+2. Provide context, including relevant file paths and line numbers if available.
+3. Break down large tasks into smaller, manageable prompts.
+4. Include any relevant error messages or logs.
+5. Specify the programming language or framework if it's not obvious from the context.
+
+Remember, the more precise and informative your prompt is, the better the AI can assist you in developing or modifying the OpenHands software.
+
+See [Getting Started with OpenHands](./getting-started) for more examples of helpful prompts.
@@ -1,28 +1,18 @@
---
-sidebar_position: 4
---
-
 # 🚧 Troubleshooting

 There are some error messages that frequently get reported by users.
-
-We'll try to make the install process easier and these error messages
-better in the future. But for now, you can look for your error message below and see if there are any workarounds.
-
-For each of these error messages **there is an existing issue**. Please do not
-open a new issue--just comment there.
-
-If you find more information or a workaround for one of these issues, please
-open a *PR* to add details to this file.
+We'll try to make the install process easier, but for now you can look for your error message below and see if there are any workarounds.
+If you find more information or a workaround for one of these issues, please open a *PR* to add details to this file.

 :::tip
-If you're running on Windows and having trouble, check out our [guide for Windows (WSL) users](troubleshooting/windows).
+OpenHands only supports Windows via [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+Please be sure to run all commands inside your WSL terminal.
+Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some troubleshooting guides.
 :::

 ## Common Issues

 * [Unable to connect to Docker](#unable-to-connect-to-docker)
-* [Unable to connect to SSH box](#unable-to-connect-to-ssh-box)
 * [404 Resource not found](#404-resource-not-found)
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
@@ -52,58 +42,6 @@ OpenHands uses a Docker container to do its work safely, without potentially bre
 * If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
 * In addition, upgrade your Docker to the latest version under `Check for Updates`

---
-### Unable to connect to SSH box
-
-[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1156)
-
-**Symptoms**
-
-```python
-self.shell = DockerSSHBox(
-...
-pexpect.pxssh.ExceptionPxssh: Could not establish connection to host
-```
-
-**Details**
-
-By default, OpenHands connects to a running container using SSH. On some machines,
-especially Windows, this seems to fail.
-
-**Workarounds**
-
-* Restart your computer (sometimes it does work)
-* Be sure to have the latest versions of WSL and Docker
-* Check that your distribution in WSL is up to date as well
-* Try [this reinstallation guide](https://github.com/All-Hands-AI/OpenHands/issues/1156#issuecomment-2064549427)
-
---
-### Unable to connect to LLM
-
-[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1208)
-
-**Symptoms**
-
-```python
-  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
-    super().__init__(message, response.request, body=body)
-                              ^^^^^^^^^^^^^^^^
-AttributeError: 'NoneType' object has no attribute 'request'
-```
-
-**Details**
-
-[GitHub Issues](https://github.com/All-Hands-AI/OpenHands/issues?q=is%3Aissue+is%3Aopen+404)
-
-This usually happens with *local* LLM setups, when OpenHands can't connect to the LLM server.
-See our guide for [local LLMs](llms/local-llms) for more information.
-
-**Workarounds**
-
-* Check your `base_url` in your config.toml (if it exists) under the "llm" section
-* Check that ollama (or whatever LLM you're using) is running OK
-* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
-
 ---
 ### `404 Resource not found`

@@ -141,11 +79,10 @@ the API endpoint you're trying to connect to. Most often this happens for Azure
 **Workarounds**

 * Check that you've set `LLM_BASE_URL` properly
-* Check that model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
+* Check that the model is set properly, based on the [LiteLLM docs](https://docs.litellm.ai/docs/providers)
  * If you're running inside the UI, be sure to set the `model` in the settings modal
  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
 * Make sure you've followed any special instructions for your LLM provider
-  * [ollama](/modules/usage/llms/local-llms)
  * [Azure](/modules/usage/llms/azure-llms)
  * [Google](/modules/usage/llms/google-llms)
 * Make sure your API key is correct
@@ -1,4 +1,4 @@
-# Notes for Windows and WSL Users
+# Notes for WSL on Windows Users

 OpenHands only supports Windows via [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
 Please be sure to run all commands inside your WSL terminal.
@@ -8,12 +8,11 @@ Please be sure to run all commands inside your WSL terminal.
 ### Recommendation: Do not run as root user

 For security reasons, it is highly recommended to not run OpenHands as the root user, but a user with a non-zero UID.
-In addition, persistent sandboxes won't be supported when running as root and during start of OpenHands an appropriate message may appear.

 References:

 * [Why it is bad to login as root](https://askubuntu.com/questions/16178/why-is-it-bad-to-log-in-as-root)
-* [Set default user in WSL](https://www.tenforums.com/tutorials/128152-set-default-user-windows-subsystem-linux-distro-windows-10-a.html#option2)  
+* [Set default user in WSL](https://www.tenforums.com/tutorials/128152-set-default-user-windows-subsystem-linux-distro-windows-10-a.html#option2)
 Hint about the 2nd reference: for Ubuntu users, the command could actually be "ubuntupreview" instead of "ubuntu".

 ---
@@ -22,21 +21,6 @@ Hint about the 2nd reference: for Ubuntu users, the command could actually be "u
 If you are using Docker Desktop, make sure to start it before calling any docker command from inside WSL.
 Docker also needs to have the WSL integration option activated.

---
-### Failed to create openhands user
-
-If you encounter the following error during setup:
-
-```sh
-Exception: Failed to create openhands user in sandbox: 'useradd: UID 0 is not unique'
- ```
-
-You can resolve it by running:
-
-```sh
-export SANDBOX_USER_ID=1000
-```
-
 ---
 ### Poetry Installation

@@ -76,5 +60,5 @@ localhostForwarding=true

 * Save the `.wslconfig` file.
 * Restart WSL2 completely by exiting any running WSL2 instances and executing the command `wsl --shutdown` in your command prompt or terminal.
-* After restarting WSL, attempt to execute `make run` again.  
-The networking issue should be resolved.
+* After restarting WSL, attempt to execute `make run` again.
+The networking issue should be resolved.
@@ -1,7 +1,3 @@
---
-sidebar_position: 8
---
-
 # ⬆️ Upgrade Guide

 ## 0.8.0 (2024-07-13)
@@ -12,9 +12,9 @@
        "@docusaurus/plugin-content-pages": "^3.5.2",
        "@docusaurus/preset-classic": "^3.5.2",
        "@docusaurus/theme-mermaid": "^3.5.2",
-        "@mdx-js/react": "^3.0.0",
+        "@mdx-js/react": "^3.1.0",
        "clsx": "^2.0.0",
-        "prism-react-renderer": "^2.3.0",
+        "prism-react-renderer": "^2.4.0",
        "react": "^18.3.1",
        "react-dom": "^18.3.1",
        "react-icons": "^5.3.0",
@@ -24,7 +24,7 @@
        "@docusaurus/module-type-aliases": "^3.5.1",
        "@docusaurus/tsconfig": "^3.5.2",
        "@docusaurus/types": "^3.5.1",
-        "typescript": "~5.5.4"
+        "typescript": "~5.6.3"
      },
      "engines": {
        "node": ">=18.0"
@@ -2883,9 +2883,9 @@
      }
    },
    "node_modules/@mdx-js/react": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.0.1.tgz",
-      "integrity": "sha512-9ZrPIU4MGf6et1m1ov3zKf+q9+deetI51zprKB1D/z3NOb+rUxxtEl3mCjW5wTGh6VhRdwPueh1oRzi6ezkA8A==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.0.tgz",
+      "integrity": "sha512-QjHtSaoameoalGnKDT3FoIl4+9RwyTmo9ZJGBdLOks/YOiWHoRDI3PUwEzOE7kEmGcV3AFcp9K6dYu9rEuKLAQ==",
      "dependencies": {
        "@types/mdx": "^2.0.0"
      },
@@ -12640,9 +12640,9 @@
      }
    },
    "node_modules/prism-react-renderer": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/prism-react-renderer/-/prism-react-renderer-2.3.1.tgz",
-      "integrity": "sha512-Rdf+HzBLR7KYjzpJ1rSoxT9ioO85nZngQEoFIhL07XhtJHlCU3SOz0GJ6+qvMyQe0Se+BV3qpe6Yd/NmQF5Juw==",
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/prism-react-renderer/-/prism-react-renderer-2.4.0.tgz",
+      "integrity": "sha512-327BsVCD/unU4CNLZTWVHyUHKnsqcvj2qbPlQ8MiBE2eq2rgctjigPA1Gp9HLF83kZ20zNN6jgizHJeEsyFYOw==",
      "dependencies": {
        "@types/prismjs": "^1.26.0",
        "clsx": "^2.0.0"
@@ -14853,9 +14853,9 @@
      }
    },
    "node_modules/typescript": {
-      "version": "5.5.4",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz",
-      "integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
+      "version": "5.6.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz",
+      "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==",
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"
@@ -19,9 +19,9 @@
    "@docusaurus/plugin-content-pages": "^3.5.2",
    "@docusaurus/preset-classic": "^3.5.2",
    "@docusaurus/theme-mermaid": "^3.5.2",
-    "@mdx-js/react": "^3.0.0",
+    "@mdx-js/react": "^3.1.0",
    "clsx": "^2.0.0",
-    "prism-react-renderer": "^2.3.0",
+    "prism-react-renderer": "^2.4.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-icons": "^5.3.0",
@@ -31,7 +31,7 @@
    "@docusaurus/module-type-aliases": "^3.5.1",
    "@docusaurus/tsconfig": "^3.5.2",
    "@docusaurus/types": "^3.5.1",
-    "typescript": "~5.5.4"
+    "typescript": "~5.6.3"
  },
  "browserslist": {
    "production": [
@@ -1,8 +1,155 @@
 import type { SidebarsConfig } from "@docusaurus/plugin-content-docs";

 const sidebars: SidebarsConfig = {
-  docsSidebar: [{ type: "autogenerated", dirName: "usage" }],
  apiSidebar: [require("./modules/python/sidebar.json")],
+  docsSidebar: [
+    {
+      type: 'doc',
+      label: 'Installation',
+      id: 'usage/installation',
+    },
+    {
+      type: 'doc',
+      label: 'Getting Started',
+      id: 'usage/getting-started',
+    },
+    {
+      type: 'doc',
+      label: 'Prompting Best Practices',
+      id: 'usage/prompting-best-practices',
+    },
+    {
+      type: 'category',
+      label: 'Usage Methods',
+      items: [
+        {
+          type: 'doc',
+          label: 'GUI Mode',
+          id: 'usage/how-to/gui-mode',
+        },
+        {
+          type: 'doc',
+          label: 'CLI Mode',
+          id: 'usage/how-to/cli-mode',
+        },
+        {
+          type: 'doc',
+          label: 'Headless Mode',
+          id: 'usage/how-to/headless-mode',
+        },
+        {
+          type: 'doc',
+          label: 'Github Actions',
+          id: 'usage/how-to/github-action',
+        },
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Advanced Configuration',
+      items: [
+        {
+          type: 'category',
+          label: 'LLM Configuration',
+          items: [
+            {
+              type: 'doc',
+              label: 'Overview',
+              id: 'usage/llms/llms',
+            },
+            {
+              type: 'category',
+              label: 'Providers',
+              items: [
+                {
+                  type: 'doc',
+                  label: 'Azure',
+                  id: 'usage/llms/azure-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Google',
+                  id: 'usage/llms/google-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'Groq',
+                  id: 'usage/llms/groq',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenAI',
+                  id: 'usage/llms/openai-llms',
+                },
+                {
+                  type: 'doc',
+                  label: 'OpenRouter',
+                  id: 'usage/llms/openrouter',
+                },
+              ],
+            },
+          ],
+        },
+        {
+          type: 'doc',
+          label: 'Custom Sandbox',
+          id: 'usage/how-to/custom-sandbox-guide',
+        },
+      ],
+    },
+    {
+      type: 'doc',
+      label: 'Troubleshooting',
+      id: 'usage/troubleshooting/troubleshooting',
+    },
+    {
+      type: 'doc',
+      label: 'Feedback',
+      id: 'usage/feedback',
+    },
+    {
+      type: 'category',
+      label: 'For OpenHands Developers',
+      items: [
+        {
+          type: 'category',
+          label: 'Architecture',
+          items: [
+            {
+              type: 'doc',
+              label: 'Backend',
+              id: 'usage/architecture/backend',
+            },
+            {
+              type: 'doc',
+              label: 'Runtime',
+              id: 'usage/architecture/runtime',
+            },
+          ],
+        },
+        {
+          type: 'doc',
+          label: 'Debugging',
+          id: 'usage/how-to/debugging',
+        },
+        {
+          type: 'doc',
+          label: 'Evaluation',
+          id: 'usage/how-to/evaluation-harness',
+        },
+        {
+          type: 'doc',
+          label: 'Kubernetes Deployment',
+          id: 'usage/how-to/openshift-example',
+        },
+      ],
+    },
+    {
+      type: 'doc',
+      label: 'About',
+      id: 'usage/about',
+    }
+  ],
 };

 export default sidebars;
@@ -7,17 +7,6 @@ function CustomFooter() {
  return (
    <footer className="custom-footer">
      <div className="footer-content">
-        <div className="footer-top">
-          <div className="footer-title">
-            <Translate id="footer.title">OpenHands</Translate>
-          </div>
-          <div className="footer-link">
-            <a href="/modules/usage/intro">
-              <Translate id="footer.docs">Docs</Translate>
-            </a>
-          </div>
-        </div>
-
        <div className="footer-icons">
          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA" target="_blank" rel="noopener noreferrer">
            <FaSlack />
@@ -32,7 +21,7 @@ function CustomFooter() {
        <div className="footer-bottom">
          <p>
            <Translate id="footer.copyright" values={{ year: new Date().getFullYear() }}>
-              {'Copyright © {year} OpenHands'}
+              {'Copyright © {year} All Hands AI, Inc'}
            </Translate>
          </p>
        </div>
@@ -17,23 +17,19 @@ export function HomepageHeader() {

        <p className="header-subtitle">{siteConfig.tagline}</p>

-        <div className="header-links">
-          <a href="https://github.com/All-Hands-AI/OpenHands">
-            <img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
-          </a>
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA">
-            <img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
-          </a>
-          <a href="https://discord.gg/ESHStjSjD4">
-            <img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
-          </a>
-
-          <a href="https://arxiv.org/abs/2407.16741">
-            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
-          </a>
-          <a href="https://huggingface.co/spaces/OpenHands/evaluation">
-            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
-          </a>
+        <div align="center" className="header-links">
+          <a href="https://github.com/All-Hands-AI/OpenHands/graphs/contributors"><img src="https://img.shields.io/github/contributors/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Contributors" /></a>
+          <a href="https://github.com/All-Hands-AI/OpenHands/stargazers"><img src="https://img.shields.io/github/stars/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="Stargazers" /></a>
+          <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
+          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
+          <br/>
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
+          <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
+          <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
+          <br/>
+          <a href="https://docs.all-hands.dev/modules/usage/getting-started"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation" /></a>
+          <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper%20on%20Arxiv-000?logoColor=FFE165&logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" /></a>
+          <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Benchmark%20score-000?logoColor=FFE165&logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark Score" /></a>
        </div>

        <Demo />
@@ -7,8 +7,10 @@
 /* You can override the default Infima variables here. */

 :root {
-  --ifm-color-primary: #4465db;
  --ifm-code-font-size: 95%;
+  --ifm-color-primary: #000;
+  --ifm-background-color: #F1EAE0;
+  --ifm-navbar-background-color: #F1EAE0;
  --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
  --secondary: #171717;
  --secondary-dark: #0a0a0a;
@@ -17,21 +19,15 @@

 /* For readability concerns, you should choose a lighter palette in dark mode. */
 [data-theme="dark"] {
-  --ifm-color-primary: #4465db;
+  --ifm-color-primary: #FFF;
+  --ifm-background-color: #000;
+  --ifm-navbar-background-color: #000;
  --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
  --secondary: #737373;
  --secondary-dark: #171717;
-  --secondary-light: #d4d4d4;
+  --secondary-light: #ccc;
 }

-.footer--dark {
-  background-image: linear-gradient(
-    140deg,
-    var(--secondary) 20%,
-    var(--secondary-light) 100%
-  );
-}
-
-.a {
+article a, .a {
  text-decoration: underline;
-}
+}
@@ -2,13 +2,19 @@

 .custom-footer {
    background-color: dark;
-    color: white;
-    height: 200px;
+    color: #000;
+    height: 100px;
    /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
-    background: linear-gradient(to bottom, #1f2937, #000000);
+    background-color: #F1EAE0;

  }

+[data-theme="dark"] .custom-footer {
+  background-color: #000;
+  color: #fff;
+}
+
+
  .footer-content {
    display: flex;
    flex-direction: column;
@@ -47,7 +53,6 @@
  }

  .footer-community {
-    text-transform: uppercase;
    font-weight: 300;
  }

@@ -65,7 +70,3 @@
  .footer-icons a:hover {
    color: white;
  }
-
-  .footer-bottom {
-    text-transform: uppercase;
-  }
@@ -2,8 +2,6 @@

 .homepage-header {
  height: 800px;
-  color: white;
-  background: linear-gradient(to top, #64748b, #000000);
 }

 .header-content {
@@ -20,8 +20,7 @@ export default function Home(): JSX.Element {
      title={`${siteConfig.title}`}
      description={translate({
        id: 'homepage.description',
-        message: 'An Open Platform for AI Software Developers as Generalist Agents',
-        description: 'The homepage description',
+        message: 'Code Less, Make More',
      })}
    >
    <HomepageHeader />
@@ -22,6 +22,8 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync

 game = None

@@ -62,7 +64,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=False,
            use_host_network=False,
        ),
@@ -74,7 +76,7 @@ def get_config(
    return config


-async def process_instance(
+def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
@@ -117,13 +119,18 @@ async def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime = await create_runtime(config, sid=instance['text'].strip())
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

-    state: State | None = await run_controller(
-        config=config,
-        task_str=instruction,
-        runtime=runtime,
-        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                metadata.agent_class
+            ],
+        )
    )
    # ======= Attempt to evaluate the agent's edits =======
    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -214,12 +221,10 @@ if __name__ == '__main__':
        eda_dataset.to_pandas(), output_file, args.eval_n_limit
    )

-    asyncio.run(
-        run_evaluation(
-            prepared_dataset,
-            metadata,
-            output_file,
-            args.eval_num_workers,
-            process_instance,
-        )
+    run_evaluation(
+        prepared_dataset,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
    )
@@ -36,7 +36,7 @@ fi

 # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
 # We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")

 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
  --data-split test \
  --max-iterations 20 \
  --OPENAI_API_KEY $OPENAI_API_KEY \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

@@ -2,49 +2,24 @@

 This folder contains code and resources to run experiments and evaluations.

-## Logistics
-To better organize the evaluation folder, we should follow the rules below:
-  - Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
-all the preprocessing/evaluation/analysis scripts.
-  - Raw data and experimental records should not be stored within this repo.
-    - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
-  - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+## For Benchmark Users

-## Supported Benchmarks
+### Setup

-To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).
+Before starting evaluation, follow the instructions here [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.

-### Software Engineering
+Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the evaluation directory.
+Generally these will involve running `run_infer.py` to perform inference with the agents.

- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- BIRD: [`evaluation/bird`](./bird)
- BioCoder: [`evaluation/ml_bench`](./ml_bench)
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
- APIBench: [`evaluation/gorilla`](./gorilla/)
- ToolQA: [`evaluation/toolqa`](./toolqa/)
+### Implementing and Evaluating an Agent

-### Web Browsing
+To add an agent to OpenHands, you will need to implement it in the [agenthub directory](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub). There is a README there with more information.

- WebArena: [`evaluation/webarena`](./webarena/)
- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+To evaluate an agent, you can provide the agent's name to the `run_infer.py` program.

-### Misc. Assistance
-
- GAIA: [`evaluation/gaia`](./gaia)
- GPQA: [`evaluation/gpqa`](./gpqa)
- AgentBench: [`evaluation/agent_bench`](./agent_bench)
- MINT: [`evaluation/mint`](./mint)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
-
-
-## Before everything begins: Setup Environment and LLM Configuration
-
-Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
-
-OpenHands in development mode uses `config.toml` to keep track of most configurations.
+### Evaluating Different LLMs

+OpenHands in development mode uses `config.toml` to keep track of most configuration.
 Here's an example configuration file you can use to define and use multiple LLMs:

 ```toml
@@ -65,12 +40,48 @@ api_key = "XXX"
 temperature = 0.0
 ```

+## Supported Benchmarks

-### Result Visualization
+The OpenHands evaluation harness supports a wide variety of benchmarks across software engineering, web browsing, and miscellaneous assistance tasks.
+
+### Software Engineering
+
+- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
+- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
+- BIRD: [`evaluation/bird`](./bird)
+- BioCoder: [`evaluation/ml_bench`](./ml_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
+- APIBench: [`evaluation/gorilla`](./gorilla/)
+- ToolQA: [`evaluation/toolqa`](./toolqa/)
+- AiderBench: [`evaluation/aider_bench`](./aider_bench/)
+
+### Web Browsing
+
+- WebArena: [`evaluation/webarena`](./webarena/)
+- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+
+### Misc. Assistance
+
+- GAIA: [`evaluation/gaia`](./gaia)
+- GPQA: [`evaluation/gpqa`](./gpqa)
+- AgentBench: [`evaluation/agent_bench`](./agent_bench)
+- MINT: [`evaluation/mint`](./mint)
+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
+
+## Result Visualization

 Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.

-
-### Upload your results
-
 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+## For Benchmark Developers
+
+To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
+
+- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
+all the preprocessing/evaluation/analysis scripts.
+- Raw data and experimental records should not be stored within this repo.
+- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
+- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+
@@ -32,7 +32,8 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync


 def get_config(
@@ -44,7 +45,7 @@ def get_config(
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
-            base_container_image='python:3.11-bookworm',
+            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
@@ -56,7 +57,7 @@ def get_config(
    return config


-async def initialize_runtime(
+def initialize_runtime(
    runtime: Runtime,
    instance: pd.Series,  # this argument is not required
 ):
@@ -70,12 +71,12 @@ async def initialize_runtime(
    # Set instance id
    action = CmdRunAction(command='mkdir -p /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    action = CmdRunAction(command='cd /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    init_cmd = instance.init
@@ -85,7 +86,7 @@ async def initialize_runtime(
        with tempfile.TemporaryDirectory() as tmpdir:
            host_script_path = os.path.join(tmpdir, script_name)
            create_sh_file(host_script_path, init_cmd)
-            await runtime.copy_to(
+            runtime.copy_to(
                host_script_path,
                '/workspace',
            )
@@ -93,14 +94,14 @@ async def initialize_runtime(
        logger.info(f'Running init script: {script_name}')
        action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = await runtime.run_action(action)
+        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0

    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")


-async def complete_runtime(
+def complete_runtime(
    runtime: Runtime,
    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
 ) -> dict[str, Any]:
@@ -121,7 +122,7 @@ async def complete_runtime(
        with tempfile.TemporaryDirectory() as tmpdir:
            host_script_path = os.path.join(tmpdir, script_name)
            create_sh_file(host_script_path, get_agent_result_cmd)
-            await runtime.copy_to(
+            runtime.copy_to(
                host_script_path,
                '/workspace',
            )
@@ -132,7 +133,7 @@ async def complete_runtime(
            keep_prompt=False,
        )
        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = await runtime.run_action(action)
+        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0
        agent_answer = obs.content
@@ -149,7 +150,7 @@ async def complete_runtime(
            with tempfile.TemporaryDirectory() as tmpdir:
                host_script_path = os.path.join(tmpdir, script_name)
                create_sh_file(host_script_path, get_ground_truth_cmd)
-                await runtime.copy_to(
+                runtime.copy_to(
                    host_script_path,
                    '/workspace',
                )
@@ -160,7 +161,7 @@ async def complete_runtime(
                keep_prompt=False,
            )
            logger.info(action, extra={'msg_type': 'ACTION'})
-            obs = await runtime.run_action(action)
+            obs = runtime.run_action(action)
            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
            final_ans = obs.content

@@ -171,7 +172,7 @@ async def complete_runtime(
    }


-async def process_instance(
+def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
@@ -209,16 +210,19 @@ async def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = await create_runtime(config, sid=instance.instance_id)
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

-    await initialize_runtime(runtime, instance=instance)
+    initialize_runtime(runtime, instance=instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = await run_controller(
-        config=config,
-        task_str=instruction,
-        runtime=runtime,
-        fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
    )
    if state is None:
        raise ValueError('State should not be None.')
@@ -227,7 +231,7 @@ async def process_instance(
    # result evaluation
    # =============================================

-    return_val = await complete_runtime(runtime, instance)
+    return_val = complete_runtime(runtime, instance)
    agent_answer = return_val['agent_answer']
    final_ans = return_val['final_ans']

@@ -313,8 +317,6 @@ if __name__ == '__main__':
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)

-    asyncio.run(
-        run_evaluation(
-            instances, metadata, output_file, args.eval_num_workers, process_instance
-        )
+    run_evaluation(
+        instances, metadata, output_file, args.eval_num_workers, process_instance
    )
@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

@@ -16,52 +16,66 @@ development environment and LLM.
 ## Start the evaluation

 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 ```

-   `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
    your LLM settings, as defined in your `config.toml`.
-   `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
-    you would like to evaluate. It could also be a release tag like `0.6.2`.
-   `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
    defaulting to `CodeActAgent`.
-   `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
    instances. By default, the script evaluates the entire Exercism test set
    (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
+
+There are also following optional environment variables you can set:
+
+```bash
+export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
+export SKIP_NUM=12 # skip the first 12 instances from the dataset
+```

 Following is the basic command to start the evaluation.

 You can update the arguments in the script
-`evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`,
-`--eval-num-workers` and so on.
+`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
+`--eval-num-workers` and so on:

-   `--agent-cls`, the agent to use. For example, `CodeActAgent`.
-   `--llm-config`: the LLM configuration to use. For example,
-    `eval_gpt4_1106_preview`.
-   `--max-iterations`: the number of iterations to run the evaluation. For
-    example, `30`.
-   `--eval-num-workers`: the number of workers to use for evaluation. For
-    example, `5`.
-   `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
+- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
+- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
+- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.

 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
+./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```

 ## Summarize Results

 ```bash
-poetry run python ./evaluation/agent_bench/scripts/summarise_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```

 This will list the instances that passed and the instances that failed. For each
 instance, the corresponding set of test cases (which can vary for each instance)
 are run on the file edited by the agent. We consider an instance to be passed
 only if ALL test cases are passed. Sometimes even a single failed test case will
-cause the entire instance to be marked as filed.
+cause the entire instance to be marked as failed.

-You can inspect the test_results field in the output json file to know the exact
+You can inspect the `test_results` field in the `output.jsonl` file to find the exact
 outcome of the tests. If there are no syntax or indentation errors, you can
-expect to see something like "..F...EF..", where "." means the test case
-passed, "E" means there was an error while executing the test case and "F"
-means some assertion failed and returned output was not as expected.
+expect to see something like "`..F...EF..`", where "`.`" means the test case
+passed, "`E`" means there was an error while executing the test case and "`F`"
+means some assertion failed and some returned output was not as expected.
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import os
 import tempfile
 from typing import Any
@@ -24,13 +25,22 @@ from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
+    load_from_toml,
    parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure visibility of unit tests to the Agent.
+USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)


 def get_config(
@@ -39,23 +49,31 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='python:3.11-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
            timeout=100,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
    return config


-async def initialize_runtime(
+def initialize_runtime(
    runtime: Runtime,
    instance: pd.Series,
 ):
@@ -63,32 +81,40 @@ async def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
    obs: CmdOutputObservation

    # Set instance id
    action = CmdRunAction(command='mkdir -p /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    action = CmdRunAction(command='cd /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    with tempfile.TemporaryDirectory() as tmpdir:
        file_path = os.path.join(tmpdir, f'{instance.instance_name}.py')
        with open(file_path, 'w') as f:
            f.write(instance.signature)
-        await runtime.copy_to(
+        runtime.copy_to(
            file_path,
            '/workspace',
        )
-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+        if USE_UNIT_TESTS:
+            file_path = os.path.join(tmpdir, f'{instance.instance_name}_test.py')
+            with open(file_path, 'w') as f:
+                f.write(instance.test)
+            runtime.copy_to(
+                file_path,
+                '/workspace',
+            )
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")


-async def complete_runtime(
+def complete_runtime(
    runtime: Runtime,
    instance: pd.Series,
 ) -> dict[str, Any]:
@@ -98,33 +124,36 @@ async def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
    obs: CmdOutputObservation

+    # Rewriting the test file to ignore any changes Agent may have made.
    script_name = f'{instance.instance_name}_test.py'
    with tempfile.TemporaryDirectory() as tmpdir:
        file_path = os.path.join(tmpdir, script_name)
        with open(file_path, 'w') as f:
            f.write(instance.test)
-        await runtime.copy_to(
+        runtime.copy_to(
            file_path,
            '/workspace',
        )
        logger.info(f'Running test file: {script_name}')

    action = CmdRunAction(
-        command=f'python -m unittest {script_name}',
+        command=f'python3 -m unittest {script_name}',
        keep_prompt=False,
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

    exit_code = 1
    if isinstance(obs, CmdOutputObservation):
        exit_code = obs.exit_code

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    runtime.close()

    return {
        'test_output': obs.content,
@@ -132,7 +161,7 @@ async def complete_runtime(
    }


-async def process_instance(
+def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
@@ -144,7 +173,9 @@ async def process_instance(
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {str(instance.instance_id)}.')
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )

    # =============================================
    # build instruction
@@ -156,6 +187,15 @@ async def process_instance(
    instruction += INSTRUCTIONS_ADDENDUM.format(
        signature_file=f'{instance.instance_name}.py',
    )
+    if USE_UNIT_TESTS:
+        logger.info(
+            f'\nInstruction to run test_file: {instance.instance_name}_test.py\n'
+        )
+        instruction += (
+            f'Use `python -m unittest {instance.instance_name}_test.py` to run the test_file '
+            'and verify the correctness of your solution. DO NOT EDIT the test file.\n\n'
+        )
+
    instruction += (
        'IMPORTANT: You should ONLY interact with the environment provided '
        'to you AND NEVER ASK FOR HUMAN HELP.\n'
@@ -167,16 +207,19 @@ async def process_instance(
    # create sandbox and run the agent
    # =============================================

-    runtime: Runtime = await create_runtime(config, sid=str(instance.instance_id))
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)

-    await initialize_runtime(runtime, instance=instance)
+    initialize_runtime(runtime, instance=instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = await run_controller(
-        config=config,
-        task_str=instruction,
-        runtime=runtime,
-        fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+        )
    )
    if state is None:
        raise ValueError('State should not be None.')
@@ -185,7 +228,7 @@ async def process_instance(
    # # result evaluation
    # # =============================================

-    return_val = await complete_runtime(runtime, instance)
+    return_val = complete_runtime(runtime, instance)
    exit_code = return_val['exit_code']
    test_output = return_val['test_output']

@@ -245,14 +288,25 @@ if __name__ == '__main__':
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(aider_bench_tests, output_file, args.eval_n_limit)

-    asyncio.run(
-        run_evaluation(
-            instances,
-            metadata,
-            output_file,
-            args.eval_num_workers,
-            process_instance,
-        )
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        aider_bench_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
    )
@@ -8,6 +8,7 @@ COMMIT_HASH=$2
 AGENT=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
+EVAL_IDS=$6

 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
@@ -26,18 +27,34 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"

+EVAL_NOTE=$AGENT_VERSION
+
+# Default to NOT use unit tests.
+if [ -z "$USE_UNIT_TESTS" ]; then
+  export USE_UNIT_TESTS=false
+fi
+echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
 COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $EVAL_NOTE"

 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi

+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
 # Run the command
 eval $COMMAND
@@ -1,35 +1,68 @@
-import json
-import sys
+import argparse
+
+import numpy as np
+import pandas as pd


-def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
    passed = []
    failed = []
-    with open(res_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'test_result' in data and 'exit_code' in data['test_result']:
-                resolved = data['test_result']['exit_code'] == 0
-            if resolved:
-                passed.append(instance_id)
-            else:
-                failed.append(instance_id)
+    for _, row in df.iterrows():
+        instance_id = row['instance_id']
+        resolved = False
+        if 'test_result' in row and 'exit_code' in row['test_result']:
+            resolved = row['test_result']['exit_code'] == 0
+        if resolved:
+            passed.append(instance_id)
+        else:
+            failed.append(instance_id)
    return passed, failed


+def visualize_results(df: pd.DataFrame):
+    df1 = pd.DataFrame()
+    df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
+    df1['result'] = (
+        df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'})
+    )
+    df1['actions'] = pd.Series([len(a) - 1 for a in df['history']])
+
+    passed = np.sum(df1['result'] == 'Pass')
+    total = df.shape[0]
+    resolve_rate = round((passed / total) * 100, 2)
+
+    print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
+    print('\nDescriptive statistics for number of actions:')
+    print(df1['actions'].describe())
+    print('\nDescriptive statistics for costs:')
+    print(df1['cost'].describe())
+
+    # Bin counts for actions
+    action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
+    print('\nAction bin counts:')
+    print(action_bins.value_counts().sort_index())
+
+    # Bin counts for costs
+    cost_bins = pd.cut(df1['cost'], bins=10)
+    print('\nCost bin counts:')
+    print(cost_bins.value_counts().sort_index())
+
+    return resolve_rate
+
+
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print(
-            'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
-        )
-        sys.exit(1)
-    json_file_path = sys.argv[1]
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    parser = argparse.ArgumentParser(description='Summarize AiderBench results')
+    parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
+    args = parser.parse_args()
+
+    # Create DataFrame from JSONL file
+    df = pd.read_json(args.input_filepath, lines=True)
+
+    passed_tests, failed_tests = extract_test_results(df)
+    resolve_rate = visualize_results(df)
+
    print(
-        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
    )
    print('PASSED TESTS:')
    print(passed_tests)
@@ -27,9 +27,10 @@ from openhands.core.config import (
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
-from openhands.runtime.runtime import Runtime
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': functools.partial(
@@ -74,7 +75,7 @@ def get_config(
    return config


-async def initialize_runtime(
+def initialize_runtime(
    runtime: Runtime,
    instance: BiocoderData,  # this argument is not required
 ):
@@ -89,19 +90,19 @@ async def initialize_runtime(

    action = CmdRunAction(command='mkdir -p /workspace && mkdir -p /testing_files')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    with tempfile.TemporaryDirectory() as tmpdir:
        context_path = os.path.join(tmpdir, 'context.' + file_ext)
        with open(context_path, 'w') as f:
            f.write(instance.contextCode)
-        await runtime.copy_to(context_path, '/testing_files')
+        runtime.copy_to(context_path, '/testing_files')

        golden_path = os.path.join(tmpdir, 'golden.' + file_ext)
        with open(golden_path, 'w') as f:
            f.write(instance.goldenCode)
-        await runtime.copy_to(golden_path, '/testing_files')
+        runtime.copy_to(golden_path, '/testing_files')

        testcase_json = {
            'test_case_id': instance.test_case_id,
@@ -112,36 +113,36 @@ async def initialize_runtime(
        with open(testcase_path, 'w') as f:
            f.write(json.dumps(testcase_json, indent=4))

-        await runtime.copy_to(testcase_path, '/testing_files')
+        runtime.copy_to(testcase_path, '/testing_files')

    # setup paths
    remove_code_script = os.path.join(
        os.path.dirname(__file__), 'scripts', 'setup', 'remove_code.py'
    )
-    await runtime.copy_to(remove_code_script, '/testing_files')
+    runtime.copy_to(remove_code_script, '/testing_files')

    action = CmdRunAction(command='cd /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    # download repository archive
    repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
    action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0, f'Failed to download the repository: {obs.content}'

    # unzip the repository
    action = CmdRunAction(command='unzip -o -q repo.zip && rm repo.zip')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0, f'Failed to unzip the repository: {obs.content}'

    # chmod 777
    action = CmdRunAction(command='chmod -R 777 /workspace')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0, f'Failed to chmod the files: {obs.content}'

    # remove code for evaluation instance
@@ -155,13 +156,13 @@ async def initialize_runtime(
        command=f'python3 /testing_files/remove_code.py --target_filepath {target_filepath} --line_start {line_start} --line_end {line_end} --language {language}'
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'

    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")


-async def complete_runtime(
+def complete_runtime(
    runtime: Runtime,
    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
 ) -> dict[str, Any]:
@@ -179,7 +180,7 @@ async def complete_runtime(
    copy_changed_code_script = os.path.join(
        os.path.dirname(__file__), 'scripts', 'setup', 'copy_changed_code.py'
    )
-    await runtime.copy_to(copy_changed_code_script, '/testing_files')
+    runtime.copy_to(copy_changed_code_script, '/testing_files')

    file_ext = FILE_EXT_MAP[instance.language.lower()]
    target_filepath = os.path.join(
@@ -191,13 +192,13 @@ async def complete_runtime(
        command=f'python3 /testing_files/copy_changed_code.py --target_filepath {target_filepath} --generated_code_filepath {generated_path} --line_start {instance.lineStart} --include_signature'
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    if obs.exit_code == 0:
        test_result['metadata']['1_copy_change_success'] = True

        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = await runtime.run_action(action)
+        obs = runtime.run_action(action)
        assert obs.exit_code == 0

        code = obs.content
@@ -208,14 +209,14 @@ async def complete_runtime(

    action = CmdRunAction(command='cd /testing_files')
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    assert obs.exit_code == 0

    action = CmdRunAction(
        command='/home/openhands/mambaforge/bin/mamba run -n test python3 /testing/start_test_openhands.py'
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

@@ -223,7 +224,7 @@ async def complete_runtime(
        command='cat /testing_files/results_biocoder.json', keep_prompt=False
    )
    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = await runtime.run_action(action)
+    obs = runtime.run_action(action)
    if obs.exit_code == 0:
        test_result['metadata']['2_run_test_success'] = True
        test_result['metadata']['2_run_test_result'] = str(obs.content)
@@ -237,7 +238,7 @@ async def complete_runtime(
    return test_result


-async def process_instance(
+def process_instance(
    instance: pd.Series,
    metadata: EvalMetadata,
    reset_logger: bool = True,
@@ -274,25 +275,26 @@ async def process_instance(
    # NOTE: You can actually set slightly different instruction for different agents
    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

-    # use a session id for concurrent evaluation
-    sid = instance.instance_id.replace('/', '__')
-
-    runtime = await create_runtime(config, sid=sid)
-
-    await initialize_runtime(runtime, instance)
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    initialize_runtime(runtime, instance)

    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = await run_controller(
-        config=config,
-        task_str=instruction,
-        runtime=runtime,
-        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                metadata.agent_class
+            ],
+        )
    )

    if state is None:
        raise ValueError('State should not be None.')

-    test_result = await complete_runtime(runtime, instance)
+    test_result = complete_runtime(runtime, instance)
    metrics = state.metrics.get() if state.metrics else None
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
@@ -340,8 +342,6 @@ if __name__ == '__main__':
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    instances = prepare_dataset(biocoder_tests, output_file, args.eval_n_limit)

-    asyncio.run(
-        run_evaluation(
-            instances, metadata, output_file, args.eval_num_workers, process_instance
-        )
+    run_evaluation(
+        instances, metadata, output_file, args.eval_num_workers, process_instance
    )
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

--- a/Show More
+++ b/Show More